我有一个具有以下结构的数据帧:
# A tibble: 95 x 7
# Groups: WallReg_2p5 [19]
CellID_2p5 Y_Coord_2p5Weighting WallReg_2p5 piC_1 piC_2 piC_3 piC_4
<int> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 6561 0.915 African 6.55 6.63 5.84 0.766
2 6278 0.947 African 15.1 5.59 2.15 2.01
3 4394 0.971 African 11.4 3.92 0.774 1.47
4 4840 0.994 African 4.70 0.962 6.21 3.54
5 4105 0.947 African 6.35 2.10 2.25 3.24
6 5228 1.000 Amazonian 8.49 5.00 1.92 2.42
7 5089 1.000 Amazonian 15.6 6.48 2.53 2.89
8 4939 0.998 Amazonian 5.56 2.94 0.389 2.44
9 5088 1.000 Amazonian 12.9 5.16 1.99 3.13
10 4947 0.998 Amazonian 8.05 11.2 2.54 4.61
# ... with 85 more rows
这是数据帧子集的 dput()
. 我的真实数据集包含10,368行和255,611列
structure(list(CellID_2p5 = c(6561L, 6278L, 4394L, 4840L, 4105L,
5228L, 5089L, 4939L, 5088L, 4947L, 1710L, 2569L, 1438L, 1175L,
1840L, 6888L, 7185L, 6031L, 7045L, 7044L, 3432L, 3288L, 3143L,
3574L, 3577L, 3260L, 1959L, 2568L, 2986L, 2386L, 5551L, 5407L,
5556L, 4979L, 5694L, 5303L, 4442L, 5587L, 5157L, 4865L, 3294L,
3009L, 2865L, 2722L, 3151L, 6427L, 6571L, 5996L, 6570L, 6139L,
3631L, 3920L, 3342L, 3341L, 4064L, 2617L, 2049L, 3346L, 1599L,
3205L, 7487L, 6612L, 6613L, 7630L, 7916L, 3854L, 3561L, 4290L,
4138L, 3704L, 4211L, 4068L, 4069L, 4357L, 4648L, 5601L, 5600L,
5455L, 5456L, 5458L, 3978L, 3822L, 3532L, 3832L, 3834L, 7105L,
6817L, 6104L, 7963L, 6098L, 3418L, 3424L, 3281L, 3566L, 3273L
), Y_Coord_2p5Weighting = c(0.915311479119447, 0.946930129495106,
0.971342069813261, 0.99405633822232, 0.946930129495106, 0.999762027079909,
0.999762027079909, 0.997858923238603, 0.999762027079909, 0.997858923238603,
0.480988768919388, 0.691513055782269, 0.402746689858737, 0.362438038283702,
0.518773258160522, 0.876726755707508, 0.831469612302545, 0.971342069813261,
0.854911870672947, 0.854911870672947, 0.854911870672947, 0.831469612302545,
0.806444604267483, 0.876726755707508, 0.876726755707508, 0.831469612302545,
0.555570233019602, 0.691513055782269, 0.779884483092882, 0.659345815100069,
0.99405633822232, 0.997858923238603, 0.99405633822232, 0.997858923238603,
0.988361510467761, 0.999762027079909, 0.971342069813261, 0.99405633822232,
0.999762027079909, 0.99405633822232, 0.831469612302545, 0.779884483092882,
0.751839807478977, 0.722363962059756, 0.806444604267483, 0.932007869282799,
0.915311479119447, 0.971342069813261, 0.915311479119447, 0.960049854385929,
0.896872741532688, 0.932007869282799, 0.854911870672947, 0.854911870672947,
0.946930129495106, 0.722363962059756, 0.591309648363582, 0.854911870672947,
0.480988768919388, 0.831469612302545, 0.779884483092882, 0.915311479119447,
0.915311479119447, 0.751839807478977, 0.691513055782269, 0.915311479119447,
0.876726755707508, 0.960049854385929, 0.946930129495106, 0.896872741532688,
0.960049854385929, 0.946930129495106, 0.946930129495106, 0.971342069813261,
0.988361510467761, 0.99405633822232, 0.99405633822232, 0.997858923238603,
0.997858923238603, 0.997858923238603, 0.932007869282799, 0.915311479119447,
0.876726755707508, 0.915311479119447, 0.915311479119447, 0.831469612302545,
0.876726755707508, 0.960049854385929, 0.659345815100069, 0.960049854385929,
0.854911870672947, 0.854911870672947, 0.831469612302545, 0.876726755707508,
0.831469612302545), WallReg_2p5 = c("African", "African", "African",
"African", "African", "Amazonian", "Amazonian", "Amazonian",
"Amazonian", "Amazonian", "Arctico-Siberian", "Arctico-Siberian",
"Arctico-Siberian", "Arctico-Siberian", "Arctico-Siberian", "Australian",
"Australian", "Australian", "Australian", "Australian", "Chinese",
"Chinese", "Chinese", "Chinese", "Chinese", "Eurasian", "Eurasian",
"Eurasian", "Eurasian", "Eurasian", "Guineo-Congolian", "Guineo-Congolian",
"Guineo-Congolian", "Guineo-Congolian", "Guineo-Congolian", "Indo-Malayan",
"Indo-Malayan", "Indo-Malayan", "Indo-Malayan", "Indo-Malayan",
"Japanese", "Japanese", "Japanese", "Japanese", "Japanese", "Madagascan",
"Madagascan", "Madagascan", "Madagascan", "Madagascan", "Mexican",
"Mexican", "Mexican", "Mexican", "Mexican", "North American",
"North American", "North American", "North American", "North American",
"Novozelandic", "Novozelandic", "Novozelandic", "Novozelandic",
"Novozelandic", "Oriental", "Oriental", "Oriental", "Oriental",
"Oriental", "Panamanian", "Panamanian", "Panamanian", "Panamanian",
"Panamanian", "Papua-Melanesian", "Papua-Melanesian", "Papua-Melanesian",
"Papua-Melanesian", "Papua-Melanesian", "Saharo-Arabian", "Saharo-Arabian",
"Saharo-Arabian", "Saharo-Arabian", "Saharo-Arabian", "South American",
"South American", "South American", "South American", "South American",
"Tibetan", "Tibetan", "Tibetan", "Tibetan", "Tibetan"), piC_1 = c(6.54637718200684,
15.1273813247681, 11.4171981811523, 4.70245027542114, 6.35227298736572,
8.48885822296143, 15.5538415908813, 5.56155681610107, 12.9046697616577,
8.04517650604248, 2.95071268081665, 21.6441345214844, 11.2329692840576,
16.1649322509766, 17.2905006408691, 3.43583130836487, 10.0594062805176,
12.3438568115234, 7.94222640991211, 6.89916276931763, 7.45456171035767,
8.77329444885254, 14.3378238677979, 3.86588025093079, 12.4889860153198,
7.18962049484253, 19.2145137786865, 22.0060653686523, 1.86285281181335,
2.09195709228516, 9.87592029571533, 12.2629871368408, 7.31402492523193,
0.601671099662781, 6.9998254776001, 20.6269207000732, 6.21515369415283,
22.039529800415, 8.35955047607422, 9.50113105773926, 7.06818675994873,
4.63532447814941, 5.81412315368652, 0.996474027633667, 8.32744407653809,
5.03945255279541, 0.893457889556885, 2.42736291885376, 10.3842725753784,
3.32475543022156, 8.1105375289917, 6.61336517333984, 4.06754541397095,
3.31069254875183, 8.05746650695801, 1.24714422225952, 6.44647121429443,
2.97141313552856, 13.3264999389648, 4.86157178878784, 6.71903085708618,
20.3318004608154, 20.8287792205811, 10.0042209625244, 12.7859420776367,
13.6358938217163, 15.9491415023804, 11.4823551177979, 18.6053276062012,
16.6047229766846, 16.1496143341064, 2.9492039680481, 13.8130388259888,
18.6300754547119, 14.464674949646, 4.92032289505005, 0.511945068836212,
3.16324853897095, 13.3062620162964, 9.84803581237793, 1.74625515937805,
2.54861640930176, 9.97869968414307, 11.2339553833008, 0.865878522396088,
14.7632684707642, 21.8330593109131, 6.42118740081787, 9.51691722869873,
13.2857227325439, 4.01672554016113, 10.9487056732178, 13.6308097839355,
4.69979858398438, 1.83490359783173), piC_2 = c(6.62732124328613,
5.59194660186768, 3.92186212539673, 0.962285339832306, 2.1002824306488,
4.99801731109619, 6.4822793006897, 2.94481801986694, 5.16082000732422,
11.2070302963257, 0.585842967033386, 4.83236265182495, 1.637331366539,
7.65087461471558, 2.28347945213318, 7.16115474700928, 3.54162955284119,
5.23653078079224, 2.28897953033447, 2.29887819290161, 0.752622723579407,
0.653791189193726, 1.5378258228302, 2.15203213691711, 1.64702248573303,
6.0682373046875, 0.22119003534317, 4.76900386810303, 0.366481363773346,
6.11435651779175, 10.8921070098877, 7.97591733932495, 6.05282688140869,
3.74584698677063, 5.75792741775513, 0.471727430820465, 2.75132250785828,
1.21862363815308, 0.138835281133652, 2.98711204528809, 0.627980709075928,
0.108154557645321, 0.995486855506897, 2.4163064956665, 0.0193456951528788,
5.70003795623779, 5.56746625900269, 2.9861011505127, 0.344279021024704,
0.640789806842804, 9.4457426071167, 7.05727958679199, 3.89853048324585,
0.340702921152115, 1.17963445186615, 8.93050575256348, 14.796028137207,
4.88054323196411, 9.28642845153809, 7.68382120132446, 2.27267980575562,
0.916118919849396, 0.689630210399628, 0.549197673797607, 1.68408465385437,
1.76007652282715, 3.2269868850708, 0.980833470821381, 5.00142002105713,
3.41616177558899, 6.74930334091187, 12.0952653884888, 15.2918863296509,
0.105648428201675, 4.59846162796021, 1.48986113071442, 5.02905178070068,
5.07208204269409, 4.98251914978027, 4.70810985565186, 2.37468719482422,
6.78730487823486, 6.18559217453003, 11.6090707778931, 2.91017484664917,
3.51590204238892, 3.35987615585327, 8.74919319152832, 2.23059439659119,
0.292922139167786, 5.41262531280518, 8.86936473846436, 8.20160961151123,
7.33296489715576, 8.42716407775879), piC_3 = c(5.84101867675781,
2.14856338500977, 0.774434208869934, 6.21446466445923, 2.25056719779968,
1.9200998544693, 2.52935075759888, 0.38894659280777, 1.98762917518616,
2.53701376914978, 6.93642854690552, 0.608367025852203, 4.7472562789917,
1.25435817241669, 4.09390258789062, 5.41882562637329, 0.221905186772346,
3.72868466377258, 0.763698220252991, 0.783569753170013, 8.32380294799805,
4.482017993927, 2.38237118721008, 10.7143220901489, 10.1253957748413,
4.51582384109497, 5.18871164321899, 1.76670265197754, 7.50785446166992,
6.2304630279541, 8.79040622711182, 7.47595691680908, 1.57976567745209,
1.46996772289276, 0.894773840904236, 1.30858862400055, 7.34649181365967,
1.41060519218445, 2.03947067260742, 4.6038031578064, 4.44245910644531,
0.236538723111153, 0.194929093122482, 0.684483885765076, 0.530747056007385,
1.89696133136749, 1.94861626625061, 3.36041831970215, 0.0835498198866844,
2.04665040969849, 7.02379274368286, 2.93551588058472, 5.33355855941772,
1.59516668319702, 2.19099020957947, 2.88170146942139, 7.42911052703857,
4.64155960083008, 2.24829292297363, 3.64715957641602, 0.363596022129059,
1.41882479190826, 0.474381387233734, 2.24125337600708, 4.11492681503296,
3.44695138931274, 3.08158445358276, 0.218709617853165, 2.44625425338745,
1.71628797054291, 1.75634157657623, 4.76044988632202, 0.387977868318558,
1.70636379718781, 1.70855867862701, 3.67641615867615, 0.744896650314331,
1.09648311138153, 1.37377882003784, 0.200171306729317, 1.4753475189209,
6.56762170791626, 7.72892284393311, 2.18395304679871, 0.481256455183029,
0.37385630607605, 4.25140476226807, 6.76727914810181, 4.81376981735229,
3.8882269859314, 2.90145373344421, 7.48540449142456, 9.90997123718262,
4.46362543106079, 5.19004011154175), piC_4 = c(0.765519082546234,
2.01459360122681, 1.4724348783493, 3.53503012657166, 3.23746180534363,
2.42439723014832, 2.89345812797546, 2.43676805496216, 3.13469624519348,
4.61154937744141, 4.51843070983887, 0.767921149730682, 5.01102733612061,
2.94891023635864, 5.20972728729248, 1.1311411857605, 2.22004199028015,
3.79573369026184, 0.551535904407501, 0.574182093143463, 5.87988710403442,
5.06349992752075, 3.72144675254822, 8.49415874481201, 4.27884483337402,
2.48057842254639, 4.45665884017944, 0.667030334472656, 6.93020153045654,
2.26927351951599, 1.5674192905426, 3.63813829421997, 2.73822736740112,
0.674351632595062, 1.89532685279846, 4.79139471054077, 1.34277474880219,
0.564522683620453, 3.33897042274475, 1.42253696918488, 2.7286331653595,
0.960368096828461, 2.00121903419495, 4.58775472640991, 2.11190366744995,
0.29313051700592, 0.0706640183925629, 2.87113666534424, 1.36242246627808,
3.57689785957336, 2.05132532119751, 0.340487778186798, 1.3506361246109,
0.400035679340363, 1.65728294849396, 5.17583227157593, 6.23331356048584,
1.60608506202698, 6.12336874008179, 0.46411395072937, 0.205161795020103,
1.93029391765594, 2.6833176612854, 0.199026927351952, 0.0609574876725674,
1.12770354747772, 1.49503016471863, 0.299944281578064, 0.302427768707275,
0.745285212993622, 2.91650176048279, 4.18865776062012, 2.71514081954956,
1.93356776237488, 1.67894613742828, 1.67655885219574, 3.09425163269043,
2.87126135826111, 2.42724895477295, 5.48751878738403, 3.4703311920166,
3.71456289291382, 4.29666662216187, 3.37810254096985, 3.07785415649414,
1.90873026847839, 3.57397627830505, 0.902793109416962, 3.96058869361877,
0.35958793759346, 2.9896719455719, 1.81924939155579, 4.22445392608643,
2.22684979438782, 4.53710412979126)), row.names = c(NA, -95L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), .Names = c("CellID_2p5", "Y_Coord_2p5Weighting",
"WallReg_2p5", "piC_1", "piC_2", "piC_3", "piC_4"), vars = "WallReg_2p5", drop = TRUE, indices = list(
0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44,
45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89,
90:94), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(
WallReg_2p5 = c("African", "Amazonian", "Arctico-Siberian",
"Australian", "Chinese", "Eurasian", "Guineo-Congolian",
"Indo-Malayan", "Japanese", "Madagascan", "Mexican", "North American",
"Novozelandic", "Oriental", "Panamanian", "Papua-Melanesian",
"Saharo-Arabian", "South American", "Tibetan")), row.names = c(NA,
-19L), class = "data.frame", vars = "WallReg_2p5", drop = TRUE, .Names = "WallReg_2p5"))
我想要做的是为每个区域生成所有 piC_
列的加权值 . 每列( x
)的过程包括3个步骤:
_999_将 piC_x
列中的每一行乘以 Y_Coord_2p5Weighting
中的值
-
对每个
WallReg_2p5
组中的加权piC_x
值求和 -
将
piC_x
值的总和除以Y_Coord_2p5Weighting
中每个WallReg_2p5
组的值之和
经过一些阅读后,似乎 data.table
在大型数据集上比 dplyr
更快,但我愿意使用任何一个包,甚至是 r
. 我试图同时做这两件事,但是在使用 data.table
时得到的结果不正确,当我将这个应用到我的完整数据帧时,我担心 dplyr
的速度 . 这是我到目前为止所尝试的
dplyr
df <- df %>% tbl_df() %>%
group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.83 4.70 15.1
2 Amazonian 10.1 5.56 15.6
3 Arctico-Siberian 13.9 2.95 21.6
4 Australian 8.14 3.44 12.3
5 Chinese 9.38 3.87 14.3
6 Eurasian 10.5 1.86 22.0
7 Guineo-Congolian 7.41 0.602 12.3
8 Indo-Malayan 13.3 6.22 22.0
9 Japanese 5.37 0.996 8.33
10 Madagascan 4.41 0.893 10.4
11 Mexican 6.03 3.31 8.11
12 North American 5.77 1.25 13.3
13 Novozelandic 14.1 6.72 20.8
14 Oriental 15.3 11.5 18.6
15 Panamanian 13.2 2.95 18.6
16 Papua-Melanesian 6.35 0.512 13.3
17 Saharo-Arabian 5.27 0.866 11.2
18 South American 13.2 6.42 21.8
19 Tibetan 7.03 1.83 13.6
weighted <- df %>%
mutate_at(.funs = funs(.*Y_Coord_2p5Weighting), .vars = vars(starts_with("piC_"))) %>% ## multiply by lat weight
mutate_at(.funs = funs(sum), .vars = vars(starts_with("piC_"))) %>% ## sum the weighted values
mutate_at(.funs = funs(./sum(Y_Coord_2p5Weighting)), .vars = vars(starts_with("piC_"))) ## divide weighted values by sum of weights
weighted %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.82 8.82 8.82
2 Amazonian 10.1 10.1 10.1
3 Arctico-Siberian 14.5 14.5 14.5
4 Australian 8.21 8.21 8.21
5 Chinese 9.32 9.32 9.32
6 Eurasian 9.86 9.86 9.86
7 Guineo-Congolian 7.41 7.41 7.41
8 Indo-Malayan 13.4 13.4 13.4
9 Japanese 5.47 5.47 5.47
10 Madagascan 4.38 4.38 4.38
11 Mexican 6.10 6.10 6.10
12 North American 5.09 5.09 5.09
13 Novozelandic 14.6 14.6 14.6
14 Oriental 15.2 15.2 15.2
15 Panamanian 13.2 13.2 13.2
16 Papua-Melanesian 6.36 6.36 6.36
17 Saharo-Arabian 5.22 5.22 5.22
18 South American 13.2 13.2 13.2
19 Tibetan 7.01 7.01 7.01
使用 dplyr
我得到正确的值 . 但是,当我使用 data.table
时,我得到的值不正确 . 我的代码基于here,但显然我做错了 .
data.table
df <- df %>% group_by(WallReg_2p5) %>%
as.data.table(.) %>% setkey(., WallReg_2p5)
is.data.table(df); haskey(df)
[1] TRUE
[1] TRUE
## same as above
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.83 4.70 15.1
2 Amazonian 10.1 5.56 15.6
3 Arctico-Siberian 13.9 2.95 21.6
4 Australian 8.14 3.44 12.3
5 Chinese 9.38 3.87 14.3
6 Eurasian 10.5 1.86 22.0
7 Guineo-Congolian 7.41 0.602 12.3
8 Indo-Malayan 13.3 6.22 22.0
9 Japanese 5.37 0.996 8.33
10 Madagascan 4.41 0.893 10.4
11 Mexican 6.03 3.31 8.11
12 North American 5.77 1.25 13.3
13 Novozelandic 14.1 6.72 20.8
14 Oriental 15.3 11.5 18.6
15 Panamanian 13.2 2.95 18.6
16 Papua-Melanesian 6.35 0.512 13.3
17 Saharo-Arabian 5.27 0.866 11.2
18 South American 13.2 6.42 21.8
19 Tibetan 7.03 1.83 13.6
# https://stackoverflow.com/q/28123098/1710632
indx <- grep("piC_", colnames(df))
for (j in indx) {
set(df, i = NULL, j = j, value = df[[j]]*df[["Y_Coord_2p5Weighting"]]) ## multiply by weights
set(df, i = NULL, j = j, value = sum(df[[j]])) ## sum the weighted values
set(df, i = NULL, j = j, value = df[[j]]/sum(df[["Y_Coord_2p5Weighting"]])) ## divide by sum of weights
}
## wrong values
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 9.27 9.27 9.27
2 Amazonian 9.27 9.27 9.27
3 Arctico-Siberian 9.27 9.27 9.27
4 Australian 9.27 9.27 9.27
5 Chinese 9.27 9.27 9.27
6 Eurasian 9.27 9.27 9.27
7 Guineo-Congolian 9.27 9.27 9.27
8 Indo-Malayan 9.27 9.27 9.27
9 Japanese 9.27 9.27 9.27
10 Madagascan 9.27 9.27 9.27
11 Mexican 9.27 9.27 9.27
12 North American 9.27 9.27 9.27
13 Novozelandic 9.27 9.27 9.27
14 Oriental 9.27 9.27 9.27
15 Panamanian 9.27 9.27 9.27
16 Papua-Melanesian 9.27 9.27 9.27
17 Saharo-Arabian 9.27 9.27 9.27
18 South American 9.27 9.27 9.27
19 Tibetan 9.27 9.27 9.27
阅读 ?set()
,声明它不能执行分组操作,但我认为,因为我已经定义了我的组,这个过程将起作用 . 我以前从未使用 data.table
,所以任何指导都会非常感激 .