我有一个具有以下结构的数据帧:

# A tibble: 95 x 7
# Groups:   WallReg_2p5 [19]
   CellID_2p5 Y_Coord_2p5Weighting WallReg_2p5 piC_1  piC_2 piC_3 piC_4
        <int>                <dbl> <chr>       <dbl>  <dbl> <dbl> <dbl>
 1       6561                0.915 African      6.55  6.63  5.84  0.766
 2       6278                0.947 African     15.1   5.59  2.15  2.01 
 3       4394                0.971 African     11.4   3.92  0.774 1.47 
 4       4840                0.994 African      4.70  0.962 6.21  3.54 
 5       4105                0.947 African      6.35  2.10  2.25  3.24 
 6       5228                1.000 Amazonian    8.49  5.00  1.92  2.42 
 7       5089                1.000 Amazonian   15.6   6.48  2.53  2.89 
 8       4939                0.998 Amazonian    5.56  2.94  0.389 2.44 
 9       5088                1.000 Amazonian   12.9   5.16  1.99  3.13 
10       4947                0.998 Amazonian    8.05 11.2   2.54  4.61 
# ... with 85 more rows

这是数据帧子集的 dput() . 我的真实数据集包含10,368行和255,611列

structure(list(CellID_2p5 = c(6561L, 6278L, 4394L, 4840L, 4105L, 
5228L, 5089L, 4939L, 5088L, 4947L, 1710L, 2569L, 1438L, 1175L, 
1840L, 6888L, 7185L, 6031L, 7045L, 7044L, 3432L, 3288L, 3143L, 
3574L, 3577L, 3260L, 1959L, 2568L, 2986L, 2386L, 5551L, 5407L, 
5556L, 4979L, 5694L, 5303L, 4442L, 5587L, 5157L, 4865L, 3294L, 
3009L, 2865L, 2722L, 3151L, 6427L, 6571L, 5996L, 6570L, 6139L, 
3631L, 3920L, 3342L, 3341L, 4064L, 2617L, 2049L, 3346L, 1599L, 
3205L, 7487L, 6612L, 6613L, 7630L, 7916L, 3854L, 3561L, 4290L, 
4138L, 3704L, 4211L, 4068L, 4069L, 4357L, 4648L, 5601L, 5600L, 
5455L, 5456L, 5458L, 3978L, 3822L, 3532L, 3832L, 3834L, 7105L, 
6817L, 6104L, 7963L, 6098L, 3418L, 3424L, 3281L, 3566L, 3273L
), Y_Coord_2p5Weighting = c(0.915311479119447, 0.946930129495106, 
0.971342069813261, 0.99405633822232, 0.946930129495106, 0.999762027079909, 
0.999762027079909, 0.997858923238603, 0.999762027079909, 0.997858923238603, 
0.480988768919388, 0.691513055782269, 0.402746689858737, 0.362438038283702, 
0.518773258160522, 0.876726755707508, 0.831469612302545, 0.971342069813261, 
0.854911870672947, 0.854911870672947, 0.854911870672947, 0.831469612302545, 
0.806444604267483, 0.876726755707508, 0.876726755707508, 0.831469612302545, 
0.555570233019602, 0.691513055782269, 0.779884483092882, 0.659345815100069, 
0.99405633822232, 0.997858923238603, 0.99405633822232, 0.997858923238603, 
0.988361510467761, 0.999762027079909, 0.971342069813261, 0.99405633822232, 
0.999762027079909, 0.99405633822232, 0.831469612302545, 0.779884483092882, 
0.751839807478977, 0.722363962059756, 0.806444604267483, 0.932007869282799, 
0.915311479119447, 0.971342069813261, 0.915311479119447, 0.960049854385929, 
0.896872741532688, 0.932007869282799, 0.854911870672947, 0.854911870672947, 
0.946930129495106, 0.722363962059756, 0.591309648363582, 0.854911870672947, 
0.480988768919388, 0.831469612302545, 0.779884483092882, 0.915311479119447, 
0.915311479119447, 0.751839807478977, 0.691513055782269, 0.915311479119447, 
0.876726755707508, 0.960049854385929, 0.946930129495106, 0.896872741532688, 
0.960049854385929, 0.946930129495106, 0.946930129495106, 0.971342069813261, 
0.988361510467761, 0.99405633822232, 0.99405633822232, 0.997858923238603, 
0.997858923238603, 0.997858923238603, 0.932007869282799, 0.915311479119447, 
0.876726755707508, 0.915311479119447, 0.915311479119447, 0.831469612302545, 
0.876726755707508, 0.960049854385929, 0.659345815100069, 0.960049854385929, 
0.854911870672947, 0.854911870672947, 0.831469612302545, 0.876726755707508, 
0.831469612302545), WallReg_2p5 = c("African", "African", "African", 
"African", "African", "Amazonian", "Amazonian", "Amazonian", 
"Amazonian", "Amazonian", "Arctico-Siberian", "Arctico-Siberian", 
"Arctico-Siberian", "Arctico-Siberian", "Arctico-Siberian", "Australian", 
"Australian", "Australian", "Australian", "Australian", "Chinese", 
"Chinese", "Chinese", "Chinese", "Chinese", "Eurasian", "Eurasian", 
"Eurasian", "Eurasian", "Eurasian", "Guineo-Congolian", "Guineo-Congolian", 
"Guineo-Congolian", "Guineo-Congolian", "Guineo-Congolian", "Indo-Malayan", 
"Indo-Malayan", "Indo-Malayan", "Indo-Malayan", "Indo-Malayan", 
"Japanese", "Japanese", "Japanese", "Japanese", "Japanese", "Madagascan", 
"Madagascan", "Madagascan", "Madagascan", "Madagascan", "Mexican", 
"Mexican", "Mexican", "Mexican", "Mexican", "North American", 
"North American", "North American", "North American", "North American", 
"Novozelandic", "Novozelandic", "Novozelandic", "Novozelandic", 
"Novozelandic", "Oriental", "Oriental", "Oriental", "Oriental", 
"Oriental", "Panamanian", "Panamanian", "Panamanian", "Panamanian", 
"Panamanian", "Papua-Melanesian", "Papua-Melanesian", "Papua-Melanesian", 
"Papua-Melanesian", "Papua-Melanesian", "Saharo-Arabian", "Saharo-Arabian", 
"Saharo-Arabian", "Saharo-Arabian", "Saharo-Arabian", "South American", 
"South American", "South American", "South American", "South American", 
"Tibetan", "Tibetan", "Tibetan", "Tibetan", "Tibetan"), piC_1 = c(6.54637718200684, 
15.1273813247681, 11.4171981811523, 4.70245027542114, 6.35227298736572, 
8.48885822296143, 15.5538415908813, 5.56155681610107, 12.9046697616577, 
8.04517650604248, 2.95071268081665, 21.6441345214844, 11.2329692840576, 
16.1649322509766, 17.2905006408691, 3.43583130836487, 10.0594062805176, 
12.3438568115234, 7.94222640991211, 6.89916276931763, 7.45456171035767, 
8.77329444885254, 14.3378238677979, 3.86588025093079, 12.4889860153198, 
7.18962049484253, 19.2145137786865, 22.0060653686523, 1.86285281181335, 
2.09195709228516, 9.87592029571533, 12.2629871368408, 7.31402492523193, 
0.601671099662781, 6.9998254776001, 20.6269207000732, 6.21515369415283, 
22.039529800415, 8.35955047607422, 9.50113105773926, 7.06818675994873, 
4.63532447814941, 5.81412315368652, 0.996474027633667, 8.32744407653809, 
5.03945255279541, 0.893457889556885, 2.42736291885376, 10.3842725753784, 
3.32475543022156, 8.1105375289917, 6.61336517333984, 4.06754541397095, 
3.31069254875183, 8.05746650695801, 1.24714422225952, 6.44647121429443, 
2.97141313552856, 13.3264999389648, 4.86157178878784, 6.71903085708618, 
20.3318004608154, 20.8287792205811, 10.0042209625244, 12.7859420776367, 
13.6358938217163, 15.9491415023804, 11.4823551177979, 18.6053276062012, 
16.6047229766846, 16.1496143341064, 2.9492039680481, 13.8130388259888, 
18.6300754547119, 14.464674949646, 4.92032289505005, 0.511945068836212, 
3.16324853897095, 13.3062620162964, 9.84803581237793, 1.74625515937805, 
2.54861640930176, 9.97869968414307, 11.2339553833008, 0.865878522396088, 
14.7632684707642, 21.8330593109131, 6.42118740081787, 9.51691722869873, 
13.2857227325439, 4.01672554016113, 10.9487056732178, 13.6308097839355, 
4.69979858398438, 1.83490359783173), piC_2 = c(6.62732124328613, 
5.59194660186768, 3.92186212539673, 0.962285339832306, 2.1002824306488, 
4.99801731109619, 6.4822793006897, 2.94481801986694, 5.16082000732422, 
11.2070302963257, 0.585842967033386, 4.83236265182495, 1.637331366539, 
7.65087461471558, 2.28347945213318, 7.16115474700928, 3.54162955284119, 
5.23653078079224, 2.28897953033447, 2.29887819290161, 0.752622723579407, 
0.653791189193726, 1.5378258228302, 2.15203213691711, 1.64702248573303, 
6.0682373046875, 0.22119003534317, 4.76900386810303, 0.366481363773346, 
6.11435651779175, 10.8921070098877, 7.97591733932495, 6.05282688140869, 
3.74584698677063, 5.75792741775513, 0.471727430820465, 2.75132250785828, 
1.21862363815308, 0.138835281133652, 2.98711204528809, 0.627980709075928, 
0.108154557645321, 0.995486855506897, 2.4163064956665, 0.0193456951528788, 
5.70003795623779, 5.56746625900269, 2.9861011505127, 0.344279021024704, 
0.640789806842804, 9.4457426071167, 7.05727958679199, 3.89853048324585, 
0.340702921152115, 1.17963445186615, 8.93050575256348, 14.796028137207, 
4.88054323196411, 9.28642845153809, 7.68382120132446, 2.27267980575562, 
0.916118919849396, 0.689630210399628, 0.549197673797607, 1.68408465385437, 
1.76007652282715, 3.2269868850708, 0.980833470821381, 5.00142002105713, 
3.41616177558899, 6.74930334091187, 12.0952653884888, 15.2918863296509, 
0.105648428201675, 4.59846162796021, 1.48986113071442, 5.02905178070068, 
5.07208204269409, 4.98251914978027, 4.70810985565186, 2.37468719482422, 
6.78730487823486, 6.18559217453003, 11.6090707778931, 2.91017484664917, 
3.51590204238892, 3.35987615585327, 8.74919319152832, 2.23059439659119, 
0.292922139167786, 5.41262531280518, 8.86936473846436, 8.20160961151123, 
7.33296489715576, 8.42716407775879), piC_3 = c(5.84101867675781, 
2.14856338500977, 0.774434208869934, 6.21446466445923, 2.25056719779968, 
1.9200998544693, 2.52935075759888, 0.38894659280777, 1.98762917518616, 
2.53701376914978, 6.93642854690552, 0.608367025852203, 4.7472562789917, 
1.25435817241669, 4.09390258789062, 5.41882562637329, 0.221905186772346, 
3.72868466377258, 0.763698220252991, 0.783569753170013, 8.32380294799805, 
4.482017993927, 2.38237118721008, 10.7143220901489, 10.1253957748413, 
4.51582384109497, 5.18871164321899, 1.76670265197754, 7.50785446166992, 
6.2304630279541, 8.79040622711182, 7.47595691680908, 1.57976567745209, 
1.46996772289276, 0.894773840904236, 1.30858862400055, 7.34649181365967, 
1.41060519218445, 2.03947067260742, 4.6038031578064, 4.44245910644531, 
0.236538723111153, 0.194929093122482, 0.684483885765076, 0.530747056007385, 
1.89696133136749, 1.94861626625061, 3.36041831970215, 0.0835498198866844, 
2.04665040969849, 7.02379274368286, 2.93551588058472, 5.33355855941772, 
1.59516668319702, 2.19099020957947, 2.88170146942139, 7.42911052703857, 
4.64155960083008, 2.24829292297363, 3.64715957641602, 0.363596022129059, 
1.41882479190826, 0.474381387233734, 2.24125337600708, 4.11492681503296, 
3.44695138931274, 3.08158445358276, 0.218709617853165, 2.44625425338745, 
1.71628797054291, 1.75634157657623, 4.76044988632202, 0.387977868318558, 
1.70636379718781, 1.70855867862701, 3.67641615867615, 0.744896650314331, 
1.09648311138153, 1.37377882003784, 0.200171306729317, 1.4753475189209, 
6.56762170791626, 7.72892284393311, 2.18395304679871, 0.481256455183029, 
0.37385630607605, 4.25140476226807, 6.76727914810181, 4.81376981735229, 
3.8882269859314, 2.90145373344421, 7.48540449142456, 9.90997123718262, 
4.46362543106079, 5.19004011154175), piC_4 = c(0.765519082546234, 
2.01459360122681, 1.4724348783493, 3.53503012657166, 3.23746180534363, 
2.42439723014832, 2.89345812797546, 2.43676805496216, 3.13469624519348, 
4.61154937744141, 4.51843070983887, 0.767921149730682, 5.01102733612061, 
2.94891023635864, 5.20972728729248, 1.1311411857605, 2.22004199028015, 
3.79573369026184, 0.551535904407501, 0.574182093143463, 5.87988710403442, 
5.06349992752075, 3.72144675254822, 8.49415874481201, 4.27884483337402, 
2.48057842254639, 4.45665884017944, 0.667030334472656, 6.93020153045654, 
2.26927351951599, 1.5674192905426, 3.63813829421997, 2.73822736740112, 
0.674351632595062, 1.89532685279846, 4.79139471054077, 1.34277474880219, 
0.564522683620453, 3.33897042274475, 1.42253696918488, 2.7286331653595, 
0.960368096828461, 2.00121903419495, 4.58775472640991, 2.11190366744995, 
0.29313051700592, 0.0706640183925629, 2.87113666534424, 1.36242246627808, 
3.57689785957336, 2.05132532119751, 0.340487778186798, 1.3506361246109, 
0.400035679340363, 1.65728294849396, 5.17583227157593, 6.23331356048584, 
1.60608506202698, 6.12336874008179, 0.46411395072937, 0.205161795020103, 
1.93029391765594, 2.6833176612854, 0.199026927351952, 0.0609574876725674, 
1.12770354747772, 1.49503016471863, 0.299944281578064, 0.302427768707275, 
0.745285212993622, 2.91650176048279, 4.18865776062012, 2.71514081954956, 
1.93356776237488, 1.67894613742828, 1.67655885219574, 3.09425163269043, 
2.87126135826111, 2.42724895477295, 5.48751878738403, 3.4703311920166, 
3.71456289291382, 4.29666662216187, 3.37810254096985, 3.07785415649414, 
1.90873026847839, 3.57397627830505, 0.902793109416962, 3.96058869361877, 
0.35958793759346, 2.9896719455719, 1.81924939155579, 4.22445392608643, 
2.22684979438782, 4.53710412979126)), row.names = c(NA, -95L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), .Names = c("CellID_2p5", "Y_Coord_2p5Weighting", 
"WallReg_2p5", "piC_1", "piC_2", "piC_3", "piC_4"), vars = "WallReg_2p5", drop = TRUE, indices = list(
    0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44, 
    45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89, 
    90:94), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(
    WallReg_2p5 = c("African", "Amazonian", "Arctico-Siberian", 
    "Australian", "Chinese", "Eurasian", "Guineo-Congolian", 
    "Indo-Malayan", "Japanese", "Madagascan", "Mexican", "North American", 
    "Novozelandic", "Oriental", "Panamanian", "Papua-Melanesian", 
    "Saharo-Arabian", "South American", "Tibetan")), row.names = c(NA, 
-19L), class = "data.frame", vars = "WallReg_2p5", drop = TRUE, .Names = "WallReg_2p5"))

我想要做的是为每个区域生成所有 piC_ 列的加权值 . 每列( x )的过程包括3个步骤:

_999_将 piC_x 列中的每一行乘以 Y_Coord_2p5Weighting 中的值

  • 对每个 WallReg_2p5 组中的加权 piC_x 值求和

  • piC_x 值的总和除以 Y_Coord_2p5Weighting 中每个 WallReg_2p5 组的值之和

经过一些阅读后,似乎 data.table 在大型数据集上比 dplyr 更快,但我愿意使用任何一个包,甚至是 r . 我试图同时做这两件事,但是在使用 data.table 时得到的结果不正确,当我将这个应用到我的完整数据帧时,我担心 dplyr 的速度 . 这是我到目前为止所尝试的

dplyr

df <- df %>% tbl_df() %>% 
  group_by(WallReg_2p5) %>% 
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS   minS  maxS
   <chr>            <dbl>  <dbl> <dbl>
 1 African           8.83  4.70  15.1 
 2 Amazonian        10.1   5.56  15.6 
 3 Arctico-Siberian 13.9   2.95  21.6 
 4 Australian        8.14  3.44  12.3 
 5 Chinese           9.38  3.87  14.3 
 6 Eurasian         10.5   1.86  22.0 
 7 Guineo-Congolian  7.41  0.602 12.3 
 8 Indo-Malayan     13.3   6.22  22.0 
 9 Japanese          5.37  0.996  8.33
10 Madagascan        4.41  0.893 10.4 
11 Mexican           6.03  3.31   8.11
12 North American    5.77  1.25  13.3 
13 Novozelandic     14.1   6.72  20.8 
14 Oriental         15.3  11.5   18.6 
15 Panamanian       13.2   2.95  18.6 
16 Papua-Melanesian  6.35  0.512 13.3 
17 Saharo-Arabian    5.27  0.866 11.2 
18 South American   13.2   6.42  21.8 
19 Tibetan           7.03  1.83  13.6 

weighted <- df %>%
  mutate_at(.funs = funs(.*Y_Coord_2p5Weighting), .vars = vars(starts_with("piC_"))) %>% ## multiply by lat weight
  mutate_at(.funs = funs(sum), .vars = vars(starts_with("piC_"))) %>% ## sum the weighted values
  mutate_at(.funs = funs(./sum(Y_Coord_2p5Weighting)), .vars = vars(starts_with("piC_"))) ## divide weighted values by sum of weights

weighted %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS  minS  maxS
   <chr>            <dbl> <dbl> <dbl>
 1 African           8.82  8.82  8.82
 2 Amazonian        10.1  10.1  10.1 
 3 Arctico-Siberian 14.5  14.5  14.5 
 4 Australian        8.21  8.21  8.21
 5 Chinese           9.32  9.32  9.32
 6 Eurasian          9.86  9.86  9.86
 7 Guineo-Congolian  7.41  7.41  7.41
 8 Indo-Malayan     13.4  13.4  13.4 
 9 Japanese          5.47  5.47  5.47
10 Madagascan        4.38  4.38  4.38
11 Mexican           6.10  6.10  6.10
12 North American    5.09  5.09  5.09
13 Novozelandic     14.6  14.6  14.6 
14 Oriental         15.2  15.2  15.2 
15 Panamanian       13.2  13.2  13.2 
16 Papua-Melanesian  6.36  6.36  6.36
17 Saharo-Arabian    5.22  5.22  5.22
18 South American   13.2  13.2  13.2 
19 Tibetan           7.01  7.01  7.01

使用 dplyr 我得到正确的值 . 但是,当我使用 data.table 时,我得到的值不正确 . 我的代码基于here,但显然我做错了 .

data.table

df <- df %>% group_by(WallReg_2p5) %>%
  as.data.table(.) %>% setkey(., WallReg_2p5)
is.data.table(df); haskey(df)
[1] TRUE
[1] TRUE

## same as above
df %>% tbl_df %>% group_by(WallReg_2p5) %>% 
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS   minS  maxS
   <chr>            <dbl>  <dbl> <dbl>
 1 African           8.83  4.70  15.1 
 2 Amazonian        10.1   5.56  15.6 
 3 Arctico-Siberian 13.9   2.95  21.6 
 4 Australian        8.14  3.44  12.3 
 5 Chinese           9.38  3.87  14.3 
 6 Eurasian         10.5   1.86  22.0 
 7 Guineo-Congolian  7.41  0.602 12.3 
 8 Indo-Malayan     13.3   6.22  22.0 
 9 Japanese          5.37  0.996  8.33
10 Madagascan        4.41  0.893 10.4 
11 Mexican           6.03  3.31   8.11
12 North American    5.77  1.25  13.3 
13 Novozelandic     14.1   6.72  20.8 
14 Oriental         15.3  11.5   18.6 
15 Panamanian       13.2   2.95  18.6 
16 Papua-Melanesian  6.35  0.512 13.3 
17 Saharo-Arabian    5.27  0.866 11.2 
18 South American   13.2   6.42  21.8 
19 Tibetan           7.03  1.83  13.6 

# https://stackoverflow.com/q/28123098/1710632
indx <- grep("piC_", colnames(df))
for (j in indx) {
  set(df, i = NULL, j = j, value = df[[j]]*df[["Y_Coord_2p5Weighting"]]) ## multiply by weights
  set(df, i = NULL, j = j, value = sum(df[[j]])) ## sum the weighted values
  set(df, i = NULL, j = j, value = df[[j]]/sum(df[["Y_Coord_2p5Weighting"]])) ## divide by sum of weights
}
## wrong values
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS  minS  maxS
   <chr>            <dbl> <dbl> <dbl>
 1 African           9.27  9.27  9.27
 2 Amazonian         9.27  9.27  9.27
 3 Arctico-Siberian  9.27  9.27  9.27
 4 Australian        9.27  9.27  9.27
 5 Chinese           9.27  9.27  9.27
 6 Eurasian          9.27  9.27  9.27
 7 Guineo-Congolian  9.27  9.27  9.27
 8 Indo-Malayan      9.27  9.27  9.27
 9 Japanese          9.27  9.27  9.27
10 Madagascan        9.27  9.27  9.27
11 Mexican           9.27  9.27  9.27
12 North American    9.27  9.27  9.27
13 Novozelandic      9.27  9.27  9.27
14 Oriental          9.27  9.27  9.27
15 Panamanian        9.27  9.27  9.27
16 Papua-Melanesian  9.27  9.27  9.27
17 Saharo-Arabian    9.27  9.27  9.27
18 South American    9.27  9.27  9.27
19 Tibetan           9.27  9.27  9.27

阅读 ?set() ,声明它不能执行分组操作,但我认为,因为我已经定义了我的组,这个过程将起作用 . 我以前从未使用 data.table ,所以任何指导都会非常感激 .