首页 文章

在R中的某些观察之前选择组

提问于
浏览
0

data=structure(list(x1 = c(88L, 88L, 94L, 82L, 68L, 72L, 43L, 84L, 
65L, 91L, 65L, 80L, 82L, 63L, 67L, 58L, 100L, 32L, 75L, 66L, 
30L, 12L, 97L, 58L, 14L, 64L), group = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female", "male"), class = "factor")), .Names = c("x1", 
"group"), class = "data.frame", row.names = c(NA, -26L))

在这个数据中有组变量(性别(男性和女性)我需要获得统计平均值和25%的所有男性在女性之前 . 男性在女性之后,我不接触 . 女性我不接触 . 所以作为输出

x1  group   mean    25%
88  male    76,36   66,5
88  male    76,36   66,5
94  male    76,36   66,5
82  male    76,36   66,5
68  male    76,36   66,5
72  male    76,36   66,5
43  male    76,36   66,5
84  male    76,36   66,5
65  male    76,36   66,5
91  male    76,36   66,5
65  male    76,36   66,5
80  female      
82  female      
63  female      
67  female      
58  female      
100 female      
32  female      
75  male        
66  male        
30  male        
12  male        
97  male        
58  male        
14  male        
64  male

怎么做?

编辑

x1  group
88  male
88  male
94  male
82  male
68  male
72  male
43  male
84  male
65  male
91  male
65  male
80  female
82  female
63  female
67  female
58  female
100 female
32  female
**76,36 male
**76,36 male
30  male
12  male
**76,36 male
58  male
14  male
64  male

结果 .

4 回答

  • 0
    library(dplyr)
    library(data.table)
    
    data %>%
      group_by(group, group2 = rleid(group)) %>%                       # group by gender and it's position
      mutate(MEAN = mean(x1[group=="male" & group2==1]),               # calculate metrics only for male in position 1
             Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
      ungroup() %>%                                                    # ungroup
      select(-group2) %>%                                              # remove column
      data.frame()                                                     # only for visualisation purposes
    
    #     x1  group     MEAN  Q25
    # 1   88   male 76.36364 66.5
    # 2   88   male 76.36364 66.5
    # 3   94   male 76.36364 66.5
    # 4   82   male 76.36364 66.5
    # 5   68   male 76.36364 66.5
    # 6   72   male 76.36364 66.5
    # 7   43   male 76.36364 66.5
    # 8   84   male 76.36364 66.5
    # 9   65   male 76.36364 66.5
    # 10  91   male 76.36364 66.5
    # 11  65   male 76.36364 66.5
    # 12  80 female      NaN   NA
    # 13  82 female      NaN   NA
    # 14  63 female      NaN   NA
    # 15  67 female      NaN   NA
    # 16  58 female      NaN   NA
    # 17 100 female      NaN   NA
    # 18  32 female      NaN   NA
    # 19  75   male      NaN   NA
    # 20  66   male      NaN   NA
    # 21  30   male      NaN   NA
    # 22  12   male      NaN   NA
    # 23  97   male      NaN   NA
    # 24  58   male      NaN   NA
    # 25  14   male      NaN   NA
    # 26  64   male      NaN   NA
    

    要根据您提到的逻辑更新 x1 列,您可以使用:

    data %>%
      group_by(group, group2 = rleid(group)) %>%                       
      mutate(MEAN = mean(x1[group=="male" & group2==1]),               
             Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
      ungroup() %>%
      mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1)) %>%
      ungroup() %>%
      select(-group2) %>%
      data.frame()
    
    #     x1  group     MEAN  Q25
    # 1   88.00000   male 76.36364 66.5
    # 2   88.00000   male 76.36364 66.5
    # 3   94.00000   male 76.36364 66.5
    # 4   82.00000   male 76.36364 66.5
    # 5   68.00000   male 76.36364 66.5
    # 6   72.00000   male 76.36364 66.5
    # 7   43.00000   male 76.36364 66.5
    # 8   84.00000   male 76.36364 66.5
    # 9   65.00000   male 76.36364 66.5
    # 10  91.00000   male 76.36364 66.5
    # 11  65.00000   male 76.36364 66.5
    # 12  80.00000 female      NaN   NA
    # 13  82.00000 female      NaN   NA
    # 14  63.00000 female      NaN   NA
    # 15  67.00000 female      NaN   NA
    # 16  58.00000 female      NaN   NA
    # 17 100.00000 female      NaN   NA
    # 18  32.00000 female      NaN   NA
    # 19  76.36364   male      NaN   NA
    # 20  66.00000   male      NaN   NA
    # 21  30.00000   male      NaN   NA
    # 22  12.00000   male      NaN   NA
    # 23  76.36364   male      NaN   NA
    # 24  58.00000   male      NaN   NA
    # 25  14.00000   male      NaN   NA
    # 26  64.00000   male      NaN   NA
    

    我添加的额外代码( mutate )仅为女性之后的男性更新 x1 (即 group2 = 3') and only if x1`大于分位数值 .

  • 0

    data.table 中,您可以编辑 rleid(group) == 1 的行,即第一组行,按 group 的值分组 .

    library(data.table)
    setDT(df)
    
    df[rleid(group) == 1, `:=`(mean = mean(x1), Q25 = quantile(x1, 0.25))]
    

    结果

    #      x1  group     mean  Q25
    #  1:  88   male 76.36364 66.5
    #  2:  88   male 76.36364 66.5
    #  3:  94   male 76.36364 66.5
    #  4:  82   male 76.36364 66.5
    #  5:  68   male 76.36364 66.5
    #  6:  72   male 76.36364 66.5
    #  7:  43   male 76.36364 66.5
    #  8:  84   male 76.36364 66.5
    #  9:  65   male 76.36364 66.5
    # 10:  91   male 76.36364 66.5
    # 11:  65   male 76.36364 66.5
    # 12:  80 female       NA   NA
    # 13:  82 female       NA   NA
    # 14:  63 female       NA   NA
    # 15:  67 female       NA   NA
    # 16:  58 female       NA   NA
    # 17: 100 female       NA   NA
    # 18:  32 female       NA   NA
    # 19:  75   male       NA   NA
    # 20:  66   male       NA   NA
    # 21:  30   male       NA   NA
    # 22:  12   male       NA   NA
    # 23:  97   male       NA   NA
    # 24:  58   male       NA   NA
    # 25:  14   male       NA   NA
    # 26:  64   male       NA   NA
    #      x1  group     mean  Q25
    
  • 2

    这是另一种 dplyr 方法,它由 rleid() group汇总并使用 left_join() 附加结果列:

    library(dplyr)
    result <- data %>% 
      group_by(rleid = data.table::rleid(group)) %>% 
      left_join(., filter(., rleid == 1) %>% 
                  summarise(mean = mean(x1), q25 = quantile(x1, 0.25))
      ) %>% 
      ungroup() %>%
      select(-rleid)
    result %>% 
      print(n = Inf)   # make sure to print all rows
    

    #A tibble:26 x 4
    x1组平均值q25
    <int> <fct> <dbl> <dbl>
    1 88男76.4 66.5
    2 88男76.4 66.5
    3 94男76.4 66.5
    4 82男76.4 66.5
    5 68男76.4 66.5
    6 72男76.4 66.5
    7 43男76.4 66.5
    8 84男76.4 66.5
    9 65男76.4 66.5
    10 91男76.4 66.5
    11 65男76.4 66.5
    12 80女NA NA
    13 82女NA NA
    14 63女NA NA
    15 67女NA NA
    16 58女NA NA
    17 100女NA NA
    18 32女NA NA
    19 75男NA NA
    20 66男NA NA
    21 30男NA NA
    22 12男NA NA
    23 97男NA NA
    24 58男NA NA
    25 14男NA NA
    26 64男NA NA

    请注意,除非将结果分配回 data ,否则不会修改 data .

  • 4

    这也是一种替代方法,它可以回答OP的原始问题以及OP在评论herehere中提出的其他问题 .

    对于这两个问题,我们需要计算第一组男性的聚合,然后通过第一个问题的更新连接和第二个问题的更新非等连接通过引用更新 data .

    计算第一组男性的聚合

    library(data.table)
    # coerce to data.table, append rleid for later joins
    setDT(data)[, rleid := rleid(group)][
      # ensure that x1 has the same type as mean(x1)
      , x1 := as.double(x1)]
    agg <- data[rleid == 1, .(mean(x1), quantile(x1, .25)), by = rleid]
    agg
    

    rleid V1 V2
    1:1 76.36364 66.5

    原始问题:附加第一个男性组的统计数据

    这是通过更新连接实现的

    data[agg, on = "rleid", c("mean", "q25") := .(V1, V2)]
    data[]
    

    x1组rleid意味着q25
    1:88男1 66.36364 66.5
    2:88男1 66.36364 66.5
    3:94男1 66.36364 66.5
    4:82男1 66.36364 66.5
    5:68男1 66.36364 66.5
    6:72男1 66.36364 66.5
    7:43男1 66.36364 66.5
    8:84男1 66.36364 66.5
    9:65男1 66.36364 66.5
    10:91男1 66.36364 66.5
    11:65男1 66.36364 66.5
    12:80女2 NA NA
    13:82女2 NA NA
    14:63女2 NA NA
    15:67女2 NA NA
    16:58女2 NA NA
    17:100女2 NA NA
    18:32女2 NA NA
    19:75男3 NA NA
    20:66男3 NA NA
    21:30男性3 NA NA
    22:12男3 NA NA
    23:97男3 NA NA
    24:58男3 NA NA
    25:14男3 NA NA
    26:64男3 NA NA
    x1组rleid意味着q25

    请注意, data 已通过引用更新,即无需复制 .

    其他问题:修改第二个男性组中的选定值

    OP已要求替换第二男性组中的任何 x1 值,该值超过为第一个男性组计算的25%分位数 q25 ,计算第一个男性组的平均值 . 请注意,第二个男性群体由 rleid == 3L 识别,因为女性群体介于两者之间 .

    这可以通过更新非equi连接来实现 . 连接条件仅选择属于 rleid == 3Lx1 更大 q25 的行 .

    data[agg[, .(rleid = 3, V1, V2)], on = .(rleid, x1 > V2), x1 := V1][]
    # remove helper column no longer needed
    data[, rleid := NULL]
    data[]
    

    x1组意味着q25
    1:88.00000男性76.36364 66.5
    2:88.00000男性76.36364 66.5
    3:94.00000男性76.36364 66.5
    4:82.00000男性76.36364 66.5
    5:68.00000男性76.36364 66.5
    6:72.00000男性76.36364 66.5
    7:43.00000男性76.36364 66.5
    8:84.00000男性76.36364 66.5
    9:65.00000男性76.36364 66.5
    10:91.00000男性76.36364 66.5
    11:65.00000男性76.36364 66.5
    12:80.00000女NA NA
    13:82.00000女NA NA
    14:63.00000女NA NA
    15:67.00000女NA NA
    16:58.00000女NA NA
    17:100.00000女NA NA
    18:32.00000女NA NA
    19:76.36364男NA NA
    20:66.00000男NA NA
    21:30.00000男NA NA
    22:12.00000男性NA NA
    23:76.36364男NA NA
    24:58.00000男NA NA
    25:14.00000男NA NA
    26:64.00000男NA NA
    x1组平均值q25

    请注意第19行和第23行已按要求更新 . 同样, data 通过引用更新 .

相关问题