首页 文章

在通过将R中的var与NA对照分组来分离某些观察之前选择组

提问于
浏览
1

我的样本 .

data=structure(list(add = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", 
"y"), class = "factor"), x1 = c(14L, 15L, 36L, 0L, 0L, 0L, 53L, 
10L, 39L, 27L, 67L, 25L, 19L, 49L, 53L, 64L, 61L, 12L, 75L, 34L, 
88L, 43L, 85L, 93L, 44L, 31L, 37L, 90L, 66L, 39L, 59L, 96L, 41L, 
23L, 20L, 26L, 69L, 28L, 35L, 96L, 87L, 82L, 70L, 68L, 26L, 12L, 
58L, 18L, 76L, 93L, 3L, 31L), group = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("female", "male"), class = "factor")), .Names = c("add", 
"x1", "group"), class = "data.frame", row.names = c(NA, -52L))

在这个数据中有组变量(性别(男性和女性)我需要获得统计学平均值和25%的所有男性在女性之前 . 男性在女性之后,我不会触摸 . 这是分析 xy 分析来自添加栏 . 如果对于男性来说,女性 Value 超过25%,这是我们在女性之前为男性计算的,那么这个值必须用男性的平均值代替女性“女性类别我们不接触” .

AntoniosK的解决方案非常好

library(tidyverse)
library(data.table)

data %>%  
  group_by(add) %>%                                           # for each add do the below...
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  group_by(add) %>%                                            # for each add update x1 values....
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1)) %>%
  ungroup() %>%
  select(-group2) %>%
  data.frame()

但现在我想将x1替换为0到Na .

data$x1[data$x1 == 0] <- NA

在它之后,当我取消脚本时,我得到 error

mutate_impl(.data,dots)中的错误:评估错误:缺少值,如果'na.rm'为FALSE则不允许NaN .

该怎么做,该脚本通过NA并仅使用int值?

编辑

data=structure(list(add = c(11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L), x1 = c(NA, 
                                                                                       2L, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 1L, 1L, NA, NA, NA, 
                                                                                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, 
                                                                                       NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, 1L, 1L, NA, NA, 
                                                                                       NA, NA, NA), group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female", 
                                                                                                                                                                                        "male"), class = "factor")), .Names = c("add", "x1", "group"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                           -52L))

library(tidyverse)
library(data.table)

data %>%  
  group_by(add) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  group_by(add) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1),
         x1 = ifelse(x1==0, NA, x1)) %>%  # new code added
  ungroup() %>%
  select(-group2) %>%
  data.frame()

Edit2

代码的结果

add x1  group   MEAN    Q25
x   14.00000    male    23.72727    5.0
x   15.00000    male    23.72727    5.0
x   36.00000    male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   53.00000    male    23.72727    5.0
x   10.00000    male    23.72727    5.0
x   39.00000    male    23.72727    5.0
x   27.00000    male    23.72727    5.0
x   67.00000    male    23.72727    5.0
x   25.00000    female  NaN NA
x   19.00000    female  NaN NA
x   49.00000    female  NaN NA
x   53.00000    female  NaN NA
x   64.00000    female  NaN NA
x   61.00000    female  NaN NA
x   12.00000    female  NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA

add x1     group
x   94.90   male

女性的前4名男性总和= 94.90

1 回答

  • 1

    我添加了一段代码来解决您的问题并简要解释错误 .

    Updated code

    data=structure(list(add = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", 
    "y"), class = "factor"), x1 = c(14L, 15L, 36L, 0L, 0L, 0L, 53L, 
    10L, 39L, 27L, 67L, 25L, 19L, 49L, 53L, 64L, 61L, 12L, 75L, 34L, 
    88L, 43L, 85L, 93L, 44L, 31L, 37L, 90L, 66L, 39L, 59L, 96L, 41L, 
    23L, 20L, 26L, 69L, 28L, 35L, 96L, 87L, 82L, 70L, 68L, 26L, 12L, 
    58L, 18L, 76L, 93L, 3L, 31L), group = structure(c(2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L), .Label = c("female", "male"), class = "factor")), .Names = c("add", 
    "x1", "group"), class = "data.frame", row.names = c(NA, -52L))
    
    library(tidyverse)
    library(data.table)
    
    data %>%  
      group_by(add) %>%                                          
      mutate(group2 = rleid(group)) %>% 
      group_by(add, group, group2) %>%
      mutate(MEAN = mean(x1[group=="male" & group2==1]),               
             Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
      group_by(add) %>%                                           
      mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1),
             x1 = ifelse(x1==0, NA, x1)) %>%  # new code added
      ungroup() %>%
      select(-group2) %>%
      data.frame()
    

    Error explanation

    您必须运行代码的前一部分,最后只需更新 x1 列 . 您收到该错误,因为 NA 值会破坏您需要执行的 meanquantile 计算 .

    另一种方法是在开头更新 x1 然后使用 na.rm=T 进行计算 .

    对于你提到的 new case ,你从 x1NA 值开始,试试这个:

    data %>%  
      group_by(add) %>%                                          
      mutate(group2 = rleid(group)) %>% 
      group_by(add, group, group2) %>%
      mutate(MEAN = mean(x1[group=="male" & group2==1], na.rm = T),      ## extra code here ##    
             Q25 = quantile(x1[group=="male" & group2==1], 0.25, na.rm = T)) %>%  ## extra code here ##
      group_by(add) %>%                                           
      mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1))%>%
      ungroup() %>%
      select(-group2) %>%
      data.frame()
    

    对于您提到的 new case (编辑2),首先将前一代码的输出保存为 data2

    data2 = data %>% ...
    

    然后运行这个:

    data2 %>%
      group_by(add) %>%                           # for each add value                      
      mutate(group2 = rleid(group)) %>%           # created group2
      filter(group=="male" & group2==3) %>%       # keep only male after female
      summarise(SUM = sum(x1[row_number() <= 4])) # get sum of x1 for first 4 rows
    
    # # A tibble: 2 x 2
    #   add     SUM
    #   <fct> <dbl>
    # 1 x      94.9
    # 2 y     107.
    

相关问题