首页 文章

逐列减去数据帧的行,保留多因子列

提问于
浏览
2

我有两个数据帧不同的列数 . 我想从df1中减去df2行的强度值,逐列(即样本) . 我的条件是:

  • 在df1中,对于每个基因(gene_nm),肽序列(pep_seq)和它们对应的每个样品的强度(int_sam)有多行 . 同一基因多次出现,即占据几排 .

  • 在df2中,基因(行)仅出现一次,具有相应的强度值

  • 因此,df1比df2长得多(例如,55000行对6000行)

  • 强度列数(int_samp)可以很多 . 我在这个例子中有3个

Dataframe 1

pep_seq = c("aaaaaaaaa", "ababababba", "dfsfsfsfds", "xbbcbcncncc", "fbbdsgffhhh", "dggdgdgegeggerr", 
        "dfgthrgfgf", "wegregegg", "egegegergewge", "sfngegebser", "qegqeefbew", "qegqetegqt", 
        "qwtqtewr", "etghsfrgf", "sfsdfbdfbergeagaegr", "wasfqertsdfaefwe")
int_samp_1 = c("2421432", "24242424", "NA", "4684757849", "NA", "10485040", "NA", 
          "6849400", "40300", "NA", "NA", "NA", "556456466", "4646456466", "246464266", "4564242646")
int_samp_2 = c("NA", "5342353", "14532556", "43566", "46367367", "768769769", "797899", "NA", "NA", "NA", 
          "686899", "7898979", "678568", "NA", "68886", "488")
int_samp_3 = c("11351", "NA", "NA", "NA", "1354151345", "1351351354", "314534", "1535", "3145354", "4353455", 
          "324535", "3543445", "34535", "34535534", "NA", "NA")
gene_nm = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C")
df_1 = cbind.data.frame(pep_seq, int_samp_1, int_samp_2, int_samp_3, gene_nm)

Dataframe 2

int_samp_1a = c("2421432", "24242424", "NA")
int_samp_2a = c("NA", "5342353", "14532556")
int_samp_3a = c("11351", "NA", "NA")
gene_nm.a = c("A", "B", "C")
df_2 = cbind.data.frame(gene_nm.a, int_samp_1a, int_samp_2a, int_samp_3a)

请建议 .

2 回答

  • 2

    在IIUC中, df_1df_2 中有相同名称的列(例如 int_samp_X 表示某个整数 X ),并且您希望获得匹配列名称的差异,按 gene_nm 分组(例如 df_1[df_1$gene_nm == 'A', int_samp_1] - df_2[df_2$gene_nm == 'A', int_samp_1] ) .

    我们可以使用 tidyverse 系列包来解决这个问题,特别是 dplyrpurrr .

    首先,将 df_1df_2left_join 合并,以确保_865874中的所有多个条目在与 df_2 中的基因级条目匹配时保留:

    library(tidyverse)
    
    df_3 <- df_1 %>% left_join(df_2, by = "gene_nm")
    
    df_3
                   pep_seq int_samp_1.x int_samp_2.x int_samp_3.x gene_nm int_samp_1.y int_samp_2.y int_samp_3.y
    1            aaaaaaaaa      2421432           NA        11351       A      2421432           NA        11351
    2           ababababba     24242424      5342353           NA       A      2421432           NA        11351
    3           dfsfsfsfds           NA     14532556           NA       A      2421432           NA        11351
    4          xbbcbcncncc   4684757849        43566           NA       A      2421432           NA        11351
    5          fbbdsgffhhh           NA     46367367   1354151345       A      2421432           NA        11351
    6      dggdgdgegeggerr     10485040    768769769   1351351354       A      2421432           NA        11351
    7           dfgthrgfgf           NA       797899       314534       B     24242424      5342353           NA
    8            wegregegg      6849400           NA         1535       B     24242424      5342353           NA
    9        egegegergewge        40300           NA      3145354       B     24242424      5342353           NA
    10         sfngegebser           NA           NA      4353455       C           NA     14532556           NA
    11          qegqeefbew           NA       686899       324535       C           NA     14532556           NA
    12          qegqetegqt           NA      7898979      3543445       C           NA     14532556           NA
    13            qwtqtewr    556456466       678568        34535       C           NA     14532556           NA
    14           etghsfrgf   4646456466           NA     34535534       C           NA     14532556           NA
    15 sfsdfbdfbergeagaegr    246464266        68886           NA       C           NA     14532556           NA
    16    wasfqertsdfaefwe   4564242646          488           NA       C           NA     14532556           NA
    

    然后在感兴趣的列名称上 map ,从每个列对中获取差异 . (请注意,您需要先将 int_samp 列从 factor 转换为 numeric . )

    Update (根据OP评论):要在计算差异之前将 NA 转换为 0 ,我们可以使用 mutate_if()replace() ,将以下内容添加到方法链中:

    mutate_if(is.numeric,  funs(replace(., is.na(.), 0)))
    

    最后, join 回到 df_1

    var_names <- df_1 %>% select(starts_with("int_samp")) %>% names()
    
    var_names # [1] "int_samp_1" "int_samp_2" "int_samp_3"
    
    var_names %>%
      map_dfc(~df_3 %>%
                mutate_at(vars(matches(.x)), funs(as.numeric(as.character(.)))) %>%
                mutate_if(is.numeric,  funs(replace(., is.na(.), 0))) %>%
                select(matches(.x)) %>%
                reduce(`-`)) %>%
      set_names(paste0(var_names, "_diff")) %>%
      bind_cols(df_1)
    

    输出:

    int_samp_1_diff int_samp_2_diff int_samp_3_diff pep_seq             int_samp_1 int_samp_2 int_samp_3 gene_nm
                 <dbl>           <dbl>           <dbl> <fct>               <fct>      <fct>      <fct>      <fct>  
     1              0.              0.              0. aaaaaaaaa           2421432    NA         11351      A      
     2       21820992.        5342353.         -11351. ababababba          24242424   5342353    NA         A      
     3       -2421432.       14532556.         -11351. dfsfsfsfds          NA         14532556   NA         A      
     4     4682336417.          43566.         -11351. xbbcbcncncc         4684757849 43566      NA         A      
     5       -2421432.       46367367.     1354139994. fbbdsgffhhh         NA         46367367   1354151345 A      
     6        8063608.      768769769.     1351340003. dggdgdgegeggerr     10485040   768769769  1351351354 A      
     7      -24242424.       -4544454.         314534. dfgthrgfgf          NA         797899     314534     B      
     8      -17393024.       -5342353.           1535. wegregegg           6849400    NA         1535       B      
     9      -24202124.       -5342353.        3145354. egegegergewge       40300      NA         3145354    B      
    10              0.      -14532556.        4353455. sfngegebser         NA         NA         4353455    C      
    11              0.      -13845657.         324535. qegqeefbew          NA         686899     324535     C      
    12              0.       -6633577.        3543445. qegqetegqt          NA         7898979    3543445    C      
    13      556456466.      -13853988.          34535. qwtqtewr            556456466  678568     34535      C      
    14     4646456466.      -14532556.       34535534. etghsfrgf           4646456466 NA         34535534   C      
    15      246464266.      -14463670.              0. sfsdfbdfbergeagaegr 246464266  68886      NA         C      
    16     4564242646.      -14532068.              0. wasfqertsdfaefwe    4564242646 488        NA         C
    

    注意:这个答案主要来自akrun的答案here .

  • 2

    一个选项可以是使用 dplyr 连接 df_1df_2 然后执行简单的矩阵减法 .

    Note: 数据框得到强度读数因子 . 当你期望进行减法时,我认为保持测量因素并不是一个好主意 . 因此我把它们转换成 integer .

    library(dplyr)
    
    # The NA values from df_2 has been changed to 0 since keeping those NA, will
    # turn values in df_A NA for no reason. 
    mod <- df_1 %>% left_join(df_2, by= c("gene_nm" = "gene_nm.a")) %>% # join on gene
      mutate_at(vars(starts_with("int_samp")), funs(as.integer(as.character(.)))) %>%
      mutate_at(vars(ends_with("a")), funs(ifelse(is.na(.),0L,.))) #Values are converted
    
    # The modified data.frame got columns from both df_1 and df_2
    mod[,grepl("^int_samp_\\d+$", names(mod))] <- 
                    mod[,grepl("^int_samp_\\d+$", names(mod))] -  
                    mod[,grepl("^int_samp_\\d+[a-z]+$", names(mod))]
    
    # Take columns from df_1. 
    mod[names(df_1)]
    #                pep_seq int_samp_1 int_samp_2 int_samp_3 gene_nm
    # 1            aaaaaaaaa          0         NA          0       A
    # 2           ababababba   21820992    5342353         NA       A
    # 3           dfsfsfsfds         NA   14532556         NA       A
    # 4          xbbcbcncncc         NA      43566         NA       A
    # 5          fbbdsgffhhh         NA   46367367 1354139994       A
    # 6      dggdgdgegeggerr    8063608  768769769 1351340003       A
    # 7           dfgthrgfgf         NA   -4544454     314534       B
    # 8            wegregegg  -17393024         NA       1535       B
    # 9        egegegergewge  -24202124         NA    3145354       B
    # 10         sfngegebser         NA         NA    4353455       C
    # 11          qegqeefbew         NA  -13845657     324535       C
    # 12          qegqetegqt         NA   -6633577    3543445       C
    # 13            qwtqtewr  556456466  -13853988      34535       C
    # 14           etghsfrgf         NA         NA   34535534       C
    # 15 sfsdfbdfbergeagaegr  246464266  -14463670         NA       C
    # 16    wasfqertsdfaefwe         NA  -14532068         NA       C
    

相关问题