首页 文章

从另一个数据帧替换dataframe中的某个值

提问于
浏览
0

我有两个数据帧:

df1 <- data.frame(id = c("LABEL1", "LABEL2", "LABEL3", "LABEL4", "LABEL5", "LABEL6"),matrix(1:60,6,10))
df1[c(4:6), c(2:4)] = NA

df2 = data.frame(id = c( "LABEL3", "LABEL4", "LABEL5", "LABEL6"),matrix(seq(100,10000, length.out = 32),4,8))

我想使用键值= 'id'来查找DF2中DF1的缺失值 . 这是所需的输出:enter image description here

这是我尝试的方法:1 . 合并:但我得到X1:X3的重复列 . 匹配:

df1[,2]= df2[,2][match(df1$id, df2$id)]

但我会在DF1中获得标签3 . 3.从qdap包查找:

library(qdap)
apply(df1, 2, lookup, df2)

与方法2相同的结果 .

谢谢!

2 回答

  • 2

    您可以使用 tidyr 以整洁的数据形式工作,然后使用 dplyr 来组合表格

    library(dplyr)
    library(tidyr)
    

    In one way with pipe

    df1 %>% 
      mutate(id = as.character(id)) %>%
      gather(key = "col", value = "val", -id) %>%
      left_join(df2 %>% 
                    mutate(id = as.character(id)) %>%
                    gather(key = "col", value = "val", -id), 
                by =c("id", "col")) %>%
      transmute(id, col, val = ifelse(is.na(val.x), val.y, val.x)) %>%
      spread(col, val) %>%
      select(id, num_range("X", 1:10))
    #>       id        X1       X2       X3 X4 X5 X6 X7 X8 X9 X10
    #> 1 LABEL1    1.0000    7.000   13.000 19 25 31 37 43 49  55
    #> 2 LABEL2    2.0000    8.000   14.000 20 26 32 38 44 50  56
    #> 3 LABEL3    3.0000    9.000   15.000 21 27 33 39 45 51  57
    #> 4 LABEL4  419.3548 1696.774 2974.194 22 28 34 40 46 52  58
    #> 5 LABEL5  738.7097 2016.129 3293.548 23 29 35 41 47 53  59
    #> 6 LABEL6 1058.0645 2335.484 3612.903 24 30 36 42 48 54  60
    

    Step by step for explanation

    # id as character instead of factor
    df1 <- df1 %>% mutate(id = as.character(id))
    # tidy data
    df1 <- df1 %>% gather(key = "col", value = "val", -id)
    # print result as dplyr tbl
    df1 %>% as.tbl()
    #> Source: local data frame [60 x 3]
    #> 
    #>        id   col   val
    #>     (chr) (chr) (int)
    #> 1  LABEL1    X1     1
    #> 2  LABEL2    X1     2
    #> 3  LABEL3    X1     3
    #> 4  LABEL4    X1    NA
    #> 5  LABEL5    X1    NA
    #> 6  LABEL6    X1    NA
    #> 7  LABEL1    X2     7
    #> 8  LABEL2    X2     8
    #> 9  LABEL3    X2     9
    #> 10 LABEL4    X2    NA
    #> ..    ...   ...   ...
    # idem on df2
    df2 <- df2 %>% 
      mutate(id = as.character(id)) %>%
      tidyr::gather(key = "col", value = "val", -id)
    # print result as dplyr tbl
    df2 %>% as.tbl()
    #> Source: local data frame [32 x 3]
    #> 
    #>        id   col       val
    #>     (chr) (chr)     (dbl)
    #> 1  LABEL3    X1  100.0000
    #> 2  LABEL4    X1  419.3548
    #> 3  LABEL5    X1  738.7097
    #> 4  LABEL6    X1 1058.0645
    #> 5  LABEL3    X2 1377.4194
    #> 6  LABEL4    X2 1696.7742
    #> 7  LABEL5    X2 2016.1290
    #> 8  LABEL6    X2 2335.4839
    #> 9  LABEL3    X3 2654.8387
    #> 10 LABEL4    X3 2974.1935
    #> ..    ...   ...       ...
    
    # join only id and col level of df1 with df2
    new.df <- left_join(df1, df2, by = c("id", "col"))
    # print result as dplyr tbl
    new.df %>% as.tbl()
    #> Source: local data frame [60 x 4]
    #> 
    #>        id   col val.x     val.y
    #>     (chr) (chr) (int)     (dbl)
    #> 1  LABEL1    X1     1        NA
    #> 2  LABEL2    X1     2        NA
    #> 3  LABEL3    X1     3  100.0000
    #> 4  LABEL4    X1    NA  419.3548
    #> 5  LABEL5    X1    NA  738.7097
    #> 6  LABEL6    X1    NA 1058.0645
    #> 7  LABEL1    X2     7        NA
    #> 8  LABEL2    X2     8        NA
    #> 9  LABEL3    X2     9 1377.4194
    #> 10 LABEL4    X2    NA 1696.7742
    #> ..    ...   ...   ...       ...
    
    #replace NA in col val.x from df1 by value val.y of df2
    # and only keep id, col and new column val
    new.df <- new.df %>% transmute(id, col, val = ifelse(is.na(val.x), val.y, val.x)) 
    new.df %>% as.tbl()
    #> Source: local data frame [60 x 3]
    #> 
    #>        id   col       val
    #>     (chr) (chr)     (dbl)
    #> 1  LABEL1    X1    1.0000
    #> 2  LABEL2    X1    2.0000
    #> 3  LABEL3    X1    3.0000
    #> 4  LABEL4    X1  419.3548
    #> 5  LABEL5    X1  738.7097
    #> 6  LABEL6    X1 1058.0645
    #> 7  LABEL1    X2    7.0000
    #> 8  LABEL2    X2    8.0000
    #> 9  LABEL3    X2    9.0000
    #> 10 LABEL4    X2 1696.7742
    #> ..    ...   ...       ...
    
    # put back data in wide format
    new.df %>% 
      spread(col, val) %>%
      select(id, num_range("X", 1:10)) # put column in same order as df1
    #>       id        X1       X2       X3 X4 X5 X6 X7 X8 X9 X10
    #> 1 LABEL1    1.0000    7.000   13.000 19 25 31 37 43 49  55
    #> 2 LABEL2    2.0000    8.000   14.000 20 26 32 38 44 50  56
    #> 3 LABEL3    3.0000    9.000   15.000 21 27 33 39 45 51  57
    #> 4 LABEL4  419.3548 1696.774 2974.194 22 28 34 40 46 52  58
    #> 5 LABEL5  738.7097 2016.129 3293.548 23 29 35 41 47 53  59
    #> 6 LABEL6 1058.0645 2335.484 3612.903 24 30 36 42 48 54  60
    
  • 0

    可能有人有比我更好的方法,但这应该有效 . 这确实假设列的顺序是相同的,所以要小心 .

    row.matches = match(df1$id, df2$id)
    nas = which(is.na(df1), arr.ind = TRUE)
    replacements = nas
    replacements[ ,1] = row.matches[nas[ ,1]]
    df1[nas] = df2[replacements]
    

    基本上,我所做的就是找到匹配的行和 df1 中出现NA的索引 . 使用匹配的行向量替换那些NA索引的行索引,并将 df1 中的这些值替换为 df2 中的相应值 .

相关问题