首页 文章

data.table内部/外部连接,在double类型的连接列中使用NA?

提问于
浏览
10

在这篇维基百科文章SQL join之后,我希望能够清楚地看到我们如何与data.table Build 联接 . 在这个过程中,我们可能在加入NAs时发现了一个错误 . 以wiki为例:

R) X = data.table(name=c("Raf","Jon","Ste","Rob","Smi","Joh"),depID=c(31,33,33,34,34,NA),key="depID")
R) Y = data.table(depID=c(31,33,34,35),depName=c("Sal","Eng","Cle","Mar"),key="depID")
R) X
   name depID
1:  Joh    NA
2:  Raf    31
3:  Jon    33
4:  Ste    33
5:  Rob    34
6:  Smi    34
R) Y
   depID depName
1:    31     Sal
2:    33     Eng
3:    34     Cle
4:    35     Mar

LEFT OUTER JOIN

R) merge.data.frame(X,Y,all.x=TRUE)
  depID name depName
1    31  Raf     Sal
2    33  Jon     Eng
3    33  Ste     Eng
4    34  Rob     Cle
5    34  Smi     Cle
6    NA  Joh    <NA>

merge.data.table 不输出相同的结果,并显示我认为是lign 2上的错误 .

R) merge(X,Y,all.x=TRUE)
   depID name depName
1:    NA  Joh     Eng
2:    31  Raf      NA
3:    33  Jon     Eng
4:    33  Ste     Eng
5:    34  Rob     Cle
6:    34  Smi     Cle
R) Y[X] #same -> :(
   depID depName name
1:    NA     Eng  Joh
2:    31      NA  Raf
3:    33     Eng  Jon
4:    33     Eng  Ste
5:    34     Cle  Rob
6:    34     Cle  Smi

RIGHT OUTER JOIN 看起来一样

R) merge.data.frame(X,Y,all.y=TRUE)
  depID name depName
1    31  Raf     Sal
2    33  Jon     Eng
3    33  Ste     Eng
4    34  Rob     Cle
5    34  Smi     Cle
6    35 <NA>     Mar

R) merge(X,Y,all.y=TRUE)
   depID name depName
1:    NA  Joh     Eng
2:    31   NA     Sal
3:    33  Jon     Eng
4:    33  Ste     Eng
5:    34  Rob     Cle 
6:    34  Smi     Cle
7:    35   NA     Mar

INNER (NATURAL) JOIN

R) merge.data.frame(X,Y)
  depID name depName
1    31  Raf     Sal
2    33  Jon     Eng
3    33  Ste     Eng
4    34  Rob     Cle
5    34  Smi     Cle
R) merge(X,Y)
   depID name depName
1:    NA  Joh     Eng
2:    33  Jon     Eng
3:    33  Ste     Eng
4:    34  Rob     Cle
5:    34  Smi     Cle

3 回答

  • 1

    是的它看起来像是一个(令人尴尬的)与关键字NA相关的新bug . 在关键不可能的情况下,还有其他关于NA的讨论,但我没有意识到它会以这种方式搞砸 . 将调查 . 谢谢 ...

    #2453 NA in double key column messes up joins (NA in integer and character ok)

    现在修复于1.8.7(提交780),来自NEWS:

    double类型的连接列中的NA可能导致X [Y]和合并(X,Y)返回不正确的结果,#2453 . 由于C源中的错误x == NA_REAL应该是ISNA(x) . 对键控连接的双重支持是对data.table的一个相对较新的补充,但同样令人尴尬 . 固定和测试添加 . 非常感谢有关彻底和可重复报告的统计数据 .

  • 8

    跟进其他答案中的评论,是的,这里证明它只影响 double 类型的列( integercharacter 列中的NA都可以) .

    X = data.table(name=c("Raf","Jon","Ste","Rob","Smi","Joh"),
                   depID=as.integer(c(31,33,33,34,34,NA)),key="depID")
    Y = data.table(depID=as.integer(c(31,33,34,35)),
                   depName=c("Sal","Eng","Cle","Mar"),key="depID")
    Y[X]
       depID depName name
    1:    NA      NA  Joh
    2:    31     Sal  Raf
    3:    33     Eng  Jon
    4:    33     Eng  Ste
    5:    34     Cle  Rob
    6:    34     Cle  Smi
    
    merge.data.frame(X,Y,all.x=T)
      depID name depName
    1    31  Raf     Sal
    2    33  Jon     Eng
    3    33  Ste     Eng
    4    34  Rob     Cle
    5    34  Smi     Cle
    6    NA  Joh    <NA>
    
    Y = data.table(depID=as.character(c(31,33,34,35)),
                   depName=c("Sal","Eng","Cle","Mar"),key="depID")
    X = data.table(name=c("Raf","Jon","Ste","Rob","Smi","Joh"),
                   depID=as.character(c(31,33,33,34,34,NA)),key="depID")
    X
       name depID
    1:  Raf    31
    2:  Jon    33
    3:  Ste    33
    4:  Rob    34
    5:  Smi    34
    6:  Joh    NA
    Y
       depID depName
    1:    31     Sal
    2:    33     Eng
    3:    34     Cle
    4:    35     Mar
    str(X)
    Classes ‘data.table’ and 'data.frame':  6 obs. of  2 variables:
     $ name : chr  "Raf" "Jon" "Ste" "Rob" ...
     $ depID: chr  "31" "33" "33" "34" ...
     - attr(*, "sorted")= chr "depID"
     - attr(*, ".internal.selfref")=<externalptr> 
    
    merge.data.frame(X,Y,all.x=T)
      depID name depName
    1    31  Raf     Sal
    2    33  Jon     Eng
    3    33  Ste     Eng
    4    34  Rob     Cle
    5    34  Smi     Cle
    6  <NA>  Joh    <NA>
    
    Y[X]
       depID depName name
    1:    31     Sal  Raf
    2:    33     Eng  Jon
    3:    33     Eng  Ste
    4:    34     Cle  Rob
    5:    34     Cle  Smi
    6:    NA      NA  Joh
    

    问题在第1.8节中由MATTHEW DOWLE修复

  • 2

    一些有用的信息:

    library(data.table);
    
    X <- data.table(name=c("Raf","Jon","Ste","Rob","Smi","Joh"),depID=c(31,33,33,34,34,NA),key="depID")
    #R) X
       #name depID
    #1:  Joh    NA
    #2:  Raf    31
    #3:  Jon    33
    #4:  Ste    33
    #5:  Rob    34
    #6:  Smi    34
    
    Y <- data.table(depID=c(31,33,34,35),depName=c("Sal","Eng","Cle","Mar"),key="depID")
    #R) Y
       #depID depName
    #1:    31     Sal
    #2:    33     Eng
    #3:    34     Cle
    #4:    35     Mar
    
    #################
    #LEFT OUTER JOIN#
    #################
    LJ <- merge.data.frame(X,Y,by="depID",all.x=TRUE); #by is implicit (see ?merge.data.frame)
    #R) LJ
      #depID name depName
    #1    31  Raf     Sal
    #2    33  Jon     Eng
    #3    33  Ste     Eng
    #4    34  Rob     Cle
    #5    34  Smi     Cle
    #6    NA  Joh    <NA>
    
    LJ2 <- Y[X];
    #R) LJ2
       #depID depName name
    #1:    NA      NA  Joh
    #2:    31     Sal  Raf
    #3:    33     Eng  Jon
    #4:    33     Eng  Ste
    #5:    34     Cle  Rob
    #6:    34     Cle  Smi
    
    ##################
    #RIGHT OUTER JOIN#
    ##################
    RJ <- merge.data.frame(X,Y,by="depID",all.y=TRUE); #by is implicit (see ?merge.data.frame)
    #R) RJ 
      #depID name depName
    #1    31  Raf     Sal
    #2    33  Jon     Eng
    #3    33  Ste     Eng
    #4    34  Rob     Cle
    #5    34  Smi     Cle
    #6    35 <NA>     Mar
    
    RJ2 <- X[Y];
    #R) RJ2
       #depID name depName
    #1:    31  Raf     Sal
    #2:    33  Jon     Eng
    #3:    33  Ste     Eng
    #4:    34  Rob     Cle
    #5:    34  Smi     Cle
    #6:    35   NA     Mar
    
    #################
    #FULL OUTER JOIN#
    #################
    FJ <- merge.data.frame(X,Y,all=T)
    #R) FJ
      #depID name depName
    #1    31  Raf     Sal
    #2    33  Jon     Eng
    #3    33  Ste     Eng
    #4    34  Rob     Cle
    #5    34  Smi     Cle
    #6    35 <NA>     Mar
    #7    NA  Joh    <NA>
    
    FJ2 <- merge(X,Y,all=T)
    #R) FJ2
       #depID name depName
    #1:    NA  Joh      NA
    #2:    31  Raf     Sal
    #3:    33  Jon     Eng
    #4:    33  Ste     Eng
    #5:    34  Rob     Cle
    #6:    34  Smi     Cle
    #7:    35   NA     Mar
    
    ####################
    #NATURAL INNER JOIN#
    ####################
    IJ <- merge.data.frame(X,Y)
    #R) IJ
      #depID name depName
    #1    31  Raf     Sal
    #2    33  Jon     Eng
    #3    33  Ste     Eng
    #4    34  Rob     Cle
    #5    34  Smi     Cle
    
    IJ2 <- merge(X,Y)
    #R) IJ2
       #depID name depName
    #1:    31  Raf     Sal
    #2:    33  Jon     Eng
    #3:    33  Ste     Eng
    #4:    34  Rob     Cle
    #5:    34  Smi     Cle
    
    
    A <- data.table(time=as.POSIXct(c("10:01:01","10:01:02","10:01:04","10:01:05","10:01:02","10:01:01","10:01:01"),format="%H:%M:%S"),
                    b=c("a","a","a","a","b","c","c"), 
                    d=c(1,1.9,2,1.8,5,4.1,4.2));
    B <- data.table(time=as.POSIXct(c("10:01:01","10:01:03","10:01:00","10:01:01"),format="%H:%M:%S"),b=c("a","a","c","d"), e=c(1L,2L,3L,4L));
    setkey(A,b,time)
    setkey(B,b,time)
    
    
    ###########
    #ASOF JOIN#
    ###########
    AOJ <- B[A,roll=T]
    #R) AOJ
       #b                time  e   d
    #1: a 2013-01-11 10:01:01  1 1.0
    #2: a 2013-01-11 10:01:02  1 1.9
    #3: a 2013-01-11 10:01:04  2 2.0
    #4: a 2013-01-11 10:01:05  2 1.8
    #5: b 2013-01-11 10:01:02 NA 5.0
    #6: c 2013-01-11 10:01:01  3 4.1
    #7: c 2013-01-11 10:01:01  3 4.2
    

相关问题