首页 文章

在dplyr中确定分组数据帧中最频繁因子的最快方法

提问于
浏览
5

我在dplyr中总结一个数据框时,试图在一个组中找到几个因子变量中最常见的值 . 我需要一个执行以下操作的公式:

  • 在一个组中的一个变量的所有因子中找到最常用的因子水平(因此基本上"max()"用于因子水平计数) .

  • 如果几个最常用因子水平之间存在平局,则选择其中任何一个因子水平 .

  • 返回因子级别名称(不是计数) .

有几个公式可行 . 但是,我能想到的那些都很慢 . 快速的那些不方便一次应用于数据帧中的几个变量 . 我想知道是否有人知道一种与dplyr很好地集成的快速方法 .

我尝试了以下方法:

generating sample data (50000 groups with 100 random letters)

z <- data.frame(a = rep(1:50000,100), b = sample(LETTERS, 5000000, replace = TRUE))

str(z)
'data.frame':   5000000 obs. of  2 variables:
$ a: int  1 2 3 4 5 6 7 8 9 10 ...
$ b: Factor w/ 26 levels "A","B","C","D",..: 6 4 14 12 3 19 17 19 15 20 ...

"Clean"-but-slow approach 1

y <- z %>% 
    group_by(a) %>% 
    summarise(c = names(table(b))[which.max(table(b))])

user    system  elapsed 
26.772  2.011   29.568

"Clean"-but-slow approach 2

y <- z %>% 
    group_by(a) %>% 
    summarise(c = names(which(table(b) == max(table(b)))[1]))

user    system  elapsed 
29.329  2.029   32.361

"Clean"-but-slow approach 3

y <- z %>% 
    group_by(a) %>% 
    summarise(c = names(sort(table(b),decreasing = TRUE)[1]))

user    system  elapsed 
35.086  6.905   42.485

"Messy"-but-fast approach

y <- z %>% 
     group_by(a,b) %>% 
     summarise(counter = n()) %>% 
     group_by(a) %>% 
     filter(counter == max(counter))
y <- y[!duplicated(y$a),]
y <- y$counter <- NULL

user   system  elapsed 
7.061  0.330   7.664

4 回答

  • 8

    这是 dplyr 的另一个选项:

    set.seed(123)
    z <- data.frame(a = rep(1:50000,100), 
                    b = sample(LETTERS, 5000000, replace = TRUE), 
                    stringsAsFactors = FALSE)
    
    a <- z %>% group_by(a, b) %>% summarise(c=n()) %>% filter(row_number(desc(c))==1) %>% .$b 
    b <- z %>% group_by(a) %>% summarise(c=names(which(table(b) == max(table(b)))[1])) %>% .$c
    

    我们确保这些是等效的方法:

    > identical(a, b)
    #[1] TRUE
    

    Update

    正如@docendodiscimus所提到的,你也可以这样做:

    count(z, a, b) %>% slice(which.max(n))
    

    以下是基准测试的结果:

    library(microbenchmark)
    mbm <- microbenchmark(
      steven = z %>% group_by(a, b) %>% summarise(c = n()) %>% filter(row_number(desc(c))==1),
      phil = z %>% group_by(a) %>% summarise(c = names(which(table(b) == max(table(b)))[1])),
      docendo = count(z, a, b) %>% slice(which.max(n)),
      times = 10
    )
    

    enter image description here

    #Unit: seconds
    #    expr       min        lq      mean    median        uq       max neval cld
    #  steven  4.752168  4.789564  4.815986  4.813686  4.847964  4.875109    10  b 
    #    phil 15.356051 15.378914 15.467534 15.458844 15.533385 15.606690    10   c
    # docendo  4.586096  4.611401  4.669375  4.688420  4.702352  4.753583    10 a
    
  • 6

    为什么选择dplyr?

    #dummy data
    set.seed(123)
    z <- data.frame(a = rep(1:50000,100),
                    b = sample(LETTERS, 5000000, replace = TRUE))
    
    #result
    names(sort(table(z$b),decreasing = TRUE)[1])
    # [1] "S"
    
    #time it
    system.time(
      names(sort(table(z$b),decreasing = TRUE)[1])
    )
    
    # user  system elapsed 
    # 0.36    0.00    0.36
    

    编辑:多列

    #dummy data
    set.seed(123)
    z <- data.frame(a = rep(1:50000,100),
                    b = sample(LETTERS, 5000000, replace = TRUE),
                    c = sample(LETTERS, 5000000, replace = TRUE),
                    d = sample(LETTERS, 5000000, replace = TRUE))
    
    # check for multiple columns
    sapply(c("b","c","d"), function(i)
      names(sort(table(z[,i]),decreasing = TRUE)[1])
      )
    # b   c   d 
    #"S" "N" "G" 
    
    #time it
    system.time(
      sapply(c("b","c","d"), function(i)
        names(sort(table(z[,i]),decreasing = TRUE)[1]))
      )
    # user  system elapsed 
    # 0.61    0.17    0.78
    
  • 4

    data.table 仍然是最快的选择:

    z <- data.frame(a = rep(1:50000,100), b = sample(LETTERS, 5000000, replace = TRUE))
    

    标杆:

    library(data.table)
    library(dplyr)
    
    #dplyr
    system.time({
      y <- z %>% 
        group_by(a) %>% 
        summarise(c = names(which(table(b) == max(table(b)))[1]))  
    })
     user  system elapsed 
    14.52    0.01   14.70 
    
    #data.table
    system.time(
      setDT(z)[, .N, by=b][order(N),][.N,]
    )
     user  system elapsed 
     0.05    0.02    0.06 
    
    #@zx8754 's way - base R
    system.time(
      names(sort(table(z$b),decreasing = TRUE)[1])
    )
       user  system elapsed 
       0.73    0.06    0.81
    

    正如使用data.table可以看到的那样:

    setDT(z)[, .N, by=b][order(N),][.N,]
    

    要么

    #just to get the name
      setDT(z)[, .N, by=b][order(N),][.N, b]
    

    似乎是最快的

    所有列的更新:

    使用@ zx8754的数据

    set.seed(123)
    z2 <- data.frame(a = rep(1:50000,100),
                    b = sample(LETTERS, 5000000, replace = TRUE),
                    c = sample(LETTERS, 5000000, replace = TRUE),
                    d = sample(LETTERS, 5000000, replace = TRUE))
    

    你可以这样做:

    #with data.table
    system.time(
     sapply(c('b','c','d'), function(x) {
      data.table(x = z2[[x]])[, .N, by=x][order(N),][.N, x] 
     }))
     user  system elapsed 
     0.34    0.00    0.34 
    
    #with base-R
    system.time(
      sapply(c("b","c","d"), function(i)
        names(sort(table(z2[,i]),decreasing = TRUE)[1]))
    )
     user  system elapsed 
     4.14    0.11    4.26
    

    只是为了确认结果是一样的:

    sapply(c('b','c','d'), function(x) {
      data.table(x = z2[[x]])[, .N, by=x][order(N),][.N, x] 
    })
    b c d 
    S N G 
    
    sapply(c("b","c","d"), function(i)
        names(sort(table(z2[,i]),decreasing = TRUE)[1]))
    b   c   d 
    "S" "N" "G"
    
  • 6

    按照LyzandeR的建议,我将添加另一个答案:

    require(data.table)
    setDT(z)[, .N, by=.(a,b)][order(-N), .(b=b[1L]), keyby=a]
    

相关问题