首页 文章

计算组内的实例(子集)

提问于
浏览
1

我为我的数据做了一个小例子:

mth <- c(rep(1,10))  
day <- c(rep(10,5),rep(11,5))  
hr <- c(3,4,5,6,7,3,4,5,6,7)  
v <- c(3,4,5,4,3,3,4,5,4,3)  
A <- data.frame(cbind(mth,day,hr,v))

我需要做的是每天获得多少值<4,我试图使用函数 rle ,但是无法得到我需要的东西 . 输出应如下所示:

mth <- c(rep(1,2))  
day <- c(10,11)  
v <- c(2,2)  #each 2 here is the sum of 1(3)+1(3) for Oct. and Nov.  
A <- data.frame(cbind(mth,day,v))

谢谢您的帮助!

4 回答

  • 1

    使用ddply很容易

    library(plyr)
    ddply(A, .(mth, day), function(x)  sum(x$v<4))
    #    mth day V1
    # 1   1  10  2
    # 2   1  11  2
    

    或者你可以使用 summarize

    ddply(A, .(mth, day), summarize, less4 = sum(v <4))
    #   mth day less4
    # 1   1  10  2
    # 2   1  11  2
    
  • 2

    这是 tapply 的基本解决方案:

    > with(A, tapply(v, paste(mth,day, sep="_"), function(x) sum(x<4) ) )
    1_10 1_11 
       2    2
    

    (它将比plyr解决方案快得多,但最有可能排在data.table方法的第二位 . )

  • 2

    一个 data.table 解决方案

    library(data.table)
    A <- data.table(A)
    A[, sum(v < 4), by = list(mth,day)]
    
    ##    mth day V1
    ## 1:   1  10 2
    ## 2:   1  11 2
    
    # or 
    
    A[v<4, .N, by = list(mth,day)]
    
    ##    mth day N
    ## 1:   1  10 2
    ## 2:   1  11 2
    

    基准测试

    # I create a mock dataset of a `year` 
    
    library(rbenchmark)
    daily <- seq(as.Date("2000/1/1"), by="day", length.out=365)
    
    A <- data.table(mth = month(daily),day = mday(daily))
    A <-  A[, list(hr = 1:24), by = list(mth,day)]
    A[['v']] <- sample(1:10, nrow(A), T)
    
    # set up the various options
    ddply1 <- function() ddply(A, .(mth, day), function(x)  sum(x$v<4))
    ddply2 <- function() ddply(A, .(mth, day), summarize, less4 = sum(v <4))
    base_tapply <- function() with(A, tapply(v, paste(mth,day, sep="_"), function(x) sum(x<4) ) )
    dt1 <- function() A[, sum(v < 4), by = list(mth,day)]
    dt2 <- function() A[v < 4, .N, by = list(mth,day)]  
    sqldf_ <- function() sqldf("SELECT A.mth,A.day,sum(A.v<4) as sum FROM A GROUP BY day")
    
    benchmark(ddply1(), ddply2(),base_tapply(),dt1(),dt2(), sqldf_(),
              replications = 5, 
              columns = c("test", "replications", "elapsed", "relative","user.self"))
    
    ##            test replications elapsed relative user.self
    ## 3 base_tapply()            5    0.08        8      0.08
    ## 1      ddply1()            5    0.72       72      0.72
    ## 2      ddply2()            5    1.04      104      1.03
    ## 4         dt1()            5    0.01        1      0.02
    ## 5         dt2()            5    0.00        0      0.00
    ## 6      sqldf_()            5    0.21       21      0.20
    
  • 3

    随着 sqldf

    library(sqldf)
    
    sqldf("SELECT A.mth,A.day,sum(A.v<4) as sum FROM A GROUP BY day")
    #  mth day sum
    #1   1  10   2
    #2   1  11   2
    

相关问题