首页 文章

如何在每个组中创建滞后变量?

提问于
浏览
46

我有一个data.table:

set.seed(1)
data <- data.table(time = c(1:3, 1:4),
                   groups = c(rep(c("b", "a"), c(3, 4))),
                   value = rnorm(7))

data
#    groups time      value
# 1:      b    1 -0.6264538
# 2:      b    2  0.1836433
# 3:      b    3 -0.8356286
# 4:      a    1  1.5952808
# 5:      a    2  0.3295078
# 6:      a    3 -0.8204684
# 7:      a    4  0.4874291

我想在"groups"的每个级别内计算"value"列的滞后版本 .

结果应该是这样的

#   groups time      value  lag.value
# 1      a    1  1.5952808         NA
# 2      a    2  0.3295078  1.5952808
# 3      a    3 -0.8204684  0.3295078
# 4      a    4  0.4874291 -0.8204684
# 5      b    1 -0.6264538         NA
# 6      b    2  0.1836433 -0.6264538
# 7      b    3 -0.8356286  0.1836433

我试图直接使用 lag

data$lag.value <- lag(data$value)

......显然不行 .

我也尝试过:

unlist(tapply(data$value, data$groups, lag))
 a1         a2         a3         a4         b1         b2         b3 
 NA -0.1162932  0.4420753  2.1505440         NA  0.5894583 -0.2890288

这几乎是我想要的 . 但是,生成的向量的排序与data.table中的排序不同,这是有问题的 .

在基础R,plyr,dplyr和data.table中执行此操作的最有效方法是什么?

5 回答

  • 55

    你可以在 data.table 内做到这一点

    library(data.table)
     data[, lag.value:=c(NA, value[-.N]), by=groups]
      data
     #   time groups       value   lag.value
     #1:    1      a  0.02779005          NA
     #2:    2      a  0.88029938  0.02779005
     #3:    3      a -1.69514201  0.88029938
     #4:    1      b -1.27560288          NA
     #5:    2      b -0.65976434 -1.27560288
     #6:    3      b -1.37804943 -0.65976434
     #7:    4      b  0.12041778 -1.37804943
    

    对于多列:

    nm1 <- grep("^value", colnames(data), value=TRUE)
    nm2 <- paste("lag", nm1, sep=".")
    data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
     data
    #    time groups      value     value1      value2  lag.value lag.value1
    #1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
    #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
    #3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
    #4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
    #5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
    #6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
    #7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
    #    lag.value2
    #1:          NA
    #2:  1.12493092
    #3: -0.04493361
    #4:          NA
    #5:  0.94383621
    #6:  0.82122120
    #7:  0.59390132
    

    更新

    data.table 版本> = v1.9.5 ,我们可以使用 shifttype 作为 laglead . 默认情况下,类型为 lag .

    data[, (nm2) :=  shift(.SD), by=groups, .SDcols=nm1]
    #   time groups      value     value1      value2  lag.value lag.value1
    #1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
    #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
    #3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
    #4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
    #5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
    #6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
    #7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
    #    lag.value2
    #1:          NA
    #2:  1.12493092
    #3: -0.04493361
    #4:          NA
    #5:  0.94383621
    #6:  0.82122120
    #7:  0.59390132
    

    如果您需要反向,请使用 type=lead

    nm3 <- paste("lead", nm1, sep=".")
    

    使用原始数据集

    data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
      #  time groups      value     value1      value2 lead.value lead.value1
      #1:    1      b -0.6264538  0.7383247  1.12493092  0.1836433   0.5757814
      #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.8356286  -0.3053884
      #3:    3      b -0.8356286 -0.3053884 -0.01619026         NA          NA
      #4:    1      a  1.5952808  1.5117812  0.94383621  0.3295078   0.3898432
      #5:    2      a  0.3295078  0.3898432  0.82122120 -0.8204684  -0.6212406
      #6:    3      a -0.8204684 -0.6212406  0.59390132  0.4874291  -2.2146999
      #7:    4      a  0.4874291 -2.2146999  0.91897737         NA          NA
     #   lead.value2
     #1: -0.04493361
     #2: -0.01619026
     #3:          NA
     #4:  0.82122120
     #5:  0.59390132
     #6:  0.91897737
     #7:          NA
    

    数据

    set.seed(1)
     data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
                 value = rnorm(7), value1=rnorm(7), value2=rnorm(7))
    
  • 73

    使用包 dplyr

    library(dplyr)
    data <- 
        data %>%
        group_by(groups) %>%
        mutate(lag.value = dplyr::lag(value, n = 1, default = NA))
    

    > data
    Source: local data table [7 x 4]
    Groups: groups
    
      time groups       value   lag.value
    1    1      a  0.07614866          NA
    2    2      a -0.02784712  0.07614866
    3    3      a  1.88612245 -0.02784712
    4    1      b  0.26526825          NA
    5    2      b  1.23820506  0.26526825
    6    3      b  0.09276648  1.23820506
    7    4      b -0.09253594  0.09276648
    

    正如@BrianD所指出的,这隐含地假设值已经按组排序 . 如果不是,请按组排序,或使用 lag 中的 order_by 参数 . 另请注意,由于existing issue具有某些版本的dplyr,为了安全起见,应明确给出参数和命名空间 .

  • 2

    在基地R,这将完成工作:

    data$lag.value <- c(NA, data$value[-nrow(data)])
    data$lag.value[which(!duplicated(data$groups))] <- NA
    

    第一行添加了一串滞后(1)观测值 . 第二个字符串校正每个组的第一个条目,因为滞后观察来自前一个组 .

    请注意 data 的格式为 data.frame ,不使用 data.table .

  • 5

    如果您想确保在排序数据时避免任何问题,可以使用dplyr手动执行此操作,例如:

    df <- data.frame(Names = c(rep('Dan',50),rep('Dave',100)),
                Dates = c(seq(1,100,by=2),seq(1,100,by=1)),
                Values = rnorm(150,0,1))
    
    df <- df %>% group_by(Names) %>% mutate(Rank=rank(Dates),
                                        RankDown=Rank-1)
    
    df <- df %>% left_join(select(df,Rank,ValueDown=Values,Names),by=c('RankDown'='Rank','Names')
    ) %>% select(-Rank,-RankDown)
    
    head(df)
    

    或者我喜欢把它放在一个带有所选分组变量,排序列(如Date或其他)和所选滞后数的函数中的想法 . 这也需要lazyeval以及dplyr .

    groupLag <- function(mydf,grouping,ranking,lag){
      df <- mydf
      groupL <- lapply(grouping,as.symbol)
    
      names <- c('Rank','RankDown')
      foos <- list(interp(~rank(var),var=as.name(ranking)),~Rank-lag)
    
      df <- df %>% group_by_(.dots=groupL) %>% mutate_(.dots=setNames(foos,names))
    
      selectedNames <- c('Rank','Values',grouping)
      df2 <- df %>% select_(.dots=selectedNames)
      colnames(df2) <- c('Rank','ValueDown',grouping)
    
      df <- df %>% left_join(df2,by=c('RankDown'='Rank',grouping)) %>% select(-Rank,-RankDown)
    
      return(df)
    }
    
    groupLag(df,c('Names'),c('Dates'),1)
    
  • 2

    我想通过两种方式来补充之前的答案,我在重要的案例 when you are not guaranteed that each group has data for every time period 中处理这个问题 . 也就是说,你仍然有一个有规律的间隔时间序列,但是这里和那里可能会有缺失 . 我将重点介绍两种改进 dplyr 解决方案的方法 .

    我们从您使用的相同数据开始...

    library(dplyr)
    library(tidyr)
    
    set.seed(1)
    data_df = data.frame(time   = c(1:3, 1:4),
                         groups = c(rep(c("b", "a"), c(3, 4))),
                         value  = rnorm(7))
    data_df
    #>   time groups      value
    #> 1    1      b -0.6264538
    #> 2    2      b  0.1836433
    #> 3    3      b -0.8356286
    #> 4    1      a  1.5952808
    #> 5    2      a  0.3295078
    #> 6    3      a -0.8204684
    #> 7    4      a  0.4874291
    

    ...但现在我们删除了几行

    data_df = data_df[-c(2, 6), ]
    data_df
    #>   time groups      value
    #> 1    1      b -0.6264538
    #> 3    3      b -0.8356286
    #> 4    1      a  1.5952808
    #> 5    2      a  0.3295078
    #> 7    4      a  0.4874291
    

    简单的dplyr解决方案不再有效

    data_df %>% 
      arrange(groups, time) %>% 
      group_by(groups) %>% 
      mutate(lag.value = lag(value)) %>% 
      ungroup()
    #> # A tibble: 5 x 4
    #>    time groups  value lag.value
    #>   <int> <fct>   <dbl>     <dbl>
    #> 1     1 a       1.60     NA    
    #> 2     2 a       0.330     1.60 
    #> 3     4 a       0.487     0.330
    #> 4     1 b      -0.626    NA    
    #> 5     3 b      -0.836    -0.626
    

    你看,虽然我们没有案例 (group = 'a', time = '3') 的值,但在 (group = 'a', time = '4') 的情况下,上面仍然显示滞后值,这实际上是 time = 2 的值 .

    正确的dplyr解决方案

    我们的想法是添加缺失的(组,时间)组合 . 当你有很多可能的(组,时间)组合时,这是低效率的,但是稀疏地捕获了这些值 .

    dplyr_correct_df = expand.grid(
      groups = sort(unique(data_df$groups)),
      time   = seq(from = min(data_df$time), to = max(data_df$time))
    ) %>% 
      left_join(data_df, by = c("groups", "time")) %>% 
      arrange(groups, time) %>% 
      group_by(groups) %>% 
      mutate(lag.value = lag(value)) %>% 
      ungroup()
    dplyr_correct_df
    #> # A tibble: 8 x 4
    #>   groups  time   value lag.value
    #>   <fct>  <int>   <dbl>     <dbl>
    #> 1 a          1   1.60     NA    
    #> 2 a          2   0.330     1.60 
    #> 3 a          3  NA         0.330
    #> 4 a          4   0.487    NA    
    #> 5 b          1  -0.626    NA    
    #> 6 b          2  NA        -0.626
    #> 7 b          3  -0.836    NA    
    #> 8 b          4  NA        -0.836
    

    请注意,我们现在在 (group = 'a', time = '4') 处有一个NA,这应该是预期的行为 . 与 (group = 'b', time = '3') 相同 .

    繁琐但也正确的解决方案使用类zoo :: zooreg

    在案例数量非常大的情况下,此解决方案应该在内存方面更好地工作,因为它不使用NA来填充缺失的案例,而是使用索引 .

    library(zoo)
    
    zooreg_correct_df = data_df %>% 
      as_tibble() %>% 
      # nest the data for each group
      # should work for multiple groups variables
      nest(-groups, .key = "zoo_ob") %>%
      mutate(zoo_ob = lapply(zoo_ob, function(d) {
    
        # create zooreg objects from the individual data.frames created by nest
        z = zoo::zooreg(
          data      = select(d,-time),
          order.by  = d$time,
          frequency = 1
        ) %>% 
          # calculate lags
          # we also ask for the 0'th order lag so that we keep the original value
          zoo:::lag.zooreg(k = (-1):0) # note the sign convention is different
    
        # recover df's from zooreg objects
        cbind(
          time = as.integer(zoo::index(z)),
          zoo:::as.data.frame.zoo(z)
        )
    
      })) %>% 
      unnest() %>% 
      # format values
      select(groups, time, value = value.lag0, lag.value = `value.lag-1`) %>% 
      arrange(groups, time) %>% 
      # eliminate additional periods created by lag
      filter(time <= max(data_df$time))
    zooreg_correct_df
    #> # A tibble: 8 x 4
    #>   groups  time   value lag.value
    #>   <fct>  <int>   <dbl>     <dbl>
    #> 1 a          1   1.60     NA    
    #> 2 a          2   0.330     1.60 
    #> 3 a          3  NA         0.330
    #> 4 a          4   0.487    NA    
    #> 5 b          1  -0.626    NA    
    #> 6 b          2  NA        -0.626
    #> 7 b          3  -0.836    NA    
    #> 8 b          4  NA        -0.836
    

    最后,让我们检查两个正确的解决方案是否真的相同:

    all.equal(dplyr_correct_df, zooreg_correct_df)
    #> [1] TRUE
    

相关问题