首页 文章

使用dplyr和for循环添加多个滞后变量

提问于
浏览
1

我有时间序列数据,我正在预测,所以我创建滞后变量用于我的统计分析 . 我想在给定特定输入的情况下快速创建多个变量,以便我可以轻松地交叉验证和比较模型 .

以下是给定特定类别(A,B,C)的2个不同变量(总共4个)增加2个滞后的示例代码:

# Load dplyr
library(dplyr)

# create day, category, and 2 value vectors
days = 1:9
cats = rep(c('A','B','C'),3)
set.seed = 19
values1 = round(rnorm(9, 16, 4))
values2 = round(rnorm(9, 16, 16))

# create data frame
data = data.frame(days, cats, values1, values2)

# mutate new lag variables 
LagVal = data %>% arrange(days) %>% group_by(cats) %>% 
  mutate(LagVal1.1 = lag(values1, 1)) %>%
  mutate(LagVal1.2 = lag(values1, 2)) %>%
  mutate(LagVal2.1 = lag(values2, 1)) %>%
  mutate(LagVal2.2 = lag(values2, 2))

LagVal

       days   cats values1 values2 LagVal1.1 LagVal1.2 LagVal2.1 LagVal2.2
  <int> <fctr>   <dbl>   <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
1     1      A      16     -10        NA        NA        NA        NA
2     2      B      14      24        NA        NA        NA        NA
3     3      C      16      -6        NA        NA        NA        NA
4     4      A      12      25        16        NA       -10        NA
5     5      B      20      14        14        NA        24        NA
6     6      C      18      -5        16        NA        -6        NA
7     7      A      21       2        12        16        25       -10
8     8      B      19       5        20        14        14        24
9     9      C      18      -3        18        16        -5        -6

我的问题出现在 # mutate new lag variables 步骤,因为我有大约十二个预测变量,我可能想要滞后10倍(~13k行数据集),而且我没有心脏创建120个新变量 .

这是我尝试编写一个函数,该函数在给定 data (数据集为mutate), variables (您希望滞后的变量)和 lags (每个变量的滞后数)的输入的情况下改变新变量:

MultiMutate = function(data, variables, lags){
  # select the data to be working with
  FuncData = data
  # Loop through desired variables to mutate
  for (i in variables){
    # Loop through number of desired lags
    for (u in 1:lags){
      FuncData = FuncData %>% arrange(days) %>% group_by(cats) %>%
        # Mutate new variable for desired number of lags. Give new variable a name with the lag number appended
        mutate(paste(i, u) = lag(i, u))
    }
  }
  FuncData
}

说实话,我只是迷失了如何让它发挥作用 . 我的for循环和整体逻辑的排序是有意义的,但函数将字符转换为变量的方式和整体语法似乎有点偏离 . 有没有一种简单的方法来修复此功能以获得我想要的结果?

In particular, I'm looking for:

  • MultiMutate(data = data, variables = c(values1, values2), lags = 2) 这样的函数可以从上面创建 LagVal 的精确结果 .

  • 根据变量及其滞后动态命名变量 . 即value1.1,value1.2,value2.1,value2.2等

提前感谢您,如果您需要其他信息,请告诉我们 . 如果有一种更简单的方法来获得我正在寻找的东西,那么我全都是耳朵 .

2 回答

  • 7

    您必须深入到tidyverse工具箱中才能一次性添加它们 . 如果为 cats 的每个值嵌套数据,则可以迭代嵌套数据帧,迭代每个 values* 列的滞后 .

    library(tidyverse)
    set.seed(47)
    
    df <- data_frame(days = 1:9,
                     cats = rep(c('A','B','C'),3),
                     values1 = round(rnorm(9, 16, 4)),
                     values2 = round(rnorm(9, 16, 16)))
    
    
    df %>% nest(-cats) %>% 
        mutate(lags = map(data, function(dat) {
            imap_dfc(dat[-1], ~set_names(map(1:2, lag, x = .x), 
                                         paste0(.y, '_lag', 1:2)))
            })) %>% 
        unnest() %>% 
        arrange(days)
    #> # A tibble: 9 x 8
    #>   cats   days values1 values2 values1_lag1 values1_lag2 values2_lag1
    #>   <chr> <int>   <dbl>   <dbl>        <dbl>        <dbl>        <dbl>
    #> 1 A         1     24.     -7.          NA           NA           NA 
    #> 2 B         2     19.      1.          NA           NA           NA 
    #> 3 C         3     17.     17.          NA           NA           NA 
    #> 4 A         4     15.     24.          24.          NA           -7.
    #> 5 B         5     16.    -13.          19.          NA            1.
    #> 6 C         6     12.     17.          17.          NA           17.
    #> 7 A         7     12.     27.          15.          24.          24.
    #> 8 B         8     16.     15.          16.          19.         -13.
    #> 9 C         9     15.     36.          12.          17.          17.
    #> # ... with 1 more variable: values2_lag2 <dbl>
    

    data.table::shift 使这更简单,因为它是矢量化的 . 命名比实际滞后需要更多的工作:

    library(data.table)
    
    setDT(df)
    
    df[, sapply(1:2, function(x){paste0('values', x, '_lag', 1:2)}) := shift(.SD, 1:2), 
       by = cats, .SDcols = values1:values2][]
    #>    days cats values1 values2 values1_lag1 values1_lag2 values2_lag1
    #> 1:    1    A      24      -7           NA           NA           NA
    #> 2:    2    B      19       1           NA           NA           NA
    #> 3:    3    C      17      17           NA           NA           NA
    #> 4:    4    A      15      24           24           NA           -7
    #> 5:    5    B      16     -13           19           NA            1
    #> 6:    6    C      12      17           17           NA           17
    #> 7:    7    A      12      27           15           24           24
    #> 8:    8    B      16      15           16           19          -13
    #> 9:    9    C      15      36           12           17           17
    #>    values2_lag2
    #> 1:           NA
    #> 2:           NA
    #> 3:           NA
    #> 4:           NA
    #> 5:           NA
    #> 6:           NA
    #> 7:           -7
    #> 8:            1
    #> 9:           17
    
  • 3

    在这些情况下,我依靠 dplyrtidyr 的魔力:

    library(dplyr)
    library(tidyr)
    
    set.seed(47)
    
    # create data
    s_data = data_frame(
      days = 1:9,
      cats = rep(c('A', 'B', 'C'), 3),
      values1 = round(rnorm(9, 16, 4)),
      values2 = round(rnorm(9, 16, 16))
    )
    
    max_lag = 2 # define max number of lags
    
    # create lags
    s_data %>% 
      gather(select = -c("days", "cats")) %>% # gather all variables that will be lagged
      mutate(n_lag = list(0:max_lag)) %>% # add list-column with lag numbers
      unnest() %>% # unnest the list column
      arrange(cats, key, n_lag, days) %>% # order the data.frame
      group_by(cats, key, n_lag) %>% # group by relevant variables
      # create lag. when grouped by vars above, n_lag is a constant vector, take 1st value
      mutate(lag_val = lag(value, n_lag[1])) %>% 
      ungroup() %>% 
      # create some fancy labels 
      mutate(var_name = ifelse(n_lag == 0, key, paste0("Lag", key, ".", n_lag))) %>% 
      select(-c(key, value, n_lag)) %>% # drop unnecesary data
      spread(var_name, lag_val) %>% # spread your newly created variables
      select(days, cats, starts_with("val"), starts_with("Lag")) # reorder
    
    ## # A tibble: 9 x 8
    ##    days cats  values1 values2 Lagvalues1.1 Lagvalues1.2 Lagvalues2.1 Lagvalues2.2
    ##   <int> <chr>   <dbl>   <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
    ## 1     1 A         24.     -7.          NA           NA           NA           NA 
    ## 2     2 B         19.      1.          NA           NA           NA           NA 
    ## 3     3 C         17.     17.          NA           NA           NA           NA 
    ## 4     4 A         15.     24.          24.          NA           -7.          NA 
    ## 5     5 B         16.    -13.          19.          NA            1.          NA 
    ## 6     6 C         12.     17.          17.          NA           17.          NA 
    ## 7     7 A         12.     27.          15.          24.          24.          -7.
    ## 8     8 B         16.     15.          16.          19.         -13.           1.
    ## 9     9 C         15.     36.          12.          17.          17.          17.
    

相关问题