Home Articles

并行化R中的滚动窗口回归

Asked
Viewed 1364 times
11

我正在运行一个非常类似于以下代码的滚动回归:

library(PerformanceAnalytics)
library(quantmod)
data(managers)

FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,FL) {
  df <- as.data.frame(df)
  model <- lm(FL,data=df[1:30,])
  predict(model,newdata=df[31,])
}

system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL,
    by.column = FALSE, align = "right", na.pad = TRUE))

我有一些额外的处理器,所以我试图找到一种方法来并行化滚动窗口 . 如果这是一个非滚动回归,我可以使用apply系列函数轻松地并行化它...

2 Answers

  • 9

    显而易见的是使用 lm.fit() 而不是 lm() ,因此您不会因处理公式等而产生所有开销 .

    Update: 所以,当我说明白我的意思是 blindingly obvious but deceptively difficult to implement

    经过一番摆弄后,我想出了这个

    library(PerformanceAnalytics)
    library(quantmod)
    data(managers)
    

    第一阶段是要意识到模型矩阵可以预先构建,所以我们这样做并将其转换回Zoo对象以与 rollapply() 一起使用:

    mmat2 <- model.frame(Next(HAM1) ~ HAM1 + HAM2 + HAM3 + HAM4, data = managers, 
                         na.action = na.pass)
    mmat2 <- cbind.data.frame(mmat2[,1], Intercept = 1, mmat2[,-1])
    mmatZ <- as.zoo(mmat2)
    

    现在我们需要一个函数来使用 lm.fit() 来完成繁重的工作,而不必在每次迭代时创建设计矩阵:

    MyRegression2 <- function(Z) {
        ## store value we want to predict for
        pred <- Z[31, -1, drop = FALSE]
        ## get rid of any rows with NA in training data
        Z <- Z[1:30, ][!rowSums(is.na(Z[1:30,])) > 0, ]
        ## Next() would lag and leave NA in row 30 for response
        ## but we precomputed model matrix, so drop last row still in Z
        Z <- Z[-nrow(Z),]
        ## fit the model
        fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
        ## get things we need to predict, in case pivoting turned on in lm.fit
        p <- fit$rank
        p1 <- seq_len(p)
        piv <- fit$qr$pivot[p1]
        ## model coefficients
        beta <- fit$coefficients
        ## this gives the predicted value for row 31 of data passed in
        drop(pred[, piv, drop = FALSE] %*% beta[piv])
    }
    

    时间比较:

    > system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL,
    +                                 by.column = FALSE, align = "right", 
    +                                 na.pad = TRUE))
       user  system elapsed 
      0.925   0.002   1.020 
    > 
    > system.time(Result2 <- rollapply(mmatZ, 31, FUN = MyRegression2,
    +                                  by.column = FALSE,  align = "right",
    +                                  na.pad = TRUE))
       user  system elapsed 
      0.048   0.000   0.05
    

    这比原版提供了相当合理的改进 . 现在检查生成的对象是否相同:

    > all.equal(Result, Result2)
    [1] TRUE
    

    请享用!

  • 1

    新答案

    我写了一个包, rollRegres ,这样做得更快 . 它比Gavin Simpson's answer快〜58倍 . 这是一个例子

    # simulate data
    set.seed(101)
    n <- 10000
    wdth <- 50
    X <- matrix(rnorm(10 * n), n, 10)
    y <- drop(X %*% runif(10)) + rnorm(n)
    Z <- cbind(y, X)
    
    # assign other function
    lm_version <- function(Z, width = wdth) {
      pred <- Z[width + 1, -1, drop = FALSE]
      ## fit the model
      Z <- Z[-nrow(Z), ]
      fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
      ## get things we need to predict, in case pivoting turned on in lm.fit
      p <- fit$rank
      p1 <- seq_len(p)
      piv <- fit$qr$pivot[p1]
      ## model coefficients
      beta <- fit$coefficients
      ## this gives the predicted value for next obs
      drop(pred[, piv, drop = FALSE] %*% beta[piv])
    }
    
    # show that they yield the same
    library(rollRegres) # the new package
    library(zoo)
    all.equal(
      rollapply(Z, wdth + 1, FUN = lm_version,
                by.column = FALSE,  align = "right", fill = NA_real_),
      roll_regres.fit(
        x = X, y = y, width = wdth, do_compute = "1_step_forecasts"
        )$one_step_forecasts)
    #R [1] TRUE
    
    # benchmark
    library(compiler)
    lm_version <- cmpfun(lm_version)
    microbenchmark::microbenchmark(
      newnew = roll_regres.fit(
        x = X, y = y, width = wdth, do_compute = "1_step_forecasts"),
      prev   = rollapply(Z, wdth + 1, FUN = lm_version,
                         by.column = FALSE,  align = "right", fill = NA_real_),
      times = 10)
    #R Unit: milliseconds
    #R   expr       min        lq      mean    median        uq       max neval
    #R newnew  10.27279  10.48929  10.91631  11.04139  11.13877  11.87121    10
    #R   prev 555.45898 565.02067 582.60309 582.22285 602.73091 605.39481    10
    

    老答案

    您可以通过更新分解来缩短运行时间 . 这会在每次迭代时产生
    frm1
    成本而不是
    frm1
    ,其中n是窗口宽度 . 下面是一个比较两者的代码 . 在C中执行此操作可能要快得多,但R不包含LINPACK dchuddchdd ,因此您必须编写一个包来执行此操作 . 此外,我记得读到你可以更快地使用其他实现,而不是用于R更新的LINPACK dchuddchdd

    library(SamplerCompare) # for LINPACK `chdd` and `chud`
    roll_forcast <- function(X, y, width){
      n <- nrow(X)
      p <- ncol(X)
      out <- rep(NA_real_, n)
    
      is_first <- TRUE
      i <- width 
      while(i < n){
        if(is_first){
          is_first <- FALSE
          qr. <- qr(X[1:width, ])
          R <- qr.R(qr.)
    
          # Use X^T for the rest
          X <- t(X)
    
          XtY <- drop(tcrossprod(y[1:width], X[, 1:width]))
        } else {
          x_new <- X[, i]
          x_old <- X[, i - width]
    
          # update R 
          R <- .Fortran(
            "dchud", R, p, p, x_new, 0., 0L, 0L, 
            0., 0., numeric(p), numeric(p), 
            PACKAGE = "SamplerCompare")[[1]]
    
          # downdate R
          R <- .Fortran(
            "dchdd", R, p, p, x_old, 0., 0L, 0L, 
            0., 0., numeric(p), numeric(p), integer(1),
            PACKAGE = "SamplerCompare")[[1]]
    
          # update XtY
          XtY <- XtY + y[i] * x_new - y[i - width] * x_old
        }
    
        coef. <- .Internal(backsolve(R, XtY, p, TRUE, TRUE))
        coef. <- .Internal(backsolve(R, coef., p, TRUE, FALSE))
    
        i <- i + 1
        out[i] <- X[, i] %*% coef.
      }
    
      out
    }
    
    # simulate data
    set.seed(101)
    n <- 10000
    wdth = 50
    X <- matrix(rnorm(10 * n), n, 10)
    y <- drop(X %*% runif(10)) + rnorm(n)
    Z <- cbind(y, X)
    
    # assign other function
    lm_version <- function(Z, width = wdth) {
      pred <- Z[width + 1, -1, drop = FALSE]
      ## fit the model
      Z <- Z[-nrow(Z), ]
      fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
      ## get things we need to predict, in case pivoting turned on in lm.fit
      p <- fit$rank
      p1 <- seq_len(p)
      piv <- fit$qr$pivot[p1]
      ## model coefficients
      beta <- fit$coefficients
      ## this gives the predicted value for row 31 of data passed in
      drop(pred[, piv, drop = FALSE] %*% beta[piv])
    }
    
    # show that they yield the same
    library(zoo)
    all.equal(
      rollapply(Z, wdth + 1, FUN = lm_version,  
                by.column = FALSE,  align = "right", fill = NA_real_),
      roll_forcast(X, y, wdth))
    #R> [1] TRUE
    
    # benchmark
    library(compiler)
    roll_forcast <- cmpfun(roll_forcast)
    lm_version <- cmpfun(lm_version)
    microbenchmark::microbenchmark(
      new =  roll_forcast(X, y, wdth),
      prev = rollapply(Z, wdth + 1, FUN = lm_version,  
                       by.column = FALSE,  align = "right", fill = NA_real_), 
      times = 10)
    #R> Unit: milliseconds
    #R> expr      min       lq     mean   median       uq      max neval cld
    #R>  new 113.7637 115.4498 129.6562 118.6540 122.4930 230.3414    10  a 
    #R> prev 639.6499 674.1677 682.1996 678.6195 686.8816 763.8034    10   b
    

Related