Home Articles

滚动回归数据框架

Asked
Viewed 553 times
2

欣赏这可能是之前提出过的,但我还没有找到一个明确的解决方案来处理数据框架 .

我希望在5天的回顾中运行滚动线性回归 . (小可以在这里说明)

到目前为止,我正在尝试:

rollingbeta <- rollapply(df,
                           width=5,
                           FUN = function(Z)
                           {
                             t = lm(formula=y_Close ~ x_Close+0, data = as.data.frame(Z));
                             return(t$coef)[1]
                           },
                           by.column=FALSE, align="right",fill = NA)

  head(rollingbeta,100)

但是,我希望有滚动回顾窗口的测试版 . 相反,我有10列输出 .

> NCOL(rollingbeta)
[1] 10

有人可以帮忙吗?

这是虚拟数据(保存为.txt和读取)

df <- read.table("your_dir\df.txt",header=TRUE, sep="", stringsAsFactors=FALSE)

           Date open.x high.x low.x x_Close volume.x open.y high.y low.y y_Close volume.y x.y.cor
1451 2010-01-04  57.32  58.13 57.32   57.85   442900   6.61 6.8400  6.61    6.83   833100      NA
1452 2010-01-05  57.90  58.33 57.54   58.20   436900   6.82 7.1200  6.80    7.12   904500      NA
1453 2010-01-06  58.20  58.56 58.01   58.42   850600   7.05 7.3800  7.05    7.27   759800      NA
1454 2010-01-07  58.31  58.41 57.14   57.90   463600   7.24 7.3000  7.06    7.11   557800      NA
1455 2010-01-08  57.45  58.62 57.45   58.47   206500   7.08 7.3500  6.95    7.29   588100      NA
1456 2010-01-11  58.79  59.00 57.22   57.73   331900   7.38 7.4500  7.17    7.22   450500      NA
1457 2010-01-12  57.20  57.21 56.15   56.34   428500   7.15 7.1900  6.87    7.00   694700      NA
1458 2010-01-13  56.32  56.66 54.83   56.56   577500   7.05 7.1700  6.98    7.15   528800      NA
1459 2010-01-14  56.51  57.05 55.37   55.53   368100   7.08 7.1701  7.08    7.11   279900      NA
1460 2010-01-15  56.59  56.59 55.19   55.84   417900   7.03 7.0500  6.95    7.03   407600      NA

第一次滚动线性回归的输出应为:

NA NA NA NA NA 0.1229065

2 Answers

  • 1

    考虑使用roll包 .

    library(magrittr); requireNamespace("roll")
    ds <- readr::read_csv(
      "     Date, open.x, high.x, low.x, x_Close, volume.x, open.y, high.y, low.y, y_Close, volume.y
      2010-01-04,  57.32,  58.13, 57.32,   57.85,   442900,   6.61, 6.8400,  6.61,    6.83,   833100
      2010-01-05,  57.90,  58.33, 57.54,   58.20,   436900,   6.82, 7.1200,  6.80,    7.12,   904500
      2010-01-06,  58.20,  58.56, 58.01,   58.42,   850600,   7.05, 7.3800,  7.05,    7.27,   759800
      2010-01-07,  58.31,  58.41, 57.14,   57.90,   463600,   7.24, 7.3000,  7.06,    7.11,   557800
      2010-01-08,  57.45,  58.62, 57.45,   58.47,   206500,   7.08, 7.3500,  6.95,    7.29,   588100
      2010-01-11,  58.79,  59.00, 57.22,   57.73,   331900,   7.38, 7.4500,  7.17,    7.22,   450500
      2010-01-12,  57.20,  57.21, 56.15,   56.34,   428500,   7.15, 7.1900,  6.87,    7.00,   694700
      2010-01-13,  56.32,  56.66, 54.83,   56.56,   577500,   7.05, 7.1700,  6.98,    7.15,   528800
      2010-01-14,  56.51,  57.05, 55.37,   55.53,   368100,   7.08, 7.1701,  7.08,    7.11,   279900
      2010-01-15,  56.59,  56.59, 55.19,   55.84,   417900,   7.03, 7.0500,  6.95,    7.03,   407600"
    )
    
    runs <- roll::roll_lm(
      x         = as.matrix(ds$x_Close),
      y         = as.matrix(ds$y_Close), 
      width     = 5, 
      intercept = FALSE
    )
    
    # Nested in a named-column, within a matrix, within a list.
    ds$beta <- runs$coefficients[, "x1"]
    
    ds$beta 
    #  [1]        NA        NA        NA        NA 0.1224813
    #  [6] 0.1238653 0.1242478 0.1246279 0.1256553 0.1259121
    

    仔细检查数据集中变量的对齐方式 . x_Close 约为50,而 y_Close 约为7.这可能解释了预期的0.1229065和0.1224813值之间的微小差异 .

  • 1

    wibeasley's answer更快的替代方法是使用 rollRegres 包,如下所示

    ds <- structure(list(Date = structure(
      c(14613, 14614, 14615, 14616, 14617, 14620, 14621, 14622, 14623, 14624), class = "Date"),
      open.x = c(57.32, 57.9, 58.2, 58.31, 57.45, 58.79, 57.2, 56.32, 56.51, 56.59),
      high.x = c(58.13, 58.33, 58.56, 58.41, 58.62, 59, 57.21, 56.66, 57.05, 56.59),
      low.x = c(57.32, 57.54, 58.01, 57.14, 57.45, 57.22, 56.15, 54.83, 55.37, 55.19),
      x_Close = c(57.85, 58.2, 58.42, 57.9, 58.47, 57.73, 56.34, 56.56, 55.53, 55.84),
      volume.x = c(442900L, 436900L, 850600L, 463600L, 206500L, 331900L, 428500L, 577500L, 368100L, 417900L),
      open.y = c(6.61, 6.82, 7.05, 7.24, 7.08, 7.38, 7.15, 7.05, 7.08, 7.03),
      high.y = c(6.84, 7.12, 7.38, 7.3, 7.35, 7.45, 7.19, 7.17, 7.1701, 7.05),
      low.y = c(6.61, 6.8, 7.05, 7.06, 6.95, 7.17, 6.87, 6.98, 7.08, 6.95),
      y_Close = c(6.83, 7.12, 7.27, 7.11, 7.29, 7.22, 7, 7.15, 7.11, 7.03),
      volume.y = c(833100L, 904500L, 759800L, 557800L, 588100L, 450500L, 694700L, 528800L, 279900L, 407600L)),
      row.names = c(NA, -10L), class = "data.frame")
    
    # we get the same
    library(roll)
    library(rollRegres)
    X <- as.matrix(ds$x_Close)
    Y <- ds$y_Close
    Ymat <- as.matrix(Y)
    
    all.equal(
      roll_lm(x = X, y = Ymat, intercept = FALSE, width = 5L)$coefficients[, 2],
      drop(roll_regres.fit(x = X, y = Y, width = 5L)$coefs),
      check.attributes = FALSE)
    #R [1] TRUE
    

    使用 roll_regres 函数,您也可以使用 formulalm 匹配模型

    all.equal(
      roll_lm(x = X, y = Ymat, intercept = FALSE, width = 5L)$coefficients[, 2],
      drop(roll_regres(y_Close ~ x_Close - 1, ds, width = 5L)$coefs),
      check.attributes = FALSE)
    #R [1] TRUE
    

    这是计算速度的基准

    # We add a few more observation to get an interesting example
    set.seed(1)
    n <- 250 * 5 # 5 years of trading data
    X <- as.matrix(rnorm(n))
    Y <- rnorm(n)
    Ymat <- as.matrix(Y)
    
    microbenchmark::microbenchmark(
      roll_lm(x = X, y = Ymat, intercept = FALSE, width = 5L),
      roll_regres.fit(x = X, y = Y, width = 5L),
      times = 1e3)
    #R Unit: microseconds
    #R                                                     expr   min    lq  mean median    uq    max neval
    #R  roll_lm(x = X, y = Ymat, intercept = FALSE, width = 5L) 663.7 739.9 834.2  777.1 860.2 3972.3  1000
    #R                roll_regres.fit(x = X, y = Y, width = 5L) 186.9 204.6 237.4  224.8 248.3  546.4  1000
    

Related