Home Articles

用dplyr滚动回归

Asked
Viewed 171 times
3

我有一个“日期”,“公司”和“返回”的数据框,可通过以下代码重现:

library(dplyr)
n.dates <- 60
n.stocks <- 2
date <- seq(as.Date("2011-07-01"), by=1, len=n.dates)
symbol <- replicate(n.stocks, paste0(sample(LETTERS, 5), collapse = ""))
x <- expand.grid(date, symbol)
x$return <- rnorm(n.dates*n.stocks, 0, sd = 0.05)
names(x) <- c("date", "company", "return")

使用此数据框,我可以计算每日市场平均回报,并将该结果添加到新列“market.ret”中 .

x <- group_by(x, date)    
x <- mutate(x, market.ret = mean(x$return, na.rm = TRUE))

现在我想将不同公司的所有数据分组(在本例中为2) .

x <- group_by(x, company)

在这样做之后,我想通过“market.ret”拟合“返回”并计算线性回归系数并将斜率存储在新列中 . 如果我想对给定公司内的整个数据集进行拟合,那么我可以简单地调用lm():

group_by(x, company) %>%
do(data.frame(beta = coef(lm(return ~ market.ret,data = .))[2])) %>%
left_join(x,.)

但是,我实际上想要在“滚动”的基础上进行线性回归,即在20天的尾随期间分别进行每一天 . 我想使用rollapply()但不知道如何将两列传递给函数 . 非常感谢任何帮助或建议 .

注意:下面是我用来计算20天回滚标准差的代码,可能会有所帮助:

sdnoNA <- function(x){return(sd(x, na.rm = TRUE))}
x <- mutate(x, sd.20.0.d = rollapply(return, FUN = sdnoNA, width = 20, fill = NA))

2 Answers

  • 2
    ## lms is a function which calculate the linear regression coefficient
    lms <- function(y, x){
    s = which(is.finite(x * y))
    y = y[s]
    x = x[s]
    return(cov(x, y)/var(x))
    }
    
    ## z is a dataframe which stores our final result
    z <- data.frame()
    
    ## x has to be ungrouped
    x <- ungroup(x)
    
    ## subset with "filter" and roll with "rollapply"
    symbols <- unique(x$company)
    for(i in 1:length(symbols)){
    temp <- filter(x, company == symbols[i])
    z <- rbind(z, mutate(temp, beta = rollapply(temp[, c(3, 4)], 
                                              FUN = function(x) lms(x[, 1], x[, 2]),
                                              width = 20, fill = NA,
                                              by.column = FALSE, align = "right")))
    }
    
    ## final result
    print(z)
    
  • 0

    这是一个 dplyr 解决方案

    #####
    # setup data as OP (notice the fix when computing the market return)
    library(dplyr)
    set.seed(41797642)
    n.dates <- 60
    n.stocks <- 2
    date <- seq(as.Date("2011-07-01"), by=1, len=n.dates)
    symbol <- replicate(n.stocks, paste0(sample(LETTERS, 5), collapse = ""))
    x <- expand.grid(date, symbol)
    x$return <- rnorm(n.dates*n.stocks, 0, sd = 0.05)
    names(x) <- c("date", "company", "return")
    
    x <- x %>%
      group_by(date) %>%
      mutate(market.ret = mean(return))
    
    #####
    # compute coefs using rollRegres
    library(rollRegres)
    func <- . %>% {
        roll_regres.fit(x = cbind(1, .$market.ret),
                        y = .$return, width = 20L)$coefs }
    out <- x %>%
      group_by(company) %>%
      # make it explicit that data needs to be sorted
      arrange(date, .by_group = TRUE) %>%
      do(cbind(reg_col = select(., market.ret, return) %>% func,
               date_col = select(., date))) %>%
      ungroup
    
    head(out[!is.na(out$reg_col.1), ], 5)
    #R # A tibble: 5 x 4
    #R company reg_col.1 reg_col.2 date
    #R   <fct>       <dbl>     <dbl> <date>
    #R 1 SNXAD    -0.0104      0.746 2011-07-20
    #R 2 SNXAD    -0.00953     0.755 2011-07-21
    #R 3 SNXAD    -0.0124      0.784 2011-07-22
    #R 4 SNXAD    -0.0167      0.709 2011-07-23
    #R 5 SNXAD    -0.0148      0.691 2011-07-24
    tail(out[!is.na(out$reg_col.1), ], 5)
    #R # A tibble: 5 x 4
    #R company  reg_col.1 reg_col.2 date
    #R   <fct>        <dbl>     <dbl> <date>
    #R 1 UYLTS   -0.00276       0.837 2011-08-25
    #R 2 UYLTS    0.0000438     0.928 2011-08-26
    #R 3 UYLTS    0.000250      0.936 2011-08-27
    #R 4 UYLTS   -0.000772      0.886 2011-08-28
    #R 5 UYLTS    0.00173       0.902 2011-08-29
    

    尽管使用了 rollRegres 包,它非常接近the answer here,这与this answer相当接近 .

Related