首页 文章

在过去365天窗口中执行运行总计的有效方法

提问于
浏览
4

这就是我的数据框架:

库(data.table)

df <- fread('
                Name  EventType  Date  SalesAmount RunningTotal Runningtotal(prior365Days)
                John    Email      1/1/2014      0          0            0
                John    Sale       2/1/2014     10          10           10
                John    Sale       7/1/2014     20          30           30
                John    Sale       4/1/2015     30          60           50 
                John    Webinar    5/1/2015      0          60           50
                Tom     Email      1/1/2014      0          0            0
                Tom     Sale       2/1/2014     15          15           15
                Tom     Sale       7/1/2014     10          25           25
                Tom     Sale       4/1/2015     25          50           35 
                Tom     Webinar    5/1/2015      0          50           35
                ')
    df[,Date:= as.Date(Date, format="%m/%d/%Y")]

最后一列是我想要的列,它是过去365天滚动窗口中SalesAmount(对于每个Name)的累积总和,我在@ 6pool的帮助下执行了此操作 . 他的解决方案是:

df$EventDate <- as.Date(df$EventDate, format="%d/%m/%Y")
df <- df %>%
   group_by (Name) %>%
   arrange(EventDate) %>% 
   mutate(day = EventDate - EventDate[1])

f <- Vectorize(function(i)
    sum(df[df$Name[i] == df$Name & df$day[i] - df$day >= 0 & 
             df$day[i] - df$day <= 365, "SalesAmount"]), vec="i")
df$RunningTotal365 <- f(1:nrow(df))

但是,当我的数据帧超过150万行时,df $ RunningTotal365 < - f(1:nrow(df))需要很长时间(目前为止超过1.5天) . 我在最初的问题中被建议“rollapply”但我在这个例子中一直在努力弄清楚如何使用它 . 请帮助 .

3 回答

  • 4

    尝试一下:

    DF <- read.table(text = "Name  EventType  EventDate  SalesAmount RunningTotal Runningtotal(prior365Days)
    John    Email      1/1/2014      0          0            0
    John    Sale       2/1/2014     10          10           10
    John    Sale       7/1/2014     20          30           30
    John    Sale       4/1/2015     30          60           50 
    John    Webinar    5/1/2015      0          60           50
    Tom     Email      1/1/2014      0          0            0
    Tom     Sale       2/1/2014     15          15           15
    Tom     Sale       7/1/2014     10          25           25
    Tom     Sale       4/1/2015     25          50           35 
    Tom     Webinar    5/1/2015      0          50           35", header = TRUE)
    
    
    fun <- function(x, date, thresh) {
      D <- as.matrix(dist(date)) #distance matrix between dates
      D <- D <= thresh
      D[lower.tri(D)] <- FALSE #don't sum to future
      R <- D * x #FALSE is treated as 0
      colSums(R)
    }
    
    
    library(data.table)
    setDT(DF)
    DF[, EventDate := as.Date(EventDate, format = "%m/%d/%Y")]
    setkey(DF, Name, EventDate)
    
    DF[, RT365 := fun(SalesAmount, EventDate, 365), by = Name]
    
    #    Name EventType  EventDate SalesAmount RunningTotal Runningtotal.prior365Days. RT365
    # 1: John     Email 2014-01-01           0            0                          0     0
    # 2: John      Sale 2014-02-01          10           10                         10    10
    # 3: John      Sale 2014-07-01          20           30                         30    30
    # 4: John      Sale 2015-04-01          30           60                         50    50
    # 5: John   Webinar 2015-05-01           0           60                         50    50
    # 6:  Tom     Email 2014-01-01           0            0                          0     0
    # 7:  Tom      Sale 2014-02-01          15           15                         15    15
    # 8:  Tom      Sale 2014-07-01          10           25                         25    25
    # 9:  Tom      Sale 2015-04-01          25           50                         35    35
    #10:  Tom   Webinar 2015-05-01           0           50                         35    35
    
  • 0

    这是使用 data.table 包中的 foverlaps 函数的方法:

    require(data.table)
    setDT(df)[, end := as.Date(EventDate, format="%d/%m/%Y")
            ][, start := end - 365L]
    setkey(df, Name, start, end)
    olaps = foverlaps(df, df, nomatch=0L, which=TRUE)
    olaps = olaps[xid >= yid, .(ans = sum(dt$SalesAmount[yid])), by=xid]
    
    df[olaps$xid, Runningtotal := olaps$ans]
    

    如有必要,您可以通过执行以下操作删除 startend 列:

    df[, c("start", "end") := NULL]
    

    很高兴知道它有多快/慢..

  • 4

    在data.table中使用更新的非equi连接功能:

    df1 = df[.(iName=Name,start = Date - 365L, end = Date),
        on=.(Name=iName,Date >= start, Date <= end),nomatch = 0, allow.cart=TRUE][,
      .(MyTotal = sum(SalesAmount)), by=.(Name,Date = Date.1)]
    
    
        df[df1, on = .(Name,Date)]
    

相关问题