首页 文章

从开始和结束时间之间的间隔计算的R组变量

提问于
浏览
5

我有一个数据框如下

tmpdf <- data.frame(licensePlate = c("Y80901", "Y80901", "Y80901", "AMG-999", "AMG-999", "W3188", "W3188"),  
starttime= c("2015-09-18 09:55", "2015-09-18 23:00", "2015-09-20 15:00", "2015-09-17 15:42", "2015-09-21 09:22", "2015-09-17 09:00", "2015-09-21 14:00"),
endtime = c("2015-09-18 17:55", "2015-09-20 11:00", "2015-09-21 12:00",  "2015-09-18 13:00",  "2015-09-21 14:22", "2015-09-21 12:00", "2015-09-21 16:00"))
    tmpdf
      licensePlate        starttime          endtime
    1       Y80901 2015-09-18 09:55 2015-09-18 17:55
    2       Y80901 2015-09-18 23:00 2015-09-20 11:00
    3       Y80901 2015-09-20 15:00 2015-09-21 12:00
    4      AMG-999 2015-09-17 15:42 2015-09-18 13:00
    5      AMG-999 2015-09-21 09:22 2015-09-21 14:22
    6        W3188 2015-09-17 09:00 2015-09-21 12:00
    7        W3188 2015-09-21 14:00 2015-09-21 16:00

我想计算每个licensePlate每天使用的最后n天(例如,从9月17日到9月21日的最后5天),我的预期结果如下:

Period            LicensePlate        Used Time   

1 2015-09-17         Y80901              0
2 2015-09-17         AMG-999             8.3     
3 2015-09-17         W3188               15
4 2015-09-18         Y80901              9
5 2015-09-18         AMG-999             13
6 2015-09-18         W3188               24
7 2015-09-19         Y80901              24
8 2015-09-19         AMG-999             0
9 2015-09-19         W3188               24
10 2015-09-20        Y80901              20
11 2015-09-20        AMG-999             0
12 2015-09-20        W3188               24
13 2015-09-21        Y80901              12
14 2015-09-21        AMG-999             5
15 2015-09-21        W3188               14

我认为dplyr / data.table和lubridate可用于获取我的结果,我可能需要以天为单位测量时间段,但我不知道如何在开始/结束时间间隔内开始/结束时切入行 .

4 回答

  • 3

    这是让你入门的东西 . 这几乎是您想要的输出,因为它没有显示每个时期缺少的 licensePlate .

    第一步是将您的日期转换为有效的 POSIXct 类,然后将数据扩展到每分钟级别(可能是此解决方案中成本最高的部分),并将 licensePlatePeriod 汇总,同时总结结果(我是这里没有使用 as.Date 因为它处理的值非常糟糕,值在00和凌晨1点之间 .

    library(data.table)
    setDT(tmpdf)[, `:=`(starttime = as.POSIXct(starttime), endtime = as.POSIXct(endtime))]
    res <- tmpdf[, .(licensePlate, Period = seq(starttime, endtime, by = "1 min")), by = 1:nrow(tmpdf)]
    res[, .(Used_Time = round(.N/60L, 1L)), keyby = .(Period = substr(Period, 1L, 10L), licensePlate)]
    #         Period licensePlate Used_Time
    #  1: 2015-09-17      AMG-999       8.3
    #  2: 2015-09-17        W3188      15.0
    #  3: 2015-09-18      AMG-999      13.0
    #  4: 2015-09-18        W3188      24.0
    #  5: 2015-09-18       Y80901       9.0
    #  6: 2015-09-19        W3188      24.0
    #  7: 2015-09-19       Y80901      24.0
    #  8: 2015-09-20        W3188      24.0
    #  9: 2015-09-20       Y80901      20.0
    # 10: 2015-09-21      AMG-999       5.0
    # 11: 2015-09-21        W3188      14.0
    # 12: 2015-09-21       Y80901      12.0
    
  • 0

    深吸一口气 . 这是我的解决方案

    初始化数据

    tmpdf <- data.frame(licensePlate = c("Y80901", "Y80901", "Y80901", "AMG-999", "AMG-999", "W3188", "W3188"),  
                    starttime= c("2015-09-18 09:55", "2015-09-18 23:00", "2015-09-20 15:00", "2015-09-17 15:42", "2015-09-21 09:22", "2015-09-17 09:00", "2015-09-21 14:00"),
                    endtime = c("2015-09-18 17:55", "2015-09-20 11:00", "2015-09-21 12:00",  "2015-09-18 13:00",  "2015-09-21 14:22", "2015-09-21 12:00", "2015-09-21 16:00"))
    
    'converting to POSIXct for better date/time handling'
        tmpdf$starttime <- as.POSIXct(tmpdf$starttime, tz = "GMT")
        tmpdf$endtime <- as.POSIXct(tmpdf$endtime, tz = "GMT")
    

    数据准备

    要执行所需的操作,必须将完整的使用数据转换为每日使用数据 . 所以我编写了以下函数来将数据准备为所需的格式 .

    #splits single usage data into two  
    splitToTwo <- function(list){
    newList <- NULL
    
    for ( i in 1:nrow(list)){
    
     tmp <- list[i,]
    
     # set the end time of the first split as 23:59:59
     list[i,]$endtime <- as.Date(list[i,]$starttime) + hours(23) + minutes(59) + seconds(59)
    
     # set the start time of the second split as 00:00:01
     tmp$starttime <- list[i,]$endtime + seconds(2)
    
     # add the new df to the list  
     tmp <-  rbind(tmp,list[i,])
     newList <- rbind(newList,tmp)
     }
     return(newList)
    }
    
    
    #recursive function. Split the usage data into two till there are completely normalised to daily usage data
    setDailyUsage <- function(tmpdf){
    
      # create a exclusive subset where the usage spawns more than a day   
      list <- tmpdf[as.Date(tmpdf$endtime) - as.Date(tmpdf$starttime) > 0,   ]
    
      # replace tmpdf with usage that started and ended the same day   
      tmpdf <- tmpdf[ as.Date(tmpdf$endtime) - as.Date(tmpdf$starttime) == 0,]  
    
      # call to our split function to split the dataset with usage spawning more than one day  
      split <- splitToTwo(list)
    
      # add the now split data to our exclusive
      tmpdf <- rbind(tmpdf,split)
    
      if (nrow(tmpdf[as.Date(tmpdf$endtime) - as.Date(tmpdf$starttime) > 0,   ])>0){
          tmpdf <- setDailyUsage(tmpdf)
      }
    
    return(tmpdf)
    
    }
    

    准备好的数据

    我们准备的数据

    preparedData <- setDailyUsage(tmpdf)
        licensePlate           starttime             endtime
    1         Y80901 2015-09-18 09:55:00 2015-09-18 17:55:00
    5        AMG-999 2015-09-21 09:22:00 2015-09-21 14:22:00
    7          W3188 2015-09-21 14:00:00 2015-09-21 16:00:00
    21        Y80901 2015-09-18 23:00:00 2015-09-18 23:59:59
    3         Y80901 2015-09-21 00:00:01 2015-09-21 12:00:00
    31        Y80901 2015-09-20 15:00:00 2015-09-20 23:59:59
    4        AMG-999 2015-09-18 00:00:01 2015-09-18 13:00:00
    41       AMG-999 2015-09-17 15:42:00 2015-09-17 23:59:59
    61         W3188 2015-09-17 09:00:00 2015-09-17 23:59:59
    2         Y80901 2015-09-20 00:00:01 2015-09-20 11:00:00
    211       Y80901 2015-09-19 00:00:01 2015-09-19 23:59:59
    611        W3188 2015-09-18 00:00:01 2015-09-18 23:59:59
    612        W3188 2015-09-19 00:00:01 2015-09-19 23:59:59
    6          W3188 2015-09-21 00:00:01 2015-09-21 12:00:00
    613        W3188 2015-09-20 00:00:01 2015-09-20 23:59:59
    

    数据操作

    现在我们创建一个新的DF,它代表所需格式的数据 . 这将最初在UsedTime列中具有空值 .

    preparedData$duration <- preparedData$endtime - preparedData$starttime
    noOfUniquePlates <- length(unique(preparedData$licensePlate))
    Period <- rep(seq(from=(min(as.Date(preparedData$starttime))),to=(max(as.Date(preparedData$starttime))), by="day"),noOfUniquePlates)
    noOfUniqueDays <- length(unique(Period))
    LicensePlate <- rep(unique(preparedData$licensePlate),each=noOfUniqueDays)
    UsedTime <- 0
    
    newDF <- data.frame(Period,LicensePlate,UsedTime)
    

    现在,newDF的每一行都有一个简单的mapply函数,在preparedData df中搜索正确的用法数据 .

    findUsage <- function(p,l){
      sum(preparedData[as.Date(preparedData$starttime) == p & as.Date(preparedData$endtime) == p & preparedData$licensePlate == l ,  ]$duration)
    }
    newDF$UsedTime <- mapply( findUsage, newDF$Period, newDF$LicensePlate)
    newDF$UsedTime <- newDF$UsedTime/60
    
        > newDF[with(newDF,order(Period)),]
           Period LicensePlate  UsedTime
    1  2015-09-17       Y80901  0.000000
    6  2015-09-17      AMG-999  8.299722
    11 2015-09-17        W3188 14.999722
    2  2015-09-18       Y80901  8.999722
    7  2015-09-18      AMG-999 12.999722
    12 2015-09-18        W3188 23.999444
    3  2015-09-19       Y80901 23.999444
    8  2015-09-19      AMG-999  0.000000
    13 2015-09-19        W3188 23.999444
    4  2015-09-20       Y80901 19.999444
    9  2015-09-20      AMG-999  0.000000
    14 2015-09-20        W3188 23.999444
    5  2015-09-21       Y80901 11.999722
    10 2015-09-21      AMG-999  5.000000
    15 2015-09-21        W3188 13.999722
    

    我不得不限制解释以保持答案简短 . 如果您在评论中需要任何澄清,请与我们联系 .

  • 0

    你是对的,plyr可以用来解决这个问题 . 一种可能的方案:

    tmpdf$starttime <- as.POSIXct(tmpdf$starttime) #convert date/time columns to date/time values in R
    tmpdf$endtime <- as.POSIXct(tmpdf$endtime) #convert date/time columns to date/time values in R
    newdf <- ddply(tmpdf,.(as.Date(starttime),licensePlate),function(df){
    df$diffdays <- as.double(difftime(df$endtime,df$starttime,units='days'))
    df
    })
    #If you want to only have the Period, LicensePlate, and Used Time columns remaining:
    newdf <- subset(newdf,select=c(1,2,5))
    colnames(newdf) <- c('Period','LicensePlate','UsedTime')
    

    希望能帮助到你!

  • 1

    试试这个 - 它有帮助吗?

    `tmpdf <- data.frame(licensePlate = c("Y80901", "Y80901", "Y80901", "AMG-999", "AMG-999", "W3188", "W3188"),  
                        starttime= c("2015-09-18 09:55", "2015-09-18 23:00", "2015-09-20 15:00", "2015-09-17 15:42", "2015-09-21 09:22", "2015-09-17 09:00", "2015-09-21 14:00"),
                        endtime = c("2015-09-18 17:55", "2015-09-20 11:00", "2015-09-21 12:00",  "2015-09-18 13:00",  "2015-09-21 14:22", "2015-09-21 12:00", "2015-09-21 16:00"))
    
    tmpdf
    str(tmpdf)
    library(lubridate)
    tmpdf$starttime=ymd_hm(paste(tmpdf$starttime))
    tmpdf$endtime=ymd_hm(paste(tmpdf$endtime))
    tmpdf$Period=day(tmpdf$starttime)
    tmpdf$diff=difftime(tmpdf$endtime,tmpdf$starttime)
    tmpdf`
    

    enter image description here

相关问题