首页 文章

从CSV文件中提取JSON数据

提问于
浏览
0

我'm trying to extract a JSON data which is a column in a CSV file. So far I'已经到了使用 jsonlite 包将一个因子转换为json文件的地步 .

[1] {"id":509746197991998767,"visibility":{"percentage":100,"time":149797,"visible1":true,"visible2":false,"visible3":false,"activetab":true},"interaction":{"mouseovercount":1,"mouseovertime":1426,"videoplaytime":0,"engagementtime":0,"expandtime":0,"exposuretime":35192}}

另一种方法是在导入时使用 stringsAsFactors = F ,但我正在努力使格式正确,其中每个条目如下所示:

[1] "{\"id\":509746197991998767,\"visibility\":{\"percentage\":100,\"time\":149797,\"visible1\":true,\"visible2\":false,\"visible3\":false,\"activetab\":true},\"interaction\":{\"mouseovercount\":1,\"mouseovertime\":1426,\"videoplaytime\":0,\"engagementtime\":0,\"expandtime\":0,\"exposuretime\":35192}}"

我错过了一些明显的东西吗?我只是想要提取位于CSV文件中的 JSON 文件 .

下面是一个CSV文件的小例子:

"","CookieID","UnloadVars"
"1",-8857188784608690176,"{""id"":509746197991998767,""visibility"":{""percentage"":100,""time"":149797,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":1,""mouseovertime"":1426,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":35192}}"
"2",-1695626857458244096,"{""id"":2917654329769114342,""visibility"":{""percentage"":46,""time"":0,""visible1"":false,""visible2"":false,""visible3"":false,""activetab"":true}}"
"3",437299165071669184,"{""id"":2252707957388071809,""visibility"":{""percentage"":99,""time"":10168,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":542},""clicks"":[{""x"":105,""y"":449}]}"
"4",292660729552227520,""
"5",7036383942916227072,"{""id"":2299674593327687292,""visibility"":{""percentage"":76,""time"":1145,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":74},""clicks"":[{""x"":197,""y"":135},{""x"":197,""y"":135}]}"

问候,

弗雷德里克 .

3 回答

  • 0
    df <- readr::read_csv('"","CookieID","UnloadVars"
    "1",-8857188784608690176,"{""id"":509746197991998767,""visibility"":{""percentage"":100,""time"":149797,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":1,""mouseovertime"":1426,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":35192}}"
    "2",-1695626857458244096,"{""id"":2917654329769114342,""visibility"":{""percentage"":46,""time"":0,""visible1"":false,""visible2"":false,""visible3"":false,""activetab"":true}}"
    "3",437299165071669184,"{""id"":2252707957388071809,""visibility"":{""percentage"":99,""time"":10168,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":542},""clicks"":[{""x"":105,""y"":449}]}"
    "4",292660729552227520,""
    "5",7036383942916227072,"{""id"":2299674593327687292,""visibility"":{""percentage"":76,""time"":1145,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":74},""clicks"":[{""x"":197,""y"":135},{""x"":197,""y"":135}]}"',
    col_types = "-cc")
    

    在每个单独的值上使用 jsonlite::fromJSON ,然后 tidyr::unnest

    library(dplyr)
    
    f <- function(.x) 
      if (is.na(.x) || .x == "") data.frame()[1, ] else 
        as.data.frame(jsonlite::fromJSON(.x))
    
    df %>% 
      tidyr::unnest(UnloadVars = lapply(UnloadVars, f)) %>% 
      mutate_at(vars(ends_with("id")), as.character)
    
    # A tibble: 6 x 16
    #               CookieID                  id visibility.percentage visibility.time visibility.visible1 visibility.visible2 visibility.visible3 visibility.activetab interaction.mouseovercount interaction.mouseovertime interaction.videoplaytime interaction.engagementtime interaction.expandtime interaction.exposuretime clicks.x clicks.y
    #                  <chr>               <chr>                 <int>           <int>               <lgl>               <lgl>               <lgl>                <lgl>                      <int>                     <int>                     <int>                      <int>                  <int>                    <int>    <int>    <int>
    # 1 -8857188784608690176  509746197991998784                   100          149797                TRUE               FALSE               FALSE                 TRUE                          1                      1426                         0                          0                      0                    35192       NA       NA
    # 2 -1695626857458244096 2917654329769114112                    46               0               FALSE               FALSE               FALSE                 TRUE                         NA                        NA                        NA                         NA                     NA                       NA       NA       NA
    # 3   437299165071669184 2252707957388071936                    99           10168                TRUE               FALSE               FALSE                 TRUE                          0                         0                         0                          0                      0                      542      105      449
    # 4   292660729552227520                <NA>                    NA              NA                  NA                  NA                  NA                   NA                         NA                        NA                        NA                         NA                     NA                       NA       NA       NA
    # 5  7036383942916227072 2299674593327687168                    76            1145                TRUE               FALSE               FALSE                 TRUE                          0                         0                         0                          0                      0                       74      197      135
    # 6  7036383942916227072 2299674593327687168                    76            1145                TRUE               FALSE               FALSE                 TRUE                          0                         0                         0                          0                      0                       74      197      135
    
  • 1

    我使用readr :: read_csv读取您的示例数据集 .

    > df <- readr::read_csv('~/sample.csv')
    Parsed with column specification:
    cols(
      CookieID = col_double(),
      UnloadVars = col_character()
    )
    

    正如您所看到的,UnloadVars是作为字符而不是因素读入的 . 如果我现在检查UnloadVars列中的第一个值,我会看到以下内容与您获得的内容相匹配,

    > df$UnloadVars[1]
    [1] "{\"id\":509746197991998767,\"visibility\":{\"percentage\":100,\"time\":149797,\"visible1\":true,\"visible2\":false,\"visible3\":false,\"activetab\":true},\"interaction\":{\"mouseovercount\":1,\"mouseovertime\":1426,\"videoplaytime\":0,\"engagementtime\":0,\"expandtime\":0,\"exposuretime\":35192}}"
    

    现在,我使用jsonlite :: fromJSON,

    > j <- jsonlite::fromJSON(df$UnloadVars[1])
    > j
    $id
    [1] 5.097462e+17
    
    $visibility
    $visibility$percentage
    [1] 100
    
    $visibility$time
    [1] 149797
    
    $visibility$visible1
    [1] TRUE
    
    $visibility$visible2
    [1] FALSE
    
    $visibility$visible3
    [1] FALSE
    
    $visibility$activetab
    [1] TRUE
    
    
    $interaction
    $interaction$mouseovercount
    [1] 1
    
    $interaction$mouseovertime
    [1] 1426
    
    $interaction$videoplaytime
    [1] 0
    
    $interaction$engagementtime
    [1] 0
    
    $interaction$expandtime
    [1] 0
    
    $interaction$exposuretime
    [1] 35192
    

    我相信你需要的是因为JSON被解析为R中的列表 .

  • 0

    处理JSON数据可能非常棘手 . 作为一般指导,您应始终努力将数据放在数据框中 . 然而,这并非总是可行的 . 在特定情况下,我没有看到在格式良好的数据框中同时具有 visibilityinteraction 值的方法 .

    接下来我要做的是将 interaction 中的信息提取到数据框中 .

    加载所需的包并读取数据

    library(purrr)
    library(dplyr)
    library(tidyr)
    df <- read.csv("sample.csv", stringsAsFactors = FALSE)
    

    然后删除无效的JSON

    # remove rows without JSON (in this case, the 4th row)
    df <- df %>% 
      dplyr::filter(UnloadVars != "")
    

    将每个JSON转换为一个列表并将它们放入 UnloadVars 列 . 如果您不知道,可以在数据框中包含列表列 . 这非常有用 .

    out <- data_frame(CookieID = numeric(), 
                      UnloadVars = list())
    
    for (row in 1:nrow(df)) {
      new_row <- data_frame(CookieID = df[row, ]$CookieID,
                            UnloadVars = list(jsonlite::fromJSON(df[row, ]$UnloadVars)))
    
      out <- bind_rows(out, new_row)
    }
    
    out
    

    我们现在可以从 Unload Vars 中的列表中提取ID . 这很简单,因为每个列表只有一个ID .

    out <- out %>% 
      mutate(id = map_chr(UnloadVars, ~ .$id))
    

    最后一部分看起来有点令人生畏 . 但我在这里做的是从 UnloadVars 列获取交互部分并将其放入 interaction 列 . 然后,我将 interaction (列表)中的每一行转换为具有两列的数据框: keyvalue . key 包含交互度量标准的名称, value 包含其值 . 我终于把它取消了,所以我们摆脱列表列,最后得到一个格式很好的数据框 .

    unpack_list <- function(obj, key_name) {
      as.data.frame(obj) %>% 
        gather(key) %>% 
        return()
    }
    
    df_interaction <- out %>% 
        mutate(interaction = map(UnloadVars, ~ .$interaction)) %>% 
        mutate(interaction = map(interaction, ~ unpack_list(.x, key))) %>% 
        unnest(interaction)
    
    df_interaction
    

    解决方案不是很优雅,但可以完成工作 . 您可以应用相同的逻辑从可见性中提取信息 .

相关问题