我开始使用sparklyr处理大尺寸数据,所以我只需要使用管道 .


> csj %>% head()

# Source:   lazy query [?? x 8]
# Database: spark_connection
  `_c0` reviewerID     asin  helpful length_of_review overall unixReviewTime category             
  <int> <chr>          <chr> <chr>              <dbl> <chr>   <chr>          <chr>                
1     0 A1KLRMWW2FWPL4 31887 [0, 0]               172 5       1297468800     Clothes_shoes_jewelry
2     1 A2G5TCU2WDFZ65 31887 [0, 0]               306 5       1358553600     Clothes_shoes_jewelry
3     2 A1RLQXYNCMWRWN 31887 [0, 0]               312 5       1357257600     Clothes_shoes_jewelry
4     3 A8U3FAMSJVHS5  31887 [0, 0]               405 5       1398556800     Clothes_shoes_jewelry
5     4 A3GEOILWLK86XM 31887 [0, 0]               453 5       1394841600     Clothes_shoes_jewelry
6     5 A27UF1MSF3DB2  31887 [0, 0]               375 4       1396224000     Clothes_shoes_jewelry

我想在help / 2nd#中创建一个名为help = first#的新列 . 我之前在这里询问过这个代码:

csj %>%
    +   mutate(col1 = as.numeric(stringi::stri_extract_first_regex(csj$helpful, pattern = "[0-9]")),#extract first number
    +          col2 = as.numeric(stringi::stri_extract_last_regex(csj$helpful, pattern = "[0-9]")),#extract second
    +          col3 = ifelse(col2 == 0, 1, col2 ),#change 0s to 1
    +          help = col1/col3) #divide col1 and 3

但col1,col2,col3,帮助所有人都转向了Nan .

# Source:   lazy query [?? x 12]
        # Database: spark_connection
           `_c0` reviewerID     asin  helpful length_of_review overall unixReviewTime category    col1  col2  col3  help
           <int> <chr>          <chr> <chr>              <dbl> <chr>   <chr>          <chr>      <dbl> <dbl> <dbl> <dbl>
         1     0 A1KLRMWW2FWPL4 31887 [0, 0]               172 5       1297468800     Clothes_s~   NaN   NaN   NaN   NaN
         2     1 A2G5TCU2WDFZ65 31887 [0, 0]               306 5       1358553600     Clothes_s~   NaN   NaN   NaN   NaN
         3     2 A1RLQXYNCMWRWN 31887 [0, 0]               312 5       1357257600     Clothes_s~   NaN   NaN   NaN   NaN
         4     3 A8U3FAMSJVHS5  31887 [0, 0]               405 5       1398556800     Clothes_s~   NaN   NaN   NaN   NaN
         5     4 A3GEOILWLK86XM 31887 [0, 0]               453 5       1394841600     Clothes_s~   NaN   NaN   NaN   NaN
         6     5 A27UF1MSF3DB2  31887 [0, 0]               375 4       1396224000     Clothes_s~   NaN   NaN   NaN   NaN
         7     6 A16GFPNVF4Y816 31887 [0, 0]               334 5       1399075200     Clothes_s~   NaN   NaN   NaN   NaN
         8     7 A2M2APVYIB2U6K 31887 [0, 0]               158 5       1356220800     Clothes_s~   NaN   NaN   NaN   NaN
         9     8 A1NJ71X3YPQNQ9 31887 [0, 0]                96 4       1384041600     Clothes_s~   NaN   NaN   NaN   NaN
        10     9 A3EERSWHAI6SO  31887 [7, 8]               532 5       1349568000     Clothes_s~   NaN   NaN   NaN   NaN
        # ... with more rows

所以,我累了 stringi::stri_extract_first_regex(helpful, pattern = "[0-9]")) 而不是

stringi::stri_extract_first_regex(csj$helpful, pattern = "[0-9]"))


Error in stringi::stri_extract_first_regex(helpful, pattern = "[0-9]") : 
  object 'helpful' not found

另外,对应于Thomas K的评论,我试过没有 as.numeric ,如下所示 .

csj %>%
  mutate(col1 = stringi::stri_extract_first_regex(csj$helpful, pattern = "[0-9]"),#extract first number
         col2 = stringi::stri_extract_last_regex(csj$helpful, pattern = "[0-9]"),#extract second
         col3 = ifelse(col2 == 0, 1, col2 ),#change 0s to 1
         help = col1/col3) #divide row1 and 3

然后我在col1和col2的每一个都得到了“”“” .

# Source:   lazy query [?? x 12]
# Database: spark_connection
   `_c0` reviewerID     asin  helpful length_of_review overall unixReviewTime category   col1  col2  col3   help
   <int> <chr>          <chr> <chr>              <dbl> <chr>   <chr>          <chr>      <chr> <chr> <chr> <dbl>
 1     0 A1KLRMWW2FWPL4 31887 [0, 0]               172 5       1297468800     Clothes_s~ ""    ""    NA      NaN
 2     1 A2G5TCU2WDFZ65 31887 [0, 0]               306 5       1358553600     Clothes_s~ ""    ""    NA      NaN
 3     2 A1RLQXYNCMWRWN 31887 [0, 0]               312 5       1357257600     Clothes_s~ ""    ""    NA      NaN
 4     3 A8U3FAMSJVHS5  31887 [0, 0]               405 5       1398556800     Clothes_s~ ""    ""    NA      NaN
 5     4 A3GEOILWLK86XM 31887 [0, 0]               453 5       1394841600     Clothes_s~ ""    ""    NA      NaN
 6     5 A27UF1MSF3DB2  31887 [0, 0]               375 4       1396224000     Clothes_s~ ""    ""    NA      NaN
 7     6 A16GFPNVF4Y816 31887 [0, 0]               334 5       1399075200     Clothes_s~ ""    ""    NA      NaN
 8     7 A2M2APVYIB2U6K 31887 [0, 0]               158 5       1356220800     Clothes_s~ ""    ""    NA      NaN
 9     8 A1NJ71X3YPQNQ9 31887 [0, 0]                96 4       1384041600     Clothes_s~ ""    ""    NA      NaN
10     9 A3EERSWHAI6SO  31887 [7, 8]               532 5       1349568000     Clothes_s~ ""    ""    NA      NaN
# ... with more rows

我很长时间都坚持这个 . 如果有人知道问题是什么,那将是非常高兴的 .

如果我能解决这些问题,我会很高兴 . 任何帮助将非常感激!


> str(csj)
List of 2
 $ src:List of 1
  ..$ con:List of 10
  .. ..$ master       : chr "local[4]"
  .. ..$ method       : chr "shell"
  .. ..$ app_name     : chr "sparklyr"
  .. ..$ config       :List of 4
  .. .. ..$ spark.env.SPARK_LOCAL_IP.local    : chr ""
  .. .. ..$ sparklyr.csv.embedded             : chr "^1.*"
  .. .. ..$ sparklyr.cores.local              : int 4
  .. .. ..$ spark.sql.shuffle.partitions.local: int 4
  .. .. ..- attr(*, "config")= chr "default"
  .. .. ..- attr(*, "file")= chr "C:\\Users\\ms\\Documents\\R\\win-library\\3.5\\sparklyr\\conf\\config-template.yml"
  .. ..$ spark_home   : chr "C:\\spark"
  .. ..$ backend      : 'sockconn' int 4
  .. .. ..- attr(*, "conn_id")=<externalptr> 
  .. ..$ monitor      : 'sockconn' int 3
  .. .. ..- attr(*, "conn_id")=<externalptr> 
  .. ..$ output_file  : chr "C:\\Users\\ms\\AppData\\Local\\Temp\\RtmpygTIca\\file371068ce6a02_spark.log"
  .. ..$ spark_context:Classes 'spark_jobj', 'shell_jobj' <environment: 0x00000000daa77a50> 
  .. ..$ java_context :Classes 'spark_jobj', 'shell_jobj' <environment: 0x00000000daa365b8> 
  .. ..- attr(*, "class")= chr [1:3] "spark_connection" "spark_shell_connection" "DBIConnection"
  ..- attr(*, "class")= chr [1:3] "src_spark" "src_sql" "src"
 $ ops:List of 4
  ..$ name: chr "select"
  ..$ x   :List of 4
  .. ..$ name: chr "mutate"
  .. ..$ x   :List of 2
  .. .. ..$ x   : 'ident' chr "review_csj"
  .. .. ..$ vars: chr [1:7] "_c0" "reviewerID" "asin" "helpful" ...
  .. .. ..- attr(*, "class")= chr [1:3] "op_base_remote" "op_base" "op"
  .. ..$ dots:List of 2
  .. .. ..$ length_of_review: language ~as.numeric(nchar(reviewText))
  .. .. .. ..- attr(*, ".Environment")=<environment: 0x00000000d91366c0> 
  .. .. ..$ category        : language ~"Clothes_shoes_jewelry"
  .. .. .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
  .. ..$ args: list()
  .. ..- attr(*, "class")= chr [1:3] "op_mutate" "op_single" "op"
  ..$ dots:List of 1
  .. ..$ : language ~c("_c0", reviewerID, asin, helpful, length_of_review, overall, unixReviewTime, category)
  .. .. ..- attr(*, ".Environment")=<environment: 0x00000000d904db20> 
  .. ..- attr(*, "class")= chr "quosures"
  ..$ args: list()
  ..- attr(*, "class")= chr [1:3] "op_select" "op_single" "op"
 - attr(*, "class")= chr [1:4] "tbl_spark" "tbl_sql" "tbl_lazy" "tbl"


Session info ----------------------------------------------------------------------------------------------------
 setting  value                       
 version  R version 3.5.0 (2018-04-23)
 system   x86_64, mingw32             
 ui       RStudio (1.1.453)           
 language (EN)                        
 collate  English_United States.1252  
 tz       Europe/Berlin               
 date     2018-05-21                  

