首页 文章

使用R抓取带有图像,文本和空白单元格的Wikipedia HTML表格

提问于
浏览
3

我感兴趣的表格是纽约市米其林星级餐厅的维基百科表格,奖励的星星数量由图片表示 .

Screenshot of wikipedia table

我能够使用两个步骤刮取表格(首先获取“Name”和“Borough”列中的单词,然后获取表格中的alt标签),但我想知道是否可以一步完成 . 我能够使用rvest包刮掉数据 .

由于XML :: readHTMLTable函数无法读取维基百科页面,我尝试了htmltab包没有运气,因为我无法找出bodyFun参数所需的函数 . 说实话,我是网络抓取的新手......和功能 .

我提到的问题可供参考:

Scraping html table with images using XML R package

Scraping html tables into R data frames using the XML package

这是我的代码:

library(stringr)
library(rvest)
library(data.table)

url <- "http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City"

#Scrape the first two columns, restaurant name and borough
name.boro <- url %>% read_html() %>% html_nodes("table") %>% html_table(fill = TRUE)
name.boro <- as.data.table(name.boro[[1]])
name.boro[, 3:length(name.boro) := NULL]
135 * 13 #1,755 cells in first table

#scrape tables for img alt 
#note that because I used the "td" node, entries for all cells in all tables were pulled
stars <- url %>% read_html() %>% html_nodes("td") %>% html_node("img") %>% html_attr("alt")
stars 

#Make vector of numbers to index each column
df <- vector("list", 13)
for (i in 1:13){
  df[[i]] <- seq(i, 1755, 13)
}

#Put everything together
Mich.Guide <- name.boro 
Mich.Guide[, c("X2006", "X2007", "X2008", "X2009", "X2010", "X2011", "X2012", "X2013", "X2014", "X2015", 
               "X2016") := .(stars[unlist(df[3])], stars[unlist(df[4])], stars[unlist(df[5])], 
                             stars[unlist(df[6])], stars[unlist(df[7])], stars[unlist(df[8])], 
                             stars[unlist(df[9])], stars[unlist(df[10])], stars[unlist(df[11])], 
                             stars[unlist(df[12])], stars[unlist(df[13])] )]

谢谢!

2 回答

  • 1

    您可以尝试以下方法

    require(rvest)
    url <- "http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City"
    doc <- read_html(url)
    col_names <- doc %>% html_nodes("#mw-content-text > table > tr:nth-child(1) > th") %>% html_text()
    tbody <- doc %>% html_nodes("#mw-content-text > table > tr:not(:first-child)")
    
    extract_tr <- function(tr){
      scope <- tr %>% html_children()
      c(scope[1:2] %>% html_text(),
        scope[3:length(scope)] %>% html_node("img") %>% html_attr("alt"))
    }
    
    res <- tbody %>% sapply(extract_tr)
    res <- as.data.frame(t(res), stringsAsFactors = FALSE)
    colnames(res) <- col_names
    

    现在你有了原始表 . 我将列的解析保留为整数,并将列名称留给您

  • 3

    略有不同的方法:

    library(rvest)
    library(purrr)
    library(stringi)
    
    pg <- read_html("http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City")
    html_nodes(pg, xpath=".//table[contains(@class, 'wikitable')]/tr[not(th)]") %>% 
      map_df(function(x) {
        r_name <- html_text(html_nodes(x, xpath=".//td[1]"))
        borough <- html_text(html_nodes(x, xpath=".//td[2]"))
        map(3:13, function(y) {
          stars <- html_attr(html_nodes(x, xpath=sprintf(".//td[%d]/a", y)), "href")
          if (length(stars)==0) {
            NA
          } else {
            stri_match_first_regex(stars, "Michelin-([[:digit:]])")[,2] 
          }
        }) -> refs
        refs <- setNames(refs, c(2006:2016))
        as.data.frame(c(r_name=r_name, borough=borough, refs), stringsAsFactors=FALSE)
      }) -> michelin_nyc
    
    dplyr::glimpse(michelin_nyc)
    
    ## Observations: 135
    ## Variables: 13
    ## $ r_name  <chr> "Adour", "Ai Fiori", "Alain Ducasse at the...
    ## $ borough <chr> "Manhattan", "Manhattan", "Manhattan", "Ma...
    ## $ X2006   <chr> NA, NA, "3", NA, NA, NA, NA, "1", NA, NA, ...
    ## $ X2007   <chr> NA, NA, NA, NA, NA, NA, NA, "1", NA, NA, N...
    ## $ X2008   <chr> NA, NA, NA, NA, NA, NA, NA, "1", "1", NA, ...
    ## $ X2009   <chr> "2", NA, NA, NA, "1", "1", NA, "1", "1", N...
    ## $ X2010   <chr> "1", NA, NA, NA, NA, "2", NA, "1", "1", NA...
    ## $ X2011   <chr> "1", NA, NA, "1", NA, "2", NA, "1", "1", N...
    ## $ X2012   <chr> "1", "1", NA, "1", NA, NA, NA, "1", NA, NA...
    ## $ X2013   <chr> "1", "1", NA, "1", NA, NA, NA, "1", NA, "1...
    ## $ X2014   <chr> NA, "1", NA, "1", NA, NA, NA, "1", NA, "1"...
    ## $ X2015   <chr> NA, "1", NA, "1", NA, NA, "1", NA, NA, "2"...
    ## $ X2016   <chr> NA, "1", NA, "1", NA, NA, "1", NA, NA, "2"...
    

    这对于 XML 包也是完全可行的,如下所示:

    library(XML)
    library(RCurl)
    library(stringi)
    
    pg <- getURL("https://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City")
    pg <- htmlParse(pg)
    rows <- getNodeSet(pg, "//table[contains(@class, 'wikitable')]/tr[not(th)]")
    do.call(rbind, lapply(rows, function(x) {
      r_name <- xpathSApply(x, ".//td[1]", xmlValue)
      borough <- xpathSApply(x, ".//td[2]", xmlValue)
      lapply(3:13, function(y) {
        stars <- xpathSApply(x, sprintf(".//td[%d]/a", y), xmlGetAttr, "href")
        if (length(stars)==0) {
          NA
        } else {
          stri_match_first_regex(stars, "Michelin-([[:digit:]])")[,2] 
        }
      }) -> refs
      refs <- setNames(refs, c(2006:2016))
      as.data.frame(c(r_name=r_name, borough=borough, refs), stringsAsFactors=FALSE)
    })) -> michelin_nyc
    
    str(michelin_nyc)
    
    ## 'data.frame': 135 obs. of  13 variables:
    ##  $ r_name : chr  "Adour" "Ai Fiori" "Alain Ducasse at the Essex House" "Aldea" ...
    ##  $ borough: chr  "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
    ##  $ X2006  : chr  NA NA "3" NA ...
    ##  $ X2007  : chr  NA NA NA NA ...
    ##  $ X2008  : chr  NA NA NA NA ...
    ##  $ X2009  : chr  "2" NA NA NA ...
    ##  $ X2010  : chr  "1" NA NA NA ...
    ##  $ X2011  : chr  "1" NA NA "1" ...
    ##  $ X2012  : chr  "1" "1" NA "1" ...
    ##  $ X2013  : chr  "1" "1" NA "1" ...
    ##  $ X2014  : chr  NA "1" NA "1" ...
    ##  $ X2015  : chr  NA "1" NA "1" ...
    ##  $ X2016  : chr  NA "1" NA "1" ...
    

相关问题