首页 文章

Web-Scrapping - 带R的多个页面

提问于
浏览
1

我需要使用R从Web中删除html表 . 每页有1000个行的一个表,总共有316页 . 第一个网址的链接在这里:“http://sumodb.sumogames.de/Query.aspx?show_form=0&columns=6&rowcount=5&showheya=on&showshusshin=on&showbirthdate=on&showhatsu=on&showintai=on&showheight=on&showweight=on&showhighest=on

那么我认为只有偏移量在其他网址上增加(1000,2000,3000 ...,316000)

到目前为止,这是我的代码,用于一页:

library(XML)
    library(rvest)

url <- read_html("http://sumodb.sumogames.de/Query.aspx?show_form=0&columns=6&rowcount=5&showheya=on&showshusshin=on&showbirthdate=on&showhatsu=on&showintai=on&showheight=on&showweight=on&showhighest=on")

    table <- url %>%
         html_nodes(".record") %>%
         html_table(fill = TRUE)
     table

大表每页的css选择器是“.record”

最终目标是将整个表格放在一个CSV文件中 .

1 回答

  • 1

    以下代码应该实现您的目标,但要注意它将花费很长时间,因为基于Web的查询需要为每个页面进行一些密集加载 .

    代码使用下一个,上一个和最后一个按钮循环浏览页面 . 需要注意的是前两页和最后两页有不同的CSS选择器,因此需要手动完成 .

    完成后,.txt文件需要整理 .

    library(XML)
    library(rvest)
    
    # Starting page URL
    url <- read_html("http://sumodb.sumogames.de/Query.aspx?show_form=0&columns=6&rowcount=5&showheya=on&showshusshin=on&showbirthdate=on&showhatsu=on&showintai=on&showheight=on&showweight=on&showhighest=on")
    
    # URL prefix
    urlPrefix <- "http://sumodb.sumogames.de/"
    
    # URL of last page
    lastURL <- url %>%
      html_nodes('div+ div a+ a') %>%
      html_attr("href")
    lastURL <- paste0(urlPrefix, lastURL)  
    lastURL <- read_html(lastURL)
    
    # URL of second to last page
    penultimateURL <- lastURL %>%
      html_nodes('div+ div a+ a') %>%
      html_attr("href")
    penultimateURL <- paste0(urlPrefix, penultimateURL)  
    penultimateURL <- read_html(penultimateURL)
    
    # Table of first page
    tabletemp <- url %>%
      html_nodes(".record") %>%
      html_table(fill = TRUE)
    tabletemp <- tabletemp[[1]]
    names(tabletemp) <- tabletemp[1, ]
    tabletemp <- tabletemp[-1, ]
    
    # Create and write first table to a .txt file
    write.table(tabletemp, 'table.txt', row.names = FALSE)
    
    # URL of second page
    nextURL <- url %>%
      html_nodes('div+ div a:nth-child(1)') %>%
      html_attr("href")
    nextURL <- paste0(urlPrefix, nextURL) 
    nextURL <- read_html(nextURL)
    
    # Table of second page
    tabletemp <- nextURL %>%
      html_nodes(".record") %>%
      html_table(fill = TRUE)
    tabletemp <- tabletemp[[1]]
    names(tabletemp) <- tabletemp[1, ]
    tabletemp <- tabletemp[-1, ]
    
    # Append second table to .txt file 
    write.table(tabletemp, 'table.txt', row.names = FALSE, col.names = FALSE, append = TRUE)
    
    # URL of third page
    nextURL <- nextURL %>%
      html_nodes('div+ div a:nth-child(2)') %>%
      html_attr("href")
    nextURL <- paste0(urlPrefix, nextURL) 
    nextURL <- read_html(nextURL)
    
    # cyle through pages 3 to N - 2
    while(html_text(nextURL) != html_text(penultimateURL)){
    
      tabletemp <- nextURL %>%
        html_nodes(".record") %>%
        html_table(fill = TRUE)
      tabletemp <- tabletemp[[1]]
      names(tabletemp) <- tabletemp[1, ]
      tabletemp <- tabletemp[-1, ]
    
      write.table(tabletemp, 'table.txt', row.names = FALSE, col.names = FALSE, append = TRUE)
    
      nextURL <- nextURL %>%
        html_nodes('div+ div a:nth-child(3)') %>%
        html_attr("href")
      nextURL <- paste0(urlPrefix, nextURL)
      nextURL <- read_html(nextURL)
    
    }
    
    # Table of penultimate page
    tabletemp <- penultimateURL %>%
      html_nodes(".record") %>%
      html_table(fill = TRUE)
    tabletemp <- tabletemp[[1]]
    names(tabletemp) <- tabletemp[1, ]
    tabletemp <- tabletemp[-1, ]
    
    # Append penultimate table to .txt file 
    write.table(tabletemp, 'table.txt', row.names = FALSE, col.names = FALSE, append = TRUE)
    
    # Table of last page  
    tabletemp <- lastURL %>%
      html_nodes(".record") %>%
      html_table(fill = TRUE)
    tabletemp <- tabletemp[[1]]
    names(tabletemp) <- tabletemp[1, ]
    tabletemp <- tabletemp[-1, ]
    
    # Append last table to .txt file
    write.table(tabletemp, 'table.txt', row.names = FALSE, col.names = FALSE, append = TRUE)
    
    # Checking number of rows in final table
    nrow(read.table('table.txt'))
    

    如果您希望代码更快地运行以进行测试,请尝试从第五页到最后一页或类似的东西开始,只需要知道必须为第一页和第二页更改CSS选择器 .

    我希望这有帮助 :)

相关问题