首页 文章

使用XML包将html表刮入R数据帧

提问于
浏览
139

如何使用XML包刮取html表?

Brazilian soccer team上的这个维基百科页面为例 . 我想在R中读取它并将"list of all matches Brazil have played against FIFA recognised teams"表作为data.frame获取 . 我怎样才能做到这一点?

4 回答

  • 129
    library(RCurl)
    library(XML)
    
    # Download page using RCurl
    # You may need to set proxy details, etc.,  in the call to getURL
    theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
    webpage <- getURL(theurl)
    # Process escape characters
    webpage <- readLines(tc <- textConnection(webpage)); close(tc)
    
    # Parse the html tree, ignoring errors on the page
    pagetree <- htmlTreeParse(webpage, error=function(...){})
    
    # Navigate your way through the tree. It may be possible to do this more efficiently using getNodeSet
    body <- pagetree$children$html$children$body 
    divbodyContent <- body$children$div$children[[1]]$children$div$children[[4]]
    tables <- divbodyContent$children[names(divbodyContent)=="table"]
    
    #In this case, the required table is the only one with class "wikitable sortable"  
    tableclasses <- sapply(tables, function(x) x$attributes["class"])
    thetable  <- tables[which(tableclasses=="wikitable sortable")]$table
    
    #Get columns headers
    headers <- thetable$children[[1]]$children
    columnnames <- unname(sapply(headers, function(x) x$children$text$value))
    
    # Get rows from table
    content <- c()
    for(i in 2:length(thetable$children))
    {
       tablerow <- thetable$children[[i]]$children
       opponent <- tablerow[[1]]$children[[2]]$children$text$value
       others <- unname(sapply(tablerow[-1], function(x) x$children$text$value)) 
       content <- rbind(content, c(opponent, others))
    }
    
    # Convert to data frame
    colnames(content) <- columnnames
    as.data.frame(content)
    

    Edited to add:

    样本输出

    Opponent Played Won Drawn Lost Goals for Goals against  % Won
        1               Argentina     94  36    24   34       148           150  38.3%
        2                Paraguay     72  44    17   11       160            61  61.1%
        3                 Uruguay     72  33    19   20       127            93  45.8%
        ...
    
  • 48

    使用Xpath的另一种选择 .

    library(RCurl)
    library(XML)
    
    theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
    webpage <- getURL(theurl)
    webpage <- readLines(tc <- textConnection(webpage)); close(tc)
    
    pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
    
    # Extract table header and contents
    tablehead <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/th", xmlValue)
    results <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/td", xmlValue)
    
    # Convert character vector to dataframe
    content <- as.data.frame(matrix(results, ncol = 8, byrow = TRUE))
    
    # Clean up the results
    content[,1] <- gsub(" ", "", content[,1])
    tablehead <- gsub(" ", "", tablehead)
    names(content) <- tablehead
    

    产生这个结果

    > head(content)
       Opponent Played Won Drawn Lost Goals for Goals against % Won
    1 Argentina     94  36    24   34       148           150 38.3%
    2  Paraguay     72  44    17   11       160            61 61.1%
    3   Uruguay     72  33    19   20       127            93 45.8%
    4     Chile     64  45    12    7       147            53 70.3%
    5      Peru     39  27     9    3        83            27 69.2%
    6    Mexico     36  21     6    9        69            34 58.3%
    
  • 26

    rvestxml2 是另一个用于解析html网页的流行软件包 .

    library(rvest)
    theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
    file<-read_html(theurl)
    tables<-html_nodes(file, "table")
    table1 <- html_table(tables[4], fill = TRUE)
    

    语法比 xml 包更容易使用,对于大多数网页,包提供了所需的所有选项 .

  • 18

    ......或者更短的尝试:

    library(XML)
    library(RCurl)
    library(rlist)
    theurl <- getURL("https://en.wikipedia.org/wiki/Brazil_national_football_team",.opts = list(ssl.verifypeer = FALSE) )
    tables <- readHTMLTable(theurl)
    tables <- list.clean(tables, fun = is.null, recursive = FALSE)
    n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
    

    挑选的表格是页面上最长的表格

    tables[[which.max(n.rows)]]
    

相关问题