我正在尝试调试从包中删除的三个函数集 . 主要功能scrape.game.results应该从ESPN的网站上抓取NCAA篮球统计数据,然后将它们格式化为一个数据框,该数据框可以被包中的其他功能读取 . 但是,该功能是为了只能访问2002年到2017年的数据而构建的,因此我修改了它以允许它从2018年获得今年三月疯狂锦标赛的数据 .

n.scrpgmrslts <- function (year, sex = c("mens", "womens")) 
{
  sex = match.arg(sex)
  `%>%` = dplyr::`%>%`
  if (missing(year)) 
    stop("scrape.game.results: A year must be provided")
  if (!(class(year) %in% c("integer", "numeric"))) 
    stop("scrape.game.results: The year must be numeric")
  if (year < 2002 | year > 2018) 
    stop("The available seasons are 2002 to 2018")
  teams = scrape.teams(sex)
  results = data.frame(game.id = character(0), primary.id = character(0), 
                       primary.score = character(0), other.id = character(0), 
                       other.score = character(0), home = character(0), location = character(0), 
                   ot = character(0))
  for (team.id in teams$id) {
    results = rbind(results, scrape.team.game.results(year, 
                                                  team.id, sex))
  }
  results = results %>% dplyr::mutate(home = ifelse(location %in% 
                                                      c("H", "A"), 
ifelse(location == "H", TRUE, FALSE), ifelse(is.na(other.id) | 

primary.id < other.id, TRUE, FALSE)))
  results = results %>% dplyr::transmute(game.id = game.id, 
                                         home.id = ifelse(home, primary.id, 
other.id), away.id = ifelse(home, 

other.id, primary.id), home.score = ifelse(home, 

primary.score, other.score), away.score = ifelse(home, 

other.score, primary.score), neutral = ifelse(location == 

"N", 1, 0), ot = ot)
  results$home.id = ifelse(is.na(results$home.id), "NA", results$home.id)
  results$away.id = ifelse(is.na(results$away.id), "NA", results$away.id)
  results = results %>% dplyr::filter(home.id %in% results$away.id & 
                                    away.id %in% results$home.id)
  unique(results)
}


scrape.teams <- function (sex) 
{
  `%>%` = dplyr::`%>%`
  url = paste0("http://www.espn.com/", sex, "-college-basketball/teams")
  cells = xml2::read_html(url) %>% rvest::html_nodes(".mod-content > 
  ul.medium-logos > li h5 a")
  name = cells %>% rvest::html_text(trim = TRUE)
  id = cells %>% rvest::html_attr("href") %>% strsplit("/") %>% 
    sapply(identity) %>% 8[]
  data.frame(name = name, id = id, stringsAsFactors = FALSE)
}


scrape.team.game.results <- function (year, team.id, sex) 
{
  `%>%` = dplyr::`%>%`
  year = as.character(year)
  team.id = as.character(team.id)
  url = paste0("http://www.espn.com/", sex, "-college-basketball/", 
               "team/schedule/_/id/", team.id, "/year/", year)
  rows = xml2::read_html(url) %>% rvest::html_nodes(".mod-content table 
tr:not(.colhead)")
  tourney = rows %>% rvest::html_text(trim = TRUE) %>% startsWith(c("MEN'S 
BASKETBALL CHAMPIONSHIP", 
                                                                    "NCAA 
 WOMEN'S CHAMPIONSHIP")) %>% which
  if (length(tourney) > 0) {
    rows = rows[1:(min(tourney) - 1)]
  }
  opponent.cells = rows %>% rvest::html_nodes("td:nth-child(2)")
  result.cells = rows %>% rvest::html_nodes("td:nth-child(3)")
  skip = result.cells %>% rvest::html_text(trim = TRUE) %in% 
    c("Postponed", "Canceled") %>% which
  skip = result.cells %>% rvest::html_node("a") %>% rvest::html_attr("href") 
%>% 
    strsplit("/") %>% sapply(function(row) row[5] %in% c("preview", 
                                                         "onair")) %>% which %>% c(skip)
  skip = result.cells %>% rvest::html_node("li.score") %>% 
    rvest::html_text(trim = TRUE) %>% is.na %>% which %>% 
    c(skip)
  if (length(skip) > 0) {
    opponent.cells = opponent.cells[-skip]
    result.cells = result.cells[-skip]
  }
  won = result.cells %>% rvest::html_node("li.game-status") %>% 
    rvest::html_text(trim = TRUE) == "W"
  score = result.cells %>% rvest::html_node("li.score") %>% 
    rvest::html_text(trim = TRUE) %>% strsplit(" ") %>% sapply(function(row) row[1]) %>% 
strsplit("-") %>% sapply(identity) %>% t
  other = opponent.cells %>% rvest::html_node("li.team-name a") %>% 
rvest::html_attr("href") %>% strsplit("/") %>% sapply(function(row) row[8])
  neutral = opponent.cells %>% rvest::html_node("li.team-name") %>% 
rvest::html_text(trim = TRUE) %>% endsWith("*")
  at.or.vs = opponent.cells %>% rvest::html_node("li.game-status") %>% 
rvest::html_text(trim = TRUE)
  location = ifelse(neutral, "N", ifelse(at.or.vs == "vs", 
                                     "H", "A"))
  ot = result.cells %>% rvest::html_node("li.score") %>% rvest::html_text(trim = TRUE) %>% 
strsplit(" ") %>% sapply(function(row) row[2]) %>% ifelse(is.na(.), 
                                                          "", .)
  game.id = result.cells %>% rvest::html_node("li.score a") %>% 
rvest::html_attr("href") %>% strsplit("/") %>% sapply(function(row) row[8])
  data.frame(game.id = game.id, primary.id = team.id, primary.score = score[matrix(c(1:nrow(score), 
                                                                                 ifelse(won, 1, 2)), ncol = 2, byrow = FALSE)], other.id = other, 
         other.score = score[matrix(c(1:nrow(score), ifelse(won, 
                                                            2, 1)), ncol = 2, byrow = FALSE)], location = location, 
         ot = ot, stringsAsFactors = FALSE)
}

当我通过键入调用该函数

n.scrpgmrslts(2018, 'mens')

进入控制台 .

我收到以下错误

得分误差[矩阵(c(1:nrow(得分),ifelse(赢,1,2)),ncol = 2,byrow = FALSE)]:下标超出范围来自:data.frame(game.id) = game.id,primary.id = team.id,primary.score = score [matrix(c(1:nrow(score),ifelse(won,1,2)),ncol = 2,byrow = FALSE)], other.id = other,other.score = score [matrix(c(1:nrow(score),ifelse(won,2,1)),ncol = 2,byrow = FALSE)],location = location,ot = ot ,stringsAsFactors = FALSE)

错误似乎发生在代码的最后几行,所以我想知道是否可能有某种解决方法,可能是通过覆盖错误或将数据强制转换为不同的格式,或者其他东西(我是一个新手当涉及到R.)

谢谢你的时间!