我有以下XML tree
library("XML")
library("RCurl")
url <- "https://doc-0s-9c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/rk8a2gr7rl8e8s8j0luiak0cahtcjnak/1459080000000/07495711428163271540/*/0BzmnaOABaMIgTEl6SnRUdU9Eb2M?e=download"
bin <- getURL(url)
con <- file("reference.xml", open = "wb")
writeBin(bin, con)
close(con)
OperationList <- xmlTreeParse("reference.xml", useInternal = TRUE)
我能够为计划名称获取一个数据帧,为操作名称获取一个数据帧 .
planname <- data.frame(sapply(OperationList["//subgroups/OperationGroup/subgroups/OperationGroup/operations/OperationHeader/plans/PlanHeader/name"], xmlValue))
operationanme <- data.frame(sapply(OperationList["//subgroups/OperationGroup/subgroups/OperationGroup/operations/OperationHeader/name"], xmlValue))
但是将它们放在一个df中(即展平xml树)不起作用 .
我经历了多种方法(参见下面的尝试和我得到的错误信息)但到目前为止没有任何工作 . 感谢指出我犯的错误 .
xmlToDataFrame函数
Operation.df1 <- xmlToDataFrame(OperationList)
列的重复下标
xmlToDF函数
根据https://hopstat.wordpress.com/2014/01/14/faster-xml-conversion-to-data-frames/
require(XML)
xmlToDF = function(doc, xpath, isXML = TRUE, usewhich = TRUE, verbose = TRUE) {
if (!isXML)
doc = xmlParse(doc)
#### get the records for that form
nodeset <- getNodeSet(doc, xpath)
## get the field names
var.names <- lapply(nodeset, names)
## get the total fields that are in any record
fields = unique(unlist(var.names))
## extract the values from all fields
dl = lapply(fields, function(x) {
if (verbose)
print(paste0(" ", x))
xpathSApply(proc, paste0(xpath, "/", x), xmlValue)
})
## make logical matrix whether each record had that field
name.mat = t(sapply(var.names, function(x) fields %in% x))
df = data.frame(matrix(NA, nrow = nrow(name.mat), ncol = ncol(name.mat)))
names(df) = fields
## fill in that data.frame
for (icol in 1:ncol(name.mat)) {
rep.rows = name.mat[, icol]
if (usewhich)
rep.rows = which(rep.rows)
df[rep.rows, icol] = dl[[icol]]
}
return(df)
}
Operation.df2 <- xmlToDF(OperationList,
xpath = "/subgroups/OperationGroup/subgroups/OperationGroup/name")
name.mat [,icol]中的错误:下标超出范围
rbind&xpathApply
require(XML)
Operation.df3 <- xpathApply(OperationList,
"/subgroups/OperationGroup/subgroups/OperationGroup/name",
function(node) {
region <- xmlValue(node[["name"]])
xp <- "./operations/OperationHeader/name"
operation <- xpathSApply(node, xp, xmlValue)
if (is.null(operation)) operation <- NA
data.frame(region, operation, stringsAsFactors = FALSE)
})
do.call(rbind, Operation.df3 )
给出一个NULL
xmlToList和plyr
require(XML)require(plyr)OperationList2 < - xmlToList(OperationList)Operation.df4 < - ldply(OperationList2,data.frame)
*给我参数意味着不同的行数:1,0
xmlToList,plyr和data.table
require(data.table)
Operation.df41 <- data.frame(rbindlist(OperationList2))
列表输入的第1项不是data.frame,data.table或list
Operation.df42 <- rbindlist(OperationList2)
列表输入的第1项不是data.frame,data.table或list
Operation.df43 <- data.frame(matrix(unlist(OperationList2),
byrow=T),stringsAsFactors=FALSE)
只有一列
Operation.df44 <- lapply(OperationList2, data.frame,
stringsAsFactors = FALSE)
参数意味着不同的行数:1,0
Operation.df45 <- rbind.fill(Operation.df44)
在循环中使用函数
Convert (possibly malformed) xml into Data Frame in R
xp <- function (OperationList, tag){
n <- xpathSApply(OperationList, tag, xmlValue)
if (length(n) > 0)
# paste multiple values?
paste0(n, collapse="; ")
else NA
}
z <- getNodeSet(OperationList, "//subgroups/OperationGroup/subgroups/OperationGroup")
n <-length(z)
notices <-vector("list",n)
for(i in 1:n)
{
Operation.df5<-xmlDoc(z[[i]])
Operation.df5[[i]] <- data.frame(
region = xp(z2, "//name"),
operation = xp(z2, "//operations/OperationHeader/name"),
stringsAsFactors=FALSE)
free(Operation.df5)
}
do.call("rbind", Operation.df5)
'externalptr'类型的对象不是子集表
设置了getNodeSet
for (i in 1:length(getNodeSet(OperationList, "//subgroups/OperationGroup")))
{
if (i==1) {
foo<-xmlSApply(OperationList[[i]], xmlValue)
Operation.df6 <-data.frame(t(foo), stringsAsFactors=FALSE)
}
else {
foo<-xmlSApply(OperationList[[i]], xmlValue)
tmp<-data.frame(t(foo), stringsAsFactors=FALSE)
Operation.df6 <-rbind(Operation.df6, tmp)
}
}
没有用于使用整数对XMLInternalDocument进行子集化的方法
1 回答
为每个
OperationHeader
获取其名称(opName)和所有计划的名称(计划),为每个OperationHeader创建一个组件的列表 . 最后rbind
组件在一起:赠送: