Я немного запутался в том, почему это заглочено ... мне все еще не кажется очевидным! Но для тех, кто хочет знать решение, я понял это (благодаря некоторой помощи от @duncantl). (Кроме того, полный анализ: https://github.com/hardin47/prediction2016 )
require(XML)
require(dplyr)
require(tidyr)
require(readr)
require(mosaic)
require(RCurl)
require(ggplot2)
require(lubridate)
require(RJSONIO)
url = "http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/"
doc <- htmlParse(url, useInternalNodes = TRUE)
sc = xpathSApply(doc, "//script[contains(., 'race.model')]",
function(x) c(xmlValue(x), xmlAttrs(x)[["href"]]))
jsobj = gsub(".*race.stateData = (.*);race.pathPrefix.*", "\1", sc)
data = fromJSON(jsobj)
allpolls <- data$polls
#unlisting the whole thing
indx <- sapply(allpolls, length)
pollsdf <- as.data.frame(do.call(rbind, lapply(allpolls, `length<-`, max(indx))))
#unlisting the weights
pollswt <- as.data.frame(t(as.data.frame(do.call(cbind, lapply(pollsdf$weight, data.frame,
stringsAsFactors=FALSE)))))
names(pollswt) <- c("wtpolls", "wtplus", "wtnow")
row.names(pollswt) <- NULL
pollsdf <- cbind(pollsdf, pollswt)
#unlisting the voting
indxv <- sapply(pollsdf$votingAnswers, length)
pollsvot <- as.data.frame(do.call(rbind, lapply(pollsdf$votingAnswers,
`length<-`, max(indxv))))
pollsvot1 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V1, data.frame,
stringsAsFactors=FALSE))))
pollsvot2 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V2, data.frame,
stringsAsFactors=FALSE))))
pollsvot1 <- cbind(polltype = rownames(pollsvot1), pollsvot1,
polltypeA = gsub('[0-9]+', '', rownames(pollsvot1)),
polltype1 = extract_numeric(rownames(pollsvot1)))
pollsvot1$polltype1 <- ifelse(is.na(pollsvot1$polltype1), 1, pollsvot1$polltype1 + 1)
pollsvot2 <- cbind(polltype = rownames(pollsvot2), pollsvot2,
polltypeA = gsub('[0-9]+', '', rownames(pollsvot2)),
polltype1 = extract_numeric(rownames(pollsvot2)))
pollsvot2$polltype1 <- ifelse(is.na(pollsvot2$polltype1), 1, pollsvot2$polltype1 + 1)
pollsdf <- pollsdf %>%
mutate(population = unlist(population),
sampleSize = as.numeric(unlist(sampleSize)),
pollster = unlist(pollster),
startDate = ymd(unlist(startDate)),
endDate = ymd(unlist(endDate)),
pollsterRating = unlist(pollsterRating)) %>%
select(population, sampleSize, pollster, startDate, endDate, pollsterRating,
wtpolls, wtplus, wtnow)
allpolldata <- cbind(rbind(pollsdf[rep(seq_len(nrow(pollsdf)), each=3),],
pollsdf[rep(seq_len(nrow(pollsdf)), each=3),]),
rbind(pollsvot1, pollsvot2))
allpolldata <- allpolldata %>%
arrange(polltype1, choice)