J James Reade
17/03/2015
#need forecast library!
library(forecast)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: timeDate
## This is forecast 5.8
#first, need election outcomes in terms of seats and other bits of information
locn <- "/home/readejj/Dropbox/Research/Big Data/twitter elections/" #location of file
elec.outcomes.0 <- read.csv(paste(locn,"overall_results_1983_2010.csv",sep=""),stringsAsFactors=F)#open file
#this file is in a difficult format for using for analysis - need to essentially pivot it from rows into columns
years <- c(gsub("X(\\d+)","\\1",colnames(elec.outcomes.0)[grep("X\\d+",colnames(elec.outcomes.0))]))
years <- gsub("[.]","-",years)
elec.outcomes <- data.frame("election"=years,
"CON.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="CON"
& elec.outcomes.0$Data=="Sum of Vote",
3:13]),
"CON.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="CON"
& elec.outcomes.0$Data=="Sum of Seats",
3:13]),
"LAB.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LAB"
& elec.outcomes.0$Data=="Sum of Vote",
3:13]),
"LAB.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LAB"
& elec.outcomes.0$Data=="Sum of Seats",
3:13]),
"LIB.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LIB"
& elec.outcomes.0$Data=="Sum of Vote",
3:13]),
"LIB.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LIB"
& elec.outcomes.0$Data=="Sum of Seats",
3:13]),stringsAsFactors=F)
#next loading up all the opinion poll data from UK Polling Report
#first list all the files (could use dir for this purpose)
files <- c("/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1970_1974.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1974_1979.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1979_1983.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1983_1987.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1987_1992.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1992_1997.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1997_2001.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2001_2005.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2005_2010.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2010_2015.csv")
#manually adding in some information about elections covered by this data
election.dates <- data.frame("election"=c("1974-Feb","1974-Oct","1979","1983","1987",
"1992","1997","2001","2005","2010","2015"),
"election.date"=as.Date(c("1974-02-28","1974-10-10","1979-05-03",
"1983-06-09","1987-06-11","1992-04-09",
"1997-05-01","2001-06-07","2005-05-05",
"2010-05-06","2015-05-07")),
"election.incumbent"=c("CON","LAB","LAB","CON","CON","CON",
"CON","LAB","LAB","LAB","CON"),
"maj.req" = c(318,318,318,326,326,326,330,330,326,326,326),
stringsAsFactors=F)
ukpolls <- data.frame(stringsAsFactors=F) #empty data frame in which to combine all polling data
for (i in 1:NROW(files)) {
temp <- read.csv(files[i],stringsAsFactors=F) #load data
this.election <- gsub("/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_\\d+_(\\d+).csv","\\1",files[i]) #extract from the file name what the election in question is
if(this.election=="1974") { #if it's 1974 we need to be careful as two elections in 1974
temp$election[temp$Survey.End.Date<as.Date("1974-02-28")] <- "1974-Feb"
temp$election[temp$Survey.End.Date>=as.Date("1974-02-28")] <- "1974-Oct"
} else {
temp$election <- this.election
}
if(i>1){ #this section within the if statement makes sure all datasets have right variable names (colnames)
colnames(temp)[colnames(temp)=="LD...."] <- "LDEM...." #change the various incarnations of libdems into one
extra.vars.0 <- setdiff(colnames(temp),colnames(ukpolls))
if (NROW(extra.vars.0)>0){
for(j in 1:NROW(extra.vars.0)){
ukpolls[[extra.vars.0[j]]] <- NA
}
}
extra.vars.1 <- setdiff(colnames(ukpolls),colnames(temp))
if (NROW(extra.vars.1)>0){
for(j in 1:NROW(extra.vars.1)){
temp[[extra.vars.1[j]]] <- NA
}
}
}
ukpolls <- rbind(ukpolls,temp) #appending data together
}
#with final dataset, merge in the election information from earlier on
ukpolls <- merge(ukpolls,election.dates[grep("election",colnames(election.dates))],by=c("election"),all.x=T)
ukpolls <- ukpolls[is.na(ukpolls$election)==F,] #get rid of rows with no information
#next few lines correct some problems with how the data is reported - format of date
ukpolls$Survey.End.Date[ukpolls$election=="2005" | ukpolls$election=="2001"] <- gsub("(\\d+)/(\\d+)/(\\d+)","20\\3-\\2-\\1",ukpolls$Survey.End.Date[ukpolls$election=="2005" | ukpolls$election=="2001"])
ukpolls$Survey.End.Date <- gsub("2099-","1999-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- gsub("2098-","1998-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- gsub("2097-","1997-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- as.Date(ukpolls$Survey.End.Date) #format date for use in plots etc
#create smoothed poll average using ets method
con.ets <- ets(ukpolls$CON....[is.na(ukpolls$CON....)==F],model="ANN",alpha=0.1)
ukpolls$con.ma[is.na(ukpolls$CON....)==F] <- con.ets$fitted
lab.ets <- ets(ukpolls$LAB....[is.na(ukpolls$LAB....)==F],model="ANN",alpha=0.1)
ukpolls$lab.ma[is.na(ukpolls$LAB....)==F] <- lab.ets$fitted
#plotting. first command, jpeg(), which is commented out, is how to open a file to save graph to
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/poll_2010_2015.jpg",height=7,width=10,units="in",res=600)
plot(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$CON....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],pch=1,col="blue",
ylim=range(c(ukpolls$CON....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$LAB....[ukpolls$Survey.End.Date>as.Date("2010-05-06")]),na.rm=T),
main="Opinion Poll Shares, Labour and Tories, 2010-2015",ylab="Poll Vote Share",xlab="Date")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$LAB....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],pch=0,col="red",type="p")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$con.ma[ukpolls$Survey.End.Date>as.Date("2010-05-06")],type="l",lwd=3,col="darkblue")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$lab.ma[ukpolls$Survey.End.Date>as.Date("2010-05-06")],type="l",lwd=3,col="darkred")
#dev.off() #if saving to jpeg (or pdf), need dev.off() at the end to close the file
#merging polling data with outcomes data
full.poll.data <- merge(ukpolls,elec.outcomes,by=c("election"),all.x=T)
#we want to run single regression for all parties, so need to stack data for lab, con, etc on top of each other
#create lab data
lab.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("LAB",colnames(full.poll.data)))]
#need to remove reference to lab in colnames as can't stack data with non-matching colnames
colnames(lab.poll.data) <- gsub("LAB[.][.][.][.]","Poll.Share",colnames(lab.poll.data))
colnames(lab.poll.data) <- gsub("LAB[.]","Outcome.",colnames(lab.poll.data))
#create con data
lab.poll.data$party <- "LAB"
con.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("CON",colnames(full.poll.data)))]
colnames(con.poll.data) <- gsub("CON[.][.][.][.]","Poll.Share",colnames(con.poll.data))
colnames(con.poll.data) <- gsub("CON[.]","Outcome.",colnames(con.poll.data))
con.poll.data$party <- "CON"
ld.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("LIB|LD",colnames(full.poll.data)))]
ld.poll.data$Poll.Share <- rowSums(ld.poll.data[,c("LIB....","LIB.SDP....","All.LD....","LDEM....")],na.rm=T)
ld.poll.data <- ld.poll.data[-grep("[.][.][.][.]",colnames(ld.poll.data))]
colnames(ld.poll.data) <- gsub("LIB[.]","Outcome.",colnames(ld.poll.data))
ld.poll.data$party <- "LIB"
#stacking data - no meaningful data on Greens or Ukip from 2010
long.poll.data <- rbind(lab.poll.data,con.poll.data,ld.poll.data)
long.poll.data <- long.poll.data[is.na(long.poll.data$election)==F,]
long.poll.data$party.LAB <- as.numeric(long.poll.data$party=="LAB")
long.poll.data$party.LIB <- as.numeric(long.poll.data$party=="LIB")
#dummy equal to one if party in power before election
long.poll.data$incumbent <- as.numeric(long.poll.data$party==long.poll.data$election.incumbent)
#need days til election variable
long.poll.data$days.until <- as.numeric(long.poll.data$election.date - long.poll.data$Survey.End.Date)
#regression
poll.reg <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,data=long.poll.data,na.action=na.exclude)
library(knitr)
kable(summary(poll.reg)$coef, digits=2)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -114.87 | 10.33 | -11.11 | 0.00 |
| Poll.Share | 10.03 | 0.28 | 36.27 | 0.00 |
| partyLAB | 85.16 | 10.86 | 7.84 | 0.00 |
| partyLIB | 150.80 | 11.29 | 13.36 | 0.00 |
| days.until | 0.13 | 0.01 | 9.80 | 0.00 |
| incumbent | 116.07 | 10.53 | 11.02 | 0.00 |
| Poll.Share:partyLAB | -2.37 | 0.29 | -8.30 | 0.00 |
| Poll.Share:partyLIB | -9.94 | 0.36 | -27.29 | 0.00 |
| Poll.Share:days.until | 0.00 | 0.00 | -8.25 | 0.00 |
| partyLAB:days.until | -0.02 | 0.01 | -1.48 | 0.14 |
| partyLIB:days.until | -0.14 | 0.01 | -10.07 | 0.00 |
| Poll.Share:incumbent | -1.28 | 0.28 | -4.66 | 0.00 |
| days.until:incumbent | -0.09 | 0.01 | -6.28 | 0.00 |
| Poll.Share:partyLAB:days.until | 0.00 | 0.00 | 1.31 | 0.19 |
| Poll.Share:partyLIB:days.until | 0.00 | 0.00 | 8.17 | 0.00 |
| Poll.Share:days.until:incumbent | 0.00 | 0.00 | 3.19 | 0.00 |
long.poll.data$residuals <- residuals(poll.reg)
long.poll.data$fitted <- fitted(poll.reg)
#forecast 2015 - the predict() command words after using lm to create forecasts if data in data.frame
long.poll.data$forecast[long.poll.data$election=="2015"] <- predict(poll.reg,long.poll.data[long.poll.data$election=="2015",])
#plotting data
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/poll_seats.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$fitted,long.poll.data$Poll.Share,col="red",pch=2,type="p",
main="Opinion Poll Shares against Seats",
ylab="Opinion Poll Share",xlab="Seats Won")
lines(long.poll.data$Outcome.seats,long.poll.data$Poll.Share,type="p")
legend("bottomright",bty="n",pch=c(1,2),col=c(1,2),legend=c("Actual","Model Fitted"))
#dev.off()
#test - forecast 2010
#need to first run regression only over data up to 2005, so not including 2005-2010 term
poll.reg.0 <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,
data=long.poll.data[long.poll.data$election!="2010",])
#now create forecasts, since all explanatory variables needed are already in data.frame
long.poll.data$forecast[long.poll.data$election=="2010"] <- predict(poll.reg.0,long.poll.data[long.poll.data$election=="2010",])
#we could also increase training period to include 2005 election also:
poll.reg.05 <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,
data=long.poll.data[long.poll.data$election!="2010" & long.poll.data$election!="2005",])
long.poll.data$forecast05[long.poll.data$election=="2005" | long.poll.data$election=="2010"] <- predict(poll.reg.05,long.poll.data[long.poll.data$election=="2005" | long.poll.data$election=="2010",])
long.poll.data$forc.error05 <- long.poll.data$Outcome.seats - long.poll.data$forecast05
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/model_forecast_2010.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB"],long.poll.data$forecast05[long.poll.data$party=="LAB"],xlim=range(as.Date("2001-06-07"),as.Date("2005-05-05")),col="red",xaxt="n",
ylim=c(0,max(long.poll.data$forecast05,na.rm=T)),
main="2005 General Election: If polls predicted seats",
ylab="Number of Seats",xlab="Date")
axis(1,at=seq(as.Date("2001-01-01"),as.Date("2005-01-01"),by="years"),
labels=c(2001:2005))
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB" & long.poll.data$election=="2005"],
long.poll.data$Outcome.seats[long.poll.data$party=="LAB" & long.poll.data$election=="2005"],type="l",
col="red")
abline(v=as.Date("2005-05-05"),lty=2)
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON"],long.poll.data$forecast05[long.poll.data$party=="CON"],xlim=range(as.Date("2001-06-07"),as.Date("2005-05-05")),col="blue",
ylim=c(0,max(long.poll.data$forecast05,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON" & long.poll.data$election=="2005"],long.poll.data$Outcome.seats[long.poll.data$party=="CON" & long.poll.data$election=="2005"],type="l",col="blue")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB"],long.poll.data$forecast05[long.poll.data$party=="LIB"],xlim=range(as.Date("2001-06-07"),as.Date("2005-05-05")),col="orange",
ylim=c(0,max(long.poll.data$forecast05,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB" & long.poll.data$election=="2005"],long.poll.data$Outcome.seats[long.poll.data$party=="LIB" & long.poll.data$election=="2005"],type="l",col="orange")
#dev.off()
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/model_forecast_2010.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB"],long.poll.data$forecast[long.poll.data$party=="LAB"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="red",xaxt="n",
ylim=c(0,max(long.poll.data$forecast,na.rm=T)),
main="2010 General Election: If polls predicted seats",
ylab="Number of Seats",xlab="Date")
axis(1,at=seq(as.Date("2006-01-01"),as.Date("2010-01-01"),by="years"),
labels=c(2006:2010))
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB" & long.poll.data$election=="2010"],
long.poll.data$Outcome.seats[long.poll.data$party=="LAB" & long.poll.data$election=="2010"],type="l",
col="red")
abline(v=as.Date("2010-05-07"),lty=2)
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON"],long.poll.data$forecast[long.poll.data$party=="CON"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="blue",
ylim=c(0,max(long.poll.data$forecast,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON" & long.poll.data$election=="2010"],long.poll.data$Outcome.seats[long.poll.data$party=="CON" & long.poll.data$election=="2010"],type="l",col="blue")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB"],long.poll.data$forecast[long.poll.data$party=="LIB"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="orange",
ylim=c(0,max(long.poll.data$forecast,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB" & long.poll.data$election=="2010"],long.poll.data$Outcome.seats[long.poll.data$party=="LIB" & long.poll.data$election=="2010"],type="l",col="orange")
#dev.off()
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/model_forecast_2015.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB" & long.poll.data$election=="2015"],
long.poll.data$forecast[long.poll.data$party=="LAB" & long.poll.data$election=="2015"],col="red",xaxt="n",
ylim=c(0,max(long.poll.data$forecast[long.poll.data$election=="2015"],na.rm=T)),
xlim=range(as.Date("2010-05-08"),as.Date("2015-05-07")),
main="2015 General Election: If polls predicted seats",
ylab="Number of Seats",xlab="Date")
axis(1,at=seq(as.Date("2011-01-01"),as.Date("2015-01-01"),by="years"),
labels=c(2011:2015))
abline(v=as.Date("2010-05-08"),lty=2)
abline(v=as.Date("2015-05-07"),lty=2)
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON" & long.poll.data$election=="2015"],
long.poll.data$forecast[long.poll.data$party=="CON" & long.poll.data$election=="2015"],
col="blue",type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB" & long.poll.data$election=="2015"],
long.poll.data$forecast[long.poll.data$party=="LIB" & long.poll.data$election=="2015"],col="orange",
type="p")
#dev.off()