Data and Method

I will use nationwide polls:
- Thousands exist over many elections (back to 1945).
- Investigate relationship between opinion poll vote share and seats won.

#need forecast library!
library(forecast)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: timeDate
## This is forecast 5.8

#first, need election outcomes in terms of seats and other bits of information
locn <- "/home/readejj/Dropbox/Research/Big Data/twitter elections/" #location of file
elec.outcomes.0 <- read.csv(paste(locn,"overall_results_1983_2010.csv",sep=""),stringsAsFactors=F)#open file
#this file is in a difficult format for using for analysis - need to essentially pivot it from rows into columns
years <- c(gsub("X(\\d+)","\\1",colnames(elec.outcomes.0)[grep("X\\d+",colnames(elec.outcomes.0))]))
years <- gsub("[.]","-",years)
elec.outcomes <- data.frame("election"=years,
                            "CON.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="CON" 
                                                                  & elec.outcomes.0$Data=="Sum of Vote",
                                                                  3:13]),
                            "CON.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="CON" 
                                                                   & elec.outcomes.0$Data=="Sum of Seats",
                                                                   3:13]),
                            "LAB.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LAB" 
                                                                  & elec.outcomes.0$Data=="Sum of Vote",
                                                                  3:13]),
                            "LAB.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LAB" 
                                                                   & elec.outcomes.0$Data=="Sum of Seats",
                                                                   3:13]),
                            "LIB.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LIB" 
                                                                  & elec.outcomes.0$Data=="Sum of Vote",
                                                                  3:13]),
                            "LIB.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LIB" 
                                                                   & elec.outcomes.0$Data=="Sum of Seats",
                                                                   3:13]),stringsAsFactors=F)

#next loading up all the opinion poll data from UK Polling Report
#first list all the files (could use dir for this purpose)
files <- c("/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1970_1974.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1974_1979.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1979_1983.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1983_1987.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1987_1992.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1992_1997.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1997_2001.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2001_2005.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2005_2010.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2010_2015.csv")

#manually adding in some information about elections covered by this data
election.dates <- data.frame("election"=c("1974-Feb","1974-Oct","1979","1983","1987",
                                          "1992","1997","2001","2005","2010","2015"),
                             "election.date"=as.Date(c("1974-02-28","1974-10-10","1979-05-03",
                                                       "1983-06-09","1987-06-11","1992-04-09",
                                                       "1997-05-01","2001-06-07","2005-05-05",
                                                       "2010-05-06","2015-05-07")),
                             "election.incumbent"=c("CON","LAB","LAB","CON","CON","CON",
                                                    "CON","LAB","LAB","LAB","CON"),
                             "maj.req" = c(318,318,318,326,326,326,330,330,326,326,326),
                             stringsAsFactors=F)

ukpolls <- data.frame(stringsAsFactors=F) #empty data frame in which to combine all polling data
for (i in 1:NROW(files)) {
  temp <- read.csv(files[i],stringsAsFactors=F) #load data
  this.election <- gsub("/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_\\d+_(\\d+).csv","\\1",files[i]) #extract from the file name what the election in question is
  
  if(this.election=="1974") { #if it's 1974 we need to be careful as two elections in 1974
    temp$election[temp$Survey.End.Date<as.Date("1974-02-28")] <- "1974-Feb"
    temp$election[temp$Survey.End.Date>=as.Date("1974-02-28")] <- "1974-Oct"
  } else {
    temp$election <- this.election
  }
  
  if(i>1){ #this section within the if statement makes sure all datasets have right variable names (colnames)
    colnames(temp)[colnames(temp)=="LD...."] <- "LDEM...." #change the various incarnations of libdems into one
    extra.vars.0 <- setdiff(colnames(temp),colnames(ukpolls))
    if (NROW(extra.vars.0)>0){
      for(j in 1:NROW(extra.vars.0)){
        ukpolls[[extra.vars.0[j]]] <- NA
      }
    }    
    extra.vars.1 <- setdiff(colnames(ukpolls),colnames(temp))
    if (NROW(extra.vars.1)>0){
      for(j in 1:NROW(extra.vars.1)){
        temp[[extra.vars.1[j]]] <- NA
      }
    }    
  }
  ukpolls <- rbind(ukpolls,temp) #appending data together
}
#with final dataset, merge in the election information from earlier on
ukpolls <- merge(ukpolls,election.dates[grep("election",colnames(election.dates))],by=c("election"),all.x=T)
ukpolls <- ukpolls[is.na(ukpolls$election)==F,] #get rid of rows with no information

#next few lines correct some problems with how the data is reported - format of date
ukpolls$Survey.End.Date[ukpolls$election=="2005" | ukpolls$election=="2001"] <- gsub("(\\d+)/(\\d+)/(\\d+)","20\\3-\\2-\\1",ukpolls$Survey.End.Date[ukpolls$election=="2005" | ukpolls$election=="2001"])
ukpolls$Survey.End.Date <- gsub("2099-","1999-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- gsub("2098-","1998-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- gsub("2097-","1997-",ukpolls$Survey.End.Date)

ukpolls$Survey.End.Date <- as.Date(ukpolls$Survey.End.Date) #format date for use in plots etc

#create smoothed poll average using ets method
con.ets <- ets(ukpolls$CON....[is.na(ukpolls$CON....)==F],model="ANN",alpha=0.1)
ukpolls$con.ma[is.na(ukpolls$CON....)==F] <- con.ets$fitted
lab.ets <- ets(ukpolls$LAB....[is.na(ukpolls$LAB....)==F],model="ANN",alpha=0.1)
ukpolls$lab.ma[is.na(ukpolls$LAB....)==F] <- lab.ets$fitted

#plotting. first command, jpeg(), which is commented out, is how to open a file to save graph to

#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/poll_2010_2015.jpg",height=7,width=10,units="in",res=600)
plot(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
     ukpolls$CON....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],pch=1,col="blue",
     ylim=range(c(ukpolls$CON....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
                  ukpolls$LAB....[ukpolls$Survey.End.Date>as.Date("2010-05-06")]),na.rm=T),
     main="Opinion Poll Shares, Labour and Tories, 2010-2015",ylab="Poll Vote Share",xlab="Date")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
      ukpolls$LAB....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],pch=0,col="red",type="p")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
     ukpolls$con.ma[ukpolls$Survey.End.Date>as.Date("2010-05-06")],type="l",lwd=3,col="darkblue")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
     ukpolls$lab.ma[ukpolls$Survey.End.Date>as.Date("2010-05-06")],type="l",lwd=3,col="darkred")

#dev.off() #if saving to jpeg (or pdf), need dev.off() at the end to close the file

We can create a linear regression model on historical data: \[ seats_{it} = \beta_0 + \beta_1 X_{it} + e_{it}. \]
Here:
- \(seats_{it}\) is seats for party \(i\) at election \(t\).
- \(X_{it}\) is a set of explanatory variables for party \(i\) at election \(t\). Includes:
  - Opinion poll vote share.
  - Days between poll and election.

Results

#merging polling data with outcomes data
full.poll.data <- merge(ukpolls,elec.outcomes,by=c("election"),all.x=T)
#we want to run single regression for all parties, so need to stack data for lab, con, etc on top of each other
#create lab data
lab.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("LAB",colnames(full.poll.data)))]
#need to remove reference to lab in colnames as can't stack data with non-matching colnames
colnames(lab.poll.data) <- gsub("LAB[.][.][.][.]","Poll.Share",colnames(lab.poll.data))
colnames(lab.poll.data) <- gsub("LAB[.]","Outcome.",colnames(lab.poll.data))
#create con data
lab.poll.data$party <- "LAB"
con.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("CON",colnames(full.poll.data)))]
colnames(con.poll.data) <- gsub("CON[.][.][.][.]","Poll.Share",colnames(con.poll.data))
colnames(con.poll.data) <- gsub("CON[.]","Outcome.",colnames(con.poll.data))
con.poll.data$party <- "CON"
ld.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("LIB|LD",colnames(full.poll.data)))]
ld.poll.data$Poll.Share <- rowSums(ld.poll.data[,c("LIB....","LIB.SDP....","All.LD....","LDEM....")],na.rm=T)
ld.poll.data <- ld.poll.data[-grep("[.][.][.][.]",colnames(ld.poll.data))]
colnames(ld.poll.data) <- gsub("LIB[.]","Outcome.",colnames(ld.poll.data))
ld.poll.data$party <- "LIB"

#stacking data - no meaningful data on Greens or Ukip from 2010
long.poll.data <- rbind(lab.poll.data,con.poll.data,ld.poll.data)
long.poll.data <- long.poll.data[is.na(long.poll.data$election)==F,]
long.poll.data$party.LAB <- as.numeric(long.poll.data$party=="LAB")
long.poll.data$party.LIB <- as.numeric(long.poll.data$party=="LIB")
#dummy equal to one if party in power before election
long.poll.data$incumbent <- as.numeric(long.poll.data$party==long.poll.data$election.incumbent)

#need days til election variable
long.poll.data$days.until <- as.numeric(long.poll.data$election.date - long.poll.data$Survey.End.Date)

#regression
poll.reg <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,data=long.poll.data,na.action=na.exclude)
library(knitr)
kable(summary(poll.reg)$coef, digits=2)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-114.87	10.33	-11.11	0.00
Poll.Share	10.03	0.28	36.27	0.00
partyLAB	85.16	10.86	7.84	0.00
partyLIB	150.80	11.29	13.36	0.00
days.until	0.13	0.01	9.80	0.00
incumbent	116.07	10.53	11.02	0.00
Poll.Share:partyLAB	-2.37	0.29	-8.30	0.00
Poll.Share:partyLIB	-9.94	0.36	-27.29	0.00
Poll.Share:days.until	0.00	0.00	-8.25	0.00
partyLAB:days.until	-0.02	0.01	-1.48	0.14
partyLIB:days.until	-0.14	0.01	-10.07	0.00
Poll.Share:incumbent	-1.28	0.28	-4.66	0.00
days.until:incumbent	-0.09	0.01	-6.28	0.00
Poll.Share:partyLAB:days.until	0.00	0.00	1.31	0.19
Poll.Share:partyLIB:days.until	0.00	0.00	8.17	0.00
Poll.Share:days.until:incumbent	0.00	0.00	3.19	0.00

long.poll.data$residuals <- residuals(poll.reg)
long.poll.data$fitted <- fitted(poll.reg)
#forecast 2015 - the predict() command words after using lm to create forecasts if data in data.frame
long.poll.data$forecast[long.poll.data$election=="2015"] <- predict(poll.reg,long.poll.data[long.poll.data$election=="2015",])

#plotting data

#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/poll_seats.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$fitted,long.poll.data$Poll.Share,col="red",pch=2,type="p",
     main="Opinion Poll Shares against Seats",
     ylab="Opinion Poll Share",xlab="Seats Won")
lines(long.poll.data$Outcome.seats,long.poll.data$Poll.Share,type="p")
legend("bottomright",bty="n",pch=c(1,2),col=c(1,2),legend=c("Actual","Model Fitted"))

#dev.off()

Plot is actual seats against polled vote shares, and predicted seats against polled vote shares.

#test - forecast 2010
#need to first run regression only over data up to 2005, so not including 2005-2010 term
poll.reg.0 <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,
                 data=long.poll.data[long.poll.data$election!="2010",])
#now create forecasts, since all explanatory variables needed are already in data.frame
long.poll.data$forecast[long.poll.data$election=="2010"] <- predict(poll.reg.0,long.poll.data[long.poll.data$election=="2010",])

#we could also increase training period to include 2005 election also:
poll.reg.05 <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,
                 data=long.poll.data[long.poll.data$election!="2010" & long.poll.data$election!="2005",])
long.poll.data$forecast05[long.poll.data$election=="2005" | long.poll.data$election=="2010"] <- predict(poll.reg.05,long.poll.data[long.poll.data$election=="2005" | long.poll.data$election=="2010",])
long.poll.data$forc.error05 <- long.poll.data$Outcome.seats - long.poll.data$forecast05

Now we can look at how we’d have forecast 2005 and 2010:

#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/model_forecast_2010.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB"],long.poll.data$forecast05[long.poll.data$party=="LAB"],xlim=range(as.Date("2001-06-07"),as.Date("2005-05-05")),col="red",xaxt="n",
     ylim=c(0,max(long.poll.data$forecast05,na.rm=T)),
     main="2005 General Election: If polls predicted seats",
     ylab="Number of Seats",xlab="Date")
axis(1,at=seq(as.Date("2001-01-01"),as.Date("2005-01-01"),by="years"),
     labels=c(2001:2005))
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB" & long.poll.data$election=="2005"],
      long.poll.data$Outcome.seats[long.poll.data$party=="LAB" & long.poll.data$election=="2005"],type="l",
      col="red")
abline(v=as.Date("2005-05-05"),lty=2)
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON"],long.poll.data$forecast05[long.poll.data$party=="CON"],xlim=range(as.Date("2001-06-07"),as.Date("2005-05-05")),col="blue",
     ylim=c(0,max(long.poll.data$forecast05,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON" & long.poll.data$election=="2005"],long.poll.data$Outcome.seats[long.poll.data$party=="CON" & long.poll.data$election=="2005"],type="l",col="blue")

lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB"],long.poll.data$forecast05[long.poll.data$party=="LIB"],xlim=range(as.Date("2001-06-07"),as.Date("2005-05-05")),col="orange",
     ylim=c(0,max(long.poll.data$forecast05,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB" & long.poll.data$election=="2005"],long.poll.data$Outcome.seats[long.poll.data$party=="LIB" & long.poll.data$election=="2005"],type="l",col="orange")

#dev.off()

#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/model_forecast_2010.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB"],long.poll.data$forecast[long.poll.data$party=="LAB"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="red",xaxt="n",
     ylim=c(0,max(long.poll.data$forecast,na.rm=T)),
     main="2010 General Election: If polls predicted seats",
     ylab="Number of Seats",xlab="Date")
axis(1,at=seq(as.Date("2006-01-01"),as.Date("2010-01-01"),by="years"),
     labels=c(2006:2010))
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB" & long.poll.data$election=="2010"],
      long.poll.data$Outcome.seats[long.poll.data$party=="LAB" & long.poll.data$election=="2010"],type="l",
      col="red")
abline(v=as.Date("2010-05-07"),lty=2)

lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON"],long.poll.data$forecast[long.poll.data$party=="CON"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="blue",
     ylim=c(0,max(long.poll.data$forecast,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON" & long.poll.data$election=="2010"],long.poll.data$Outcome.seats[long.poll.data$party=="CON" & long.poll.data$election=="2010"],type="l",col="blue")

lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB"],long.poll.data$forecast[long.poll.data$party=="LIB"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="orange",
     ylim=c(0,max(long.poll.data$forecast,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB" & long.poll.data$election=="2010"],long.poll.data$Outcome.seats[long.poll.data$party=="LIB" & long.poll.data$election=="2010"],type="l",col="orange")

#dev.off()

Now we can look at 2015’s forecasts:

#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/model_forecast_2015.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB" & long.poll.data$election=="2015"],
     long.poll.data$forecast[long.poll.data$party=="LAB" & long.poll.data$election=="2015"],col="red",xaxt="n",
     ylim=c(0,max(long.poll.data$forecast[long.poll.data$election=="2015"],na.rm=T)),
     xlim=range(as.Date("2010-05-08"),as.Date("2015-05-07")),
     main="2015 General Election: If polls predicted seats",
     ylab="Number of Seats",xlab="Date")
axis(1,at=seq(as.Date("2011-01-01"),as.Date("2015-01-01"),by="years"),
     labels=c(2011:2015))
abline(v=as.Date("2010-05-08"),lty=2)
abline(v=as.Date("2015-05-07"),lty=2)

lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON" & long.poll.data$election=="2015"],
      long.poll.data$forecast[long.poll.data$party=="CON" & long.poll.data$election=="2015"],
      col="blue",type="p")

lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB" & long.poll.data$election=="2015"],
      long.poll.data$forecast[long.poll.data$party=="LIB" & long.poll.data$election=="2015"],col="orange",
      type="p")

#dev.off()

EC313 Forecasting Project: General Election Outcome

Introduction

Literature/Context

Data and Method

Results

Conclusions