J James Reade
10/03/2015
clean.data is a function, as in it’s a collection of commands.load.my.data which takes as an argument where your data is, and adds the command stringsAsFactors=F line for you each time. This would look like:load.my.data <- function(loc) {
data <- read.csv(loc,stringsAsFactors=F)
return(data)
}
loc0 <- "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/"
loc <- paste(loc0,"seats_1945_2010.csv",sep="")
print(loc)
## [1] "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/seats_1945_2010.csv"
all.outcomes <- load.my.data(loc)
clean.data <- function(data) {
require(zoo)
colnames(data)[1] <- "Date.Time"
colnames(data)[-1] <- gsub("^.*?\\d+_(\\w+)[.]href.*?$","\\1",colnames(data)[-1])
data <- data[,-NCOL(data)]
data$Date.Time <- as.Date(substring(data$Date.Time,2),"%Y-%m-%d")
data <- data[is.na(data$Date.Time)==F,]
data <- data[order(data$Date.Time),]
for(i in colnames(data)[nchar(colnames(data))==2]) {
data[,i] <- gsub("<div class=.*?>(.*?)</div>","\\1",data[,i])
data[,i] <- gsub("<div class=.*?>(.*?)\\s+$","\\1",data[,i])
data[,i] <- gsub(">","",data[,i])
data[,i] <- gsub("^\\s+|\\s+$","",data[,i])
#need to turn fractional odds into decimal odds
numerator <- gsub("(\\d+)/(\\d+)$","\\1",data[,i])
denominator <- gsub("(\\d+)/(\\d+)$","\\2",data[,i])
data[,i] <- as.numeric(numerator)+1
data[numerator!=denominator,i] <- as.numeric(numerator[numerator!=denominator])/as.numeric(denominator[numerator!=denominator])+1
data[,i] <- na.locf(data[,i],na.rm=F)
}
return(data)
}
loc <- "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/betting/2015-03-09/"
markets <- dir(loc) #this command lists all files/folders in a location
loc3 <- paste(loc,markets[3],sep="") #here I'm taking just the third market, overall-majority
maj.types <- dir(loc3,pattern="*.csv") #getting all files that end in .csv
maj.type <- data.frame() #setting up empty data frame to fill up with the contents of the files we're
#going to open; idea is we want all data on different party majorities in same file
for(i in maj.types) { #look on the different files
temp <- read.csv(paste(loc3,i,sep="/"),stringsAsFactors=F) #open the data - could use load.my.data() here
temp <- clean.data(temp) #clean up the data using the function above
temp$maj.typ <- gsub("-majority[.]csv","",i) #create a variable that says what kind of majority this file is for
maj.type <- rbind(maj.type,temp) #append data on the bottom of the existing dataset
}
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
summary(maj.type) will give information, or perhaps better str(maj.type).maj.type$mean <- rowMeans(maj.type[,nchar(colnames(maj.type))==2],na.rm=T)
plot(maj.type$Date.Time[maj.type$maj.typ=="no-overall"],1/maj.type$mean[maj.type$maj.typ=="no-overall"],
ylim=range(0,1),xlim=range(maj.type$Date.Time),type="l",ylab="Implied Probability",xlab="Date",
main="Implied Bookmaker Probabilities for Overall Majority")
lines(maj.type$Date.Time[maj.type$maj.typ=="labour"],1/maj.type$mean[maj.type$maj.typ=="labour"],col="red")
lines(maj.type$Date.Time[maj.type$maj.typ=="conservative"],1/maj.type$mean[maj.type$maj.typ=="conservative"],
col="blue")
lines(maj.type$Date.Time[maj.type$maj.typ=="ukip"],1/maj.type$mean[maj.type$maj.typ=="ukip"],col="purple")
lines(maj.type$Date.Time[maj.type$maj.typ=="liberal-democrat"],
1/maj.type$mean[maj.type$maj.typ=="liberal-democrat"],col="orange")
legend("topleft",ncol=3,lty=1,col=c("black","red","blue","purple","orange"),legend=c("No overall","Labour","Tory","Ukip","Lib Dem"))
loc <- "/home/readejj/Dropbox/Research/Big Data/twitter elections/"
loc0 <- "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/"
all.outcomes <- read.csv(paste(loc0,"seats_1945_2010.csv",sep=""),stringsAsFactors=F)
plot(all.outcomes$X.1,all.outcomes$Lab,col="red",type="o",main="Seats Won Since WW2",
ylab="Seats",xlab="Year",
ylim=range(all.outcomes[,c("Con","Lab","Lib","Others")]))
lines(all.outcomes$X.1,all.outcomes$Con,col="blue",type="o")
lines(all.outcomes$X.1,all.outcomes$Lib,col="orange",type="o")
lines(all.outcomes$X.1,all.outcomes$Others,col="grey",type="o")
legend("bottomleft",lty=1,col=c("red","blue","orange","grey"),legend=c("Labour","Tory","LibDem","Other"),
bty="n")
loc <- "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/betting/2015-03-09/"
markets <- dir(loc)
loc2 <- paste(loc,markets[grep("most-seats",markets)],sep="")
parties <- dir(loc2,pattern="*.csv")
con.most <- read.csv(paste(loc2,parties[grep("conservatives",parties)],sep="/"),stringsAsFactors=F)
con.most <- clean.data(con.most)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
lab.most <- read.csv(paste(loc2,parties[grep("labour",parties)],sep="/"),stringsAsFactors=F)
lab.most <- clean.data(lab.most)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
ld.most <- read.csv(paste(loc2,parties[grep("liberal-democrats",parties)],sep="/"),stringsAsFactors=F)
ld.most <- clean.data(ld.most)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
ukip.most <- read.csv(paste(loc2,parties[grep("ukip",parties)],sep="/"),stringsAsFactors=F)
ukip.most <- clean.data(ukip.most)
## Warning: NAs introduced by coercion
lab.most$mean <- rowMeans(lab.most[,nchar(colnames(lab.most))==2],na.rm=T)
con.most$mean <- rowMeans(con.most[,nchar(colnames(con.most))==2],na.rm=T)
plot(con.most$Date.Time,con.most$B3,type="l",
ylim=range(0,1),xlim=range(con.most$Date.Time,na.rm=T),col="white",
main="Most Seats at GE2015",
ylab="Implied Probability by bookmaker",xlab="Date")
j=0
blues <- colours()[grep("blue",colours())]
for(i in colnames(con.most)[nchar(colnames(con.most))==2]) {
j=j+1
lines(con.most$Date.Time,1/con.most[,i],type="l",col=blues[j])
#col=rgb(runif(1),runif(1),runif(1))
}
reds <- colours()[grep("red",colours())]
j=0
for(i in colnames(lab.most)[nchar(colnames(lab.most))==2]) {
j=j+1
lines(lab.most$Date.Time,1/lab.most[,i],type="l",col=reds[j])
#col=rgb(runif(1),runif(1),runif(1))
}
lines(lab.most$Date.Time,1/lab.most$mean,col="red",lwd=3,type="l")
lines(con.most$Date.Time,1/con.most$mean,col="blue",lwd=3,type="l")
#need forecast library!
library(forecast)
## Loading required package: timeDate
## This is forecast 5.8
#first, need election outcomes in terms of seats and other bits of information
locn <- "/home/readejj/Dropbox/Research/Big Data/twitter elections/" #location of file
elec.outcomes.0 <- read.csv(paste(locn,"overall_results_1983_2010.csv",sep=""),stringsAsFactors=F)#open file
#this file is in a difficult format for using for analysis - need to essentially pivot it from rows into columns
years <- c(gsub("X(\\d+)","\\1",colnames(elec.outcomes.0)[grep("X\\d+",colnames(elec.outcomes.0))]))
years <- gsub("[.]","-",years)
elec.outcomes <- data.frame("election"=years,
"CON.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="CON"
& elec.outcomes.0$Data=="Sum of Vote",
3:13]),
"CON.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="CON"
& elec.outcomes.0$Data=="Sum of Seats",
3:13]),
"LAB.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LAB"
& elec.outcomes.0$Data=="Sum of Vote",
3:13]),
"LAB.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LAB"
& elec.outcomes.0$Data=="Sum of Seats",
3:13]),
"LIB.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LIB"
& elec.outcomes.0$Data=="Sum of Vote",
3:13]),
"LIB.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LIB"
& elec.outcomes.0$Data=="Sum of Seats",
3:13]),stringsAsFactors=F)
#next loading up all the opinion poll data from UK Polling Report
#first list all the files (could use dir for this purpose)
files <- c("/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1970_1974.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1974_1979.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1979_1983.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1983_1987.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1987_1992.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1992_1997.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1997_2001.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2001_2005.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2005_2010.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2010_2015.csv")
#manually adding in some information about elections covered by this data
election.dates <- data.frame("election"=c("1974-Feb","1974-Oct","1979","1983","1987",
"1992","1997","2001","2005","2010","2015"),
"election.date"=as.Date(c("1974-02-28","1974-10-10","1979-05-03",
"1983-06-09","1987-06-11","1992-04-09",
"1997-05-01","2001-06-07","2005-05-05",
"2010-05-06","2015-05-07")),
"election.incumbent"=c("CON","LAB","LAB","CON","CON","CON",
"CON","LAB","LAB","LAB","CON"),
"maj.req" = c(318,318,318,326,326,326,330,330,326,326,326),
stringsAsFactors=F)
ukpolls <- data.frame(stringsAsFactors=F) #empty data frame in which to combine all polling data
for (i in 1:NROW(files)) {
temp <- read.csv(files[i],stringsAsFactors=F) #load data
this.election <- gsub("/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_\\d+_(\\d+).csv","\\1",files[i]) #extract from the file name what the election in question is
if(this.election=="1974") { #if it's 1974 we need to be careful as two elections in 1974
temp$election[temp$Survey.End.Date<as.Date("1974-02-28")] <- "1974-Feb"
temp$election[temp$Survey.End.Date>=as.Date("1974-02-28")] <- "1974-Oct"
} else {
temp$election <- this.election
}
if(i>1){ #this section within the if statement makes sure all datasets have right variable names (colnames)
colnames(temp)[colnames(temp)=="LD...."] <- "LDEM...." #change the various incarnations of libdems into one
extra.vars.0 <- setdiff(colnames(temp),colnames(ukpolls))
if (NROW(extra.vars.0)>0){
for(j in 1:NROW(extra.vars.0)){
ukpolls[[extra.vars.0[j]]] <- NA
}
}
extra.vars.1 <- setdiff(colnames(ukpolls),colnames(temp))
if (NROW(extra.vars.1)>0){
for(j in 1:NROW(extra.vars.1)){
temp[[extra.vars.1[j]]] <- NA
}
}
}
ukpolls <- rbind(ukpolls,temp) #appending data together
}
#with final dataset, merge in the election information from earlier on
ukpolls <- merge(ukpolls,election.dates[grep("election",colnames(election.dates))],by=c("election"),all.x=T)
ukpolls <- ukpolls[is.na(ukpolls$election)==F,] #get rid of rows with no information
#next few lines correct some problems with how the data is reported - format of date
ukpolls$Survey.End.Date[ukpolls$election=="2005" | ukpolls$election=="2001"] <- gsub("(\\d+)/(\\d+)/(\\d+)","20\\3-\\2-\\1",ukpolls$Survey.End.Date[ukpolls$election=="2005" | ukpolls$election=="2001"])
ukpolls$Survey.End.Date <- gsub("2099-","1999-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- gsub("2098-","1998-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- gsub("2097-","1997-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- as.Date(ukpolls$Survey.End.Date) #format date for use in plots etc
#create smoothed poll average using ets method
con.ets <- ets(ukpolls$CON....[is.na(ukpolls$CON....)==F],model="ANN",alpha=0.1)
ukpolls$con.ma[is.na(ukpolls$CON....)==F] <- con.ets$fitted
lab.ets <- ets(ukpolls$LAB....[is.na(ukpolls$LAB....)==F],model="ANN",alpha=0.1)
ukpolls$lab.ma[is.na(ukpolls$LAB....)==F] <- lab.ets$fitted
#plotting. first command, jpeg(), which is commented out, is how to open a file to save graph to
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/poll_2010_2015.jpg",height=7,width=10,units="in",res=600)
plot(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$CON....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],pch=1,col="blue",
ylim=range(c(ukpolls$CON....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$LAB....[ukpolls$Survey.End.Date>as.Date("2010-05-06")]),na.rm=T),
main="Opinion Poll Shares, Labour and Tories, 2010-2015",ylab="Poll Vote Share",xlab="Date")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$LAB....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],pch=0,col="red",type="p")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$con.ma[ukpolls$Survey.End.Date>as.Date("2010-05-06")],type="l",lwd=3,col="darkblue")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
ukpolls$lab.ma[ukpolls$Survey.End.Date>as.Date("2010-05-06")],type="l",lwd=3,col="darkred")
#dev.off() #if saving to jpeg (or pdf), need dev.off() at the end to close the file
#merging polling data with outcomes data
full.poll.data <- merge(ukpolls,elec.outcomes,by=c("election"),all.x=T)
#we want to run single regression for all parties, so need to stack data for lab, con, etc on top of each other
#create lab data
lab.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("LAB",colnames(full.poll.data)))]
#need to remove reference to lab in colnames as can't stack data with non-matching colnames
colnames(lab.poll.data) <- gsub("LAB[.][.][.][.]","Poll.Share",colnames(lab.poll.data))
colnames(lab.poll.data) <- gsub("LAB[.]","Outcome.",colnames(lab.poll.data))
#create con data
lab.poll.data$party <- "LAB"
con.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("CON",colnames(full.poll.data)))]
colnames(con.poll.data) <- gsub("CON[.][.][.][.]","Poll.Share",colnames(con.poll.data))
colnames(con.poll.data) <- gsub("CON[.]","Outcome.",colnames(con.poll.data))
con.poll.data$party <- "CON"
ld.poll.data <- full.poll.data[,c(1,2,3,14,15,grep("LIB|LD",colnames(full.poll.data)))]
ld.poll.data$Poll.Share <- rowSums(ld.poll.data[,c("LIB....","LIB.SDP....","All.LD....","LDEM....")],na.rm=T)
ld.poll.data <- ld.poll.data[-grep("[.][.][.][.]",colnames(ld.poll.data))]
colnames(ld.poll.data) <- gsub("LIB[.]","Outcome.",colnames(ld.poll.data))
ld.poll.data$party <- "LIB"
#stacking data - no meaningful data on Greens or Ukip from 2010
long.poll.data <- rbind(lab.poll.data,con.poll.data,ld.poll.data)
long.poll.data <- long.poll.data[is.na(long.poll.data$election)==F,]
long.poll.data$party.LAB <- as.numeric(long.poll.data$party=="LAB")
long.poll.data$party.LIB <- as.numeric(long.poll.data$party=="LIB")
#dummy equal to one if party in power before election
long.poll.data$incumbent <- as.numeric(long.poll.data$party==long.poll.data$election.incumbent)
#need days til election variable
long.poll.data$days.until <- as.numeric(long.poll.data$election.date - long.poll.data$Survey.End.Date)
#regression
poll.reg <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,data=long.poll.data,na.action=na.exclude)
summary(poll.reg)
##
## Call:
## lm(formula = Outcome.seats ~ Poll.Share * party * days.until +
## incumbent * Poll.Share * days.until, data = long.poll.data,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -165.741 -23.731 -0.606 20.547 313.736
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.149e+02 1.033e+01 -11.115 < 2e-16 ***
## Poll.Share 1.003e+01 2.765e-01 36.272 < 2e-16 ***
## partyLAB 8.516e+01 1.086e+01 7.839 5.09e-15 ***
## partyLIB 1.508e+02 1.129e+01 13.358 < 2e-16 ***
## days.until 1.266e-01 1.292e-02 9.798 < 2e-16 ***
## incumbent 1.161e+02 1.053e+01 11.021 < 2e-16 ***
## Poll.Share:partyLAB -2.369e+00 2.855e-01 -8.298 < 2e-16 ***
## Poll.Share:partyLIB -9.941e+00 3.643e-01 -27.286 < 2e-16 ***
## Poll.Share:days.until -2.880e-03 3.489e-04 -8.254 < 2e-16 ***
## partyLAB:days.until -2.117e-02 1.429e-02 -1.482 0.13843
## partyLIB:days.until -1.425e-01 1.415e-02 -10.067 < 2e-16 ***
## Poll.Share:incumbent -1.284e+00 2.755e-01 -4.660 3.22e-06 ***
## days.until:incumbent -8.827e-02 1.405e-02 -6.284 3.47e-10 ***
## Poll.Share:partyLAB:days.until 4.840e-04 3.689e-04 1.312 0.18952
## Poll.Share:partyLIB:days.until 3.874e-03 4.745e-04 8.166 3.67e-16 ***
## Poll.Share:days.until:incumbent 1.158e-03 3.628e-04 3.192 0.00142 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.47 on 8210 degrees of freedom
## (5652 observations deleted due to missingness)
## Multiple R-squared: 0.8921, Adjusted R-squared: 0.8919
## F-statistic: 4525 on 15 and 8210 DF, p-value: < 2.2e-16
long.poll.data$residuals <- residuals(poll.reg)
long.poll.data$fitted <- fitted(poll.reg)
#forecast 2015 - the predict() command words after using lm to create forecasts if data in data.frame
long.poll.data$forecast[long.poll.data$election=="2015"] <- predict(poll.reg,long.poll.data[long.poll.data$election=="2015",])
#plotting data
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/poll_seats.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$fitted,long.poll.data$Poll.Share,col="red",pch=2,type="p",
main="Opinion Poll Shares against Seats",
ylab="Opinion Poll Share",xlab="Seats Won")
lines(long.poll.data$Outcome.seats,long.poll.data$Poll.Share,type="p")
legend("bottomright",bty="n",pch=c(1,2),col=c(1,2),legend=c("Actual","Model Fitted"))
#dev.off()
#test - forecast 2010
#need to first run regression only over data up to 2005, so not including 2005-2010 term
poll.reg.0 <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,
data=long.poll.data[long.poll.data$election!="2010",])
#now create forecasts, since all explanatory variables needed are already in data.frame
long.poll.data$forecast[long.poll.data$election=="2010"] <- predict(poll.reg.0,long.poll.data[long.poll.data$election=="2010",])
#we could also increase training period to include 2005 election also:
poll.reg.05 <- lm(Outcome.seats ~ Poll.Share*party*days.until + incumbent*Poll.Share*days.until,
data=long.poll.data[long.poll.data$election!="2010" & long.poll.data$election!="2005",])
long.poll.data$forecast05[long.poll.data$election=="2005" | long.poll.data$election=="2010"] <- predict(poll.reg.05,long.poll.data[long.poll.data$election=="2005" | long.poll.data$election=="2010",])
long.poll.data$forc.error05 <- long.poll.data$Outcome.seats - long.poll.data$forecast05
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/model_forecast_2010.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB"],long.poll.data$forecast[long.poll.data$party=="LAB"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="red",xaxt="n",
ylim=c(0,max(long.poll.data$forecast,na.rm=T)),
main="2010 General Election: If polls predicted seats",
ylab="Number of Seats",xlab="Date")
axis(1,at=seq(as.Date("2006-01-01"),as.Date("2010-01-01"),by="years"),
labels=c(2006:2010))
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB" & long.poll.data$election=="2010"],
long.poll.data$Outcome.seats[long.poll.data$party=="LAB" & long.poll.data$election=="2010"],type="l",
col="red")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON"],long.poll.data$forecast[long.poll.data$party=="CON"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="blue",
ylim=c(0,max(long.poll.data$forecast,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON" & long.poll.data$election=="2010"],long.poll.data$Outcome.seats[long.poll.data$party=="CON" & long.poll.data$election=="2010"],type="l",col="blue")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB"],long.poll.data$forecast[long.poll.data$party=="LIB"],xlim=range(as.Date("2005-05-06"),as.Date("2010-05-07")),col="orange",
ylim=c(0,max(long.poll.data$forecast,na.rm=T)),type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB" & long.poll.data$election=="2010"],long.poll.data$Outcome.seats[long.poll.data$party=="LIB" & long.poll.data$election=="2010"],type="l",col="orange")
#dev.off()
#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/model_forecast_2015.jpg",height=7,width=10,units="in",res=600)
plot(long.poll.data$Survey.End.Date[long.poll.data$party=="LAB" & long.poll.data$election=="2015"],
long.poll.data$forecast[long.poll.data$party=="LAB" & long.poll.data$election=="2015"],col="red",xaxt="n",
ylim=c(0,max(long.poll.data$forecast[long.poll.data$election=="2015"],na.rm=T)),
main="2015 General Election: If polls predicted seats",
ylab="Number of Seats",xlab="Date")
axis(1,at=seq(as.Date("2011-01-01"),as.Date("2015-01-01"),by="years"),
labels=c(2011:2015))
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="CON" & long.poll.data$election=="2015"],
long.poll.data$forecast[long.poll.data$party=="CON" & long.poll.data$election=="2015"],
col="blue",type="p")
lines(long.poll.data$Survey.End.Date[long.poll.data$party=="LIB" & long.poll.data$election=="2015"],
long.poll.data$forecast[long.poll.data$party=="LIB" & long.poll.data$election=="2015"],col="orange",
type="p")
#dev.off()