EC313 Lecture 12

J James Reade

10/03/2015

Introduction

Midterm II Review

Project: Details

Project: Steps

  1. Determine your idea.
    • Make it something you’re really interested in.
    • Talk to me about it, particularly if speculative.
  2. Research your idea:
    • Can you collect data?
    • Does your variable satisfy properties set out in lecture 1?
  3. Start constructing forecast models.
    • What are the properties of your data?
    • What particular features of your data do you need to account for? How will you do it?
    • What forecast models are most appropriate/inappropriate?

Data Sources

What about Lectures and Classes?

Forecasting the General Election

load.my.data <- function(loc) {
  data <- read.csv(loc,stringsAsFactors=F)
  return(data)
}
loc0 <- "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/"
loc <- paste(loc0,"seats_1945_2010.csv",sep="")
print(loc)
## [1] "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/seats_1945_2010.csv"
all.outcomes <- load.my.data(loc)
clean.data <- function(data) {
  require(zoo)
  colnames(data)[1] <- "Date.Time"
  colnames(data)[-1] <- gsub("^.*?\\d+_(\\w+)[.]href.*?$","\\1",colnames(data)[-1])
  data <- data[,-NCOL(data)]
  data$Date.Time <- as.Date(substring(data$Date.Time,2),"%Y-%m-%d")
  data <- data[is.na(data$Date.Time)==F,]
  data <- data[order(data$Date.Time),]
  for(i in colnames(data)[nchar(colnames(data))==2]) {
    data[,i] <- gsub("<div class=.*?>(.*?)</div>","\\1",data[,i])
    data[,i] <- gsub("<div class=.*?>(.*?)\\s+$","\\1",data[,i])
    data[,i] <- gsub(">","",data[,i])
    data[,i] <- gsub("^\\s+|\\s+$","",data[,i])
    #need to turn fractional odds into decimal odds
    numerator <- gsub("(\\d+)/(\\d+)$","\\1",data[,i])
    denominator <- gsub("(\\d+)/(\\d+)$","\\2",data[,i])
    data[,i] <- as.numeric(numerator)+1
    data[numerator!=denominator,i] <- as.numeric(numerator[numerator!=denominator])/as.numeric(denominator[numerator!=denominator])+1
    data[,i] <- na.locf(data[,i],na.rm=F)
  }
  return(data)
}
loc <- "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/betting/2015-03-09/"
markets <- dir(loc) #this command lists all files/folders in a location

loc3 <- paste(loc,markets[3],sep="") #here I'm taking just the third market, overall-majority
maj.types <- dir(loc3,pattern="*.csv") #getting all files that end in .csv
maj.type <- data.frame() #setting up empty data frame to fill up with the contents of the files we're 
#going to open; idea is we want all data on different party majorities in same file
for(i in maj.types) { #look on the different files
  temp <- read.csv(paste(loc3,i,sep="/"),stringsAsFactors=F) #open the data - could use load.my.data() here
  temp <- clean.data(temp) #clean up the data using the function above
  temp$maj.typ <- gsub("-majority[.]csv","",i) #create a variable that says what kind of majority this file is for
  maj.type <- rbind(maj.type,temp) #append data on the bottom of the existing dataset
}
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
maj.type$mean <- rowMeans(maj.type[,nchar(colnames(maj.type))==2],na.rm=T)
plot(maj.type$Date.Time[maj.type$maj.typ=="no-overall"],1/maj.type$mean[maj.type$maj.typ=="no-overall"],
     ylim=range(0,1),xlim=range(maj.type$Date.Time),type="l",ylab="Implied Probability",xlab="Date",
     main="Implied Bookmaker Probabilities for Overall Majority")
lines(maj.type$Date.Time[maj.type$maj.typ=="labour"],1/maj.type$mean[maj.type$maj.typ=="labour"],col="red")
lines(maj.type$Date.Time[maj.type$maj.typ=="conservative"],1/maj.type$mean[maj.type$maj.typ=="conservative"],
      col="blue")
lines(maj.type$Date.Time[maj.type$maj.typ=="ukip"],1/maj.type$mean[maj.type$maj.typ=="ukip"],col="purple")
lines(maj.type$Date.Time[maj.type$maj.typ=="liberal-democrat"],
      1/maj.type$mean[maj.type$maj.typ=="liberal-democrat"],col="orange")
legend("topleft",ncol=3,lty=1,col=c("black","red","blue","purple","orange"),legend=c("No overall","Labour","Tory","Ukip","Lib Dem"))

loc <- "/home/readejj/Dropbox/Research/Big Data/twitter elections/"
loc0 <- "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/"
all.outcomes <- read.csv(paste(loc0,"seats_1945_2010.csv",sep=""),stringsAsFactors=F)
plot(all.outcomes$X.1,all.outcomes$Lab,col="red",type="o",main="Seats Won Since WW2",
     ylab="Seats",xlab="Year",
     ylim=range(all.outcomes[,c("Con","Lab","Lib","Others")]))
lines(all.outcomes$X.1,all.outcomes$Con,col="blue",type="o")
lines(all.outcomes$X.1,all.outcomes$Lib,col="orange",type="o")
lines(all.outcomes$X.1,all.outcomes$Others,col="grey",type="o")
legend("bottomleft",lty=1,col=c("red","blue","orange","grey"),legend=c("Labour","Tory","LibDem","Other"),
       bty="n")

loc <- "/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/betting/2015-03-09/"
markets <- dir(loc)
loc2 <- paste(loc,markets[grep("most-seats",markets)],sep="")
parties <- dir(loc2,pattern="*.csv")

con.most <- read.csv(paste(loc2,parties[grep("conservatives",parties)],sep="/"),stringsAsFactors=F)
con.most <- clean.data(con.most)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
lab.most <- read.csv(paste(loc2,parties[grep("labour",parties)],sep="/"),stringsAsFactors=F)
lab.most <- clean.data(lab.most)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
ld.most <- read.csv(paste(loc2,parties[grep("liberal-democrats",parties)],sep="/"),stringsAsFactors=F)
ld.most <- clean.data(ld.most)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
ukip.most <- read.csv(paste(loc2,parties[grep("ukip",parties)],sep="/"),stringsAsFactors=F)
ukip.most <- clean.data(ukip.most)
## Warning: NAs introduced by coercion
lab.most$mean <- rowMeans(lab.most[,nchar(colnames(lab.most))==2],na.rm=T)
con.most$mean <- rowMeans(con.most[,nchar(colnames(con.most))==2],na.rm=T)

plot(con.most$Date.Time,con.most$B3,type="l",
     ylim=range(0,1),xlim=range(con.most$Date.Time,na.rm=T),col="white",
     main="Most Seats at GE2015",
     ylab="Implied Probability by bookmaker",xlab="Date")
j=0
blues <- colours()[grep("blue",colours())]
for(i in colnames(con.most)[nchar(colnames(con.most))==2]) {
  j=j+1
  lines(con.most$Date.Time,1/con.most[,i],type="l",col=blues[j])
  #col=rgb(runif(1),runif(1),runif(1))
}
reds <- colours()[grep("red",colours())]
j=0
for(i in colnames(lab.most)[nchar(colnames(lab.most))==2]) {
  j=j+1
  lines(lab.most$Date.Time,1/lab.most[,i],type="l",col=reds[j])
  #col=rgb(runif(1),runif(1),runif(1))
}
lines(lab.most$Date.Time,1/lab.most$mean,col="red",lwd=3,type="l")
lines(con.most$Date.Time,1/con.most$mean,col="blue",lwd=3,type="l")

General Election

National Opinion Polls to Seats?

#need forecast library!
library(forecast)
## Loading required package: timeDate
## This is forecast 5.8
#first, need election outcomes in terms of seats and other bits of information
locn <- "/home/readejj/Dropbox/Research/Big Data/twitter elections/" #location of file
elec.outcomes.0 <- read.csv(paste(locn,"overall_results_1983_2010.csv",sep=""),stringsAsFactors=F)#open file
#this file is in a difficult format for using for analysis - need to essentially pivot it from rows into columns
years <- c(gsub("X(\\d+)","\\1",colnames(elec.outcomes.0)[grep("X\\d+",colnames(elec.outcomes.0))]))
years <- gsub("[.]","-",years)
elec.outcomes <- data.frame("election"=years,
                            "CON.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="CON" 
                                                                  & elec.outcomes.0$Data=="Sum of Vote",
                                                                  3:13]),
                            "CON.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="CON" 
                                                                   & elec.outcomes.0$Data=="Sum of Seats",
                                                                   3:13]),
                            "LAB.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LAB" 
                                                                  & elec.outcomes.0$Data=="Sum of Vote",
                                                                  3:13]),
                            "LAB.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LAB" 
                                                                   & elec.outcomes.0$Data=="Sum of Seats",
                                                                   3:13]),
                            "LIB.vote"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LIB" 
                                                                  & elec.outcomes.0$Data=="Sum of Vote",
                                                                  3:13]),
                            "LIB.seats"=as.numeric(elec.outcomes.0[elec.outcomes.0$Party=="LIB" 
                                                                   & elec.outcomes.0$Data=="Sum of Seats",
                                                                   3:13]),stringsAsFactors=F)

#next loading up all the opinion poll data from UK Polling Report
#first list all the files (could use dir for this purpose)
files <- c("/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1970_1974.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1974_1979.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1979_1983.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1983_1987.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1987_1992.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1992_1997.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_1997_2001.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2001_2005.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2005_2010.csv",
"/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_2010_2015.csv")

#manually adding in some information about elections covered by this data
election.dates <- data.frame("election"=c("1974-Feb","1974-Oct","1979","1983","1987",
                                          "1992","1997","2001","2005","2010","2015"),
                             "election.date"=as.Date(c("1974-02-28","1974-10-10","1979-05-03",
                                                       "1983-06-09","1987-06-11","1992-04-09",
                                                       "1997-05-01","2001-06-07","2005-05-05",
                                                       "2010-05-06","2015-05-07")),
                             "election.incumbent"=c("CON","LAB","LAB","CON","CON","CON",
                                                    "CON","LAB","LAB","LAB","CON"),
                             "maj.req" = c(318,318,318,326,326,326,330,330,326,326,326),
                             stringsAsFactors=F)

ukpolls <- data.frame(stringsAsFactors=F) #empty data frame in which to combine all polling data
for (i in 1:NROW(files)) {
  temp <- read.csv(files[i],stringsAsFactors=F) #load data
  this.election <- gsub("/home/readejj/Dropbox/Research/Misc/LVW Projects/UK Elections/data/UK_polls_\\d+_(\\d+).csv","\\1",files[i]) #extract from the file name what the election in question is
  
  if(this.election=="1974") { #if it's 1974 we need to be careful as two elections in 1974
    temp$election[temp$Survey.End.Date<as.Date("1974-02-28")] <- "1974-Feb"
    temp$election[temp$Survey.End.Date>=as.Date("1974-02-28")] <- "1974-Oct"
  } else {
    temp$election <- this.election
  }
  
  if(i>1){ #this section within the if statement makes sure all datasets have right variable names (colnames)
    colnames(temp)[colnames(temp)=="LD...."] <- "LDEM...." #change the various incarnations of libdems into one
    extra.vars.0 <- setdiff(colnames(temp),colnames(ukpolls))
    if (NROW(extra.vars.0)>0){
      for(j in 1:NROW(extra.vars.0)){
        ukpolls[[extra.vars.0[j]]] <- NA
      }
    }    
    extra.vars.1 <- setdiff(colnames(ukpolls),colnames(temp))
    if (NROW(extra.vars.1)>0){
      for(j in 1:NROW(extra.vars.1)){
        temp[[extra.vars.1[j]]] <- NA
      }
    }    
  }
  ukpolls <- rbind(ukpolls,temp) #appending data together
}
#with final dataset, merge in the election information from earlier on
ukpolls <- merge(ukpolls,election.dates[grep("election",colnames(election.dates))],by=c("election"),all.x=T)
ukpolls <- ukpolls[is.na(ukpolls$election)==F,] #get rid of rows with no information

#next few lines correct some problems with how the data is reported - format of date
ukpolls$Survey.End.Date[ukpolls$election=="2005" | ukpolls$election=="2001"] <- gsub("(\\d+)/(\\d+)/(\\d+)","20\\3-\\2-\\1",ukpolls$Survey.End.Date[ukpolls$election=="2005" | ukpolls$election=="2001"])
ukpolls$Survey.End.Date <- gsub("2099-","1999-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- gsub("2098-","1998-",ukpolls$Survey.End.Date)
ukpolls$Survey.End.Date <- gsub("2097-","1997-",ukpolls$Survey.End.Date)

ukpolls$Survey.End.Date <- as.Date(ukpolls$Survey.End.Date) #format date for use in plots etc

#create smoothed poll average using ets method
con.ets <- ets(ukpolls$CON....[is.na(ukpolls$CON....)==F],model="ANN",alpha=0.1)
ukpolls$con.ma[is.na(ukpolls$CON....)==F] <- con.ets$fitted
lab.ets <- ets(ukpolls$LAB....[is.na(ukpolls$LAB....)==F],model="ANN",alpha=0.1)
ukpolls$lab.ma[is.na(ukpolls$LAB....)==F] <- lab.ets$fitted

#plotting. first command, jpeg(), which is commented out, is how to open a file to save graph to

#jpeg("/home/readejj/Dropbox/Research/Big Data/twitter elections/blog-materials/poll_2010_2015.jpg",height=7,width=10,units="in",res=600)
plot(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
     ukpolls$CON....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],pch=1,col="blue",
     ylim=range(c(ukpolls$CON....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
                  ukpolls$LAB....[ukpolls$Survey.End.Date>as.Date("2010-05-06")]),na.rm=T),
     main="Opinion Poll Shares, Labour and Tories, 2010-2015",ylab="Poll Vote Share",xlab="Date")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
      ukpolls$LAB....[ukpolls$Survey.End.Date>as.Date("2010-05-06")],pch=0,col="red",type="p")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
     ukpolls$con.ma[ukpolls$Survey.End.Date>as.Date("2010-05-06")],type="l",lwd=3,col="darkblue")
lines(ukpolls$Survey.End.Date[ukpolls$Survey.End.Date>as.Date("2010-05-06")],
     ukpolls$lab.ma[ukpolls$Survey.End.Date>as.Date("2010-05-06")],type="l",lwd=3,col="darkred")