> library(car)
> library(caret)
> library(ggplot2)
> library(reshape2)
> library(gridExtra)
> library(choroplethr)
> library(choroplethrMaps)
> data <- read.csv("final_data.csv")
> df = data.frame(region=tolower(data$state), value=data$winner_2008)
> map_2008 <- state_choropleth(df)
> map_2008 <- map_2008 + scale_fill_manual(name=" ", values=c("red", "blue"))
> map_2008 <- map_2008 + ggtitle("Actual results for the 2008 election")
>
> df = data.frame(region=tolower(data$state), value=data$winner_2012)
> map_2012 <- state_choropleth(df)
> map_2012 <- map_2012 + scale_fill_manual(name=" ", values=c("red", "blue"))
> map_2012 <- map_2012 + ggtitle("Actual results for the 2012 election")
A function to simulate n elections. Input a list of state win probabilities probabilities and number of simulations n. The result is a binomial random variable with the given probability of 1 i.e democrat win.
> simulate_election <- function(probabilities, n){
+ results <- numeric(n)
+ for(i in 1:n){
+ votes = 0
+ for(j in 1:nrow(data)){
+ state <- data$state[j]
+ if(rbinom(1, 1, probabilities[j]) == 1){
+ votes <- votes + data[data$state==state, ]$electoral_votes
+ }
+ }
+ results[i] <- votes
+ }
+ results
+ }
Use an alternate Bayesian approach to predicting the 2012 election results, post-facto, as a way to develop a model for predicting future election results. Start with a flat prior distribution for each state.
> states <- data$state
> alpha0 <- rep(1, 51)
> beta0 <- rep(1, 51)
Update using 2004 data to obtain posterior distribution. We use percentage of democrats in each state.
> alpha1 <- alpha0 + data$democrat_04
> beta1 <- beta0 + data$republican_04
Calculate the state win probabilities for democrats in 2008 prob_win_08
> n <- nrow(data)
> prob_win_08 <- numeric(n)
> for (i in 1:n) {
+ prob_win_08[i] <- round(1 - pbeta(0.5, alpha1[i], beta1[i]), digits=3)
+ }
Predict a democratic win in 2008 for a state if the computed probability of a win is >= 0.50
> states <- data$state
> pred_democ_win_08 <- states[prob_win_08 >= 0.50]
> actual_democ_win_08 <- data[data$winner_2008 == 1,]$state
> setdiff(actual_democ_win_08, pred_democ_win_08)
## [1] "Colorado" "Florida" "Indiana" "Iowa"
## [5] "Nevada" "New Mexico" "North Carolina" "Ohio"
## [9] "Virginia"
The prediction misses the following 9 states: Colorado, Florida, Indiana, Iowa, Nevada, New Mexico, North Carolina, Ohio and Virginia.
Compare posterior probabilities of a democrat winning the 2008 election with the actual results of the 2008 election.
> df = data.frame(region=tolower(data$state), value=prob_win_08)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Posterior probabilities of a democrat winning the 2008 election")
>
> grid.arrange(map, map_2008, ncol=2)
Update using 2008 data to obtain posterior distribution. We use percentage of democrats in each state.
> alpha2 <- alpha1 + data$democrat_08
> beta2 <- beta1 + data$republican_08
Calculate the state win probabilities for democrats in 2012 prob_win_12
> prob_win_12 <- numeric(n)
> for (i in 1:n) {
+ prob_win_12[i] <- round(1 - pbeta(0.5, alpha2[i], beta2[i]), digits=3)
+ }
Predict a democratic win in 2012 for a state if the computed probability of a win is >= 0.50
> pred_democ_win_12 <- states[prob_win_12 >= 0.5]
> actual_democ_win_12 <- data[data$winner_2012 == 1,]$state
> setdiff(actual_democ_win_12, pred_democ_win_12)
## [1] "Florida" "Virginia"
The difference between predicted wins and actual is far narrower this time with misses on the following states: Florida and Virginia.
We compare posterior probabilities of a democrat winning the 2012 election with the actual results of the 2012 election.
> df = data.frame(region=tolower(data$state), value=prob_win_12)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Posterior probabilities of a democrat winning the 2012 election")
>
> grid.arrange(map, map_2012, ncol=2)
Update using gallup 2012 data to obtain posterior distribution. We use gallup sampling results before the 2012 elections in each state.
> alpha3 <- alpha2 + data$gallup_democrat_2012
> beta3 <- beta2 + data$gallup_republican_2012
Calculate the probability of a democratic win in 2012 based on this posterior distribution for each state.
> rev_prob_win_12 <- numeric(n)
> for (i in 1:n) {
+ rev_prob_win_12[i] <- round(1 - pbeta(0.5, alpha3[i], beta3[i]), digits=3)
+ }
Predict a democratic win in 2012 for a state if the computed revised probability of a win is >= 0.50
> rev_pred_democ_win_12 <- states[rev_prob_win_12 >= 0.5]
> actual_democ_win_12 <- data[data$winner_2012 == 1,]$state
> setdiff(actual_democ_win_12, rev_pred_democ_win_12)
## [1] "Colorado" "Florida" "Virginia"
The Gallup data based revision includes Colorado among the states where prediction differs from the actual outcome.
Compare posterior probabilities of a democrat winning the 2012 election with the actual results of the 2012 election.
> df = data.frame(region=tolower(data$state), value=rev_prob_win_12)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Revised posterior probabilities of a democrat winning the 2012 election")
>
> grid.arrange(map, map_2012, ncol=2)
Simulate 10^3 elections using the state probabilities from our Bayesian Analysis above.
> N = 10^3
> results <- simulate_election(prob_win_12, N)
> probability_obama <- sum(results >= 269)/N; probability_obama
## [1] 0.824
> plot <- ggplot(data=NULL, aes(x=results))
> plot <- plot + geom_histogram(col="white", fill="blue", binwidth=1.25)
> plot <- plot + geom_vline(x=c(269, 332, mean(results)), col=c("black", "red", "yellow")); plot
On November 7th, 2012, Predictwise published a prediction that accurately predicted the number of college votes Obama would get in the 2012 election: http://www.predictwise.com/results/2012/president
> N = 10^3
> predictwise <- data$predictwise_democrat
> results <- simulate_election(predictwise, N)
> probability_obama <- sum(results >= 269)/N; probability_obama
## [1] 0.992
> plot <- ggplot(data=NULL, aes(x=results))
> plot <- plot + geom_histogram(col="white", fill="blue", binwidth=1.25)
> plot <- plot + geom_vline(x=c(269, 332, mean(results)), col=c("black", "red", "yellow")); plot
Compare predictwise probabilities of a democrat winning the 2012 election with the actual results of the 2012 election.
> df = data.frame(region=tolower(data$state), value=data$predictwise_democrat)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Predictwise probabilities of a democrat winning the 2012 election")
>
> grid.arrange(map, map_2012, ncol=2)
Conclusion: The Bayesian analysis produces good results; however, the Predictwise data, which combines data from multiple sources to arrive at the probabilities, seems to do better.
Try to predict the 2012 election using Logistic Regression by regressing the winner_2008 on gallup_democrat_advantage_08.
> model <- glm(winner_2008 ~ gallup_democrat_advantage_08, data=data, family=binomial)
>
> x <- seq(min(data$gallup_democrat_advantage_2012), max(data$gallup_democrat_advantage_2012))
> y <- exp(model$coef[1] + model$coef[2]*x)/(1 + exp(model$coef[1] + model$coef[2]*x))
>
> new_df <- data.frame(gallup_democrat_advantage_08=data$gallup_democrat_advantage_2012)
> pred <- as.numeric(predict(model, newdata=new_df, type="response"))
>
> plot <- ggplot(data=NULL, aes(x=x, y=y))
> plot <- plot + geom_line()
> plot <- plot + geom_point(data=NULL,
+ aes(x=data$gallup_democrat_advantage_2012, y=pred),
+ col="blue")
> plot <- plot + ggtitle("Logistic regresion curve for 2012 predicted values")
> plot <- plot + xlab("gallup_democrat_advantage")
> plot <- plot + ylab("probability"); plot
> N = 10^3
> results <- simulate_election(pred, N)
> probability_obama <- sum(results >= 269)/N; probability_obama
## [1] 1
> plot <- ggplot(data=NULL, aes(x=results))
> plot <- plot + geom_histogram(col="white", fill="blue", binwidth=1.75)
> plot <- plot + geom_vline(x=c(269, 332, mean(results)), col=c("black", "red", "blue")); plot
This model does not give us good results.
Compare the logistic regresion probabilities of a democrat winning the 2012 election with the actual results of the 2012 election.
> df = data.frame(region=tolower(data$state), value=pred)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Logistic regresion probabilities of a democrat winning the 2012 election")
>
> grid.arrange(map, map_2012, ncol=2)