> library(car)
> library(caret)
> library(ggplot2)
> library(reshape2)
> library(gridExtra)
> library(choroplethr)
> library(choroplethrMaps)

Bayesian approach to predicting the US elections.

> data <- read.csv("final_data.csv")

> df = data.frame(region=tolower(data$state), value=data$winner_2008)
> map_2008 <- state_choropleth(df)
> map_2008 <- map_2008 + scale_fill_manual(name=" ", values=c("red", "blue"))
> map_2008 <- map_2008 + ggtitle("Actual results for the 2008 election")
> 
> df = data.frame(region=tolower(data$state), value=data$winner_2012)
> map_2012 <- state_choropleth(df)
> map_2012 <- map_2012 + scale_fill_manual(name=" ", values=c("red", "blue"))
> map_2012 <- map_2012 + ggtitle("Actual results for the 2012 election")

A function to simulate n elections. Input a list of state win probabilities probabilities and number of simulations n. The result is a binomial random variable with the given probability of 1 i.e democrat win.

> simulate_election <- function(probabilities, n){
+     results <- numeric(n)
+     for(i in 1:n){
+         votes = 0
+         for(j in 1:nrow(data)){
+             state <- data$state[j]
+             if(rbinom(1, 1, probabilities[j]) == 1){
+                 votes <- votes + data[data$state==state, ]$electoral_votes
+             }
+         }
+         results[i] <- votes
+     }
+     results
+ }

Use an alternate Bayesian approach to predicting the 2012 election results, post-facto, as a way to develop a model for predicting future election results. Start with a flat prior distribution for each state.

> states <- data$state
> alpha0 <- rep(1, 51)
> beta0 <- rep(1, 51)

Update using 2004 data to obtain posterior distribution. We use percentage of democrats in each state.

> alpha1 <- alpha0 + data$democrat_04 
> beta1  <- beta0 + data$republican_04

Calculate the state win probabilities for democrats in 2008 prob_win_08

> n <- nrow(data)
> prob_win_08 <- numeric(n)
> for (i in 1:n) {
+     prob_win_08[i] <- round(1 - pbeta(0.5, alpha1[i], beta1[i]), digits=3)
+ }

Predict a democratic win in 2008 for a state if the computed probability of a win is >= 0.50

> states <- data$state
> pred_democ_win_08 <- states[prob_win_08 >= 0.50]
> actual_democ_win_08 <- data[data$winner_2008 == 1,]$state
> setdiff(actual_democ_win_08, pred_democ_win_08)

## [1] "Colorado"       "Florida"        "Indiana"        "Iowa"          
## [5] "Nevada"         "New Mexico"     "North Carolina" "Ohio"          
## [9] "Virginia"

The prediction misses the following 9 states: Colorado, Florida, Indiana, Iowa, Nevada, New Mexico, North Carolina, Ohio and Virginia.

Compare posterior probabilities of a democrat winning the 2008 election with the actual results of the 2008 election.

> df = data.frame(region=tolower(data$state), value=prob_win_08)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Posterior probabilities of a democrat winning the 2008 election")
> 
> grid.arrange(map, map_2008, ncol=2)

Update using 2008 data to obtain posterior distribution. We use percentage of democrats in each state.

> alpha2 <- alpha1 + data$democrat_08
> beta2  <- beta1 + data$republican_08

Calculate the state win probabilities for democrats in 2012 prob_win_12

> prob_win_12 <- numeric(n)
> for (i in 1:n) {
+     prob_win_12[i] <- round(1 - pbeta(0.5, alpha2[i], beta2[i]), digits=3)
+ }

Predict a democratic win in 2012 for a state if the computed probability of a win is >= 0.50

> pred_democ_win_12 <- states[prob_win_12 >= 0.5]
> actual_democ_win_12 <- data[data$winner_2012 == 1,]$state
> setdiff(actual_democ_win_12, pred_democ_win_12)

## [1] "Florida"  "Virginia"

The difference between predicted wins and actual is far narrower this time with misses on the following states: Florida and Virginia.

We compare posterior probabilities of a democrat winning the 2012 election with the actual results of the 2012 election.

> df = data.frame(region=tolower(data$state), value=prob_win_12)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Posterior probabilities of a democrat winning the 2012 election")
> 
> grid.arrange(map, map_2012, ncol=2)

Update using gallup 2012 data to obtain posterior distribution. We use gallup sampling results before the 2012 elections in each state.

> alpha3 <- alpha2 + data$gallup_democrat_2012
> beta3  <- beta2 + data$gallup_republican_2012

Calculate the probability of a democratic win in 2012 based on this posterior distribution for each state.

> rev_prob_win_12 <- numeric(n)
> for (i in 1:n) {
+     rev_prob_win_12[i] <- round(1 - pbeta(0.5, alpha3[i], beta3[i]), digits=3)
+ }

Predict a democratic win in 2012 for a state if the computed revised probability of a win is >= 0.50

> rev_pred_democ_win_12 <- states[rev_prob_win_12 >= 0.5]
> actual_democ_win_12 <- data[data$winner_2012 == 1,]$state
> setdiff(actual_democ_win_12, rev_pred_democ_win_12)

## [1] "Colorado" "Florida"  "Virginia"

The Gallup data based revision includes Colorado among the states where prediction differs from the actual outcome.

Compare posterior probabilities of a democrat winning the 2012 election with the actual results of the 2012 election.

> df = data.frame(region=tolower(data$state), value=rev_prob_win_12)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Revised posterior probabilities of a democrat winning the 2012 election")
> 
> grid.arrange(map, map_2012, ncol=2)

Simulate 10^3 elections using the state probabilities from our Bayesian Analysis above.

> N = 10^3
> results <- simulate_election(prob_win_12, N)
> probability_obama <- sum(results >= 269)/N; probability_obama

## [1] 0.824

> plot <- ggplot(data=NULL, aes(x=results))
> plot <- plot + geom_histogram(col="white", fill="blue", binwidth=1.25)
> plot <- plot + geom_vline(x=c(269, 332, mean(results)), col=c("black", "red", "yellow")); plot

On November 7th, 2012, Predictwise published a prediction that accurately predicted the number of college votes Obama would get in the 2012 election: http://www.predictwise.com/results/2012/president

> N = 10^3
> predictwise <- data$predictwise_democrat
> results <- simulate_election(predictwise, N)
> probability_obama <- sum(results >= 269)/N; probability_obama

## [1] 0.992

> plot <- ggplot(data=NULL, aes(x=results))
> plot <- plot + geom_histogram(col="white", fill="blue", binwidth=1.25)
> plot <- plot + geom_vline(x=c(269, 332, mean(results)), col=c("black", "red", "yellow")); plot

Compare predictwise probabilities of a democrat winning the 2012 election with the actual results of the 2012 election.

> df = data.frame(region=tolower(data$state), value=data$predictwise_democrat)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Predictwise probabilities of a democrat winning the 2012 election")
> 
> grid.arrange(map, map_2012, ncol=2)

Conclusion: The Bayesian analysis produces good results; however, the Predictwise data, which combines data from multiple sources to arrive at the probabilities, seems to do better.

Logistic Regression: Democratic Advantage vs. Election Outcome

Try to predict the 2012 election using Logistic Regression by regressing the winner_2008 on gallup_democrat_advantage_08.

> model <- glm(winner_2008 ~ gallup_democrat_advantage_08, data=data, family=binomial)
> 
> x <- seq(min(data$gallup_democrat_advantage_2012), max(data$gallup_democrat_advantage_2012))
> y <- exp(model$coef[1] + model$coef[2]*x)/(1 + exp(model$coef[1] + model$coef[2]*x))
> 
> new_df <- data.frame(gallup_democrat_advantage_08=data$gallup_democrat_advantage_2012)
> pred <- as.numeric(predict(model, newdata=new_df, type="response"))
> 
> plot <- ggplot(data=NULL, aes(x=x, y=y))
> plot <- plot + geom_line()
> plot <- plot + geom_point(data=NULL, 
+                           aes(x=data$gallup_democrat_advantage_2012, y=pred), 
+                           col="blue")
> plot <- plot + ggtitle("Logistic regresion curve for 2012 predicted values")
> plot <- plot + xlab("gallup_democrat_advantage")
> plot <- plot + ylab("probability"); plot

> N = 10^3
> results <- simulate_election(pred, N)
> probability_obama <- sum(results >= 269)/N; probability_obama

## [1] 1

> plot <- ggplot(data=NULL, aes(x=results))
> plot <- plot + geom_histogram(col="white", fill="blue", binwidth=1.75)
> plot <- plot + geom_vline(x=c(269, 332, mean(results)), col=c("black", "red", "blue")); plot

This model does not give us good results.

Compare the logistic regresion probabilities of a democrat winning the 2012 election with the actual results of the 2012 election.

> df = data.frame(region=tolower(data$state), value=pred)
> map <- state_choropleth(df, num_colors=1)
> map <- map + scale_fill_continuous(name="", low="red", high="blue")
> map <- map + ggtitle("Logistic regresion probabilities of a democrat winning the 2012 election")
> 
> grid.arrange(map, map_2012, ncol=2)