Week_3_Recitation_Election

Data Sources

https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/PollingData.csv http://www.realclearpolitics.com/ https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/Unit3_Recitation.R

IMPORTANT NOTE: On some operating systems, the imputed results will be slightly different even if you set the random seed. This is just due to the randomess involved in the multiple imputation process. We’ve provided the imputed data here: PollingData_Imputed.csv. If your results are not matching after the imputation, you can use this dataset instead.

https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/PollingData_Imputed.csv

setwd("/home/oner/Dersler/MIT_15.071/Week3")
polling <- read.csv("PollingData.csv")
str(polling)

## 'data.frame':    145 obs. of  7 variables:
##  $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 3 4 4 4 ...
##  $ Year      : int  2004 2008 2004 2008 2004 2008 2012 2004 2008 2012 ...
##  $ Rasmussen : int  11 21 NA 16 5 5 8 7 10 NA ...
##  $ SurveyUSA : int  18 25 NA NA 15 NA NA 5 NA NA ...
##  $ DiffCount : int  5 5 1 6 8 9 4 8 5 2 ...
##  $ PropR     : num  1 1 1 1 1 ...
##  $ Republican: int  1 1 1 1 1 1 1 1 1 1 ...

table(polling$Year)

## 
## 2004 2008 2012 
##   50   50   45

# There are decent number of missing data in SurveyUSA and Rasmussen
summary(polling)

##          State          Year        Rasmussen          SurveyUSA       
##  Arizona    :  3   Min.   :2004   Min.   :-41.0000   Min.   :-33.0000  
##  Arkansas   :  3   1st Qu.:2004   1st Qu.: -8.0000   1st Qu.:-11.7500  
##  California :  3   Median :2008   Median :  1.0000   Median : -2.0000  
##  Colorado   :  3   Mean   :2008   Mean   :  0.0404   Mean   : -0.8243  
##  Connecticut:  3   3rd Qu.:2012   3rd Qu.:  8.5000   3rd Qu.:  8.0000  
##  Florida    :  3   Max.   :2012   Max.   : 39.0000   Max.   : 30.0000  
##  (Other)    :127                  NA's   :46         NA's   :71        
##    DiffCount           PropR          Republican    
##  Min.   :-19.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: -6.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :  1.000   Median :0.6250   Median :1.0000  
##  Mean   : -1.269   Mean   :0.5259   Mean   :0.5103  
##  3rd Qu.:  4.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   : 11.000   Max.   :1.0000   Max.   :1.0000  
##

# We'll be using Multiple Imputation by Chained Equations (This is simply be done by loading "mice" package )
#install.packages("mice")

library(mice)

## Loading required package: Rcpp
## Loading required package: lattice
## mice 2.22 2014-06-10

# we'll create a new data frame with limited variables

simple <- polling[c("Rasmussen","SurveyUSA", "PropR", "DiffCount")]
summary(simple)

##    Rasmussen          SurveyUSA            PropR          DiffCount      
##  Min.   :-41.0000   Min.   :-33.0000   Min.   :0.0000   Min.   :-19.000  
##  1st Qu.: -8.0000   1st Qu.:-11.7500   1st Qu.:0.0000   1st Qu.: -6.000  
##  Median :  1.0000   Median : -2.0000   Median :0.6250   Median :  1.000  
##  Mean   :  0.0404   Mean   : -0.8243   Mean   :0.5259   Mean   : -1.269  
##  3rd Qu.:  8.5000   3rd Qu.:  8.0000   3rd Qu.:1.0000   3rd Qu.:  4.000  
##  Max.   : 39.0000   Max.   : 30.0000   Max.   :1.0000   Max.   : 11.000  
##  NA's   :46         NA's   :71

set.seed(144)


imputed <- complete(mice(simple))

## 
##  iter imp variable
##   1   1  Rasmussen  SurveyUSA
##   1   2  Rasmussen  SurveyUSA
##   1   3  Rasmussen  SurveyUSA
##   1   4  Rasmussen  SurveyUSA
##   1   5  Rasmussen  SurveyUSA
##   2   1  Rasmussen  SurveyUSA
##   2   2  Rasmussen  SurveyUSA
##   2   3  Rasmussen  SurveyUSA
##   2   4  Rasmussen  SurveyUSA
##   2   5  Rasmussen  SurveyUSA
##   3   1  Rasmussen  SurveyUSA
##   3   2  Rasmussen  SurveyUSA
##   3   3  Rasmussen  SurveyUSA
##   3   4  Rasmussen  SurveyUSA
##   3   5  Rasmussen  SurveyUSA
##   4   1  Rasmussen  SurveyUSA
##   4   2  Rasmussen  SurveyUSA
##   4   3  Rasmussen  SurveyUSA
##   4   4  Rasmussen  SurveyUSA
##   4   5  Rasmussen  SurveyUSA
##   5   1  Rasmussen  SurveyUSA
##   5   2  Rasmussen  SurveyUSA
##   5   3  Rasmussen  SurveyUSA
##   5   4  Rasmussen  SurveyUSA
##   5   5  Rasmussen  SurveyUSA

# 5 rounds of imputation ran and all NA's were filled in
summary(imputed)

##    Rasmussen         SurveyUSA           PropR          DiffCount      
##  Min.   :-41.000   Min.   :-33.000   Min.   :0.0000   Min.   :-19.000  
##  1st Qu.: -8.000   1st Qu.:-11.000   1st Qu.:0.0000   1st Qu.: -6.000  
##  Median :  3.000   Median :  1.000   Median :0.6250   Median :  1.000  
##  Mean   :  1.731   Mean   :  1.517   Mean   :0.5259   Mean   : -1.269  
##  3rd Qu.: 11.000   3rd Qu.: 18.000   3rd Qu.:1.0000   3rd Qu.:  4.000  
##  Max.   : 39.000   Max.   : 30.000   Max.   :1.0000   Max.   : 11.000

# Copy values back to original data
polling$Rasmussen <- imputed$Rasmussen
polling$SurveyUSA <-imputed$SurveyUSA

summary(polling)

##          State          Year        Rasmussen         SurveyUSA      
##  Arizona    :  3   Min.   :2004   Min.   :-41.000   Min.   :-33.000  
##  Arkansas   :  3   1st Qu.:2004   1st Qu.: -8.000   1st Qu.:-11.000  
##  California :  3   Median :2008   Median :  3.000   Median :  1.000  
##  Colorado   :  3   Mean   :2008   Mean   :  1.731   Mean   :  1.517  
##  Connecticut:  3   3rd Qu.:2012   3rd Qu.: 11.000   3rd Qu.: 18.000  
##  Florida    :  3   Max.   :2012   Max.   : 39.000   Max.   : 30.000  
##  (Other)    :127                                                     
##    DiffCount           PropR          Republican    
##  Min.   :-19.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: -6.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :  1.000   Median :0.6250   Median :1.0000  
##  Mean   : -1.269   Mean   :0.5259   Mean   :0.5103  
##  3rd Qu.:  4.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   : 11.000   Max.   :1.0000   Max.   :1.0000  
##

# Here my results were not the same as the results the instructer has so I load teh data from already imputed 
polling <- read.csv("PollingData_Imputed.csv")

A Sophisticated Baseline method

#We'll train the data from 2004 and 2008 elections and test on 2012 elections
Train <- subset(polling, Year == 2004 | Year == 2008)
Test <- subset(polling, Year == 2012)

str(Train)

## 'data.frame':    100 obs. of  7 variables:
##  $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ Year      : int  2004 2008 2004 2008 2004 2008 2004 2008 2004 2008 ...
##  $ Rasmussen : int  11 21 19 16 5 5 7 10 -11 -27 ...
##  $ SurveyUSA : int  18 25 21 18 15 3 5 7 -11 -24 ...
##  $ DiffCount : int  5 5 1 6 8 9 8 5 -8 -5 ...
##  $ PropR     : num  1 1 1 1 1 1 1 1 0 0 ...
##  $ Republican: int  1 1 1 1 1 1 1 1 0 0 ...

str(Test)

## 'data.frame':    45 obs. of  7 variables:
##  $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 3 4 5 6 7 9 10 11 12 13 ...
##  $ Year      : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
##  $ Rasmussen : int  8 13 -12 3 -7 2 5 -22 31 -22 ...
##  $ SurveyUSA : int  5 21 -14 -2 -13 0 8 -24 24 -16 ...
##  $ DiffCount : int  4 2 -6 -5 -8 6 4 -2 1 -5 ...
##  $ PropR     : num  0.833 1 0 0.308 0 ...
##  $ Republican: int  1 1 0 0 0 0 1 0 1 0 ...

# Among 100 observations 53 republicans won in Train data set. Therefore our simple baseline method will always predict Republicans win (with 0.53 accuracy)
table(Train$Republican)

## 
##  0  1 
## 47 53

# A more complicated baseline would be to choose one poll (Rasmussen for example) make a prediction based on that. 
# Sign function returns 1 if value is positrive returns -1 if value is negative or 0 if value is 0

table(sign(Train$Rasmussen))

## 
## -1  0  1 
## 42  2 56

# This baseline calculated that in 42 instances democrats win and 56 instances Republican wins and in 2 instances it is inconclusive



# Below rows are the true outcome (0 for democrat and 1 for republican),  columns are predictions of rasmussen(-1 for democrat 0 for inconclusive and 1 for republican)
table(Train$Republican, sign(Train$Rasmussen))

##    
##     -1  0  1
##   0 42  1  4
##   1  0  1 52

# This table makes 4 mistakes (predicted Republican wins but indeed Democrats win), much better than the naive baseline method

Logistic Regression Models

# We can suspect there could be multi co linearity, this makes sense since basicly they are measuring the same thing which is how strong a republican candidate performing in the particular state


# cor(Train) doesn't works since state is not numeric

str(Train)

## 'data.frame':    100 obs. of  7 variables:
##  $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ Year      : int  2004 2008 2004 2008 2004 2008 2004 2008 2004 2008 ...
##  $ Rasmussen : int  11 21 19 16 5 5 7 10 -11 -27 ...
##  $ SurveyUSA : int  18 25 21 18 15 3 5 7 -11 -24 ...
##  $ DiffCount : int  5 5 1 6 8 9 8 5 -8 -5 ...
##  $ PropR     : num  1 1 1 1 1 1 1 1 0 0 ...
##  $ Republican: int  1 1 1 1 1 1 1 1 0 0 ...

cor(Train[c("Rasmussen", "SurveyUSA", "PropR", "DiffCount", "Republican")])

##            Rasmussen SurveyUSA     PropR DiffCount Republican
## Rasmussen  1.0000000 0.9365837 0.8431180 0.5109169  0.7929252
## SurveyUSA  0.9365837 1.0000000 0.8616478 0.5222585  0.8101645
## PropR      0.8431180 0.8616478 1.0000000 0.8273785  0.9484204
## DiffCount  0.5109169 0.5222585 0.8273785 1.0000000  0.8092777
## Republican 0.7929252 0.8101645 0.9484204 0.8092777  1.0000000

# Looks like Republican is mostly correlated with PropR, and let's base the model using PropR

mod1 <- glm(Republican ~ PropR, data = Train, family = "binomial")
summary(mod1)

## 
## Call:
## glm(formula = Republican ~ PropR, family = "binomial", data = Train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.22880  -0.06541   0.10260   0.10260   1.37392  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -6.146      1.977  -3.108 0.001882 ** 
## PropR         11.390      3.153   3.613 0.000303 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.269  on 99  degrees of freedom
## Residual deviance:  15.772  on 98  degrees of freedom
## AIC: 19.772
## 
## Number of Fisher Scoring iterations: 8

#let's do the prediction using the model on training set
pred1 <- predict(mod1, type="response")
table(Train$Republican, pred1>=0.5)

##    
##     FALSE TRUE
##   0    45    2
##   1     2   51

# it makes 4 mistakes just as like the smart baseline model did in previous example 

# Let's add another variable to our model and see if we can get a better prediction
cor(Train[c("Rasmussen", "SurveyUSA", "PropR", "DiffCount", "Republican")])

##            Rasmussen SurveyUSA     PropR DiffCount Republican
## Rasmussen  1.0000000 0.9365837 0.8431180 0.5109169  0.7929252
## SurveyUSA  0.9365837 1.0000000 0.8616478 0.5222585  0.8101645
## PropR      0.8431180 0.8616478 1.0000000 0.8273785  0.9484204
## DiffCount  0.5109169 0.5222585 0.8273785 1.0000000  0.8092777
## Republican 0.7929252 0.8101645 0.9484204 0.8092777  1.0000000

# We better look and find if there are two variables with less correlation so it will add value to our model, for example it doesn't make sense to add Rasmussen and SurveyUSA together since they are correlated 0.936 with each other. 

# In our second model we pick SurveyUSA and Diffcount (correlation of 0.52)

mod2 <- glm(Republican ~ SurveyUSA + DiffCount, data = Train, family = "binomial")
pred2 <- predict(mod2,type="response")
table(Train$Republican, pred2 >=0.5)

##    
##     FALSE TRUE
##   0    45    2
##   1     1   52

# this time we made 1 less mistake, not too impresive but slighly better


summary(mod2)

## 
## Call:
## glm(formula = Republican ~ SurveyUSA + DiffCount, family = "binomial", 
##     data = Train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.04741  -0.00977   0.00561   0.03751   1.32999  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.6827     1.0468  -0.652   0.5143  
## SurveyUSA     0.3309     0.2226   1.487   0.1371  
## DiffCount     0.6619     0.3663   1.807   0.0708 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.269  on 99  degrees of freedom
## Residual deviance:  11.154  on 97  degrees of freedom
## AIC: 17.154
## 
## Number of Fisher Scoring iterations: 9

Test Set predictions

# Let's use our smart baseline (Use Rasmussen to predict outcome)
table(Test$Republican, sign(Test$Rasmussen))

##    
##     -1  0  1
##   0 18  2  4
##   1  0  0 21

# 4 mistakes and 2 inconclusive results. This is our baseline against our model

TestPrediction <- predict(mod2, newdata=Test,type="response")

table(Test$Republican, TestPrediction>=0.5)

##    
##     FALSE TRUE
##   0    23    1
##   1     0   21

#Let's see error we made, which state predicted as Republican but indeed elected as Democrat 
subset(Test, TestPrediction>=0.5 & Republican ==0)

##      State Year Rasmussen SurveyUSA DiffCount     PropR Republican
## 24 Florida 2012         2         0         6 0.6666667          0

# No suprise! Florida. Rasmussen give 2 percent more for Republican and SurveyUSA predicted as equal, DiffCount predicted 6 more polls republicans than democrats, 2/3 of the votes predicted republicans would win, but Republicans lost at the end

Week_3_Recitation_Election_Forecasting

Data Sources