https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/PollingData.csv http://www.realclearpolitics.com/ https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/Unit3_Recitation.R
IMPORTANT NOTE: On some operating systems, the imputed results will be slightly different even if you set the random seed. This is just due to the randomess involved in the multiple imputation process. We’ve provided the imputed data here: PollingData_Imputed.csv. If your results are not matching after the imputation, you can use this dataset instead.
https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/PollingData_Imputed.csv
setwd("/home/oner/Dersler/MIT_15.071/Week3")
polling <- read.csv("PollingData.csv")
str(polling)
## 'data.frame': 145 obs. of 7 variables:
## $ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 3 4 4 4 ...
## $ Year : int 2004 2008 2004 2008 2004 2008 2012 2004 2008 2012 ...
## $ Rasmussen : int 11 21 NA 16 5 5 8 7 10 NA ...
## $ SurveyUSA : int 18 25 NA NA 15 NA NA 5 NA NA ...
## $ DiffCount : int 5 5 1 6 8 9 4 8 5 2 ...
## $ PropR : num 1 1 1 1 1 ...
## $ Republican: int 1 1 1 1 1 1 1 1 1 1 ...
table(polling$Year)
##
## 2004 2008 2012
## 50 50 45
# There are decent number of missing data in SurveyUSA and Rasmussen
summary(polling)
## State Year Rasmussen SurveyUSA
## Arizona : 3 Min. :2004 Min. :-41.0000 Min. :-33.0000
## Arkansas : 3 1st Qu.:2004 1st Qu.: -8.0000 1st Qu.:-11.7500
## California : 3 Median :2008 Median : 1.0000 Median : -2.0000
## Colorado : 3 Mean :2008 Mean : 0.0404 Mean : -0.8243
## Connecticut: 3 3rd Qu.:2012 3rd Qu.: 8.5000 3rd Qu.: 8.0000
## Florida : 3 Max. :2012 Max. : 39.0000 Max. : 30.0000
## (Other) :127 NA's :46 NA's :71
## DiffCount PropR Republican
## Min. :-19.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: -6.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 1.000 Median :0.6250 Median :1.0000
## Mean : -1.269 Mean :0.5259 Mean :0.5103
## 3rd Qu.: 4.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. : 11.000 Max. :1.0000 Max. :1.0000
##
# We'll be using Multiple Imputation by Chained Equations (This is simply be done by loading "mice" package )
#install.packages("mice")
library(mice)
## Loading required package: Rcpp
## Loading required package: lattice
## mice 2.22 2014-06-10
# we'll create a new data frame with limited variables
simple <- polling[c("Rasmussen","SurveyUSA", "PropR", "DiffCount")]
summary(simple)
## Rasmussen SurveyUSA PropR DiffCount
## Min. :-41.0000 Min. :-33.0000 Min. :0.0000 Min. :-19.000
## 1st Qu.: -8.0000 1st Qu.:-11.7500 1st Qu.:0.0000 1st Qu.: -6.000
## Median : 1.0000 Median : -2.0000 Median :0.6250 Median : 1.000
## Mean : 0.0404 Mean : -0.8243 Mean :0.5259 Mean : -1.269
## 3rd Qu.: 8.5000 3rd Qu.: 8.0000 3rd Qu.:1.0000 3rd Qu.: 4.000
## Max. : 39.0000 Max. : 30.0000 Max. :1.0000 Max. : 11.000
## NA's :46 NA's :71
set.seed(144)
imputed <- complete(mice(simple))
##
## iter imp variable
## 1 1 Rasmussen SurveyUSA
## 1 2 Rasmussen SurveyUSA
## 1 3 Rasmussen SurveyUSA
## 1 4 Rasmussen SurveyUSA
## 1 5 Rasmussen SurveyUSA
## 2 1 Rasmussen SurveyUSA
## 2 2 Rasmussen SurveyUSA
## 2 3 Rasmussen SurveyUSA
## 2 4 Rasmussen SurveyUSA
## 2 5 Rasmussen SurveyUSA
## 3 1 Rasmussen SurveyUSA
## 3 2 Rasmussen SurveyUSA
## 3 3 Rasmussen SurveyUSA
## 3 4 Rasmussen SurveyUSA
## 3 5 Rasmussen SurveyUSA
## 4 1 Rasmussen SurveyUSA
## 4 2 Rasmussen SurveyUSA
## 4 3 Rasmussen SurveyUSA
## 4 4 Rasmussen SurveyUSA
## 4 5 Rasmussen SurveyUSA
## 5 1 Rasmussen SurveyUSA
## 5 2 Rasmussen SurveyUSA
## 5 3 Rasmussen SurveyUSA
## 5 4 Rasmussen SurveyUSA
## 5 5 Rasmussen SurveyUSA
# 5 rounds of imputation ran and all NA's were filled in
summary(imputed)
## Rasmussen SurveyUSA PropR DiffCount
## Min. :-41.000 Min. :-33.000 Min. :0.0000 Min. :-19.000
## 1st Qu.: -8.000 1st Qu.:-11.000 1st Qu.:0.0000 1st Qu.: -6.000
## Median : 3.000 Median : 1.000 Median :0.6250 Median : 1.000
## Mean : 1.731 Mean : 1.517 Mean :0.5259 Mean : -1.269
## 3rd Qu.: 11.000 3rd Qu.: 18.000 3rd Qu.:1.0000 3rd Qu.: 4.000
## Max. : 39.000 Max. : 30.000 Max. :1.0000 Max. : 11.000
# Copy values back to original data
polling$Rasmussen <- imputed$Rasmussen
polling$SurveyUSA <-imputed$SurveyUSA
summary(polling)
## State Year Rasmussen SurveyUSA
## Arizona : 3 Min. :2004 Min. :-41.000 Min. :-33.000
## Arkansas : 3 1st Qu.:2004 1st Qu.: -8.000 1st Qu.:-11.000
## California : 3 Median :2008 Median : 3.000 Median : 1.000
## Colorado : 3 Mean :2008 Mean : 1.731 Mean : 1.517
## Connecticut: 3 3rd Qu.:2012 3rd Qu.: 11.000 3rd Qu.: 18.000
## Florida : 3 Max. :2012 Max. : 39.000 Max. : 30.000
## (Other) :127
## DiffCount PropR Republican
## Min. :-19.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: -6.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 1.000 Median :0.6250 Median :1.0000
## Mean : -1.269 Mean :0.5259 Mean :0.5103
## 3rd Qu.: 4.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. : 11.000 Max. :1.0000 Max. :1.0000
##
# Here my results were not the same as the results the instructer has so I load teh data from already imputed
polling <- read.csv("PollingData_Imputed.csv")
A Sophisticated Baseline method
#We'll train the data from 2004 and 2008 elections and test on 2012 elections
Train <- subset(polling, Year == 2004 | Year == 2008)
Test <- subset(polling, Year == 2012)
str(Train)
## 'data.frame': 100 obs. of 7 variables:
## $ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ Year : int 2004 2008 2004 2008 2004 2008 2004 2008 2004 2008 ...
## $ Rasmussen : int 11 21 19 16 5 5 7 10 -11 -27 ...
## $ SurveyUSA : int 18 25 21 18 15 3 5 7 -11 -24 ...
## $ DiffCount : int 5 5 1 6 8 9 8 5 -8 -5 ...
## $ PropR : num 1 1 1 1 1 1 1 1 0 0 ...
## $ Republican: int 1 1 1 1 1 1 1 1 0 0 ...
str(Test)
## 'data.frame': 45 obs. of 7 variables:
## $ State : Factor w/ 50 levels "Alabama","Alaska",..: 3 4 5 6 7 9 10 11 12 13 ...
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ Rasmussen : int 8 13 -12 3 -7 2 5 -22 31 -22 ...
## $ SurveyUSA : int 5 21 -14 -2 -13 0 8 -24 24 -16 ...
## $ DiffCount : int 4 2 -6 -5 -8 6 4 -2 1 -5 ...
## $ PropR : num 0.833 1 0 0.308 0 ...
## $ Republican: int 1 1 0 0 0 0 1 0 1 0 ...
# Among 100 observations 53 republicans won in Train data set. Therefore our simple baseline method will always predict Republicans win (with 0.53 accuracy)
table(Train$Republican)
##
## 0 1
## 47 53
# A more complicated baseline would be to choose one poll (Rasmussen for example) make a prediction based on that.
# Sign function returns 1 if value is positrive returns -1 if value is negative or 0 if value is 0
table(sign(Train$Rasmussen))
##
## -1 0 1
## 42 2 56
# This baseline calculated that in 42 instances democrats win and 56 instances Republican wins and in 2 instances it is inconclusive
# Below rows are the true outcome (0 for democrat and 1 for republican), columns are predictions of rasmussen(-1 for democrat 0 for inconclusive and 1 for republican)
table(Train$Republican, sign(Train$Rasmussen))
##
## -1 0 1
## 0 42 1 4
## 1 0 1 52
# This table makes 4 mistakes (predicted Republican wins but indeed Democrats win), much better than the naive baseline method
Logistic Regression Models
# We can suspect there could be multi co linearity, this makes sense since basicly they are measuring the same thing which is how strong a republican candidate performing in the particular state
# cor(Train) doesn't works since state is not numeric
str(Train)
## 'data.frame': 100 obs. of 7 variables:
## $ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ Year : int 2004 2008 2004 2008 2004 2008 2004 2008 2004 2008 ...
## $ Rasmussen : int 11 21 19 16 5 5 7 10 -11 -27 ...
## $ SurveyUSA : int 18 25 21 18 15 3 5 7 -11 -24 ...
## $ DiffCount : int 5 5 1 6 8 9 8 5 -8 -5 ...
## $ PropR : num 1 1 1 1 1 1 1 1 0 0 ...
## $ Republican: int 1 1 1 1 1 1 1 1 0 0 ...
cor(Train[c("Rasmussen", "SurveyUSA", "PropR", "DiffCount", "Republican")])
## Rasmussen SurveyUSA PropR DiffCount Republican
## Rasmussen 1.0000000 0.9365837 0.8431180 0.5109169 0.7929252
## SurveyUSA 0.9365837 1.0000000 0.8616478 0.5222585 0.8101645
## PropR 0.8431180 0.8616478 1.0000000 0.8273785 0.9484204
## DiffCount 0.5109169 0.5222585 0.8273785 1.0000000 0.8092777
## Republican 0.7929252 0.8101645 0.9484204 0.8092777 1.0000000
# Looks like Republican is mostly correlated with PropR, and let's base the model using PropR
mod1 <- glm(Republican ~ PropR, data = Train, family = "binomial")
summary(mod1)
##
## Call:
## glm(formula = Republican ~ PropR, family = "binomial", data = Train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.22880 -0.06541 0.10260 0.10260 1.37392
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.146 1.977 -3.108 0.001882 **
## PropR 11.390 3.153 3.613 0.000303 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.269 on 99 degrees of freedom
## Residual deviance: 15.772 on 98 degrees of freedom
## AIC: 19.772
##
## Number of Fisher Scoring iterations: 8
#let's do the prediction using the model on training set
pred1 <- predict(mod1, type="response")
table(Train$Republican, pred1>=0.5)
##
## FALSE TRUE
## 0 45 2
## 1 2 51
# it makes 4 mistakes just as like the smart baseline model did in previous example
# Let's add another variable to our model and see if we can get a better prediction
cor(Train[c("Rasmussen", "SurveyUSA", "PropR", "DiffCount", "Republican")])
## Rasmussen SurveyUSA PropR DiffCount Republican
## Rasmussen 1.0000000 0.9365837 0.8431180 0.5109169 0.7929252
## SurveyUSA 0.9365837 1.0000000 0.8616478 0.5222585 0.8101645
## PropR 0.8431180 0.8616478 1.0000000 0.8273785 0.9484204
## DiffCount 0.5109169 0.5222585 0.8273785 1.0000000 0.8092777
## Republican 0.7929252 0.8101645 0.9484204 0.8092777 1.0000000
# We better look and find if there are two variables with less correlation so it will add value to our model, for example it doesn't make sense to add Rasmussen and SurveyUSA together since they are correlated 0.936 with each other.
# In our second model we pick SurveyUSA and Diffcount (correlation of 0.52)
mod2 <- glm(Republican ~ SurveyUSA + DiffCount, data = Train, family = "binomial")
pred2 <- predict(mod2,type="response")
table(Train$Republican, pred2 >=0.5)
##
## FALSE TRUE
## 0 45 2
## 1 1 52
# this time we made 1 less mistake, not too impresive but slighly better
summary(mod2)
##
## Call:
## glm(formula = Republican ~ SurveyUSA + DiffCount, family = "binomial",
## data = Train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.04741 -0.00977 0.00561 0.03751 1.32999
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6827 1.0468 -0.652 0.5143
## SurveyUSA 0.3309 0.2226 1.487 0.1371
## DiffCount 0.6619 0.3663 1.807 0.0708 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.269 on 99 degrees of freedom
## Residual deviance: 11.154 on 97 degrees of freedom
## AIC: 17.154
##
## Number of Fisher Scoring iterations: 9
Test Set predictions
# Let's use our smart baseline (Use Rasmussen to predict outcome)
table(Test$Republican, sign(Test$Rasmussen))
##
## -1 0 1
## 0 18 2 4
## 1 0 0 21
# 4 mistakes and 2 inconclusive results. This is our baseline against our model
TestPrediction <- predict(mod2, newdata=Test,type="response")
table(Test$Republican, TestPrediction>=0.5)
##
## FALSE TRUE
## 0 23 1
## 1 0 21
#Let's see error we made, which state predicted as Republican but indeed elected as Democrat
subset(Test, TestPrediction>=0.5 & Republican ==0)
## State Year Rasmussen SurveyUSA DiffCount PropR Republican
## 24 Florida 2012 2 0 6 0.6666667 0
# No suprise! Florida. Rasmussen give 2 percent more for Republican and SurveyUSA predicted as equal, DiffCount predicted 6 more polls republicans than democrats, 2/3 of the votes predicted republicans would win, but Republicans lost at the end