Unit 3, Recitation

Video 2

setwd('C:/Users/daria.alekseeva/Documents/Edx/Logistic reg')

# Read in data
polling = read.csv("PollingData.csv")
str(polling)
## 'data.frame':    145 obs. of  7 variables:
##  $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 3 4 4 4 ...
##  $ Year      : int  2004 2008 2004 2008 2004 2008 2012 2004 2008 2012 ...
##  $ Rasmussen : int  11 21 NA 16 5 5 8 7 10 NA ...
##  $ SurveyUSA : int  18 25 NA NA 15 NA NA 5 NA NA ...
##  $ DiffCount : int  5 5 1 6 8 9 4 8 5 2 ...
##  $ PropR     : num  1 1 1 1 1 ...
##  $ Republican: int  1 1 1 1 1 1 1 1 1 1 ...
table(polling$Year)
## 
## 2004 2008 2012 
##   50   50   45
summary(polling)
##          State          Year        Rasmussen          SurveyUSA       
##  Arizona    :  3   Min.   :2004   Min.   :-41.0000   Min.   :-33.0000  
##  Arkansas   :  3   1st Qu.:2004   1st Qu.: -8.0000   1st Qu.:-11.7500  
##  California :  3   Median :2008   Median :  1.0000   Median : -2.0000  
##  Colorado   :  3   Mean   :2008   Mean   :  0.0404   Mean   : -0.8243  
##  Connecticut:  3   3rd Qu.:2012   3rd Qu.:  8.5000   3rd Qu.:  8.0000  
##  Florida    :  3   Max.   :2012   Max.   : 39.0000   Max.   : 30.0000  
##  (Other)    :127                  NA's   :46         NA's   :71        
##    DiffCount           PropR          Republican    
##  Min.   :-19.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: -6.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :  1.000   Median :0.6250   Median :1.0000  
##  Mean   : -1.269   Mean   :0.5259   Mean   :0.5103  
##  3rd Qu.:  4.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   : 11.000   Max.   :1.0000   Max.   :1.0000  
## 
# Install and load mice package
#install.packages("mice")
library(mice)
## Loading required package: Rcpp
## Loading required package: lattice
## mice 2.22 2014-06-10
# Multiple imputation
simple = polling[c("Rasmussen", "SurveyUSA", "PropR", "DiffCount")]
summary(simple)
##    Rasmussen          SurveyUSA            PropR          DiffCount      
##  Min.   :-41.0000   Min.   :-33.0000   Min.   :0.0000   Min.   :-19.000  
##  1st Qu.: -8.0000   1st Qu.:-11.7500   1st Qu.:0.0000   1st Qu.: -6.000  
##  Median :  1.0000   Median : -2.0000   Median :0.6250   Median :  1.000  
##  Mean   :  0.0404   Mean   : -0.8243   Mean   :0.5259   Mean   : -1.269  
##  3rd Qu.:  8.5000   3rd Qu.:  8.0000   3rd Qu.:1.0000   3rd Qu.:  4.000  
##  Max.   : 39.0000   Max.   : 30.0000   Max.   :1.0000   Max.   : 11.000  
##  NA's   :46         NA's   :71
set.seed(144)
imputed = complete(mice(simple))
## 
##  iter imp variable
##   1   1  Rasmussen  SurveyUSA
##   1   2  Rasmussen  SurveyUSA
##   1   3  Rasmussen  SurveyUSA
##   1   4  Rasmussen  SurveyUSA
##   1   5  Rasmussen  SurveyUSA
##   2   1  Rasmussen  SurveyUSA
##   2   2  Rasmussen  SurveyUSA
##   2   3  Rasmussen  SurveyUSA
##   2   4  Rasmussen  SurveyUSA
##   2   5  Rasmussen  SurveyUSA
##   3   1  Rasmussen  SurveyUSA
##   3   2  Rasmussen  SurveyUSA
##   3   3  Rasmussen  SurveyUSA
##   3   4  Rasmussen  SurveyUSA
##   3   5  Rasmussen  SurveyUSA
##   4   1  Rasmussen  SurveyUSA
##   4   2  Rasmussen  SurveyUSA
##   4   3  Rasmussen  SurveyUSA
##   4   4  Rasmussen  SurveyUSA
##   4   5  Rasmussen  SurveyUSA
##   5   1  Rasmussen  SurveyUSA
##   5   2  Rasmussen  SurveyUSA
##   5   3  Rasmussen  SurveyUSA
##   5   4  Rasmussen  SurveyUSA
##   5   5  Rasmussen  SurveyUSA
summary(imputed)
##    Rasmussen         SurveyUSA           PropR          DiffCount      
##  Min.   :-41.000   Min.   :-33.000   Min.   :0.0000   Min.   :-19.000  
##  1st Qu.: -8.000   1st Qu.:-11.000   1st Qu.:0.0000   1st Qu.: -6.000  
##  Median :  3.000   Median :  1.000   Median :0.6250   Median :  1.000  
##  Mean   :  1.731   Mean   :  1.517   Mean   :0.5259   Mean   : -1.269  
##  3rd Qu.: 11.000   3rd Qu.: 18.000   3rd Qu.:1.0000   3rd Qu.:  4.000  
##  Max.   : 39.000   Max.   : 30.000   Max.   :1.0000   Max.   : 11.000
polling$Rasmussen = imputed$Rasmussen
polling$SurveyUSA = imputed$SurveyUSA
summary(polling)
##          State          Year        Rasmussen         SurveyUSA      
##  Arizona    :  3   Min.   :2004   Min.   :-41.000   Min.   :-33.000  
##  Arkansas   :  3   1st Qu.:2004   1st Qu.: -8.000   1st Qu.:-11.000  
##  California :  3   Median :2008   Median :  3.000   Median :  1.000  
##  Colorado   :  3   Mean   :2008   Mean   :  1.731   Mean   :  1.517  
##  Connecticut:  3   3rd Qu.:2012   3rd Qu.: 11.000   3rd Qu.: 18.000  
##  Florida    :  3   Max.   :2012   Max.   : 39.000   Max.   : 30.000  
##  (Other)    :127                                                     
##    DiffCount           PropR          Republican    
##  Min.   :-19.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: -6.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :  1.000   Median :0.6250   Median :1.0000  
##  Mean   : -1.269   Mean   :0.5259   Mean   :0.5103  
##  3rd Qu.:  4.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   : 11.000   Max.   :1.0000   Max.   :1.0000  
## 
# Video 3

# Subset data into training set and test set
Train = subset(polling, Year == 2004 | Year == 2008)
Test = subset(polling, Year == 2012)

# Smart Baseline
table(Train$Republican)
## 
##  0  1 
## 47 53
sign(20)
## [1] 1
sign(-10)
## [1] -1
sign(0)
## [1] 0
table(sign(Train$Rasmussen))
## 
## -1  0  1 
## 42  3 55
table(Train$Republican, sign(Train$Rasmussen))
##    
##     -1  0  1
##   0 42  2  3
##   1  0  1 52
# Video 4

# Multicollinearity
#cor(Train)
str(Train)
## 'data.frame':    100 obs. of  7 variables:
##  $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ Year      : int  2004 2008 2004 2008 2004 2008 2004 2008 2004 2008 ...
##  $ Rasmussen : int  11 21 16 16 5 5 7 10 -11 -27 ...
##  $ SurveyUSA : int  18 25 21 21 15 8 5 9 -11 -24 ...
##  $ DiffCount : int  5 5 1 6 8 9 8 5 -8 -5 ...
##  $ PropR     : num  1 1 1 1 1 1 1 1 0 0 ...
##  $ Republican: int  1 1 1 1 1 1 1 1 0 0 ...
cor(Train[c("Rasmussen", "SurveyUSA", "PropR", "DiffCount", "Republican")])
##            Rasmussen SurveyUSA     PropR DiffCount Republican
## Rasmussen  1.0000000 0.9194508 0.8404803 0.5124098  0.8021191
## SurveyUSA  0.9194508 1.0000000 0.8756581 0.5541816  0.8205806
## PropR      0.8404803 0.8756581 1.0000000 0.8273785  0.9484204
## DiffCount  0.5124098 0.5541816 0.8273785 1.0000000  0.8092777
## Republican 0.8021191 0.8205806 0.9484204 0.8092777  1.0000000
# Logistic Regression Model
mod1 = glm(Republican~PropR, data=Train, family="binomial")
summary(mod1)
## 
## Call:
## glm(formula = Republican ~ PropR, family = "binomial", data = Train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.22880  -0.06541   0.10260   0.10260   1.37392  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -6.146      1.977  -3.108 0.001882 ** 
## PropR         11.390      3.153   3.613 0.000303 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.269  on 99  degrees of freedom
## Residual deviance:  15.772  on 98  degrees of freedom
## AIC: 19.772
## 
## Number of Fisher Scoring iterations: 8
# Training set predictions
pred1 = predict(mod1, type="response")
table(Train$Republican, pred1 >= 0.5)
##    
##     FALSE TRUE
##   0    45    2
##   1     2   51
# Two-variable model
mod2 = glm(Republican~SurveyUSA+DiffCount, data=Train, family="binomial")
pred2 = predict(mod2, type="response")
table(Train$Republican, pred2 >= 0.5)
##    
##     FALSE TRUE
##   0    45    2
##   1     1   52
summary(mod2)
## 
## Call:
## glm(formula = Republican ~ SurveyUSA + DiffCount, family = "binomial", 
##     data = Train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.01196  -0.00698   0.01005   0.05074   1.54975  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -1.1405     1.2456  -0.916   0.3599  
## SurveyUSA     0.2976     0.1949   1.527   0.1267  
## DiffCount     0.7673     0.4188   1.832   0.0669 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.269  on 99  degrees of freedom
## Residual deviance:  12.439  on 97  degrees of freedom
## AIC: 18.439
## 
## Number of Fisher Scoring iterations: 10
# Video 5

# Smart baseline accuracy
table(Test$Republican, sign(Test$Rasmussen))
##    
##     -1  0  1
##   0 18  2  4
##   1  0  0 21
# Test set predictions
TestPrediction = predict(mod2, newdata=Test, type="response")
table(Test$Republican, TestPrediction >= 0.5)
##    
##     FALSE TRUE
##   0    23    1
##   1     0   21
# Analyze mistake
subset(Test, TestPrediction >= 0.5 & Republican == 0)
##      State Year Rasmussen SurveyUSA DiffCount     PropR Republican
## 24 Florida 2012         2         0         6 0.6666667          0