Project Predicting Election Results

require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(caTools)
## Loading required package: caTools

Loading the data sets

primary_data = read.csv(file=
        "https://raw.githubusercontent.com/peterkoebel/ProjectElectionResults/master/primary_winner.csv")
county_data = read.csv(file=
        "https://raw.githubusercontent.com/peterkoebel/ProjectElectionResults/master/county_data.csv")

Using inner join to merge the primary dataset and the county dataset

data = primary_data %>% inner_join(county_data, by = 'fips')

Counting the number of counties won by each canidate

primary_data %>% group_by(winner) %>% summarise(number_wins = n())
## # A tibble: 2 x 2
##   winner          number_wins
##   <fct>                 <int>
## 1 Bernie Sanders         1540
## 2 Hillary Clinton        1199

Splitting the data set into Training and Testing datasets. 80% of the data will go into the Training dataset

Will split on the column ‘winner’, so the same proportion of winners are in the Training and Testing datasets

set.seed(486)
split = sample.split(data$winner, SplitRatio = 0.8)
training_data = data[split,]
testing_data = data[!split,]

Checking that the proportions are the same

training_data %>% group_by(winner) %>% 
  summarise(percent_won = n()/nrow(training_data))
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 2 x 2
##   winner          percent_won
##   <fct>                 <dbl>
## 1 Bernie Sanders        0.388
## 2 Hillary Clinton       0.612
testing_data %>% group_by(winner) %>% 
  summarise(percent_won = n()/nrow(testing_data))
## # A tibble: 2 x 2
##   winner          percent_won
##   <fct>                 <dbl>
## 1 Bernie Sanders        0.388
## 2 Hillary Clinton       0.612

Yes, the proporations are the same

Observing the levels of the factor winner

levels(data$winner)
## [1] "Bernie Sanders"  "Hillary Clinton"

Bernie Sanders is the base level, with Hillary Clinton as our other level

Building a logistic regression model to predict the winner

logistic_model = glm(winner ~ age_over_65 + female_perc + foreign_born_perc + 
                     bachelors_perc + household_income,
                     data = data,
                     family = binomial)

Reviewing the summary of the model

summary(logistic_model)
## 
## Call:
## glm(formula = winner ~ age_over_65 + female_perc + foreign_born_perc + 
##     bachelors_perc + household_income, family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8503  -1.2024   0.7336   0.9499   2.2646  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -6.336e+00  1.242e+00  -5.099 3.41e-07 ***
## age_over_65       -2.114e-02  1.270e-02  -1.665    0.096 .  
## female_perc        1.594e-01  2.422e-02   6.581 4.66e-11 ***
## foreign_born_perc  1.114e-01  1.392e-02   8.005 1.20e-15 ***
## bachelors_perc    -7.761e-02  9.257e-03  -8.384  < 2e-16 ***
## household_income   4.831e-06  6.397e-06   0.755    0.450    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2111.3  on 1580  degrees of freedom
## Residual deviance: 1933.5  on 1575  degrees of freedom
## AIC: 1945.5
## 
## Number of Fisher Scoring iterations: 4

Notice that the household_income column isn’t significant.

So we’ll remove it from our model.

logistic_model = glm(winner ~ age_over_65 + female_perc + 
                       foreign_born_perc + bachelors_perc,
                     data = data,
                     family = binomial)

summary(logistic_model)
## 
## Call:
## glm(formula = winner ~ age_over_65 + female_perc + foreign_born_perc + 
##     bachelors_perc, family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8731  -1.2000   0.7322   0.9508   2.2349  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -6.069616   1.188619  -5.106 3.28e-07 ***
## age_over_65       -0.022483   0.012573  -1.788   0.0737 .  
## female_perc        0.157212   0.024005   6.549 5.78e-11 ***
## foreign_born_perc  0.112498   0.013846   8.125 4.47e-16 ***
## bachelors_perc    -0.073474   0.007419  -9.904  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2111.3  on 1580  degrees of freedom
## Residual deviance: 1934.1  on 1576  degrees of freedom
## AIC: 1944.1
## 
## Number of Fisher Scoring iterations: 4

That looks better. Now all our variables have significance.

Predicting the probability for the Trainging datset.

training_data$winner_prediction_perc = predict(logistic_model, training_data, type = 'response')

Trying out different values as thresholds,

then chooseing the ones with the best prediction rate

for(i in c(0.4, 0.5, 0.6, 0.7)){
training_data = training_data %>% 
    mutate(winner_prediction = ifelse(winner_prediction_perc > i, 'Hillary Clinton', 'Bernie Sanders'))
print(sum(as.character(training_data$winner) == training_data$winner_prediction)/
          nrow(training_data))
}
## [1] 0.647943
## [1] 0.6495253
## [1] 0.6416139
## [1] 0.5704114

Threshold 0.5 has the best prediction rate with 64.95%. So we will use 0.5.

Creating a confusion matrix, to see where are the correct predictions

and where are the wrong ones.

training_data = training_data %>% 
  mutate(winner_prediction = ifelse(winner_prediction_perc > 0.5, 'Hillary Clinton', 'Bernie Sanders'))
table(training_data$winner, training_data$winner_prediction)
##                  
##                   Bernie Sanders Hillary Clinton
##   Bernie Sanders             152             338
##   Hillary Clinton            105             669

The table shows that 152 instances where we predicted Bernie Sanders and Bernie Sanders was the winner.

There are 338 instances where we predicted Bernies Sanders and Hillary Clinton was the winner.

In 105 instances we predicted Hillary Clinton and Bernie Sanders was the winner.

Lastly we have 669 instances where we predicted Hillary Clinton nd Hillary Clinton was the winner.

Predicting the winner with the Testing dataset and creating a confusion matrix.

testing_data$winner_prediction_perc = predict(logistic_model, testing_data, type = 'response')
testing_data = testing_data %>% 
  mutate(winner_prediction = ifelse(winner_prediction_perc > 0.5, 'Hillary Clinton', 'Bernie Sanders'))
test_confusion = table(testing_data$winner, testing_data$winner_prediction)
test_confusion
##                  
##                   Bernie Sanders Hillary Clinton
##   Bernie Sanders              43              80
##   Hillary Clinton             26             168

For the Testing dataset, we correctly predicted Bernie Sanders in 43 occurances

and correctly predicted Hillary Clinton for 168 occurances.

Now we’ll calculate the ‘corrected prediction’ percentage

sum(diag(test_confusion))/sum(test_confusion)
## [1] 0.6656151

This shows that 66.56% of the occurances were correctly predicted.