library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data1618 <- read.csv("https://raw.githubusercontent.com/kitadasmalley/fallChallenge2020/ff4d4795566a553cade80ca6e6fe15ea69ee6e1b/data/data_2016_2018.csv",
                   header=TRUE)

dd <- read.csv("https://raw.githubusercontent.com/kitadasmalley/fallChallenge2020/ff4d4795566a553cade80ca6e6fe15ea69ee6e1b/data/data_dictionary.csv",
             header=TRUE, 
             stringsAsFactors = FALSE)
STATEFIP<-dd%>%
  filter(variable=="STATEFIP")

STATEFIP<-STATEFIP[-1, 2:3]
colnames(STATEFIP)<-c("STATEFIP", "State")

METRO<-dd%>%
  filter(variable=="METRO")

METRO<-METRO[-1, 2:3]
colnames(METRO)<-c("METRO", "Metro")

RACE<-dd%>%
  filter(variable=="RACE")

RACE<-RACE[-1, 2:3]
colnames(RACE)<-c("RACE", "Race")

#### We might prefer to use the simplified RACE variable 
RACESIMP<-dd%>%
  filter(variable=="RACESIMPLE")

RACESIMP<-RACESIMP[-1, 2:3]
colnames(RACESIMP)<-c("RACESIMPLE", "RaceSimp")

## Martial Status (MARST) CODES
MARST<-dd%>%
  filter(variable=="MARST")

MARST<-MARST[-1, 2:3]
colnames(MARST)<-c("MARST", "Martial")

#### We might prefer to use the simplified MARST variable 
MARRSIMP<-dd%>%
  filter(variable=="MARRSIMPLE")

MARRSIMP<-MARRSIMP[-1, 2:3]
colnames(MARRSIMP)<-c("MARRSIMPLE", "MartialSimp")

## VETSTAT CODES
VETSTAT<-dd%>%
  filter(variable=="VETSTAT")

VETSTAT<-VETSTAT[-1, 2:3]
colnames(VETSTAT)<-c("VETSTAT", "Vet")

## CITIZEN CODES
CITIZEN<-dd%>%
  filter(variable=="CITIZEN")

CITIZEN<-CITIZEN[-1, 2:3]
colnames(CITIZEN)<-c("CITIZEN", "Citizen")

## HISPAN CODES
HISPAN<-dd%>%
  filter(variable=="HISPAN")

HISPAN<-HISPAN[-1, 2:3]
colnames(HISPAN)<-c("HISPAN", "Hispanic")

#### We might prefer to use the simplified HISPAN variable 
HISPSIMP<-dd%>%
  filter(variable=="HISPSIMPLE")

HISPSIMP<-HISPSIMP[-1, 2:3]
colnames(HISPSIMP)<-c("HISPSIMPLE", "HispanSimp")

## LABFORCE CODES
LABFORCE<-dd%>%
  filter(variable=="LABFORCE")

LABFORCE<-LABFORCE[-1, 2:3]
colnames(LABFORCE)<-c("LABFORCE", "Labor")

## EDUC99 CODES (Education Attainment)
EDUC99<-dd%>%
  filter(variable=="EDUC99")

EDUC99<-EDUC99[-1, 2:3]
colnames(EDUC99)<-c("EDUC99", "Edu1990")

## EDCYC CODES (Years of college credit)
EDCYC<-dd%>%
  filter(variable=="EDCYC")

EDCYC<-EDCYC[-1, 2:3]
colnames(EDCYC)<-c("EDCYC", "College")

## EDDIPGED CODES (Highschool or GED)
EDDIPGED<-dd%>%
  filter(variable=="EDDIPGED")

EDDIPGED<-EDDIPGED[-1, 2:3]
colnames(EDDIPGED)<-c("EDDIPGED", "HighGED")

## EDHGCGED CODES (Highest grade before GED)
EDHGCGED<-dd%>%
  filter(variable=="EDHGCGED")

EDHGCGED<-EDHGCGED[-1, 2:3]
colnames(EDHGCGED)<-c("EDHGCGED", "HighestGrade")

#### We might prefer to use the simplified EDU variable 
EDUSIMPLE<-dd%>%
  filter(variable=="EDUSIMPLE")

EDUSIMPLE<-EDUSIMPLE[-1, 2:3]
colnames(EDUSIMPLE)<-c("EDUSIMPLE", "EduSimp")

## SCHLCOLL CODES (School or college attendance)
SCHLCOLL<-dd%>%
  filter(variable=="SCHLCOLL")

SCHLCOLL<-SCHLCOLL[-1, 2:3]
colnames(SCHLCOLL)<-c("SCHLCOLL", "SchoolAttend")

## Reason why eligible voter did not vote
VOWHYNOT<-dd%>%
  filter(variable=="VOWHYNOT")

VOWHYNOT<-VOWHYNOT[-1, 2:3]
colnames(VOWHYNOT)<-c("VOWHYNOT", "WhyNotVote")
VOWHYNOT[8,2]<-"Registration Problems"

## Reason why eligible voter did not register to vote
VOYNOTREG<-dd%>%
  filter(variable=="VOYNOTREG")

VOYNOTREG<-VOYNOTREG[-1, 2:3]
colnames(VOYNOTREG)<-c("VOYNOTREG", "WhyNotReg")

## Method of voting in the most recent November election
VOTEHOW<-dd%>%
  filter(variable=="VOTEHOW")

VOTEHOW<-VOTEHOW[-1, 2:3]
colnames(VOTEHOW)<-c("VOTEHOW", "MethodVote")

## Voted on or before election day
VOTEWHEN<-dd%>%
  filter(variable=="VOTEWHEN")

VOTEWHEN<-VOTEWHEN[-1, 2:3]
colnames(VOTEWHEN)<-c("VOTEWHEN", "VoteWhen")

## Method of registering to vote
VOREGHOW<-dd%>%
  filter(variable=="VOREGHOW")

VOREGHOW<-VOREGHOW[-1, 2:3]
colnames(VOREGHOW)<-c("VOREGHOW", "MethodReg")

## Voted for the most recent November election
VOTED<-dd%>%
  filter(variable=="VOTED")

VOTED<-VOTED[-1, 2:3]
colnames(VOTED)<-c("VOTED", "Voted")

## Registered for the most recent November election
VOREG<-dd%>%
  filter(variable=="VOREG")

VOREG<-VOREG[-1, 2:3]
colnames(VOREG)<-c("VOREG", "Registered")
trim1618<-data1618%>%
  select(YEAR, STATEFIP, METRO, AGE, SEX, 
         RACESIMPLE, MARRSIMPLE, VETSTAT, CITIZEN, 
         HISPSIMPLE, LABFORCE, EDUSIMPLE, SCHLCOLL, 
         VOWHYNOT, VOYNOTREG, VOTEHOW, VOTEWHEN, 
         VOREGHOW, VOTED, VOREG, VOSUPPWT)%>%
  left_join(STATEFIP)%>%
  left_join(METRO)%>%
  left_join(RACESIMP)%>%
  left_join(MARRSIMP)%>%
  left_join(VETSTAT)%>%
  left_join(CITIZEN)%>%
  left_join(HISPSIMP)%>%
  left_join(LABFORCE)%>%
  left_join(EDUSIMPLE)%>%
  left_join(SCHLCOLL)%>%
  left_join(VOWHYNOT)%>%
  left_join(VOYNOTREG)%>%
  left_join(VOTEHOW)%>%
  left_join(VOTEWHEN)%>%
  left_join(VOREGHOW)%>%
  left_join(VOTED)%>%
  left_join(VOREG)
## Joining, by = "STATEFIP"
## Joining, by = "METRO"
## Joining, by = "RACESIMPLE"
## Joining, by = "MARRSIMPLE"
## Joining, by = "VETSTAT"
## Joining, by = "CITIZEN"
## Joining, by = "HISPSIMPLE"
## Joining, by = "LABFORCE"
## Joining, by = "EDUSIMPLE"
## Joining, by = "SCHLCOLL"
## Joining, by = "VOWHYNOT"
## Joining, by = "VOYNOTREG"
## Joining, by = "VOTEHOW"
## Joining, by = "VOTEWHEN"
## Joining, by = "VOREGHOW"
## Joining, by = "VOTED"
## Joining, by = "VOREG"
trim1618 <- na.omit(trim1618)

Introduction

Voter suppression is a problem in many parts of our country. It is important to understand why people do not vote. We are working with the Voter Turnout Data set from 2016 to 2018. The data was obtained from the Census Bureau and Department of Labor Statistics which was collected on a sample of Americans every two years immediately following the November election. This Data set is important because it provides details into why people did or did not vote in the election. At first, we looked at the big picture: Why don’t people vote. From this we got to conclude that the main reason why people don’t vote is because they are too busy. Luckily, with the 2020 election, more people got to do mail in ballots.

By understanding the factors that impact someone’s decision to vote or not, we can encourage certain demographics to vote. We have decided to focus on what affects the voter turnout, which we found to be age and education.

Data/Application

For this study, there were many variables involved and we found that many of the variables actually affected voter turnout. We found that the top three reasons why people do not vote was because people were too busy/there was another conflict, they were not interested, or did not like the candidates. We decided to dive in deeper and explore what group of people don’t vote. Which we found to be the younger generations and people with a lower education status. We speculated that people with higher education, go to school longer, and are thus older. Also, people with higher education are more committed: for example, people who attend college are more like to sign petitions and are more politically active in comparison to someone who has only a high school degree. Our right to vote is a freedom that other countries do not have. As we do this study, we hope we can encourage others to get out and vote.

Graphics
ggplot(trim1618, aes(x=AGE, fill=Voted))+
  geom_boxplot()

From the box plot above, it is clear that age is a big factor in whether people vote or not. The mean age that did not vote is around 41 and the mean age that voted is around 54. This age gap is fairly large. What was interesting though, is how the mean age gap between people who voted and did not vote is about 10 years apart. We see that it is not a jump from young adults to adults, this age gap is mainly adults. Also, since there are no outliers, there is no influence that could affect the mean value.

ggplot(trim1618, aes(x = YEAR, fill = Voted)) + 
  geom_bar(position = "fill") +
  facet_grid(.~EduSimp)

Now, as we look at the bar graph, we see that higher education correlates to voting. This could be because people with higher education are more educated about how powerful voting can be. Also, higher education could mean better jobs (this is just a thought, as it could not be true), but these jobs could have there employees take off to vote, or they have more resources. Also, people with lower education could be working more where they cannot afford to take off (this is also just a thought as well). This could directly correlate with the first graphic of why people don’t vote. They coule be too busy with work and their job does not provide resources to support voting.

Methods

The method we used was a general linear model (GLM). We chose a general linear model because since our Voted variable is numeric, the GLM best fit our data. We created a GLM for the variables age and education.

General linear model of Voted as a factor of age.
mod1 <- glm(as.factor(Voted) ~ AGE, data = trim1618, family = "binomial")
summary(mod1)
## 
## Call:
## glm(formula = as.factor(Voted) ~ AGE, family = "binomial", data = trim1618)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9493  -1.2968   0.7173   0.8747   1.1328  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.3325070  0.0161454  -20.59   <2e-16 ***
## AGE          0.0243576  0.0003231   75.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 186890  on 152823  degrees of freedom
## Residual deviance: 180929  on 152822  degrees of freedom
## AIC: 180933
## 
## Number of Fisher Scoring iterations: 4
General linear model of Voted as a function of education.
mod2 <- glm(as.factor(Voted) ~ EduSimp, data = trim1618, family = "binomial")
summary(mod2)
## 
## Call:
## glm(formula = as.factor(Voted) ~ EduSimp, family = "binomial", 
##     data = trim1618)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1186  -1.3322   0.6095   0.8508   1.5814  
## 
## Coefficients:
##                                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                             1.04062    0.01791  58.098   <2e-16 ***
## EduSimpBachelors degree                 0.54845    0.02319  23.646   <2e-16 ***
## EduSimpHigh school graduate or GED     -0.68370    0.02036 -33.581   <2e-16 ***
## EduSimpMasters degree                   0.98632    0.03198  30.846   <2e-16 ***
## EduSimpNo school                       -1.95351    0.15515 -12.591   <2e-16 ***
## EduSimpProfessional or Doctoral degree  1.09146    0.04951  22.045   <2e-16 ***
## EduSimpSome college but no degree      -0.21076    0.02199  -9.586   <2e-16 ***
## EduSimpSome school but no diploma      -1.39602    0.02590 -53.901   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 186890  on 152823  degrees of freedom
## Residual deviance: 173289  on 152816  degrees of freedom
## AIC: 173305
## 
## Number of Fisher Scoring iterations: 4
General linear model of Voted as a function of age+education.
mod3 <- glm(as.factor(Voted) ~ AGE+EduSimp, data = trim1618, family = "binomial")
summary(mod3)
## 
## Call:
## glm(formula = as.factor(Voted) ~ AGE + EduSimp, family = "binomial", 
##     data = trim1618)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5104  -1.0388   0.5802   0.8159   2.0564  
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -0.3106201  0.0240839 -12.897  < 2e-16
## AGE                                     0.0284254  0.0003427  82.954  < 2e-16
## EduSimpBachelors degree                 0.5950674  0.0236153  25.198  < 2e-16
## EduSimpHigh school graduate or GED     -0.7636823  0.0208712 -36.590  < 2e-16
## EduSimpMasters degree                   0.9290185  0.0323862  28.686  < 2e-16
## EduSimpNo school                       -2.3858487  0.1592024 -14.986  < 2e-16
## EduSimpProfessional or Doctoral degree  1.0017688  0.0500309  20.023  < 2e-16
## EduSimpSome college but no degree      -0.1248188  0.0225258  -5.541 3.01e-08
## EduSimpSome school but no diploma      -1.6043367  0.0269143 -59.609  < 2e-16
##                                           
## (Intercept)                            ***
## AGE                                    ***
## EduSimpBachelors degree                ***
## EduSimpHigh school graduate or GED     ***
## EduSimpMasters degree                  ***
## EduSimpNo school                       ***
## EduSimpProfessional or Doctoral degree ***
## EduSimpSome college but no degree      ***
## EduSimpSome school but no diploma      ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 186890  on 152823  degrees of freedom
## Residual deviance: 165908  on 152815  degrees of freedom
## AIC: 165926
## 
## Number of Fisher Scoring iterations: 4

From this model we have found that age, having a bachelors degree, having a masters degree, and having a professional or doctoral degree are all positive. This tells us that there is an increasing chance of people who fit into these categories vote.

Effect Size
exp(0.0284254)
## [1] 1.028833

People who are older is 1.02 more likely to vote.

exp(0.5950674)
## [1] 1.813153

Someone with a bachelors degree is 1.81 more likely to vote.

exp(0.9290185)
## [1] 2.532023

Someone with a masters degree is 2.53 more likely to vote.

exp(1.0017688)
## [1] 2.723094

someone with a professional/doctoral degrre is 2.72 more likely to vote.

General linear model of Voted as a function of age+education and their interaction.
mod4 <- glm(as.factor(Voted) ~ AGE+EduSimp+ AGE*EduSimp, data = trim1618, family = "binomial")
summary(mod4)
## 
## Call:
## glm(formula = as.factor(Voted) ~ AGE + EduSimp + AGE * EduSimp, 
##     family = "binomial", data = trim1618)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5094  -1.0348   0.5715   0.8151   2.1352  
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                -0.579320   0.056794 -10.200
## AGE                                         0.034336   0.001191  28.840
## EduSimpBachelors degree                     0.853742   0.072454  11.783
## EduSimpHigh school graduate or GED         -0.515522   0.063734  -8.089
## EduSimpMasters degree                       1.354424   0.106169  12.757
## EduSimpNo school                           -2.418049   0.651267  -3.713
## EduSimpProfessional or Doctoral degree      1.273859   0.167144   7.621
## EduSimpSome college but no degree           0.066168   0.065864   1.005
## EduSimpSome school but no diploma          -0.894600   0.079703 -11.224
## AGE:EduSimpBachelors degree                -0.005681   0.001540  -3.690
## AGE:EduSimpHigh school graduate or GED     -0.005497   0.001310  -4.195
## AGE:EduSimpMasters degree                  -0.009221   0.002157  -4.276
## AGE:EduSimpNo school                       -0.001304   0.009655  -0.135
## AGE:EduSimpProfessional or Doctoral degree -0.005982   0.003342  -1.790
## AGE:EduSimpSome college but no degree      -0.004077   0.001394  -2.925
## AGE:EduSimpSome school but no diploma      -0.013819   0.001522  -9.078
##                                            Pr(>|z|)    
## (Intercept)                                 < 2e-16 ***
## AGE                                         < 2e-16 ***
## EduSimpBachelors degree                     < 2e-16 ***
## EduSimpHigh school graduate or GED         6.04e-16 ***
## EduSimpMasters degree                       < 2e-16 ***
## EduSimpNo school                           0.000205 ***
## EduSimpProfessional or Doctoral degree     2.51e-14 ***
## EduSimpSome college but no degree          0.315079    
## EduSimpSome school but no diploma           < 2e-16 ***
## AGE:EduSimpBachelors degree                0.000224 ***
## AGE:EduSimpHigh school graduate or GED     2.73e-05 ***
## AGE:EduSimpMasters degree                  1.91e-05 ***
## AGE:EduSimpNo school                       0.892601    
## AGE:EduSimpProfessional or Doctoral degree 0.073489 .  
## AGE:EduSimpSome college but no degree      0.003449 ** 
## AGE:EduSimpSome school but no diploma       < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 186890  on 152823  degrees of freedom
## Residual deviance: 165804  on 152808  degrees of freedom
## AIC: 165836
## 
## Number of Fisher Scoring iterations: 5
Prediction models
pred <- predict(mod3, newdata = trim1618, type = "response")
head(pred)
##         1         2         3         4         5         6 
## 0.8427932 0.7950050 0.7758530 0.5819179 0.4268517 0.9281412

From this prediction model, we can see that the range of people who are older and have a higher education are 84.3-92.8% more likely to vote.

conf_mat1 <- data.frame(vote = trim1618$Voted, 
                        predVote = pred > .5) %>% 
  group_by(vote, predVote) %>%
  summarise(n=n())
## `summarise()` regrouping output by 'vote' (override with `.groups` argument)
conf_mat1 
## # A tibble: 4 x 3
## # Groups:   vote [2]
##   vote         predVote     n
##   <chr>        <lgl>    <int>
## 1 Did not vote FALSE    12757
## 2 Did not vote TRUE     33197
## 3 Voted        FALSE     8015
## 4 Voted        TRUE     98855

Analysis/Results

Looking at the methods above, we can see that all of the model outputs read the same thing. People with higher education and people who are older are more likely to vote. Out of all the models, there is no model that tells us otherwise. The model that we think provided the clearest results was mod3. This is because it is clear to the reader on which variables effect voter turnout. Also, in mod3, all of the variables were significant because all of their p-values were small.

From mod3 we made a prediction model to help us analyze how likely people are to vote. From the prediction model, we can see that it ranges from 84-93% of people that fall into the category of older age and higher education are likely to vote. This prediction model is useful because it can help us predict voting turnout in the future.

Conclusion

Our findings were not surprising and will help us educate the uneducated when it comes to voting. If we go back to the big picture of why people dont vote, the main reason was because they were too busy. This could directly correlate with age as the younger generations have a conflict with school. To help fix this, we should set up voting registrations at universities, and more ballot drop boxes. Also, as we have seen, people with lower education tend to not vote as well. This could directly correlate to them not being interested, because they are not educated/motivated enough.We, as a Country, need to educate the uneducated and provide more resources to the people who are not able to vote. From this past election, we have seen the impact that voting has.