library(tidyverse)
## ── Attaching packages ────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(geofacet)
library(usmap)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridis)
## Loading required package: viridisLite
data1618 <- read.csv("https://raw.githubusercontent.com/kitadasmalley/fallChallenge2020/ff4d4795566a553cade80ca6e6fe15ea69ee6e1b/data/data_2016_2018.csv",
header=TRUE)
dd <- read.csv("https://raw.githubusercontent.com/kitadasmalley/fallChallenge2020/ff4d4795566a553cade80ca6e6fe15ea69ee6e1b/data/data_dictionary.csv",
header=TRUE,
stringsAsFactors = FALSE)
STATEFIP<-dd%>%
filter(variable=="STATEFIP")
STATEFIP<-STATEFIP[-1, 2:3]
colnames(STATEFIP)<-c("STATEFIP", "State")
METRO<-dd%>%
filter(variable=="METRO")
METRO<-METRO[-1, 2:3]
colnames(METRO)<-c("METRO", "Metro")
RACE<-dd%>%
filter(variable=="RACE")
RACE<-RACE[-1, 2:3]
colnames(RACE)<-c("RACE", "Race")
#### We might prefer to use the simplified RACE variable
RACESIMP<-dd%>%
filter(variable=="RACESIMPLE")
RACESIMP<-RACESIMP[-1, 2:3]
colnames(RACESIMP)<-c("RACESIMPLE", "RaceSimp")
## Martial Status (MARST) CODES
MARST<-dd%>%
filter(variable=="MARST")
MARST<-MARST[-1, 2:3]
colnames(MARST)<-c("MARST", "Martial")
#### We might prefer to use the simplified MARST variable
MARRSIMP<-dd%>%
filter(variable=="MARRSIMPLE")
MARRSIMP<-MARRSIMP[-1, 2:3]
colnames(MARRSIMP)<-c("MARRSIMPLE", "MartialSimp")
## VETSTAT CODES
VETSTAT<-dd%>%
filter(variable=="VETSTAT")
VETSTAT<-VETSTAT[-1, 2:3]
colnames(VETSTAT)<-c("VETSTAT", "Vet")
## CITIZEN CODES
CITIZEN<-dd%>%
filter(variable=="CITIZEN")
CITIZEN<-CITIZEN[-1, 2:3]
colnames(CITIZEN)<-c("CITIZEN", "Citizen")
## HISPAN CODES
HISPAN<-dd%>%
filter(variable=="HISPAN")
HISPAN<-HISPAN[-1, 2:3]
colnames(HISPAN)<-c("HISPAN", "Hispanic")
#### We might prefer to use the simplified HISPAN variable
HISPSIMP<-dd%>%
filter(variable=="HISPSIMPLE")
HISPSIMP<-HISPSIMP[-1, 2:3]
colnames(HISPSIMP)<-c("HISPSIMPLE", "HispanSimp")
## LABFORCE CODES
LABFORCE<-dd%>%
filter(variable=="LABFORCE")
LABFORCE<-LABFORCE[-1, 2:3]
colnames(LABFORCE)<-c("LABFORCE", "Labor")
## EDUC99 CODES (Education Attainment)
EDUC99<-dd%>%
filter(variable=="EDUC99")
EDUC99<-EDUC99[-1, 2:3]
colnames(EDUC99)<-c("EDUC99", "Edu1990")
## EDCYC CODES (Years of college credit)
EDCYC<-dd%>%
filter(variable=="EDCYC")
EDCYC<-EDCYC[-1, 2:3]
colnames(EDCYC)<-c("EDCYC", "College")
## EDDIPGED CODES (Highschool or GED)
EDDIPGED<-dd%>%
filter(variable=="EDDIPGED")
EDDIPGED<-EDDIPGED[-1, 2:3]
colnames(EDDIPGED)<-c("EDDIPGED", "HighGED")
## EDHGCGED CODES (Highest grade before GED)
EDHGCGED<-dd%>%
filter(variable=="EDHGCGED")
EDHGCGED<-EDHGCGED[-1, 2:3]
colnames(EDHGCGED)<-c("EDHGCGED", "HighestGrade")
#### We might prefer to use the simplified EDU variable
EDUSIMPLE<-dd%>%
filter(variable=="EDUSIMPLE")
EDUSIMPLE<-EDUSIMPLE[-1, 2:3]
colnames(EDUSIMPLE)<-c("EDUSIMPLE", "EduSimp")
## SCHLCOLL CODES (School or college attendance)
SCHLCOLL<-dd%>%
filter(variable=="SCHLCOLL")
SCHLCOLL<-SCHLCOLL[-1, 2:3]
colnames(SCHLCOLL)<-c("SCHLCOLL", "SchoolAttend")
## Reason why eligible voter did not vote
VOWHYNOT<-dd%>%
filter(variable=="VOWHYNOT")
VOWHYNOT<-VOWHYNOT[-1, 2:3]
colnames(VOWHYNOT)<-c("VOWHYNOT", "WhyNotVote")
VOWHYNOT[8,2]<-"Registration Problems"
## Reason why eligible voter did not register to vote
VOYNOTREG<-dd%>%
filter(variable=="VOYNOTREG")
VOYNOTREG<-VOYNOTREG[-1, 2:3]
colnames(VOYNOTREG)<-c("VOYNOTREG", "WhyNotReg")
## Method of voting in the most recent November election
VOTEHOW<-dd%>%
filter(variable=="VOTEHOW")
VOTEHOW<-VOTEHOW[-1, 2:3]
colnames(VOTEHOW)<-c("VOTEHOW", "MethodVote")
## Voted on or before election day
VOTEWHEN<-dd%>%
filter(variable=="VOTEWHEN")
VOTEWHEN<-VOTEWHEN[-1, 2:3]
colnames(VOTEWHEN)<-c("VOTEWHEN", "VoteWhen")
## Method of registering to vote
VOREGHOW<-dd%>%
filter(variable=="VOREGHOW")
VOREGHOW<-VOREGHOW[-1, 2:3]
colnames(VOREGHOW)<-c("VOREGHOW", "MethodReg")
## Voted for the most recent November election
VOTED<-dd%>%
filter(variable=="VOTED")
VOTED<-VOTED[-1, 2:3]
colnames(VOTED)<-c("VOTED", "Voted")
## Registered for the most recent November election
VOREG<-dd%>%
filter(variable=="VOREG")
VOREG<-VOREG[-1, 2:3]
colnames(VOREG)<-c("VOREG", "Registered")
trim1618<-data1618%>%
select(YEAR, STATEFIP, METRO, AGE, SEX,
RACESIMPLE, MARRSIMPLE, VETSTAT, CITIZEN,
HISPSIMPLE, LABFORCE, EDUSIMPLE, SCHLCOLL,
VOWHYNOT, VOYNOTREG, VOTEHOW, VOTEWHEN,
VOREGHOW, VOTED, VOREG, VOSUPPWT)%>%
left_join(STATEFIP)%>%
left_join(METRO)%>%
left_join(RACESIMP)%>%
left_join(MARRSIMP)%>%
left_join(VETSTAT)%>%
left_join(CITIZEN)%>%
left_join(HISPSIMP)%>%
left_join(LABFORCE)%>%
left_join(EDUSIMPLE)%>%
left_join(SCHLCOLL)%>%
left_join(VOWHYNOT)%>%
left_join(VOYNOTREG)%>%
left_join(VOTEHOW)%>%
left_join(VOTEWHEN)%>%
left_join(VOREGHOW)%>%
left_join(VOTED)%>%
left_join(VOREG)
## Joining, by = "STATEFIP"
## Joining, by = "METRO"
## Joining, by = "RACESIMPLE"
## Joining, by = "MARRSIMPLE"
## Joining, by = "VETSTAT"
## Joining, by = "CITIZEN"
## Joining, by = "HISPSIMPLE"
## Joining, by = "LABFORCE"
## Joining, by = "EDUSIMPLE"
## Joining, by = "SCHLCOLL"
## Joining, by = "VOWHYNOT"
## Joining, by = "VOYNOTREG"
## Joining, by = "VOTEHOW"
## Joining, by = "VOTEWHEN"
## Joining, by = "VOREGHOW"
## Joining, by = "VOTED"
## Joining, by = "VOREG"
trim1618 <- na.omit(trim1618)
We are working with the Voter Turnout Data set from 2016 to 2018. We obtained the data from the Census Bureau and Department of Labor Statistics which was collected on a sample of Americans every two years immediately following the November election. This Data set is important because it provides details in to why people did or did not vote in the election. Over the past four years we have seen the power that comes from voting. By understanding the factors that impact someones decision to vote or not, we can encourage certain demographics to vote.
sumWhy<-trim1618%>%
filter(WhyNotVote!="NIU")%>%
group_by(YEAR, WhyNotVote)%>%
summarise(n=n())
## `summarise()` regrouping output by 'YEAR' (override with `.groups` argument)
ggplot(sumWhy, aes(x=reorder(WhyNotVote, n), y=n, fill=as.factor(YEAR)))+
geom_bar(stat="identity", position="dodge2")+
#facet_grid(.~YEAR)+
coord_flip()+
#theme(legend.position = "none")+
theme(axis.title.y=element_blank(),
axis.ticks.y=element_blank())+
ggtitle("Why don't people vote?")
ggplot(trim1618, aes(x=AGE, fill=Voted))+
geom_boxplot()
trim1618$EduSimp<-factor(trim1618$EduSimp, levels =c("No school",
"Some school but no diploma","High school graduate or GED","Some college but no degree","Associate degree","Bachelors degree", "Masters degree", "Professional or Doctoral degree"))
ggplot(trim1618, aes(Voted, fill=EduSimp))+ geom_bar()+ facet_grid(.~YEAR)
ggplot(trim1618, aes(x = YEAR, fill = Voted)) +
geom_bar(position = "fill") +
facet_grid(.~EduSimp)
method<-trim1618%>%
filter(!MethodVote %in% c("Don't know", "NIU", "Refused", "No Response"))%>%
group_by(YEAR, State, MethodVote)%>%
summarise(nVoteM=n(),
nWgtVoteM=sum(VOSUPPWT, na.rm=TRUE))%>%
mutate(state=State)
## `summarise()` regrouping output by 'YEAR', 'State' (override with `.groups` argument)
method%>%
filter(YEAR==2016)%>%
ggplot(aes(x=1, y=nWgtVoteM, fill = MethodVote)) +
geom_col(position="fill") +
#coord_flip() +
facet_geo(~ state) +
theme_bw()+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())+
labs(x="", y = "",
caption = "Voting by mail is more popular in the West Coast (Based on data from IPUMS)",
fill = 'Method of Voting',
title=paste("Most Popular Method of Voting in 2016"))
Response: Voted
Explanatory: Age and Education
m1 <- glm(as.factor(Voted) ~ AGE, data = trim1618, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = as.factor(Voted) ~ AGE, family = "binomial", data = trim1618)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9493 -1.2968 0.7173 0.8747 1.1328
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3325070 0.0161454 -20.59 <2e-16 ***
## AGE 0.0243576 0.0003231 75.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 186890 on 152823 degrees of freedom
## Residual deviance: 180929 on 152822 degrees of freedom
## AIC: 180933
##
## Number of Fisher Scoring iterations: 4
exp(0.0243576)
## [1] 1.024657
m4 <- glm(as.factor(Voted) ~ EduSimp, data = trim1618, family = "binomial")
summary(m4)
##
## Call:
## glm(formula = as.factor(Voted) ~ EduSimp, family = "binomial",
## data = trim1618)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1186 -1.3322 0.6095 0.8508 1.5814
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9129 0.1541 -5.923 3.15e-09 ***
## EduSimpHigh school graduate or GED 1.2698 0.1544 8.223 < 2e-16 ***
## EduSimpSome college but no degree 1.7428 0.1546 11.270 < 2e-16 ***
## EduSimpAssociate degree 1.9535 0.1552 12.591 < 2e-16 ***
## EduSimpBachelors degree 2.5020 0.1548 16.161 < 2e-16 ***
## EduSimpMasters degree 2.9398 0.1564 18.800 < 2e-16 ***
## EduSimpProfessional or Doctoral degree 3.0450 0.1609 18.927 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 166356 on 141029 degrees of freedom
## Residual deviance: 157305 on 141023 degrees of freedom
## (11794 observations deleted due to missingness)
## AIC: 157319
##
## Number of Fisher Scoring iterations: 4
m2 <- glm(as.factor(Voted) ~ AGE+EduSimp, data = trim1618, family = "binomial")
summary(m2)
##
## Call:
## glm(formula = as.factor(Voted) ~ AGE + EduSimp, family = "binomial",
## data = trim1618)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5264 -1.0281 0.5854 0.8024 2.0758
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7702372 0.1600943 -17.30 <2e-16
## AGE 0.0295594 0.0003672 80.50 <2e-16
## EduSimpHigh school graduate or GED 1.6397392 0.1587188 10.33 <2e-16
## EduSimpSome college but no degree 2.2866907 0.1590638 14.38 <2e-16
## EduSimpAssociate degree 2.4077578 0.1595119 15.10 <2e-16
## EduSimpBachelors degree 3.0054442 0.1592043 18.88 <2e-16
## EduSimpMasters degree 3.3353605 0.1606828 20.76 <2e-16
## EduSimpProfessional or Doctoral degree 3.4069897 0.1651363 20.63 <2e-16
##
## (Intercept) ***
## AGE ***
## EduSimpHigh school graduate or GED ***
## EduSimpSome college but no degree ***
## EduSimpAssociate degree ***
## EduSimpBachelors degree ***
## EduSimpMasters degree ***
## EduSimpProfessional or Doctoral degree ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 166356 on 141029 degrees of freedom
## Residual deviance: 150338 on 141022 degrees of freedom
## (11794 observations deleted due to missingness)
## AIC: 150354
##
## Number of Fisher Scoring iterations: 4
m3 <- glm(as.factor(Voted) ~ AGE+EduSimp+AGE*EduSimp, data = trim1618, family = "binomial")
summary(m3)
##
## Call:
## glm(formula = as.factor(Voted) ~ AGE + EduSimp + AGE * EduSimp,
## family = "binomial", data = trim1618)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5094 -1.0348 0.5774 0.8032 2.1352
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.997368 0.648786 -4.620
## AGE 0.033033 0.009581 3.448
## EduSimpHigh school graduate or GED 1.902527 0.649431 2.930
## EduSimpSome college but no degree 2.484217 0.649643 3.824
## EduSimpAssociate degree 2.418049 0.651267 3.713
## EduSimpBachelors degree 3.271791 0.650344 5.031
## EduSimpMasters degree 3.772473 0.654958 5.760
## EduSimpProfessional or Doctoral degree 3.691908 0.667559 5.530
## AGE:EduSimpHigh school graduate or GED -0.004193 0.009597 -0.437
## AGE:EduSimpSome college but no degree -0.002774 0.009609 -0.289
## AGE:EduSimpAssociate degree 0.001304 0.009655 0.135
## AGE:EduSimpBachelors degree -0.004377 0.009631 -0.454
## AGE:EduSimpMasters degree -0.007917 0.009749 -0.812
## AGE:EduSimpProfessional or Doctoral degree -0.004678 0.010077 -0.464
## Pr(>|z|)
## (Intercept) 3.84e-06 ***
## AGE 0.000566 ***
## EduSimpHigh school graduate or GED 0.003395 **
## EduSimpSome college but no degree 0.000131 ***
## EduSimpAssociate degree 0.000205 ***
## EduSimpBachelors degree 4.88e-07 ***
## EduSimpMasters degree 8.42e-09 ***
## EduSimpProfessional or Doctoral degree 3.19e-08 ***
## AGE:EduSimpHigh school graduate or GED 0.662165
## AGE:EduSimpSome college but no degree 0.772849
## AGE:EduSimpAssociate degree 0.892601
## AGE:EduSimpBachelors degree 0.649488
## AGE:EduSimpMasters degree 0.416715
## AGE:EduSimpProfessional or Doctoral degree 0.642497
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 166356 on 141029 degrees of freedom
## Residual deviance: 150312 on 141016 degrees of freedom
## (11794 observations deleted due to missingness)
## AIC: 150340
##
## Number of Fisher Scoring iterations: 5
pred <- predict(m1, newdata = trim1618, type = "response")
head(pred)
## 1 2 3 4 5 6
## 0.7977882 0.7688846 0.7511198 0.8308616 0.7419020 0.8342571
conf_mat1 <- data.frame(vote = trim1618$Voted,
predVote = pred > .5) %>%
group_by(vote, predVote) %>%
summarise(n=n())
## `summarise()` regrouping output by 'vote' (override with `.groups` argument)
conf_mat1
## # A tibble: 2 x 3
## # Groups: vote [2]
## vote predVote n
## <chr> <lgl> <int>
## 1 Did not vote TRUE 45954
## 2 Voted TRUE 106870
pred2 <- predict(m4, newdata = trim1618, type = "response")
head(pred2)
## 1 2 3 4 5 6
## 0.7389690 0.6963248 0.6963248 NA NA 0.8304852
conf_mat2 <- data.frame(vote = trim1618$Voted,
predVote = pred2 > .5) %>%
group_by(vote, predVote) %>%
summarise(n=n())
## `summarise()` regrouping output by 'vote' (override with `.groups` argument)
conf_mat2
## # A tibble: 6 x 3
## # Groups: vote [2]
## vote predVote n
## <chr> <lgl> <int>
## 1 Did not vote FALSE 147
## 2 Did not vote TRUE 38873
## 3 Did not vote NA 6934
## 4 Voted FALSE 59
## 5 Voted TRUE 101951
## 6 Voted NA 4860