Final

library(tidyverse)

## ── Attaching packages ────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(geofacet)
library(usmap)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(viridis)

## Loading required package: viridisLite

data1618 <- read.csv("https://raw.githubusercontent.com/kitadasmalley/fallChallenge2020/ff4d4795566a553cade80ca6e6fe15ea69ee6e1b/data/data_2016_2018.csv",
                   header=TRUE)

dd <- read.csv("https://raw.githubusercontent.com/kitadasmalley/fallChallenge2020/ff4d4795566a553cade80ca6e6fe15ea69ee6e1b/data/data_dictionary.csv",
             header=TRUE, 
             stringsAsFactors = FALSE)

STATEFIP<-dd%>%
  filter(variable=="STATEFIP")

STATEFIP<-STATEFIP[-1, 2:3]
colnames(STATEFIP)<-c("STATEFIP", "State")

METRO<-dd%>%
  filter(variable=="METRO")

METRO<-METRO[-1, 2:3]
colnames(METRO)<-c("METRO", "Metro")

RACE<-dd%>%
  filter(variable=="RACE")

RACE<-RACE[-1, 2:3]
colnames(RACE)<-c("RACE", "Race")

#### We might prefer to use the simplified RACE variable 
RACESIMP<-dd%>%
  filter(variable=="RACESIMPLE")

RACESIMP<-RACESIMP[-1, 2:3]
colnames(RACESIMP)<-c("RACESIMPLE", "RaceSimp")

## Martial Status (MARST) CODES
MARST<-dd%>%
  filter(variable=="MARST")

MARST<-MARST[-1, 2:3]
colnames(MARST)<-c("MARST", "Martial")

#### We might prefer to use the simplified MARST variable 
MARRSIMP<-dd%>%
  filter(variable=="MARRSIMPLE")

MARRSIMP<-MARRSIMP[-1, 2:3]
colnames(MARRSIMP)<-c("MARRSIMPLE", "MartialSimp")

## VETSTAT CODES
VETSTAT<-dd%>%
  filter(variable=="VETSTAT")

VETSTAT<-VETSTAT[-1, 2:3]
colnames(VETSTAT)<-c("VETSTAT", "Vet")

## CITIZEN CODES
CITIZEN<-dd%>%
  filter(variable=="CITIZEN")

CITIZEN<-CITIZEN[-1, 2:3]
colnames(CITIZEN)<-c("CITIZEN", "Citizen")

## HISPAN CODES
HISPAN<-dd%>%
  filter(variable=="HISPAN")

HISPAN<-HISPAN[-1, 2:3]
colnames(HISPAN)<-c("HISPAN", "Hispanic")

#### We might prefer to use the simplified HISPAN variable 
HISPSIMP<-dd%>%
  filter(variable=="HISPSIMPLE")

HISPSIMP<-HISPSIMP[-1, 2:3]
colnames(HISPSIMP)<-c("HISPSIMPLE", "HispanSimp")

## LABFORCE CODES
LABFORCE<-dd%>%
  filter(variable=="LABFORCE")

LABFORCE<-LABFORCE[-1, 2:3]
colnames(LABFORCE)<-c("LABFORCE", "Labor")

## EDUC99 CODES (Education Attainment)
EDUC99<-dd%>%
  filter(variable=="EDUC99")

EDUC99<-EDUC99[-1, 2:3]
colnames(EDUC99)<-c("EDUC99", "Edu1990")

## EDCYC CODES (Years of college credit)
EDCYC<-dd%>%
  filter(variable=="EDCYC")

EDCYC<-EDCYC[-1, 2:3]
colnames(EDCYC)<-c("EDCYC", "College")

## EDDIPGED CODES (Highschool or GED)
EDDIPGED<-dd%>%
  filter(variable=="EDDIPGED")

EDDIPGED<-EDDIPGED[-1, 2:3]
colnames(EDDIPGED)<-c("EDDIPGED", "HighGED")

## EDHGCGED CODES (Highest grade before GED)
EDHGCGED<-dd%>%
  filter(variable=="EDHGCGED")

EDHGCGED<-EDHGCGED[-1, 2:3]
colnames(EDHGCGED)<-c("EDHGCGED", "HighestGrade")

#### We might prefer to use the simplified EDU variable 
EDUSIMPLE<-dd%>%
  filter(variable=="EDUSIMPLE")

EDUSIMPLE<-EDUSIMPLE[-1, 2:3]
colnames(EDUSIMPLE)<-c("EDUSIMPLE", "EduSimp")

## SCHLCOLL CODES (School or college attendance)
SCHLCOLL<-dd%>%
  filter(variable=="SCHLCOLL")

SCHLCOLL<-SCHLCOLL[-1, 2:3]
colnames(SCHLCOLL)<-c("SCHLCOLL", "SchoolAttend")

## Reason why eligible voter did not vote
VOWHYNOT<-dd%>%
  filter(variable=="VOWHYNOT")

VOWHYNOT<-VOWHYNOT[-1, 2:3]
colnames(VOWHYNOT)<-c("VOWHYNOT", "WhyNotVote")
VOWHYNOT[8,2]<-"Registration Problems"

## Reason why eligible voter did not register to vote
VOYNOTREG<-dd%>%
  filter(variable=="VOYNOTREG")

VOYNOTREG<-VOYNOTREG[-1, 2:3]
colnames(VOYNOTREG)<-c("VOYNOTREG", "WhyNotReg")

## Method of voting in the most recent November election
VOTEHOW<-dd%>%
  filter(variable=="VOTEHOW")

VOTEHOW<-VOTEHOW[-1, 2:3]
colnames(VOTEHOW)<-c("VOTEHOW", "MethodVote")

## Voted on or before election day
VOTEWHEN<-dd%>%
  filter(variable=="VOTEWHEN")

VOTEWHEN<-VOTEWHEN[-1, 2:3]
colnames(VOTEWHEN)<-c("VOTEWHEN", "VoteWhen")

## Method of registering to vote
VOREGHOW<-dd%>%
  filter(variable=="VOREGHOW")

VOREGHOW<-VOREGHOW[-1, 2:3]
colnames(VOREGHOW)<-c("VOREGHOW", "MethodReg")

## Voted for the most recent November election
VOTED<-dd%>%
  filter(variable=="VOTED")

VOTED<-VOTED[-1, 2:3]
colnames(VOTED)<-c("VOTED", "Voted")

## Registered for the most recent November election
VOREG<-dd%>%
  filter(variable=="VOREG")

VOREG<-VOREG[-1, 2:3]
colnames(VOREG)<-c("VOREG", "Registered")

trim1618<-data1618%>%
  select(YEAR, STATEFIP, METRO, AGE, SEX, 
         RACESIMPLE, MARRSIMPLE, VETSTAT, CITIZEN, 
         HISPSIMPLE, LABFORCE, EDUSIMPLE, SCHLCOLL, 
         VOWHYNOT, VOYNOTREG, VOTEHOW, VOTEWHEN, 
         VOREGHOW, VOTED, VOREG, VOSUPPWT)%>%
  left_join(STATEFIP)%>%
  left_join(METRO)%>%
  left_join(RACESIMP)%>%
  left_join(MARRSIMP)%>%
  left_join(VETSTAT)%>%
  left_join(CITIZEN)%>%
  left_join(HISPSIMP)%>%
  left_join(LABFORCE)%>%
  left_join(EDUSIMPLE)%>%
  left_join(SCHLCOLL)%>%
  left_join(VOWHYNOT)%>%
  left_join(VOYNOTREG)%>%
  left_join(VOTEHOW)%>%
  left_join(VOTEWHEN)%>%
  left_join(VOREGHOW)%>%
  left_join(VOTED)%>%
  left_join(VOREG)

## Joining, by = "STATEFIP"

## Joining, by = "METRO"

## Joining, by = "RACESIMPLE"

## Joining, by = "MARRSIMPLE"

## Joining, by = "VETSTAT"

## Joining, by = "CITIZEN"

## Joining, by = "HISPSIMPLE"

## Joining, by = "LABFORCE"

## Joining, by = "EDUSIMPLE"

## Joining, by = "SCHLCOLL"

## Joining, by = "VOWHYNOT"

## Joining, by = "VOYNOTREG"

## Joining, by = "VOTEHOW"

## Joining, by = "VOTEWHEN"

## Joining, by = "VOREGHOW"

## Joining, by = "VOTED"

## Joining, by = "VOREG"

trim1618 <- na.omit(trim1618)

HW 8

Introduction

We are working with the Voter Turnout Data set from 2016 to 2018. We obtained the data from the Census Bureau and Department of Labor Statistics which was collected on a sample of Americans every two years immediately following the November election. This Data set is important because it provides details in to why people did or did not vote in the election. Over the past four years we have seen the power that comes from voting. By understanding the factors that impact someones decision to vote or not, we can encourage certain demographics to vote.

sumWhy<-trim1618%>%
  filter(WhyNotVote!="NIU")%>%
  group_by(YEAR, WhyNotVote)%>%
  summarise(n=n())

## `summarise()` regrouping output by 'YEAR' (override with `.groups` argument)

ggplot(sumWhy, aes(x=reorder(WhyNotVote, n), y=n, fill=as.factor(YEAR)))+
  geom_bar(stat="identity", position="dodge2")+
  #facet_grid(.~YEAR)+
  coord_flip()+ 
  #theme(legend.position = "none")+
  theme(axis.title.y=element_blank(),
        axis.ticks.y=element_blank())+
  ggtitle("Why don't people vote?")

Age and Voter Turn Out

ggplot(trim1618, aes(x=AGE, fill=Voted))+
  geom_boxplot()

Education and Voter Turn Out

trim1618$EduSimp<-factor(trim1618$EduSimp, levels =c("No school",
                                                     "Some school but no  diploma","High school graduate or GED","Some college but no degree","Associate degree","Bachelors degree", "Masters degree", "Professional or Doctoral degree"))

ggplot(trim1618, aes(Voted, fill=EduSimp))+ geom_bar()+  facet_grid(.~YEAR)

ggplot(trim1618, aes(x = YEAR, fill = Voted)) + 
  geom_bar(position = "fill") +
  facet_grid(.~EduSimp)

Method of Voting by State

method<-trim1618%>%
  filter(!MethodVote %in% c("Don't know", "NIU", "Refused", "No Response"))%>%
  group_by(YEAR, State, MethodVote)%>%
  summarise(nVoteM=n(), 
            nWgtVoteM=sum(VOSUPPWT, na.rm=TRUE))%>%
  mutate(state=State)

## `summarise()` regrouping output by 'YEAR', 'State' (override with `.groups` argument)

method%>%
  filter(YEAR==2016)%>%
  ggplot(aes(x=1, y=nWgtVoteM, fill = MethodVote)) +
  geom_col(position="fill") +
  #coord_flip() +
  facet_geo(~ state) +
  theme_bw()+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())+
  labs(x="", y = "", 
       caption = "Voting by mail is more popular in the West Coast (Based on data from IPUMS)", 
       fill = 'Method of Voting',
       title=paste("Most Popular Method of Voting in 2016"))

HW 9

Response: Voted

Explanatory: Age and Education

m1 <- glm(as.factor(Voted) ~ AGE, data = trim1618, family = "binomial")
summary(m1)

## 
## Call:
## glm(formula = as.factor(Voted) ~ AGE, family = "binomial", data = trim1618)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9493  -1.2968   0.7173   0.8747   1.1328  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.3325070  0.0161454  -20.59   <2e-16 ***
## AGE          0.0243576  0.0003231   75.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 186890  on 152823  degrees of freedom
## Residual deviance: 180929  on 152822  degrees of freedom
## AIC: 180933
## 
## Number of Fisher Scoring iterations: 4

exp(0.0243576)

## [1] 1.024657

m4 <- glm(as.factor(Voted) ~ EduSimp, data = trim1618, family = "binomial")
summary(m4)

## 
## Call:
## glm(formula = as.factor(Voted) ~ EduSimp, family = "binomial", 
##     data = trim1618)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1186  -1.3322   0.6095   0.8508   1.5814  
## 
## Coefficients:
##                                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                             -0.9129     0.1541  -5.923 3.15e-09 ***
## EduSimpHigh school graduate or GED       1.2698     0.1544   8.223  < 2e-16 ***
## EduSimpSome college but no degree        1.7428     0.1546  11.270  < 2e-16 ***
## EduSimpAssociate degree                  1.9535     0.1552  12.591  < 2e-16 ***
## EduSimpBachelors degree                  2.5020     0.1548  16.161  < 2e-16 ***
## EduSimpMasters degree                    2.9398     0.1564  18.800  < 2e-16 ***
## EduSimpProfessional or Doctoral degree   3.0450     0.1609  18.927  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 166356  on 141029  degrees of freedom
## Residual deviance: 157305  on 141023  degrees of freedom
##   (11794 observations deleted due to missingness)
## AIC: 157319
## 
## Number of Fisher Scoring iterations: 4

m2 <- glm(as.factor(Voted) ~ AGE+EduSimp, data = trim1618, family = "binomial")
summary(m2)

## 
## Call:
## glm(formula = as.factor(Voted) ~ AGE + EduSimp, family = "binomial", 
##     data = trim1618)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5264  -1.0281   0.5854   0.8024   2.0758  
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -2.7702372  0.1600943  -17.30   <2e-16
## AGE                                     0.0295594  0.0003672   80.50   <2e-16
## EduSimpHigh school graduate or GED      1.6397392  0.1587188   10.33   <2e-16
## EduSimpSome college but no degree       2.2866907  0.1590638   14.38   <2e-16
## EduSimpAssociate degree                 2.4077578  0.1595119   15.10   <2e-16
## EduSimpBachelors degree                 3.0054442  0.1592043   18.88   <2e-16
## EduSimpMasters degree                   3.3353605  0.1606828   20.76   <2e-16
## EduSimpProfessional or Doctoral degree  3.4069897  0.1651363   20.63   <2e-16
##                                           
## (Intercept)                            ***
## AGE                                    ***
## EduSimpHigh school graduate or GED     ***
## EduSimpSome college but no degree      ***
## EduSimpAssociate degree                ***
## EduSimpBachelors degree                ***
## EduSimpMasters degree                  ***
## EduSimpProfessional or Doctoral degree ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 166356  on 141029  degrees of freedom
## Residual deviance: 150338  on 141022  degrees of freedom
##   (11794 observations deleted due to missingness)
## AIC: 150354
## 
## Number of Fisher Scoring iterations: 4

m3 <- glm(as.factor(Voted) ~ AGE+EduSimp+AGE*EduSimp, data = trim1618, family = "binomial")
summary(m3)

## 
## Call:
## glm(formula = as.factor(Voted) ~ AGE + EduSimp + AGE * EduSimp, 
##     family = "binomial", data = trim1618)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5094  -1.0348   0.5774   0.8032   2.1352  
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                -2.997368   0.648786  -4.620
## AGE                                         0.033033   0.009581   3.448
## EduSimpHigh school graduate or GED          1.902527   0.649431   2.930
## EduSimpSome college but no degree           2.484217   0.649643   3.824
## EduSimpAssociate degree                     2.418049   0.651267   3.713
## EduSimpBachelors degree                     3.271791   0.650344   5.031
## EduSimpMasters degree                       3.772473   0.654958   5.760
## EduSimpProfessional or Doctoral degree      3.691908   0.667559   5.530
## AGE:EduSimpHigh school graduate or GED     -0.004193   0.009597  -0.437
## AGE:EduSimpSome college but no degree      -0.002774   0.009609  -0.289
## AGE:EduSimpAssociate degree                 0.001304   0.009655   0.135
## AGE:EduSimpBachelors degree                -0.004377   0.009631  -0.454
## AGE:EduSimpMasters degree                  -0.007917   0.009749  -0.812
## AGE:EduSimpProfessional or Doctoral degree -0.004678   0.010077  -0.464
##                                            Pr(>|z|)    
## (Intercept)                                3.84e-06 ***
## AGE                                        0.000566 ***
## EduSimpHigh school graduate or GED         0.003395 ** 
## EduSimpSome college but no degree          0.000131 ***
## EduSimpAssociate degree                    0.000205 ***
## EduSimpBachelors degree                    4.88e-07 ***
## EduSimpMasters degree                      8.42e-09 ***
## EduSimpProfessional or Doctoral degree     3.19e-08 ***
## AGE:EduSimpHigh school graduate or GED     0.662165    
## AGE:EduSimpSome college but no degree      0.772849    
## AGE:EduSimpAssociate degree                0.892601    
## AGE:EduSimpBachelors degree                0.649488    
## AGE:EduSimpMasters degree                  0.416715    
## AGE:EduSimpProfessional or Doctoral degree 0.642497    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 166356  on 141029  degrees of freedom
## Residual deviance: 150312  on 141016  degrees of freedom
##   (11794 observations deleted due to missingness)
## AIC: 150340
## 
## Number of Fisher Scoring iterations: 5

Confusion Matrix for Age

pred <- predict(m1, newdata = trim1618, type = "response")
head(pred)

##         1         2         3         4         5         6 
## 0.7977882 0.7688846 0.7511198 0.8308616 0.7419020 0.8342571

conf_mat1 <- data.frame(vote = trim1618$Voted, 
                        predVote = pred > .5) %>% 
  group_by(vote, predVote) %>%
  summarise(n=n())

## `summarise()` regrouping output by 'vote' (override with `.groups` argument)

conf_mat1

## # A tibble: 2 x 3
## # Groups:   vote [2]
##   vote         predVote      n
##   <chr>        <lgl>     <int>
## 1 Did not vote TRUE      45954
## 2 Voted        TRUE     106870

Confusion Matrix for Education

pred2 <- predict(m4, newdata = trim1618, type = "response")
head(pred2)

##         1         2         3         4         5         6 
## 0.7389690 0.6963248 0.6963248        NA        NA 0.8304852

conf_mat2 <- data.frame(vote = trim1618$Voted, 
                        predVote = pred2 > .5) %>% 
  group_by(vote, predVote) %>%
  summarise(n=n())

## `summarise()` regrouping output by 'vote' (override with `.groups` argument)

conf_mat2

## # A tibble: 6 x 3
## # Groups:   vote [2]
##   vote         predVote      n
##   <chr>        <lgl>     <int>
## 1 Did not vote FALSE       147
## 2 Did not vote TRUE      38873
## 3 Did not vote NA         6934
## 4 Voted        FALSE        59
## 5 Voted        TRUE     101951
## 6 Voted        NA         4860