SOC202 Final Project - Adam Chmelauskas

Introduction

My Research Question

How has millennial’s perception of their vote having an influence on politics effected their voter turnout in the United Kingdom

Prelininary Analysis of Data

Loading ESS Dataset and Various Packages

# Installing and loading Various Packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "broom", "remotes") 

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'remotes' was built under R version 4.3.2

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "broom"        "modelsummary" "fst"          "infer"        "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[6]]
##  [1] "remotes"      "broom"        "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"

Loading Data

#Loading ESS Dataset
setwd("C:/Users/2expl/Desktop/UToronto/Year 2/SOC202")
ess <- read_fst("All-ESS-Data.fst")

Subsetting and Cleaning Required Data

GB_subset <- ess %>% 
  filter(cntry == "GB") %>%
  mutate(
    psppipla = case_when(
      psppipla %in% c(7, 8, 9) ~ NA_character_,
      psppipla == 1 ~ "Not At All",
      psppipla == 2 ~ "Very Little",
      psppipla == 3 ~ "Some",
      psppipla == 4 ~ "A Lot",
      psppipla == 5 ~ "A Great Deal",
      TRUE ~ as.character(psppipla)
    ),
    
    vote = case_when(
      vote %in% c(3, 7, 8, 9) ~ NA_character_,
      vote == 1 ~ "Yes",
      vote == 2 ~ "No",
      
      TRUE ~ as.character(vote)
      ),
    
    age = ifelse(agea == 999, NA, agea),
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  
    ),
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  ) %>%
  
  # Selecting only chosen variables 
  select(psppipla, gen, vote) %>%

  # Filtering NAs
  filter(!is.na(psppipla)) %>%
  filter(!is.na(vote)) %>%
  filter(!is.na(gen)) %>%
  
  # Filtering for only Millenials
  filter(gen == "Millennials")

Creating a Data Table

# Replicating the GB_subset dataset but keeping all values as numeric in order to meet the confines of the datasummary_skim output.

GB_df <- ess %>% 
  filter(cntry == "GB") %>%
  mutate(
    psppipla = case_when(
      psppipla %in% c(7, 8, 9) ~ NA_real_,
      TRUE ~ as.numeric(psppipla)
    ),
    
    vote = case_when(
      vote %in% c(3, 7, 8, 9) ~ NA_real_,
      TRUE ~ vote
    ),
    
    # Cohorting and mutating respondent age to isolate selected age group.  
    age = ifelse(agea == 999, NA, agea),
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
    gen = case_when(
      yrbrn %in% 1900:1945 ~ 1,
      yrbrn %in% 1946:1964 ~ 2,
      yrbrn %in% 1965:1979 ~ 3,
      yrbrn %in% 1980:2000 ~ 4,
      TRUE ~ as.numeric(cohort) 
    )
  ) %>%

  # Filtering NAs
  filter(!is.na(psppipla)) %>%
  filter(!is.na(vote)) %>%
  filter(!is.na(gen)) %>%
  
  filter(gen == 4)

# Creating the flextable
GB_table <- datasummary_skim(GB_df  %>% dplyr::select(psppipla, vote, gen), output = "flextable")

## Warning: The histogram argument is only supported for (a) output types "default",
##   "html", "kableExtra", or "gt"; (b) writing to file paths with extensions
##   ".html", ".jpg", or ".png"; and (c) Rmarkdown, knitr or Quarto documents
##   compiled to PDF (via kableExtra)  or HTML (via kableExtra or gt). Use
##   `histogram=FALSE` to silence this warning.

GB_table

	Unique (#)	Mean	SD	Min	Median	Max
psppipla	5	2.3	0.8	1.0	2.0	5.0
vote	2	1.4	0.5	1.0	1.0	2.0
gen	1	4.0	0.0	4.0	4.0	4.0

# renaming values to aid in readability
GB_df_named <- GB_df %>%
  rename(
    `Political System Allows Influence` = psppipla,
    `Generation (Millennials)` = gen,
    `Did Respondant Vote?` = vote,
  )
  
  GB_tableV2 <- datasummary_skim(GB_df_named %>% dplyr::select(`Political System Allows Influence`, `Generation (Millennials)`, `Did Respondant Vote?`), title = "Table 1. Descriptive statistics of main variables", output = "flextable")

## Warning: The histogram argument is only supported for (a) output types "default",
##   "html", "kableExtra", or "gt"; (b) writing to file paths with extensions
##   ".html", ".jpg", or ".png"; and (c) Rmarkdown, knitr or Quarto documents
##   compiled to PDF (via kableExtra)  or HTML (via kableExtra or gt). Use
##   `histogram=FALSE` to silence this warning.

GB_tableV2

Table 1. Descriptive statistics of main variables
	Unique (#)	Mean	SD	Min	Median	Max
Political System Allows Influence	5	2.3	0.8	1.0	2.0	5.0
Generation (Millennials)	1	4.0	0.0	4.0	4.0	4.0
Did Respondant Vote?	2	1.4	0.5	1.0	1.0	2.0

Hypothisis Testing

Getting Null Distruibution

# converting psppipla to a factor 
GB_subset$psppipla <- factor(GB_subset$psppipla)

# Test Stat
test_stat <- GB_subset %>%
  specify(explanatory = psppipla, 
          response = vote) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "Chisq") 
print(test_stat$stat)

## X-squared 
##  29.37997

# Null Distribution
null_dist <- GB_subset %>%
   specify(vote ~ psppipla) %>%
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "Chisq")

p_val <- null_dist %>% # 
  get_pvalue(obs_stat = test_stat, direction = "greater")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.

# Simulating Null Distribution
conf_int <- null_dist%>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_dist %>%
  visualize(data, bins = 10, method = "simulation", dens_color = "black") +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf

Generating Regression

# filtering additional pridictors to use in analysis
  GB_df <- GB_df %>%
  mutate(
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),
    
# More Predictors... 
GB_df <- GB_df %>%
  # Clean Left-right scale
  mutate(lrscale = ifelse(lrscale %in% c(77, 88, 99), NA, lrscale)) %>%
  # Recode Political Interest
  mutate(polintr = case_when(
    polintr %in% c(1, 2) ~ "Interested",
    polintr %in% c(3, 4) ~ "Not Interested",
    polintr %in% c(7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(polintr)
  )) %>%
  # Recode work for party
  mutate(wrkprty = case_when(
    wrkprty == 1 ~ "Yes",
    wrkprty == 2 ~ "No",
    wrkprty %in% c(7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(wrkprty)
  )) %>%
  # Recode worked for civil org/association
  mutate(workcivil = case_when(
    wrkorg == 1 ~ "Yes",
    wrkorg == 2 ~ "No",
    wrkorg %in% c(7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(wrkorg)
  )) %>%
  # Recode participation in protest
  mutate(protest = case_when(
    pbldmn == 1 ~ "Yes",
    pbldmn == 2 ~ "No",
    pbldmn %in% c(7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(pbldmn)
  ))
)

GB_df$weight <- GB_df$dweight * GB_df$pweight
model1 <- lm(vote ~ psppipla, data = GB_df, weights = weight)
model2 <- lm(vote ~ psppipla + polintr, data = GB_df, weights = weight)
model3 <- lm(vote ~ psppipla + polintr * pbldmn, data = GB_df, weights = weight)

# Displaying Models 
modelsummary(
  list(model1, model2, model3),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_omit = "Intercept")

	(1)	(2)	(3)
psppipla	−0.1 (0.0)***	−0.1 (0.0)***	−0.1 (0.0)***
polintrNot Interested		0.3 (0.0)***	1.3 (0.3)***
pbldmn			0.1 (0.1)+
polintrNot Interested × pbldmn			−0.5 (0.2)**
Num.Obs.	1173	1173	929
R2	0.025	0.145	0.154
R2 Adj.	0.024	0.143	0.150
AIC	1734.9	1583.5	1271.8
BIC	1750.1	1603.7	1300.9
Log.Lik.	−864.432	−787.735	−629.924
RMSE	0.47	0.45	0.45

Regression Equations

equatiomatic::extract_eq(model1, use_coefs = TRUE)

\[ \operatorname{\widehat{vote}} = 1.56 - 0.09(\operatorname{psppipla}) \]

equatiomatic::extract_eq(model2, use_coefs = TRUE)

\[ \operatorname{\widehat{vote}} = 1.32 - 0.06(\operatorname{psppipla}) + 0.33(\operatorname{polintr}_{\operatorname{Not\ Interested}}) \]

equatiomatic::extract_eq(model3, use_coefs = TRUE)

\[ \operatorname{\widehat{vote}} = 1.15 - 0.06(\operatorname{psppipla}) + 1.32(\operatorname{polintr}_{\operatorname{Not\ Interested}}) + 0.11(\operatorname{pbldmn}) - 0.5(\operatorname{polintr}_{\operatorname{Not\ Interested}} \times \operatorname{pbldmn}) \]

Modeling

ggplot(GB_subset, aes(x = factor(psppipla), fill = vote, group = vote)) +
  geom_bar(position = "dodge", stat = "count") +
  labs(
    title = "Millenial's Perception of Influence on Politics and How That Affects Voting Patterns",
    x = "Political System Allows People to Have an Influence on Politics",
    y = "Count",
    fill = "Voted in Last Election (Yes/No)"
  ) +
  scale_fill_manual(values = c("Yes" = "#0dc813", "No" = "#c82e0d"))