Setting up environment

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## 
## Loading required package: grid
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: survival
## 
## 
## Attaching package: 'survey'
## 
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "interactions" "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"
ess <- read_fst("All-ESS-Data.fst")

#Ensuring data is from 2002 to 2022
ess$year <- NA 
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020, 2022) 
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

ess$weight <- ess$dweight * ess$pweight
survey_design <- svydesign(ids = ~1, data = ess, weights = ~weight)

Data preparation

Italy has its own division of level of education so I recoded its 21 levels of education with the help of translator and the below websites: https://www.internations.org/italy-expats/guide/education#:~:text=The%20education%20system%20in%20Italy%20is%20divided%20into%20five%20main,your%20children%20or%20for%20yourself. https://eurydice.eacea.ec.europa.eu/national-education-systems/italy/overview#:~:text=secondary%20education’).-,Stages%20of%20the%20education%20system,secondary%2C%20tertiary%20and%20adult%20education.

# Recoding for Italy 
italy_data <- ess %>%
  filter(cntry == "IT") %>%
  mutate(
    # Recode education levels
    edlveit = case_when(
      edlveit %in% c(5555, 7777, 8888, 9999) ~ NA_real_,
      TRUE ~ edlveit
    ),
    edu_level = case_when(
      edlveit %in% c(1, 2) ~ "Under Secondary",
      edlveit %in% c(3, 4, 5, 6, 7, 8, 9) ~ "Secondary",
      edlveit %in% c(10, 11, 12, 13, 14, 15) ~ "Postsecondary",
      edlveit %in% c(16, 17, 18, 19, 20, 21) ~ "Postgrad",
      TRUE ~ NA_character_
    ),
    # Recode voting variable
    vote_level = recode(vote,
                  `1` = "Yes",
                  `2` = "No",
                  `3` = NA_character_,
                  `7` = NA_character_,
                  `8` = NA_character_,
                  `9` = NA_character_),
    
    # Recode income variable
    household_income = case_when(
      hinctnta %in% c("J") ~ 1,
      hinctnta %in% c("R") ~ 2,
      hinctnta %in% c("C") ~ 3,
      hinctnta %in% c("M") ~ 4,
      hinctnta %in% c("F") ~ 5,
      hinctnta %in% c("S") ~ 6,
      hinctnta %in% c("K") ~ 7,
      hinctnta %in% c("P") ~ 8,
      hinctnta %in% c("D") ~ 9,
      hinctnta %in% c("H") ~ 10,
      hinctnta %in% c("77", "88", "99") ~ NA_real_,
      TRUE ~ as.numeric(hinctnta)
    )
    
  ) %>%
  # Remove rows with NA
  filter(!is.na(edu_level))%>%
  filter(!is.na(vote_level))%>%
  filter(!is.na(household_income))

# View the distribution
table(italy_data$edu_level)
## 
##        Postgrad   Postsecondary       Secondary Under Secondary 
##             318             272            1952             367
table(italy_data$vote_level)
## 
##   No  Yes 
##  604 2305
table(italy_data$household_income)
## 
##   1   2   3   4   5   6   7   8   9  10 
## 181 390 447 396 326 325 379 215 166  84
rm(ess)

Table for Context section

# Define the levels in order from lowest to highest education
edu_levels_ordered <- c("Under Secondary", "Secondary", "Postsecondary", "Postgrad")

# Convert 'edu_level' to a factor with ordered levels and then to numeric
italy_data$edu_level <- factor(italy_data$edu_level, levels = edu_levels_ordered)
italy_data$edu_level_numeric <- as.numeric(italy_data$edu_level)

# Convert 'vote_level' to numeric: 'Yes' = 1, 'No' = 0
italy_data$vote_level_numeric <- ifelse(italy_data$vote_level == "Yes", 1, 0)

# Generate the table of descriptive statistics
table1 <- datasummary_skim(italy_data %>% select(vote_level_numeric, edu_level_numeric, household_income), output = "flextable")
## Warning: The histogram argument is only supported for (a) output types "default",
##   "html", "kableExtra", or "gt"; (b) writing to file paths with extensions
##   ".html", ".jpg", or ".png"; and (c) Rmarkdown, knitr or Quarto documents
##   compiled to PDF (via kableExtra)  or HTML (via kableExtra or gt). Use
##   `histogram=FALSE` to silence this warning.
# Print the table
print(table1)
## a flextable object.
## col_keys: ` `, `Unique (#)`, `Missing (%)`, `Mean`, `SD`, `Min`, `Median`, `Max` 
## header has 1 row(s) 
## body has 3 row(s) 
## original dataset sample: 
##                      Unique (#) Missing (%) Mean  SD Min Median  Max
## 1 vote_level_numeric          2           0  0.8 0.4 0.0    1.0  1.0
## 2  edu_level_numeric          4           0  2.2 0.8 1.0    2.0  4.0
## 3   household_income         10           0  4.9 2.4 1.0    5.0 10.0

Findings

Hypothesis testing using infer

Step 1:

test_stat <- italy_data %>%
  specify(explanatory = edu_level, # change variable name for explanatory variable
          response = vote_level) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq") # replace in between quotation marks appropriate test statistic

print(test_stat$stat)
## X-squared 
##  89.18693

Step 2

null_distribution <- italy_data %>%
  specify(explanatory = edu_level,
          response = vote_level) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
  calculate(stat = "chisq")

Step 3

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

step 4

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: vote_level (factor)
## Explanatory: edu_level (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 0.193 
##  2         2 1.14  
##  3         3 0.0690
##  4         4 1.60  
##  5         5 3.25  
##  6         6 0.454 
##  7         7 2.16  
##  8         8 3.89  
##  9         9 0.442 
## 10        10 4.57  
## # ℹ 990 more rows

Test Chi-square using another method:

# Perform the Chi-Square
chisq_results <- chisq.test(italy_data$edu_level, italy_data$vote_level)

chisq_results
## 
##  Pearson's Chi-squared test
## 
## data:  italy_data$edu_level and italy_data$vote_level
## X-squared = 89.187, df = 3, p-value < 2.2e-16

Modelling

#First Model
model1 <- glm(vote_level_numeric ~ edu_level, family = binomial, data = italy_data)

#Coefficients and intercept
coefficients <- coef(model1)
print(coefficients)
##            (Intercept)     edu_levelSecondary edu_levelPostsecondary 
##              0.5886345              0.7298799              1.4621752 
##      edu_levelPostgrad 
##              1.6731286
# Second Model: Two predictor model (level of education and income level predicting vote)
model2 <- glm(vote_level_numeric ~ edu_level + household_income, family = binomial, data = italy_data)

# Third model with interaction term (education level and household income)
model3 <- glm(vote_level_numeric ~ edu_level * household_income, family = binomial, data = italy_data)

# Get the summary of each model
modelsummary(
  list(model1, model2, model3),
  title = "Regression Model Summary: Effects of Education and Income on Voting",
  fmt = 1,
  estimate = c("{estimate} ({std.error}){stars}"),
  statistic = NULL,
)
Regression Model Summary: Effects of Education and Income on Voting
 (1)   (2)   (3)
(Intercept) 0.6 (0.1)*** 0.2 (0.1) 0.1 (0.2)
edu_levelSecondary 0.7 (0.1)*** 0.6 (0.1)*** 0.7 (0.2)**
edu_levelPostsecondary 1.5 (0.2)*** 1.2 (0.2)*** 0.8 (0.5)
edu_levelPostgrad 1.7 (0.2)*** 1.3 (0.2)*** 1.8 (0.6)**
household_income 0.1 (0.0)*** 0.1 (0.1)*
edu_levelSecondary × household_income 0.0 (0.1)
edu_levelPostsecondary × household_income 0.1 (0.1)
edu_levelPostgrad × household_income −0.1 (0.1)
Num.Obs. 2909 2909 2909
AIC 2890.0 2860.6 2863.8
BIC 2913.9 2890.5 2911.7
Log.Lik. −1440.998 −1425.292 −1423.923
RMSE 0.40 0.40 0.40

visual for the interaction model

# Assuming you have a logistic regression model with interaction between edu_level and income
model <- glm(vote_level_numeric ~ edu_level * household_income, family = "binomial", data = italy_data)

interaction_plot <- effect("edu_level*household_income", model, na.rm=TRUE)

plot(interaction_plot,
     main="Interaction Effect of Education Level and Household Income on Voting",
     xlab="Household Income Decile (1=1st Decile, ..., 10=10th Decile)",
     ylab="Predicted Probability of Voting (0=no vote, 1=voted)")

A visual that didn’t work for some reason

model <- glm(vote_level_numeric ~ edu_level * household_income, family = “binomial”, data = italy_data)

#Ensure that edu_level is a factor italy_data\(edu_level <- factor(italy_data\)edu_level, levels = c(“Under Secondary”, “Secondary”, “Postsecondary”, “Postgrad”))

#Generate interaction plot interaction_plot <- interact_plot(model, pred = edu_level, modx = household_income, x.label = “Household Income Decile”, legend.title = “Education Level”)

#Plot the interaction effect ggplot(interaction_plot, aes(x = modx, y = predicted, color = factor(pred), group = factor(pred))) + geom_line() + geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) + labs(title = “Interaction Effect of Education Level and Household Income on Voting”, x = “Household Income Decile”, y = “Predicted Probability of Voting”) + theme_minimal()