# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[7]]
## [1] "interactions" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
ess <- read_fst("All-ESS-Data.fst")
#Ensuring data is from 2002 to 2022
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020, 2022)
for(i in 1:10){
ess$year[ess$essround == i] <- replacements[i]
}
ess$weight <- ess$dweight * ess$pweight
survey_design <- svydesign(ids = ~1, data = ess, weights = ~weight)
Italy has its own division of level of education so I recoded its 21 levels of education with the help of translator and the below websites: https://www.internations.org/italy-expats/guide/education#:~:text=The%20education%20system%20in%20Italy%20is%20divided%20into%20five%20main,your%20children%20or%20for%20yourself. https://eurydice.eacea.ec.europa.eu/national-education-systems/italy/overview#:~:text=secondary%20education’).-,Stages%20of%20the%20education%20system,secondary%2C%20tertiary%20and%20adult%20education.
# Recoding for Italy
italy_data <- ess %>%
filter(cntry == "IT") %>%
mutate(
# Recode education levels
edlveit = case_when(
edlveit %in% c(5555, 7777, 8888, 9999) ~ NA_real_,
TRUE ~ edlveit
),
edu_level = case_when(
edlveit %in% c(1, 2) ~ "Under Secondary",
edlveit %in% c(3, 4, 5, 6, 7, 8, 9) ~ "Secondary",
edlveit %in% c(10, 11, 12, 13, 14, 15) ~ "Postsecondary",
edlveit %in% c(16, 17, 18, 19, 20, 21) ~ "Postgrad",
TRUE ~ NA_character_
),
# Recode voting variable
vote_level = recode(vote,
`1` = "Yes",
`2` = "No",
`3` = NA_character_,
`7` = NA_character_,
`8` = NA_character_,
`9` = NA_character_),
# Recode income variable
household_income = case_when(
hinctnta %in% c("J") ~ 1,
hinctnta %in% c("R") ~ 2,
hinctnta %in% c("C") ~ 3,
hinctnta %in% c("M") ~ 4,
hinctnta %in% c("F") ~ 5,
hinctnta %in% c("S") ~ 6,
hinctnta %in% c("K") ~ 7,
hinctnta %in% c("P") ~ 8,
hinctnta %in% c("D") ~ 9,
hinctnta %in% c("H") ~ 10,
hinctnta %in% c("77", "88", "99") ~ NA_real_,
TRUE ~ as.numeric(hinctnta)
)
) %>%
# Remove rows with NA
filter(!is.na(edu_level))%>%
filter(!is.na(vote_level))%>%
filter(!is.na(household_income))
# View the distribution
table(italy_data$edu_level)
##
## Postgrad Postsecondary Secondary Under Secondary
## 318 272 1952 367
table(italy_data$vote_level)
##
## No Yes
## 604 2305
table(italy_data$household_income)
##
## 1 2 3 4 5 6 7 8 9 10
## 181 390 447 396 326 325 379 215 166 84
rm(ess)
# Define the levels in order from lowest to highest education
edu_levels_ordered <- c("Under Secondary", "Secondary", "Postsecondary", "Postgrad")
# Convert 'edu_level' to a factor with ordered levels and then to numeric
italy_data$edu_level <- factor(italy_data$edu_level, levels = edu_levels_ordered)
italy_data$edu_level_numeric <- as.numeric(italy_data$edu_level)
# Convert 'vote_level' to numeric: 'Yes' = 1, 'No' = 0
italy_data$vote_level_numeric <- ifelse(italy_data$vote_level == "Yes", 1, 0)
# Generate the table of descriptive statistics
table1 <- datasummary_skim(italy_data %>% select(vote_level_numeric, edu_level_numeric, household_income), output = "flextable")
## Warning: The histogram argument is only supported for (a) output types "default",
## "html", "kableExtra", or "gt"; (b) writing to file paths with extensions
## ".html", ".jpg", or ".png"; and (c) Rmarkdown, knitr or Quarto documents
## compiled to PDF (via kableExtra) or HTML (via kableExtra or gt). Use
## `histogram=FALSE` to silence this warning.
# Print the table
print(table1)
## a flextable object.
## col_keys: ` `, `Unique (#)`, `Missing (%)`, `Mean`, `SD`, `Min`, `Median`, `Max`
## header has 1 row(s)
## body has 3 row(s)
## original dataset sample:
## Unique (#) Missing (%) Mean SD Min Median Max
## 1 vote_level_numeric 2 0 0.8 0.4 0.0 1.0 1.0
## 2 edu_level_numeric 4 0 2.2 0.8 1.0 2.0 4.0
## 3 household_income 10 0 4.9 2.4 1.0 5.0 10.0
Step 1:
test_stat <- italy_data %>%
specify(explanatory = edu_level, # change variable name for explanatory variable
response = vote_level) %>% # change variable name for outcome of interest
hypothesize(null = "independence") %>%
calculate(stat = "chisq") # replace in between quotation marks appropriate test statistic
print(test_stat$stat)
## X-squared
## 89.18693
Step 2
null_distribution <- italy_data %>%
specify(explanatory = edu_level,
response = vote_level) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
calculate(stat = "chisq")
Step 3
p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
step 4
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.
null_distribution
## Response: vote_level (factor)
## Explanatory: edu_level (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
## replicate stat
## <int> <dbl>
## 1 1 0.193
## 2 2 1.14
## 3 3 0.0690
## 4 4 1.60
## 5 5 3.25
## 6 6 0.454
## 7 7 2.16
## 8 8 3.89
## 9 9 0.442
## 10 10 4.57
## # ℹ 990 more rows
Test Chi-square using another method:
# Perform the Chi-Square
chisq_results <- chisq.test(italy_data$edu_level, italy_data$vote_level)
chisq_results
##
## Pearson's Chi-squared test
##
## data: italy_data$edu_level and italy_data$vote_level
## X-squared = 89.187, df = 3, p-value < 2.2e-16
#First Model
model1 <- glm(vote_level_numeric ~ edu_level, family = binomial, data = italy_data)
#Coefficients and intercept
coefficients <- coef(model1)
print(coefficients)
## (Intercept) edu_levelSecondary edu_levelPostsecondary
## 0.5886345 0.7298799 1.4621752
## edu_levelPostgrad
## 1.6731286
# Second Model: Two predictor model (level of education and income level predicting vote)
model2 <- glm(vote_level_numeric ~ edu_level + household_income, family = binomial, data = italy_data)
# Third model with interaction term (education level and household income)
model3 <- glm(vote_level_numeric ~ edu_level * household_income, family = binomial, data = italy_data)
# Get the summary of each model
modelsummary(
list(model1, model2, model3),
title = "Regression Model Summary: Effects of Education and Income on Voting",
fmt = 1,
estimate = c("{estimate} ({std.error}){stars}"),
statistic = NULL,
)
| (1) | (2) | (3) | |
|---|---|---|---|
| (Intercept) | 0.6 (0.1)*** | 0.2 (0.1) | 0.1 (0.2) |
| edu_levelSecondary | 0.7 (0.1)*** | 0.6 (0.1)*** | 0.7 (0.2)** |
| edu_levelPostsecondary | 1.5 (0.2)*** | 1.2 (0.2)*** | 0.8 (0.5) |
| edu_levelPostgrad | 1.7 (0.2)*** | 1.3 (0.2)*** | 1.8 (0.6)** |
| household_income | 0.1 (0.0)*** | 0.1 (0.1)* | |
| edu_levelSecondary × household_income | 0.0 (0.1) | ||
| edu_levelPostsecondary × household_income | 0.1 (0.1) | ||
| edu_levelPostgrad × household_income | −0.1 (0.1) | ||
| Num.Obs. | 2909 | 2909 | 2909 |
| AIC | 2890.0 | 2860.6 | 2863.8 |
| BIC | 2913.9 | 2890.5 | 2911.7 |
| Log.Lik. | −1440.998 | −1425.292 | −1423.923 |
| RMSE | 0.40 | 0.40 | 0.40 |
# Assuming you have a logistic regression model with interaction between edu_level and income
model <- glm(vote_level_numeric ~ edu_level * household_income, family = "binomial", data = italy_data)
interaction_plot <- effect("edu_level*household_income", model, na.rm=TRUE)
plot(interaction_plot,
main="Interaction Effect of Education Level and Household Income on Voting",
xlab="Household Income Decile (1=1st Decile, ..., 10=10th Decile)",
ylab="Predicted Probability of Voting (0=no vote, 1=voted)")
model <- glm(vote_level_numeric ~ edu_level * household_income, family = “binomial”, data = italy_data)
#Ensure that edu_level is a factor italy_data\(edu_level <- factor(italy_data\)edu_level, levels = c(“Under Secondary”, “Secondary”, “Postsecondary”, “Postgrad”))
#Generate interaction plot interaction_plot <- interact_plot(model, pred = edu_level, modx = household_income, x.label = “Household Income Decile”, legend.title = “Education Level”)
#Plot the interaction effect ggplot(interaction_plot, aes(x = modx, y = predicted, color = factor(pred), group = factor(pred))) + geom_line() + geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) + labs(title = “Interaction Effect of Education Level and Household Income on Voting”, x = “Household Income Decile”, y = “Predicted Probability of Voting”) + theme_minimal()