# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'infer' was built under R version 4.3.3
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"
italy_data <- read.fst("italy_data.fst")

Recoding

Using Italy ESS Data, to interpret impact education has on choosing a high paying job

italy_task1 <- italy_data %>%
  mutate(
    ID = case_when(
      ipjbhin %in% 0:4 ~ "high income",                     
      ipjbhin %in% 6:10 ~ "low income",                  
      ipjbhin %in% c(77, 88) ~ NA_character_,    
      TRUE ~ NA_character_                           
    ),
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))  
  ) %>%
  filter(!is.na(ID), !is.na(educ.ba))  
table(italy_task1$educ.ba)
## 
##      No BA BA or more 
##          0          0
# Now, proceed with the Chi-Square test for italy_data with the recoded variables
mytab <- table(italy_task1$ID, italy_task1$educ.ba)
mytab
## < table of extent 0 x 2 >
rsums <- rowSums(mytab)
csums <- colSums(mytab)
N <- sum(mytab)
expected <- outer(rsums, csums) / N
expected
##      No BA BA or more
alpha <- 0.05
df <- (nrow(mytab) - 1) * (ncol(mytab) - 1)
critical_value <- qchisq(1 - alpha, df)
## Warning in qchisq(1 - alpha, df): NaNs produced
observed <- mytab
chi_squared <- sum((observed - expected)^2 / expected)

p_value <- pchisq(chi_squared, df, lower.tail = FALSE)
## Warning in pchisq(chi_squared, df, lower.tail = FALSE): NaNs produced
cat("Critical Value:", critical_value, "\n")
## Critical Value: NaN
cat("Chi-Squared Statistic:", chi_squared, "\n")
## Chi-Squared Statistic: 0
cat("P-Value:", p_value, "\n")
## P-Value: NaN

##Interpreting output

italy_task2 <- italy_data %>%
  mutate(
    geo = recode(as.character(domicil), 
                 '1' = "High Income", 
                 '2' = "Lower Income",
                 '3' = NA_character_,
                 '4' = NA_character_,
                 '5' = NA_character_,
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_),
    socben = case_when(
      sbbsntx == 1 ~ "Agree Strongly",  
      sbbsntx == 2 ~ "Agree",
      sbbsntx == 3 ~ "Neither agree nor disagree",
      sbbsntx == 4 ~ "Disagree",
      sbbsntx == 5 ~ "Disagree strongly",
      sbbsntx %in% c(7, 8, 9) ~ NA_character_, 
      TRUE ~ as.character(sbbsntx)
    )
  ) %>%
  filter(!is.na(geo), !is.na(socben))  # Ensuring to remove NA values for cleaner analysis
table(italy_task2$geo)
## 
##  High Income Lower Income 
##          245          135
table(italy_task2$socben)
## 
##                      Agree             Agree Strongly 
##                        110                         37 
##                   Disagree          Disagree strongly 
##                         63                         22 
## Neither agree nor disagree 
##                        148
# Step 1: Calculate the test statistic
test_stat <- italy_task2 %>%
  specify(response = socben, explanatory = geo) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "Chisq")
print(test_stat$stat)
## X-squared 
##  12.72196
# Steps 2 & 3: Simulate the null distribution and calculate the p-value
null_distribution <- italy_task2 %>%
  specify(response = socben, explanatory = geo) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")

# p-value
p_value <- null_distribution %>%
  get_pvalue(obs_stat = test_stat, direction = "greater")
p_value
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.004

##Visualizing Null Distribution

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)