# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'infer' was built under R version 4.3.3
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
italy_data <- read.fst("italy_data.fst")
Using Italy ESS Data, to interpret impact education has on choosing a high paying job
italy_task1 <- italy_data %>%
mutate(
ID = case_when(
ipjbhin %in% 0:4 ~ "high income",
ipjbhin %in% 6:10 ~ "low income",
ipjbhin %in% c(77, 88) ~ NA_character_,
TRUE ~ NA_character_
),
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
) %>%
filter(!is.na(ID), !is.na(educ.ba))
table(italy_task1$educ.ba)
##
## No BA BA or more
## 0 0
# Now, proceed with the Chi-Square test for italy_data with the recoded variables
mytab <- table(italy_task1$ID, italy_task1$educ.ba)
mytab
## < table of extent 0 x 2 >
rsums <- rowSums(mytab)
csums <- colSums(mytab)
N <- sum(mytab)
expected <- outer(rsums, csums) / N
expected
## No BA BA or more
alpha <- 0.05
df <- (nrow(mytab) - 1) * (ncol(mytab) - 1)
critical_value <- qchisq(1 - alpha, df)
## Warning in qchisq(1 - alpha, df): NaNs produced
observed <- mytab
chi_squared <- sum((observed - expected)^2 / expected)
p_value <- pchisq(chi_squared, df, lower.tail = FALSE)
## Warning in pchisq(chi_squared, df, lower.tail = FALSE): NaNs produced
cat("Critical Value:", critical_value, "\n")
## Critical Value: NaN
cat("Chi-Squared Statistic:", chi_squared, "\n")
## Chi-Squared Statistic: 0
cat("P-Value:", p_value, "\n")
## P-Value: NaN
##Interpreting output
italy_task2 <- italy_data %>%
mutate(
geo = recode(as.character(domicil),
'1' = "High Income",
'2' = "Lower Income",
'3' = NA_character_,
'4' = NA_character_,
'5' = NA_character_,
'7' = NA_character_,
'8' = NA_character_,
'9' = NA_character_),
socben = case_when(
sbbsntx == 1 ~ "Agree Strongly",
sbbsntx == 2 ~ "Agree",
sbbsntx == 3 ~ "Neither agree nor disagree",
sbbsntx == 4 ~ "Disagree",
sbbsntx == 5 ~ "Disagree strongly",
sbbsntx %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ as.character(sbbsntx)
)
) %>%
filter(!is.na(geo), !is.na(socben)) # Ensuring to remove NA values for cleaner analysis
table(italy_task2$geo)
##
## High Income Lower Income
## 245 135
table(italy_task2$socben)
##
## Agree Agree Strongly
## 110 37
## Disagree Disagree strongly
## 63 22
## Neither agree nor disagree
## 148
# Step 1: Calculate the test statistic
test_stat <- italy_task2 %>%
specify(response = socben, explanatory = geo) %>%
hypothesize(null = "independence") %>%
calculate(stat = "Chisq")
print(test_stat$stat)
## X-squared
## 12.72196
# Steps 2 & 3: Simulate the null distribution and calculate the p-value
null_distribution <- italy_task2 %>%
specify(response = socben, explanatory = geo) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
# p-value
p_value <- null_distribution %>%
get_pvalue(obs_stat = test_stat, direction = "greater")
p_value
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0.004
##Visualizing Null Distribution
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints = conf_int)