Setting up environment

# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"
ess <- read_fst("All-ESS-Data.fst")

Homework 7 (2.5%) due Nov. 13 Submit on Quercus using assignment tab – not on the discussion board. Make sure to provide both your markdown file and R pubs link. If you do not submit both, you will be penalized 1 pt. out of the 2.5 total.

Task 1 Set up the calculations for an exploration of educ_level (as BA, No BA) and brncntr (recoded as yes/no *shown in tutorial 4) for Switzerland. Using the steps outlined in the tutorial, first generate the tables of expected proportions and frequencies. Determine and interpret the critical value for independence. Finally, determine and interpret both the Pearson’s chi-squared statistic and the p-value. Discuss main takeaways.

Preparing the dataset

sw_data <- ess %>%
  filter(cntry == "CH") %>%
  mutate(
    brncntr = case_when(
      brncntr == 1 ~ "Yes",  # Recode 1 to "Yes"
      brncntr == 2 ~ "No",   # Recode 2 to "No"
      brncntr %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(brncntr)  # Keep other values as-is but ensure they are characters
    ),
  
    # Recoding education for rounds 1-4
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    
    # Re-coding education for rounds 5 and later
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    
    # Create educ_level column
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )

Table of Expected Proportions

## Not necessary, but always good to rename when doing operations so you can backtrack to clean dataset
tab_dat <- sw_data

# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$clsprty, tab_dat$educ_level)

# Calculate row-wise sums of counts in table 
rsums <- rowSums(mytab)

# Calculate column-wise sums of counts in table
csums <- colSums(mytab)

# Get the total count of the table (N)
N <- sum(mytab)

# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
##              [,1]         [,2]
## [1,] 0.1203823734 0.4080075823
## [2,] 0.1030310506 0.3491993777
## [3,] 0.0001346107 0.0004562312
## [4,] 0.0042806211 0.0145081529
# Table of expected frequencies
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
##             [,1]        [,2]
## [1,] 2037.471669 6905.528331
## [2,] 1743.800532 5910.199468
## [3,]    2.278287    7.721713
## [4,]   72.449513  245.550487
# Critical Value for Independence:
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841

Calculating and interpreting Pearson’s chi-squared statistic

# Pearson's chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 275.2525
# p-value:
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
# Using chisq.test for validation:
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
## Warning in chisq.test(mytab, correct = F): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 275.25, df = 3, p-value < 2.2e-16

Task 2 Using the infer package and following the 4 steps outlined in the tutorial and during the lecture, conduct and visualize the hypothesis testing for domicil (recoded as urban/rural – see tutorial 4) and gincdif (Government should reduce differences in income levels) for the country of Netherlands. For gincdif, you must recode for the five categories (i.e, cannot keep as numeric).

See variable info here:

https://ess-search.nsd.no/en/variable/1ad9f42e-0c02-4b13-9a67-ee83d82ac576

Make decision over appropriate test given table provided during lecture. After conducting the 4 steps, interpret all the output and discuss your takeaways.

Preparing place of residence:

  nl_data <- ess %>%
  filter(cntry == "NL") %>%
 mutate(geo = recode(as.character(domicil), 
                      '1' = "Urban", 
                      '2' = "Urban", 
                      '3' = "Rural", 
                      '4' = "Rural", 
                      '5' = "Rural",
                      '7' = NA_character_,
                      '8' = NA_character_,
                      '9' = NA_character_))



# check
table(nl_data$geo)
## 
## Rural Urban 
## 12681  5623

Preparing income inequality:

nl_data <- nl_data %>%
  mutate(income = recode(as.character(gincdif), 
                      '1' = "Agree strongly", 
                      '2' = "Agree", 
                      '3' = "Neither agree nor disagree", 
                      '4' = "Disagree", 
                      '5' = "Disagree strongly",
                      '7' = NA_character_,
                      '8' = NA_character_,
                      '9' = NA_character_))   

# View the table of gincdif
table(nl_data$income)
## 
##                      Agree             Agree strongly 
##                       7894                       2896 
##                   Disagree          Disagree strongly 
##                       3681                        543 
## Neither agree nor disagree 
##                       3142

Step 1:

test_stat <- nl_data %>%
  specify(explanatory = geo, # change variable name for explanatory variable
          response = income) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq") # replace in between quotation marks appropriate test statistic
## Warning: Removed 198 rows containing missing values.
print(test_stat$stat)
## X-squared 
##  8.176048

Step 2

null_distribution <- nl_data %>%
  specify(explanatory = geo,
          response = income) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
  calculate(stat = "chisq")
## Warning: Removed 198 rows containing missing values.

Step 3

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1

p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1    0.17

step 4

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: income (factor)
## Explanatory: geo (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.398
##  2         2 3.65 
##  3         3 8.54 
##  4         4 0.417
##  5         5 0.152
##  6         6 6.63 
##  7         7 3.70 
##  8         8 5.82 
##  9         9 5.69 
## 10        10 4.90 
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

Task 3 First, talk about a research question you would like to consider, which identifies a main relationship between an explanatory variable and an outcome variable. Using the infer package and following the 4 steps outlined in the tutorial and during the lecture, conduct and visualize the hypothesis testing for your variables based on your research question. After conducting the 4 steps, interpret all the output and discuss your takeaways.

Answer: Research question: How does place of residence relate to level of education in Netherlands?

Preparing place of residence:

  nl_data <- ess %>%
  filter(cntry == "NL") %>%
 mutate(geo = recode(as.character(domicil), 
                      '1' = "Urban", 
                      '2' = "Peri-Rural", 
                      '3' = "Urban", 
                      '4' = "Rural", 
                      '5' = "Rural",
                      '7' = NA_character_,
                      '8' = NA_character_,
                      '9' = NA_character_))



# check
table(nl_data$geo)
## 
## Peri-Rural      Rural      Urban 
##       1891       7999       8414

Preparing level of education:

# Re-coding education
nl_data <- nl_data %>%
  mutate(
    # Set 'other' to NA for rounds 1-4
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    # Set 'other' to NA for rounds 5 and later
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    # Create educ_level column
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )

# View the table of educ_level
table(nl_data$educ_level)
## 
##    BA No BA 
##  5054 13275

Step 1:

test_stat <- nl_data %>%
  specify(explanatory = geo, # change variable name for explanatory variable
          response = educ_level) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq") # replace in between quotation marks appropriate test statistic
## Warning: Removed 25 rows containing missing values.
print(test_stat$stat)
## X-squared 
##  191.2812

Step 2

null_distribution <- nl_data %>%
  specify(explanatory = geo,
          response = educ_level) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
  calculate(stat = "chisq")
## Warning: Removed 25 rows containing missing values.

Step 3

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

step 4

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: educ_level (factor)
## Explanatory: geo (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 3.38 
##  2         2 0.432
##  3         3 3.28 
##  4         4 3.21 
##  5         5 2.38 
##  6         6 0.154
##  7         7 1.63 
##  8         8 1.85 
##  9         9 1.97 
## 10        10 0.328
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

Both the p value and the graphs rejects the null hypothesize of independence and show that there is statistically significant association between the place of residence (rural/peri-rural/city) and level of education. The significance of the test implies that geographic location (urban vs. rural) may have an effect on the level of education attained by individuals in the Netherlands. This could reflect differences in access to educational resources, socio-economic factors, or other regional disparities. However, as correlation does not necessarily mean causation, further exploration is needed (proportions of individuals at each education level within urban and rural areas, controlled groups that eliminate other possible factors influencing such a difference, etc.).