Homework Assignment 3

setting up the environment:

## To Ensure All_Ess-Data file will run 
rm(list=ls()); gc() 
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 526944 28.2    1170850 62.6         NA   669408 35.8
## Vcells 970728  7.5    8388608 64.0      16384  1851777 14.2
# List of packages
packages <- c("tidyverse", "infer", "fst") 

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"
ess <- read_fst("All-ESS-Data.fst")
france_data <- read.fst("france_data.fst")

Task 1

Based on the recoding of lrscale for “left” and “right” and omitting “moderates” (see Tutorial 4), and educ.ba from this tutorial, do the long form coding for Chi-Square. Using the steps outlined in the tutorial, first generate the tables of expected proportions and frequencies. Determine and interpret the critical value for independence. Finally, determine and interpret both the Pearson’s chi-squared statistic and the p-value. Discuss main takeaways.

france_data <- france_data %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  
      TRUE ~ as.character(clsprty) 
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),
 
    # Recoding education with case_when for consistency
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  # Ensure all NAs are handled uniformly
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )
france_data <- france_data %>%
  mutate(
    lrscale = case_when(
      lrscale %in% 0:3 ~ "Left",
      lrscale %in% 7:10 ~ "Right",
      lrscale %in% c(77, 88, 99) ~ NA_character_
    ),
    Gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",                   
      yrbrn %in% 1946:1964 ~ "2",                   
      yrbrn %in% 1965:1979 ~ "3",                   
      yrbrn %in% 1980:1996 ~ "4",                   
      TRUE ~ as.character(yrbrn)                    
    ),

    Gen = factor(Gen, levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
## rename
tab_dat <- france_data

# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$clsprty, tab_dat$educ.ba)

# Calculate row-wise sums of counts in table 
rsums <- rowSums(mytab)

# Calculate column-wise sums of counts in table
csums <- colSums(mytab)

# Get the total count of the table (N)
N <- sum(mytab)

# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
##           [,1]      [,2]
## [1,] 0.1101248 0.3815066
## [2,] 0.1138739 0.3944946
# Table of expected frequencies
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
##          [,1]     [,2]
## [1,] 2059.444 7134.556
## [2,] 2129.556 7377.444
# Critical Value for Independence:
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841

If the Chi-squared statistic is higher than 3.841 at the 0.05 significance level, we would reject the null hypothesis. That would mean there is a strong connection between feeling close to a party and education level.

# Pearson's chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 166.1849

The critical value is 3.841 and with the value of 166.185, we can see that the observed value is significantly higher. Therefore, the relationship between the observed data and the expected values under the assumption of independance is statistically significant. That means there is a correlation between a persons position on the left-right scale and their educational attainment. In other words, a persons left-right placement varies across education levels.

# p-value:
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
# Using chisq.test for validation:
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 166.18, df = 1, p-value < 2.2e-16

p-value < 2.2e-16: The P-value is the probability that the observed data would occur under the “null hypothesis scenario”. A small P value indicated strong evidence against the null hypothesis. The Chi sqaured test tells us if the variables are independent or connected. In this case, there is a strong association between the two variables.

Task 2

Do Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables sbbsntx (recoded to the corresponding 5 categories) as the response and domicil (recoded to urban and rural, setting 2 to urban instead of peri-urban) as the explanatory variable for the country of France.

See variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%22874a86e2-a0f6-40b4-aef7-51c8eed98a7d/1%22])

Provide interpretations of the output (consider the variable info).

france_data <- read.fst("france_data.fst")
table(france_data$domicil)
## 
##    1    2    3    4    5    7    8 
## 3584 2361 6257 5668 1162    3    3
table(france_data$sbbsntx)
## 
##    1    2    3    4    5    7    8 
##  757 1538  804  640  301    3  100
france_data <- france_data %>%
  mutate(
    sbbsntx = recode(as.character(sbbsntx), 
                 '1' = "Agree Strongly", 
                 '2' = "Agree",
                 '3' = "Neither agree nor disagree", 
                 '4' = "Disagree", 
                 '5' = "Disagree strongly",
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_)
  ) %>%
  filter(!is.na(sbbsntx))
france_data <- france_data %>%
  mutate(
    domicil = recode(as.character(domicil), 
                 '1' = "Urban", 
                 '2' = "Urban",
                 '3' = "Rural", 
                 '4' = "Rural", 
                 '5' = "Rural",
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_)
  ) %>%
  filter(!is.na(domicil))
test_stat <- france_data %>%
  specify(explanatory = domicil, 
          response = sbbsntx) %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq") 

print(test_stat$stat)
## X-squared 
##  19.97187

With a value of 19.97, we can see that there is large association between urban and rural areas in terms of levels of agreement that socal benefits/services cost businesses too much taxes/charges.

null_distribution <- france_data %>%
  specify(explanatory = domicil,
          response = sbbsntx) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "chisq")
p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided") 
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

The p value is 0, which under the assumption of hypthesis testing, we can see that there is strong evidence to reject the null hypothesis.

Task 3

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 2, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: sbbsntx (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 9.79 
##  2         2 4.89 
##  3         3 9.24 
##  4         4 0.498
##  5         5 4.37 
##  6         6 2.46 
##  7         7 1.29 
##  8         8 0.818
##  9         9 5.89 
## 10        10 2.47 
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf

The graph is right-skewed and because the red line falls outside where the mean, median, and mode are, we should reject the null hypothesis

Task 4

Conduct Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables ccrdprs (leaving it on the 0-10 numeric scale) as the response (or outcome) variable and lrscale (recoded as left and right, omitting “moderates” as we did in Task 1) as the explanatory variable.

Variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%2283b08cf2-508e-49a0-9fc8-c3cc281290ec/4%22]

Provide interpretations of the output (consider the variable info).

france_data <- france_data %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
     # Recoding 'lrscale' 
    mutate(
    lrscale = case_when(
      lrscale %in% 0:3 ~ "Left",                    # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",                  # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ "Moderate",                # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),

    ccrdprs = ifelse(ccrdprs %in% c(66, 77, 88, 99), NA, ccrdprs),   # Set values 66, 77, 88, 99 in 'ccrdprs' to NA
  )
test_stat <- france_data %>%
  specify(explanatory = lrscale, 
          response = ccrdprs) %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "F") 
## Warning: Removed 2200 rows containing missing values.
print(test_stat$stat)
## [1] 12.48013

There is large association between individuals placement on the left-right scale and levels of feelings of personal responsibility to reduce climate change. Levels of feelings of responsibility to reduce climate change are connected to the placement of individuals on the left-right scale.

null_distribution <- france_data %>%
  specify(explanatory = lrscale,
          response = ccrdprs) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "F")
## Warning: Removed 2200 rows containing missing values.
p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "greater") 
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

The P-value is zero meaning there is strong evidence and support to reject the null hypothesis

Task 5

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 4, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: F usually corresponds to right-tailed tests. Proceed with caution.

null_distribution
## Response: ccrdprs (numeric)
## Explanatory: lrscale (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 0.547 
##  2         2 0.312 
##  3         3 0.967 
##  4         4 0.0678
##  5         5 0.214 
##  6         6 0.314 
##  7         7 0.0728
##  8         8 0.204 
##  9         9 2.75  
## 10        10 0.0586
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf

The confidence variables mainly fall between 0 and 6 making this graph righ skewed. The red line falls outside this range meaning it is against the null hypothesis. ## End.