# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"

Task 1

Based on the recoding of lrscale for “left” and “right” and omitting “moderates” (see Tutorial 4), and educ.ba from this tutorial, do the long form coding for Chi-Square. Using the steps outlined in the tutorial, first generate the tables of expected proportions and frequencies. Determine and interpret the critical value for independence. Finally, determine and interpret both the Pearson’s chi-squared statistic and the p-value. Discuss main takeaways.

ess <- read_fst("All-ESS-Data.fst")
france_data <- read.fst("france_data.fst")
france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    # Recoding education with case_when for consistency
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  # Ensure all NAs are handled uniformly
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )
unique(france_data$gndr)
## [1] "Female" "Male"
unique(france_data$clsprty)
## [1] "Yes" "No"  NA
france_data <- france_data %>% filter(!is.na(clsprty))
france_data <- france_data %>% filter(!is.na(trstplt))
unique(france_data$clsprty)
## [1] "Yes" "No"
table(france_data$clsprty)
## 
##   No  Yes 
## 9098 9480
## Not necessary, but always good to rename when doing operations so you can backtrack to clean dataset
tab_dat <- france_data

# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$clsprty, tab_dat$educ.ba)

# Calculate row-wise sums of counts in table 
rsums <- rowSums(mytab)

# Calculate column-wise sums of counts in table
csums <- colSums(mytab)

# Get the total count of the table (N)
N <- sum(mytab)

# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
##           [,1]      [,2]
## [1,] 0.1099746 0.3797444
## [2,] 0.1145921 0.3956889
# Table of expected frequencies
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
##          [,1]     [,2]
## [1,] 2043.108 7054.892
## [2,] 2128.892 7351.108
# Critical Value for Independence:
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841
# Pearson's chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 160.4057
# p-value:
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
# Using chisq.test for validation:
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 160.41, df = 1, p-value < 2.2e-16

Conclusions: When interpreting the critical value of independence when we generate the chi squared test statistic that is computed if it is higher than 3.841 than we will reject the null hypothesis at the 0.05 signifiance level and conclude that there is a relationship between both categorical variables. That said after computing the x square value it is larger than 3.841 and we can say that there is a relationship between individuals who feel close to a party and their educational level and thus there will be differences amoungst these individuals. Next, the person’s p-value is less than 0.05 and very small in number which is the significance level meaning we can reject the null hypothesis of independence and there is a significant relationship between the variables. That said, the chi square number of 160.41 also supports this.

Task 2

Do Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables sbbsntx (recoded to the corresponding 5 categories) as the response and domicil (recoded to urban and rural, setting 2 to urban instead of peri-urban) as the explanatory variable for the country of France.

See variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%22874a86e2-a0f6-40b4-aef7-51c8eed98a7d/1%22])

Provide interpretations of the output (consider the variable info).

test_stat <- france_data %>%
  specify(explanatory = domicil, # change variable name for explanatory variable
          response = sbbsntx) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "slope") # replace in between quotation marks appropriate test statistic
## Warning: Removed 14528 rows containing missing values.
## Message: The independence null hypothesis does not inform calculation of the
## observed statistic (a slope) and will be ignored.
print(test_stat$stat)
##    domicil 
## -0.0768133

This test statistic shows that there is no clear relationship between people living in urban and rural areas and how taxes affect their businesses.

null_distribution <- france_data %>%
  specify(explanatory = domicil,
          response = sbbsntx) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "slope")
## Warning: Removed 14528 rows containing missing values.
p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

Therefore, we reject the null hypothesis because we have a P-value of 0.

Task 3

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 2, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution
## Response: sbbsntx (numeric)
## Explanatory: domicil (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate     stat
##        <int>    <dbl>
##  1         1 -0.0170 
##  2         2  0.0286 
##  3         3  0.00170
##  4         4  0.00151
##  5         5  0.0102 
##  6         6 -0.0287 
##  7         7  0.0149 
##  8         8 -0.0215 
##  9         9  0.00708
## 10        10 -0.00498
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution
## Response: sbbsntx (numeric)
## Explanatory: domicil (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate     stat
##        <int>    <dbl>
##  1         1 -0.0170 
##  2         2  0.0286 
##  3         3  0.00170
##  4         4  0.00151
##  5         5  0.0102 
##  6         6 -0.0287 
##  7         7  0.0149 
##  8         8 -0.0215 
##  9         9  0.00708
## 10        10 -0.00498
## # ℹ 990 more rows

Therefore, the test statistic falls out from the confidence interval meaning we would reject the hypothesis because these is no strong correlation and it does not fall within the distribution as well.

Task 4

Conduct Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables ccrdprs (leaving it on the 0-10 numeric scale) as the response (or outcome) variable and lrscale (recoded as left and right, omitting “moderates” as we did in Task 1) as the explanatory variable.

Variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%2283b08cf2-508e-49a0-9fc8-c3cc281290ec/4%22]

Provide interpretations of the output (consider the variable info).

test_stat <- france_data %>%
  specify(explanatory = lrscale, # change variable name for explanatory variable
          response = ccrdprs) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "slope") # replace in between quotation marks appropriate test statistic
## Warning: Removed 14645 rows containing missing values.
## Message: The independence null hypothesis does not inform calculation of the
## observed statistic (a slope) and will be ignored.
print(test_stat$stat)
##     lrscale 
## 0.007179428
null_distribution <- france_data %>%
  specify(explanatory = lrscale,
          response = ccrdprs) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "slope")
## Warning: Removed 14645 rows containing missing values.
p_val <- null_distribution %>%
get_pvalue(obs_stat = test_stat, direction = "two-sided")
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.188

Therefore, the p-value is 0.212 which is greater than 0.05 which means we support the null hypothesis. And there is a correlation between feeling responsible for climate change based on where they live which means they lean more right in feeling this sense of responsibility (7 - 10).

Task 5

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 4, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution
## Response: ccrdprs (numeric)
## Explanatory: lrscale (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate      stat
##        <int>     <dbl>
##  1         1  0.000141
##  2         2  0.00535 
##  3         3  0.00437 
##  4         4  0.00513 
##  5         5 -0.00893 
##  6         6  0.00227 
##  7         7 -0.00356 
##  8         8 -0.00415 
##  9         9 -0.0100  
## 10        10 -0.000183
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution
## Response: ccrdprs (numeric)
## Explanatory: lrscale (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate      stat
##        <int>     <dbl>
##  1         1  0.000141
##  2         2  0.00535 
##  3         3  0.00437 
##  4         4  0.00513 
##  5         5 -0.00893 
##  6         6  0.00227 
##  7         7 -0.00356 
##  8         8 -0.00415 
##  9         9 -0.0100  
## 10        10 -0.000183
## # ℹ 990 more rows

Therefore, the red line represents the t-statistic which is between the shaded area of the confidence variable or the distributions which means the hypothesis is true.

End.