HOMEWORK 3

# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"
france_data <- read.fst("france_data.fst")

Task 1

Based on the recoding of lrscale for “left” and “right” and omitting “moderates” (see Tutorial 4), and educ.ba from this tutorial, do the long form coding for Chi-Square. Using the steps outlined in the tutorial, first generate the tables of expected proportions and frequencies. Determine and interpret the critical value for independence. Finally, determine and interpret both the Pearson’s chi-squared statistic and the p-value. Discuss main takeaways.

france_data <- france_data %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),
 
    # Recoding education with case_when for consistency
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  # Ensure all NAs are handled uniformly
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )
france_data <- france_data %>%
  mutate(
    lrscale = case_when(
      lrscale %in% 0:3 ~ "Left",
      lrscale %in% 7:10 ~ "Right",
      lrscale %in% c(77, 88, 99) ~ NA_character_
    ),
    Gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",                   
      yrbrn %in% 1946:1964 ~ "2",                   
      yrbrn %in% 1965:1979 ~ "3",                   
      yrbrn %in% 1980:1996 ~ "4",                   
      TRUE ~ as.character(yrbrn)                    
    ),

    Gen = factor(Gen, levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
## Not necessary, but always good to rename when doing operations so you can backtrack to clean dataset
tab_dat <- france_data

# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$clsprty, tab_dat$educ.ba)

# Calculate row-wise sums of counts in table 
rsums <- rowSums(mytab)

# Calculate column-wise sums of counts in table
csums <- colSums(mytab)

# Get the total count of the table (N)
N <- sum(mytab)

# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
##           [,1]      [,2]
## [1,] 0.1101248 0.3815066
## [2,] 0.1138739 0.3944946
# Table of expected frequencies
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
##          [,1]     [,2]
## [1,] 2059.444 7134.556
## [2,] 2129.556 7377.444
# Critical Value for Independence:
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841

Interpreting the Critical Value: If the Chi-squared test statistic exceeds 3.841, then we reject the null hypothesis at the 0.05 significance level. This would mean that there is a statistically significant association between feeling close to a party and education level in my dataset.

# Pearson's chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 166.1849

Comparison to the critical value: the critical value is 3.841, but the observed value is much greater than this critical value. Interpretation: The difference between the observed data and the expectation under the assumption of independence is statistically significant. There is a statistically significant association between placement on left right scale and a person’s educational level. This means that the probability of placement on left right scale is not the same across different education levels.

# p-value:
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
# Using chisq.test for validation:
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 166.18, df = 1, p-value < 2.2e-16

p-value < 2.2e-16: This is another representation of the p-value in scientific notation because it’s extremely close to 0. The chi-squared test tells us if two categorical variables are independent or if they are associated in some manner. The extremely small p-value (essentially 0) means that we reject the null hypothesis that the two variables are independent of each other. In other words, there’s strong evidence to suggest that there’s a significant association between the two variables in mytab.

Task 2

Do Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables sbbsntx (recoded to the corresponding 5 categories) as the response and domicil (recoded to urban and rural, setting 2 to urban instead of peri-urban) as the explanatory variable for the country of France.

See variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%22874a86e2-a0f6-40b4-aef7-51c8eed98a7d/1%22])

Provide interpretations of the output (consider the variable info).

france_data <- read.fst("france_data.fst")
table(france_data$domicil)
## 
##    1    2    3    4    5    7    8 
## 3584 2361 6257 5668 1162    3    3
table(france_data$sbbsntx)
## 
##    1    2    3    4    5    7    8 
##  757 1538  804  640  301    3  100
france_data <- france_data %>%
  mutate(
    sbbsntx = recode(as.character(sbbsntx), 
                 '1' = "Agree Strongly", 
                 '2' = "Agree",
                 '3' = "Neither agree nor disagree", 
                 '4' = "Disagree", 
                 '5' = "Disagree strongly",
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_)
  ) %>%
  filter(!is.na(sbbsntx))
france_data <- france_data %>%
  mutate(
    domicil = recode(as.character(domicil), 
                 '1' = "Urban", 
                 '2' = "Urban",
                 '3' = "Rural", 
                 '4' = "Rural", 
                 '5' = "Rural",
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_)
  ) %>%
  filter(!is.na(domicil))
test_stat <- france_data %>%
  specify(explanatory = domicil, # change variable name for explanatory variable
          response = sbbsntx) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq") # replace in between quotation marks appropriate test statistic

print(test_stat$stat)
## X-squared 
##  19.97187

The chi-squared statistic is 19.97 which shows that there is a big association between urban and rural as well as levels of agreement on Social benefits/services cost businesses too much in taxes/charges.

null_distribution <- france_data %>%
  specify(explanatory = domicil,
          response = sbbsntx) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "chisq")
p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1

p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.002

The p value is 0.002, which is extremely close to 0, under the assumptions of the hypothesis testing. This allows us to conclude that there is strong evidence to reject the null.

Task 3

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 2, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: sbbsntx (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.84 
##  2         2 0.700
##  3         3 3.49 
##  4         4 4.20 
##  5         5 1.40 
##  6         6 4.72 
##  7         7 5.39 
##  8         8 4.26 
##  9         9 1.21 
## 10        10 2.80 
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution
## Response: sbbsntx (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.84 
##  2         2 0.700
##  3         3 3.49 
##  4         4 4.20 
##  5         5 1.40 
##  6         6 4.72 
##  7         7 5.39 
##  8         8 4.26 
##  9         9 1.21 
## 10        10 2.80 
## # ℹ 990 more rows

The visualization is a right-skewed distribution, which means that mean, median, mode are under 10. However, the red line does not fall within the null distribution (it falls at 20), which allows us to conclude that it provides evidence against the null hypothesis. ## Task 4

Conduct Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables ccrdprs (leaving it on the 0-10 numeric scale) as the response (or outcome) variable and lrscale (recoded as left and right, omitting “moderates” as we did in Task 1) as the explanatory variable.

Variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%2283b08cf2-508e-49a0-9fc8-c3cc281290ec/4%22]

Provide interpretations of the output (consider the variable info).

france_data <- france_data %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
     # Recoding 'lrscale' 
    mutate(
    lrscale = case_when(
      lrscale %in% 0:3 ~ "Left",                    # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",                  # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ "Moderate",                # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),

    ccrdprs = ifelse(ccrdprs %in% c(66, 77, 88, 99), NA, ccrdprs),   # Set values 66, 77, 88, 99 in 'ccrdprs' to NA
  )
test_stat <- france_data %>%
  specify(explanatory = lrscale, # change variable name for explanatory variable
          response = ccrdprs) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "F") 
## Warning: Removed 2200 rows containing missing values.
print(test_stat$stat)
## [1] 12.48013

There is a significant association between placement on left right scale and levels of feeling personal responsibility to reduce climate change. This shows that whether people are left or right affects their level of feeling personal responsibility to reduce climate change

null_distribution <- france_data %>%
  specify(explanatory = lrscale,
          response = ccrdprs) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "F")
## Warning: Removed 2200 rows containing missing values.
p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "greater") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

There is strong evidence to reject the null as the p-value is 0.

Task 5

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 4, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: F usually corresponds to right-tailed tests. Proceed with caution.

null_distribution
## Response: ccrdprs (numeric)
## Explanatory: lrscale (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.95 
##  2         2 0.172
##  3         3 1.53 
##  4         4 0.973
##  5         5 1.91 
##  6         6 0.170
##  7         7 0.376
##  8         8 0.542
##  9         9 1.52 
## 10        10 0.343
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf

null_distribution
## Response: ccrdprs (numeric)
## Explanatory: lrscale (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.95 
##  2         2 0.172
##  3         3 1.53 
##  4         4 0.973
##  5         5 1.91 
##  6         6 0.170
##  7         7 0.376
##  8         8 0.542
##  9         9 1.52 
## 10        10 0.343
## # ℹ 990 more rows

The visualization shows a right-skewed distribution. The confidence intervals are between 0 and 5, which shows that the true value is likely to fall withinn this range. The red line does not fall within the null distribution, which shwows evidence against the null hypothesis. The p-value also being close to 0 indicates strong evidence against the null hypothesis. ## End.