Sum_ManLaiMannie_Homework

Setting up environment

rm(list=ls()); gc()

##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 526414 28.2    1169317 62.5         NA   669445 35.8
## Vcells 969879  7.4    8388608 64.0      16384  1851710 14.2

# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"

ess <- read_fst("All-ESS-Data.fst")

Task 1

Based on the recoding of lrscale for “left” and “right” and omitting “moderates” (see Tutorial 4), and educ.ba from this tutorial, do the long form coding for Chi-Square. Using the steps outlined in the tutorial, first generate the tables of expected proportions and frequencies. Determine and interpret the critical value for independence. Finally, determine and interpret both the Pearson’s chi-squared statistic and the p-value. Discuss main takeaways.

france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

     # Recoding 'lrscale' into a new variable 'ID'
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",                    # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",                  # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ "Moderate",                # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),
    
    # Recoding education with case_when for consistency
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  # Ensure all NAs are handled uniformly
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )

## Not necessary, but always good to rename when doing operations so you can backtrack to clean dataset
tab_dat <- france_data

# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$lrscale, tab_dat$educ.ba)

# Calculate row-wise sums of counts in table 
rsums <- rowSums(mytab)

# Calculate column-wise sums of counts in table
csums <- colSums(mytab)

# Get the total count of the table (N)
N <- sum(mytab)

# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")

## Table of Expected Proportions:

print(ptab)

##              [,1]       [,2]
##  [1,] 0.011086193 0.03855137
##  [2,] 0.005853979 0.02035676
##  [3,] 0.014887173 0.05176899
##  [4,] 0.023415916 0.08142703
##  [5,] 0.021562351 0.07498140
##  [6,] 0.061871513 0.21515338
##  [7,] 0.018699885 0.06502740
##  [8,] 0.020518255 0.07135064
##  [9,] 0.017104411 0.05947926
## [10,] 0.004798151 0.01668520
## [11,] 0.008047755 0.02798544
## [12,] 0.004469671 0.01554294
## [13,] 0.011027535 0.03834740

# Table of expected frequencies
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")

## Table of Expected Frequencies:

print(ftab)

##            [,1]      [,2]
##  [1,]  211.0589  733.9411
##  [2,]  111.4481  387.5519
##  [3,]  283.4220  985.5780
##  [4,]  445.7922 1550.2078
##  [5,]  410.5040 1427.4960
##  [6,] 1177.9099 4096.0901
##  [7,]  356.0084 1237.9916
##  [8,]  390.6265 1358.3735
##  [9,]  325.6338 1132.3662
## [10,]   91.3472  317.6528
## [11,]  153.2132  532.7868
## [12,]   85.0936  295.9064
## [13,]  209.9422  730.0578

# Critical Value for Independence:
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")

## Critical Value: 3.841

Interpreting the Critical Value: if the Chi-squared test statistic exceeds 3.841, the null hypothesis will be rejected at the 0.05 significance level. This would mean that there is a statistically significant association between placement on left right scale and education level in the dataset.

# Pearson's chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")

## Pearson's X^2: 510.8805

Comparison to the critical value: the critical value is 3.841, but the observed X-squared value is much larger than this critical value. Interpretation: The difference between the observed data and the expectation under the assumption of independence is statistically significant. Thus, there is a statistically significant association between placement on left right scale and one’s educational level. This means that the probability of placement on left right scale is not the same across different education levels.

# p-value:
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")

## p-value: 0

# Using chisq.test for validation:
cat("Chi-squared test result:\n")

## Chi-squared test result:

print(chisq.test(mytab, correct = F))

## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 510.88, df = 12, p-value < 2.2e-16

p-value < 2.2e-16: this is another representation of the p-value in scientific notation because it’s extremely close to 0. The chi-squared test tells us if two categorical variables are independent or if they are associated in some manner. In our test: the extremely small p-value means that we reject the null hypothesis that the two variables are independent of each other. In other words, there’s strong evidence to suggest that there’s a significant association between the two variables in mytab. The large chi-squared statistic of 510.88 further reinforces this point. If the two variables were independent, we’d expect a chi-squared statistic closer to 0.

Task 2

Do Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables sbbsntx (recoded to the corresponding 5 categories) as the response and domicil (recoded to urban and rural, setting 2 to urban instead of peri-urban) as the explanatory variable for the country of France.

See variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%22874a86e2-a0f6-40b4-aef7-51c8eed98a7d/1%22])

Provide interpretations of the output (consider the variable info).

france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  #Recoding to the corresponding 5 categories
  mutate(
   sbbsntx = case_when(
      sbbsntx == 1 ~ "Agree strongly",  
      sbbsntx == 2 ~ "Agree",  
      sbbsntx == 3 ~ "Neither agree nor disagree",
      sbbsntx == 4 ~ "Disagree",
      sbbsntx == 5 ~ "Disagree strongly",
      sbbsntx %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(sbbsntx)  # Keep other values as-is but ensure they are characters
    ),
   
  # Recoding to urban and rural
    domicil = case_when(
      domicil == 1 ~ "Urban",  
      domicil == 2 ~ "Urban",  
      domicil == 3 ~ "Rural",
      domicil == 4 ~ "Rural",
      domicil == 5 ~ "Rural",
      domicil %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(domicil)  # Keep other values as-is but ensure they are characters
   )
  )

test_stat <- france_data %>%
  specify(explanatory = domicil, # change variable name for explanatory variable
          response = sbbsntx) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "Chisq") # replace in between quotation marks appropriate test statistic

## Warning: Removed 14999 rows containing missing values.

print(test_stat$stat)

## X-squared 
##  19.97187

Interpretation: the chi-squared statistic of 19.97187 shows that there is a significant association between urban or rural and levels of agreement on the social benefits/services cost businesses too much in taxes/charges. It means that agreement levels of people vary significantly depending on whether they live in urban or rural areas. Also, it is a strong evidence to reject the null hypothesis.

null_distribution <- france_data %>%
  specify(explanatory = domicil,
          response = sbbsntx) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "Chisq")

## Warning: Removed 14999 rows containing missing values.

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "greater") # would only replace test_stat if assigned another name in Step 1

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.001

It shows a p-value of 0.002, which is nearly 0, but still, under the assumptions of frequentist hypothesis testing. This means there is a strong evidence to reject the null.

Task 3

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 2, and interpret the output.

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution

## Response: sbbsntx (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 4.40 
##  2         2 6.95 
##  3         3 2.24 
##  4         4 0.675
##  5         5 5.37 
##  6         6 1.72 
##  7         7 6.55 
##  8         8 1.33 
##  9         9 1.51 
## 10        10 1.38 
## # ℹ 990 more rows

Interpretation: the visualization shows a right-skewed distribution, which means that its mean, mode, and median are all under 10. At the same time, the confidence intervals are between 0 and around 10, which means that the true value probably falls within this range. Also, the red line does not fall within the null distribution, but at 20, which means that it provides an evidence against the null hypothesis.

Task 4

Conduct Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables ccrdprs (leaving it on the 0-10 numeric scale) as the response (or outcome) variable and lrscale (recoded as left and right, omitting “moderates” as we did in Task 1) as the explanatory variable.

Variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%2283b08cf2-508e-49a0-9fc8-c3cc281290ec/4%22]

Provide interpretations of the output (consider the variable info).

france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
     # Recoding 'lrscale' 
    mutate(
    lrscale = case_when(
      lrscale %in% 0:3 ~ "Left",                    # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",                  # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ "Moderate",                # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),

    ccrdprs = ifelse(ccrdprs %in% c(66, 77, 88, 99), NA, ccrdprs),   # Set values 66, 77, 88, 99 in 'ccrdprs' to NA
  )

test_stat <- france_data %>%
  specify(explanatory = lrscale, # change variable name for explanatory variable
          response = ccrdprs) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "F")

## Warning: Removed 15413 rows containing missing values.

print(test_stat$stat)

## [1] 15.41871

Interpretation: the ANOVA statistic of 15.41871 shows that there are differences among the variables. There is a significant association between placement on left right scale and levels of feeling personal responsibility to reduce climate change. It means that whether people are left or right affects their level of feeling personal responsibility to reduce climate change.

null_distribution <- france_data %>%
  specify(explanatory = lrscale,
          response = ccrdprs) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "F")

## Warning: Removed 15413 rows containing missing values.

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "greater") # would only replace test_stat if assigned another name in Step 1

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

It shows a p-value of 0, but, still, under the assumptions of frequentist hypothesis testing. This means there is a strong evidence to reject the null.

Task 5

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 4, and interpret the output.

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf

null_distribution

## Response: ccrdprs (numeric)
## Explanatory: lrscale (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.386
##  2         2 1.39 
##  3         3 3.48 
##  4         4 2.03 
##  5         5 1.46 
##  6         6 1.47 
##  7         7 3.51 
##  8         8 0.588
##  9         9 0.184
## 10        10 0.103
## # ℹ 990 more rows

Interpretation: the visualization shows a right-skewed distribution, which means that its mean, mode, and median are all under 5. At the same time, the confidence intervals are between 0 and almost 5, which means that the true value probably falls within this range. Also, the red line does not fall within the null distribution, but just over 15, which means that it provides an evidence against the null hypothesis.

Sum_ManLaiMannie_Homework_3

2024-03-05