Ka_Meryem

Setting up environment

# List of packages
packages <- c("tidyverse", "infer", "fst") # added any we need here

# Installed packages
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Loaded the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'infer' was built under R version 4.3.3

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"

# Read french data file to be used on this HW
france_data <- read.fst("france_data.fst")

Task 1

Based on the recoding of lrscale for “left” and “right” and omitting “moderates” (see Tutorial 4), and educ.ba from this tutorial, do the long form coding for Chi-Square. Using the steps outlined in the tutorial, first generate the tables of expected proportions and frequencies. Determine and interpret the critical value for independence. Finally, determine and interpret both the Pearson’s chi-squared statistic and the p-value. Discuss main takeaways.

Filtering to country, recoding and cleaning variables of interest

france_data <- france_data %>%

  # Recoding lrscale into a new variable ID
  mutate(
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",          # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",        # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ NA_character_,      # Values 4 to 6 in 'lrscale' are categorized as "Moderate but we want to omit them so set them to NA"
      lrscale %in% c(77, 88, 99) ~ NA_character_,   # Values 77, 88, 99 in 'lrscale' are set as NA
      TRUE ~ as.character(lrscale)
    ),
    # Recoding clsprty and cleaning trstplt
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    # Recoding education with case_when for consistency
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  # Ensure all NAs are handled uniformly
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )

unique(france_data$ID)

## [1] "Left"  NA      "Right"

# Removing NAs
france_data <- france_data %>% filter(!is.na(ID))

# Checking
unique(france_data$ID)

## [1] "Left"  "Right"

To “do the long form coding for Chi-Square” :

## Not necessary, but always good to rename when doing operations so you can backtrack to clean dataset
tab_dat <- france_data

# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$ID, tab_dat$educ.ba)

# Calculate row-wise sums of counts in table 
rsums <- rowSums(mytab)

# Calculate column-wise sums of counts in table
csums <- colSums(mytab)

# Get the total count of the table (N)
N <- sum(mytab)

# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")

## Table of Expected Proportions:

print(ptab)

##           [,1]      [,2]
## [1,] 0.1311243 0.3914592
## [2,] 0.1197912 0.3576253

# Generate table of expected frequencies
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")

## Table of Expected Frequencies:

print(ftab)

##          [,1]     [,2]
## [1,] 1181.561 3527.439
## [2,] 1079.439 3222.561

To “determine and interpret the critical value for independence”:

# Critical Value for Independence:
alpha <- 0.05 #significance level
c_val <- qchisq(alpha, df=1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")

## Critical Value: 3.841

That is, if the Chi-squared test statistic that we would calculate in the next step is larger than 3.841, then we would reject the null hypothesis at the “signficance” level. This would mean there is a statistically significant association between ideological self-placement on the left right scale and education level in our dataset.

# Pearson's chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")

## Pearson's X^2: 100.8551

# p-value:
p_val <- pchisq(test_stat, df=1, lower.tail = F)
cat("p-value:", round(p_val,4), "\n")

## p-value: 0

# Using chisq.test for validation:
cat("Chi-squared test result:\n")

## Chi-squared test result:

print(chisq.test(mytab, correct = F))

## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 100.86, df = 1, p-value < 2.2e-16

In comparison to the critical value we calculated of 3.841, Pearson’s Chi-Squared statistic is 100.8551. Therefore, our observed Chi-Squared statistic is much larger than this critical value and tells us that the difference between our observed data and what we’d expect under the assumption of independence is statistically significant. There is a statistically significant association between one’s ideological placement on the left right scale and one’s educational level. That is, the probability of one’s ideological placement on the left right scale is not the same across different education levels. If the two variables were independent, we would expect a chi-squared statistic closer to 0.

The p-value is less than 2.2e-16, or very close to 0. This extremely small p-value tells us that we reject the null hypothesis that the two variables are independent of each other. There is strong evidence to suggest that there’s a significant association between the two variables of interest.

Task 2

Do Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables sbbsntx (recoded to the corresponding 5 categories) as the response and domicil (recoded to urban and rural, setting 2 to urban instead of peri-urban) as the explanatory variable for the country of France.

See variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%22874a86e2-a0f6-40b4-aef7-51c8eed98a7d/1%22])

Provide interpretations of the output (consider the variable info).

france_data <- france_data %>%
  mutate(geo = recode(as.character(domicil), 
                      '1' = "Urban", 
                      '2' = "Urban", 
                      '3' = "Rural", 
                      '4' = "Rural", 
                      '5' = "Rural",
                      '7' = NA_character_,
                      '8' = NA_character_,
                      '9' = NA_character_),
         
    # Recoding sbbsntx
    sbbsntx = case_when(
      sbbsntx == 1 ~ "Agree strongly",
      sbbsntx == 2 ~ "Agree",
      sbbsntx == 3 ~ "Neither agree nor disagree",
      sbbsntx == 4 ~ "Disagree",
      sbbsntx == 5 ~ "Disagree strongly",
      sbbsntx %in% c(7, 8) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(sbbsntx)  # Keep other values as-is but ensure they are characters
    )
         )

# check for NAs and remove
table(france_data$geo)

## 
## Rural Urban 
##  6059  2951

france_data <- france_data %>% filter(!is.na(geo))
unique(france_data$geo)

## [1] "Rural" "Urban"

# compare
table(france_data$domicil)

## 
##    1    2    3    4    5 
## 1815 1136 2896 2616  547

# check for NAs and remove
table(france_data$sbbsntx)

## 
##                      Agree             Agree strongly 
##                        681                        389 
##                   Disagree          Disagree strongly 
##                        370                        202 
## Neither agree nor disagree 
##                        350

france_data <- france_data %>% filter(!is.na(sbbsntx))
unique(france_data$sbbsntx)

## [1] "Disagree strongly"          "Agree"                     
## [3] "Agree strongly"             "Neither agree nor disagree"
## [5] "Disagree"

Doing steps 1-3 using the infer package

# Step 1:  Calculate the test statistic of your sample
test_stat <- france_data %>%
  specify( explanatory = geo,
           response = sbbsntx) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "Chisq") # the explanatory variable (geo) is a categorical variable and has 2 levels and the response variable (sbbsntx) is a categorical variable and has 5 levels so use a Chi-square test of independence based on the table in the lecture 

print(test_stat$stat)

## X-squared 
##  17.31799

The Pearson’s Chi-squared statistic is 17.32 and this statistic measures the difference between the observed sample mean and the population mean we’d expect under the assumption of independence. The larger the magnitude of Chi-squared, the greater the evidence against the null hypothesis.

# Step 2:  Simulate the null distribution
null_distribution <- france_data %>%
  specify(explanatory = geo,
          response = sbbsntx) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")

# Step 3:  Calculate the p-value of the sample
p_val <- null_distribution %>%
  get_pvalue(obs_stat = test_stat, direction = "greater")

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.003

The p-value is close to 0, that is, under the assumptions of frequentist hypothesis testing this means there is strong evidence to reject the null. Therefore the large Chi-squared statistic which is not very close to 0, and the p-value of almost 0 give us strong evidence to reject the null hypothesis and suggests that there is a significant statistical association between the area where one lives and the extent to which they agree that social benefits/services cost businesses too much in taxes/charges.

Task 3

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 2, and interpret the output.

# Step 4:  Visualize
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater")

null_distribution

## Response: sbbsntx (factor)
## Explanatory: geo (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1  4.74
##  2         2  2.34
##  3         3  9.99
##  4         4  2.81
##  5         5  4.03
##  6         6  4.78
##  7         7  5.70
##  8         8  2.89
##  9         9  7.57
## 10        10  2.68
## # ℹ 990 more rows

We visualizeed the null distribution beside the observed test statistic which was the red line, since the observed test statistic was calculated to be 17.32. This shows that under the assumption of frequentist hypo. testing, we should reject the null hypothesis. If the observed test statistic fell within the null distribution instead of far to the right of it, then we would see that there are regions of the null distribution that are as (or more) extreme as our observed statistic.

# We can also visualize with confidence intervals
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution

## Response: sbbsntx (factor)
## Explanatory: geo (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1  4.74
##  2         2  2.34
##  3         3  9.99
##  4         4  2.81
##  5         5  4.03
##  6         6  4.78
##  7         7  5.70
##  8         8  2.89
##  9         9  7.57
## 10        10  2.68
## # ℹ 990 more rows

This view demosntrates the 95 % confidence interval of our null-distribution.

Task 4

Conduct Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables ccrdprs (leaving it on the 0-10 numeric scale) as the response (or outcome) variable and lrscale (recoded as left and right, omitting “moderates” as we did in Task 1) as the explanatory variable.

Variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%2283b08cf2-508e-49a0-9fc8-c3cc281290ec/4%22]

Provide interpretations of the output (consider the variable info).

france_data <- france_data %>%

  # Recoding lrscale into a new variable ID
  mutate(
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",          # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",        # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ NA_character_,      # Values 4 to 6 in 'lrscale' are categorized as "Moderate but we want to omit them so set them to NA"
      lrscale %in% c(77, 88, 99) ~ NA_character_,   # Values 77, 88, 99 in 'lrscale' are set as NA
      TRUE ~ as.character(lrscale)
    )) %>%
  mutate(
    ccrdprs = ifelse(ccrdprs %in% c(66,77,88,99), NA_integer_, ccrdprs)
  )

france_data <- france_data %>% filter(!is.na(ccrdprs))

unique(france_data$ccrdprs)

##  [1] 10  8  7  6  5  9  0  4  2  3  1

unique(france_data$ID)

## [1] "Left"  "Right"

# Step 1:  Calculate the test statistic of your sample
test_stat <- france_data %>%
  specify( explanatory = ID,
           response = ccrdprs) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "t") # the explanatory variable (ID) is a categorical variable and has 2 levels and the response variable (ccrdprs) is a numerical variable so use a two-sample t test based on the table in the lecture

## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "Left" - "Right", or divided in the order "Left" / "Right" for
## ratio-based statistics. To specify this order yourself, supply `order =
## c("Left", "Right")` to the calculate() function.

print(test_stat$stat)

##        t 
## 4.855399

This statistic measures the difference between the observed sample mean and the population mean we’d expect under the assumption of independence, in units of standard error. The larger the magnitude of t–> the greater the evidence against the null hypothesis. A t-stat of 4.855 is not extremely large, although it is not too low either so it is hard to say whether there is a significant difference in the ideological self placement on the left right political scale and the extent to which one feels personally responsible to try to reduce climate change.

# Step 2:  Simulate the null distribution
null_distribution <- france_data %>%
  specify(explanatory = ID,
          response = ccrdprs) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "t")

## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "Left" - "Right", or divided in the order "Left" / "Right" for
## ratio-based statistics. To specify this order yourself, supply `order =
## c("Left", "Right")` to the calculate() function.

# Step 3:  Calculate the p-value of the sample
p_val <- null_distribution %>%
  get_pvalue(obs_stat = test_stat, direction = "two-sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

The calculated p-value is very close to 0, which means that under the assumptions of frequentist hypothesis testing this means there is strong evidence to reject the null. Therefore the t statistic which is not very close to 0, and the p-value of almost 0 give us strong evidence to reject the null hypothesis and suggests that there is a significant statistical association between the ideological self placement on the left right political scale and the extent to which one feels personally responsible to try to reduce climate change.

Task 5

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 4, and interpret the output.

# Step 4:  Visualize
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution

## Response: ccrdprs (numeric)
## Explanatory: ID (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.0435
##  2         2 -0.567 
##  3         3 -0.181 
##  4         4  0.396 
##  5         5 -0.869 
##  6         6  0.258 
##  7         7 -1.53  
##  8         8  0.258 
##  9         9  0.340 
## 10        10  0.313 
## # ℹ 990 more rows

We visualized the null distribution beside the observed test statistic which is the red line since the observed test statistic was calculated to be 4.855. This shows that under the assumption of frequentist hypothesis testing, we should reject the null hypothesis. If the observed test statistic fell within the null distribution instead of far to the right of it, then we’d see that there are area of the null distribution that are as (or more) extreme as our observed statistic.

# We can also visualize with confidence intervals
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution

## Response: ccrdprs (numeric)
## Explanatory: ID (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.0435
##  2         2 -0.567 
##  3         3 -0.181 
##  4         4  0.396 
##  5         5 -0.869 
##  6         6  0.258 
##  7         7 -1.53  
##  8         8  0.258 
##  9         9  0.340 
## 10        10  0.313 
## # ℹ 990 more rows

The plotting demosntrates the 95% confidence interval of our null distribution.