Kim_Roy_Homework

packages <- c("tidyverse", "infer", "fst") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'infer' was built under R version 4.3.3

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"

ess <- read_fst("All-ESS-Data.fst")

task 1

Based on the recoding of lrscale for “left” and “right” and omitting “moderates” (see Tutorial 4), and educ.ba from this tutorial, do the long form coding for Chi-Square. Using the steps outlined in the tutorial, first generate the tables of expected proportions and frequencies. Determine and interpret the critical value for independence. Finally, determine and interpret both the Pearson’s chi-squared statistic and the p-value. Discuss main takeaways.

france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # this code Recodes clsprty and cleaning trstplt
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # now we Recode for gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
 
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )

# using unique to identify
unique(france_data$gndr)

## [1] "Female" "Male"

# does not return NA, but the following does
unique(france_data$clsprty)

## [1] "Yes" "No"  NA

# here's how to make sure it's really cleaned
france_data <- france_data %>% filter(!is.na(clsprty))
france_data <- france_data %>% filter(!is.na(trstplt))

# double-checking + quick distribution
unique(france_data$clsprty)

## [1] "Yes" "No"

table(france_data$clsprty)

## 
##   No  Yes 
## 9098 9480

france_clean <- france_data %>% 
  mutate(
    clsprty = ifelse(clsprty == 2, 0, ifelse(clsprty %in% c(7, 8, 9), NA, clsprty)), # If 'clsprty' is 2, set it to 0. If it's 7, 8, or 9, set it to NA.
    stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem), # For 'stfdem', set values 77, 88, and 99 to NA.
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt), # For 'trstplt', set values 77, 88, and 99 to NA.
  )

france_clean <- france_clean %>% filter(!is.na(clsprty), !is.na(stfdem), !is.na(trstplt))

france_clean <- france_clean %>%
  mutate(
    # Recoding 'lrscale' into a new variable 'ID'
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",                    # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",                  # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ "Moderate",                # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),
    # Recoding 'yrbrn' (year born) into a new variable 'Gen'
    Gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",                   # Birth years from 1900 to 1945 categorized as "1"
      yrbrn %in% 1946:1964 ~ "2",                   # Birth years from 1946 to 1964 categorized as "2"
      yrbrn %in% 1965:1979 ~ "3",                   # Birth years from 1965 to 1979 categorized as "3"
      yrbrn %in% 1980:1996 ~ "4",                   # Birth years from 1980 to 1996 categorized as "4"
      TRUE ~ as.character(yrbrn)                    # Other years remain unchanged
    ),
    # Converting 'Gen' to a factor with meaningful labels
    Gen = factor(Gen, levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials")) # Assigning labels to the levels in 'Gen'
  )

tab_dat <- france_data


mytab <- table(tab_dat$clsprty, tab_dat$educ.ba)


rsums <- rowSums(mytab)


csums <- colSums(mytab)


N <- sum(mytab)


ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")

## Table of Expected Proportions:

print(ptab)

##           [,1]      [,2]
## [1,] 0.1099746 0.3797444
## [2,] 0.1145921 0.3956889

# Table of expected frequencies
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")

## Table of Expected Frequencies:

print(ftab)

##          [,1]     [,2]
## [1,] 2043.108 7054.892
## [2,] 2128.892 7351.108

# Critical Value for Independence:
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")

## Critical Value: 3.841

# Pearson's chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")

## Pearson's X^2: 160.4057

Here, we see that the observed X-squared value is much larger than the critical value of 3.841. There is a significant connection between feeling close to a specific party and one’s educational level. This means that the probability of feeling close to a specific party is not the same across different education levels

# p-value:
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")

## p-value: 0

# Using chisq.test for validation:
cat("Chi-squared test result:\n")

## Chi-squared test result:

print(chisq.test(mytab, correct = F))

## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 160.41, df = 1, p-value < 2.2e-16

The “p-value indicates an extremely small value close to zero. This value leads us to reject the null hypothesis, signifying a significant association between the two categorical variables analyzed in the mytab dataset. Additionally, the large chi-squared statistic of 160.41 further supports this conclusion. A higher chi-squared statistic indicates a greater discrepancy between the observed and expected frequencies which reinforcing the evidence against the null hypothesis of independence between the variables.

task 2

Do Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables sbbsntx (recoded to the corresponding 5 categories) as the response and domicil (recoded to urban and rural, setting 2 to urban instead of peri-urban) as the explanatory variable for the country of France.

See variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%22874a86e2-a0f6-40b4-aef7-51c8eed98a7d/1%22])

Provide interpretations of the output (consider the variable info).

test_stat <- france_data %>%
  specify(explanatory = domicil, # change variable name for explanatory variable
          response = sbbsntx) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "slope") # replace in between quotation marks appropriate test statistic

## Warning: Removed 14528 rows containing missing values.

## Message: The independence null hypothesis does not inform calculation of the
## observed statistic (a slope) and will be ignored.

print(test_stat$stat)

##    domicil 
## -0.0768133

We can see that the output suggests the analysis does not provide meaningful information about the relationship between the variables.Further clarification is needed.

null_distribution <- france_data %>%
  specify(explanatory = domicil,
          response = sbbsntx) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "slope")

## Warning: Removed 14528 rows containing missing values.

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

The code concludes that the null should be rejected.

task 3

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 2, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution

## Response: sbbsntx (numeric)
## Explanatory: domicil (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate     stat
##        <int>    <dbl>
##  1         1  0.0117 
##  2         2  0.0227 
##  3         3 -0.0375 
##  4         4 -0.0209 
##  5         5 -0.0159 
##  6         6 -0.0189 
##  7         7 -0.0139 
##  8         8 -0.0111 
##  9         9  0.00392
## 10        10 -0.0139 
## # ℹ 990 more rows

The red line indicates that the null should be rejected.

task 4

Conduct Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables ccrdprs (leaving it on the 0-10 numeric scale) as the response (or outcome) variable and lrscale (recoded as left and right, omitting “moderates” as we did in Task 1) as the explanatory variable.

Variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%2283b08cf2-508e-49a0-9fc8-c3cc281290ec/4%22]

Provide interpretations of the output (consider the variable info).

france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    # Recoding education with case_when for consistency
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  # Ensure all NAs are handled uniformly
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )

# using unique to identify
unique(france_data$gndr)

## [1] "Female" "Male"

# does not return NA, but the following does
unique(france_data$clsprty)

## [1] "Yes" "No"  NA

# here's how to make sure it's really cleaned
france_data <- france_data %>% filter(!is.na(clsprty))
france_data <- france_data %>% filter(!is.na(trstplt))

# double-checking + quick distribution
unique(france_data$clsprty)

## [1] "Yes" "No"

table(france_data$clsprty)

## 
##   No  Yes 
## 9098 9480

france_clean <- france_data %>% 
  mutate(
    clsprty = ifelse(clsprty == 2, 0, ifelse(clsprty %in% c(7, 8, 9), NA, clsprty)), # If 'clsprty' is 2, set it to 0. If it's 7, 8, or 9, set it to NA.
    stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem), # For 'stfdem', set values 77, 88, and 99 to NA.
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt), # For 'trstplt', set values 77, 88, and 99 to NA.
  )

france_clean <- france_clean %>% filter(!is.na(clsprty), !is.na(stfdem), !is.na(trstplt))

france_clean <- france_clean %>%
  mutate(
    # Recoding 'lrscale' into a new variable 'ID'
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",                    # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",                  # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ "Moderate",                # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),
    # Recoding 'yrbrn' (year born) into a new variable 'Gen'
    Gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",                   
      yrbrn %in% 1946:1964 ~ "2",                   
      yrbrn %in% 1965:1979 ~ "3",                   
      yrbrn %in% 1980:1996 ~ "4",                 
      TRUE ~ as.character(yrbrn)                    
    ),
    # Converting 'Gen' to a factor with meaningful labels
    Gen = factor(Gen, levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials")) # Assigning labels to the levels in 'Gen'
  )

test_stat <- france_data %>%
  specify(explanatory = lrscale, # change variable name for explanatory variable
          response = ccrdprs) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "slope") # replace in between quotation marks appropriate test statistic

## Warning: Removed 14645 rows containing missing values.

## Message: The independence null hypothesis does not inform calculation of the
## observed statistic (a slope) and will be ignored.

print(test_stat$stat)

##     lrscale 
## 0.007179428

null_distribution <- france_data %>%
  specify(explanatory = lrscale,
          response = ccrdprs) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "slope")

## Warning: Removed 14645 rows containing missing values.

p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided") 

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.206

The output indicates that the calculated p-value of 0.206 is directly from the observed statistic. The value of the p-value tells us that there is insufficient evidence to reject the null hypothesis at conventional levels of significance, indicating that the observed statistic is not significantly different from what would be expected under the null hypothesis.

task 5

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 4, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution

## Response: ccrdprs (numeric)
## Explanatory: lrscale (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate      stat
##        <int>     <dbl>
##  1         1  0.00664 
##  2         2  0.00326 
##  3         3  0.00675 
##  4         4 -0.00912 
##  5         5  0.00327 
##  6         6  0.00217 
##  7         7  0.00670 
##  8         8  0.00474 
##  9         9 -0.000480
## 10        10 -0.00393 
## # ℹ 990 more rows

Since the red line is set based on the test statistic and located within the distributions the null should be rejected.

Kim_Roy_Homework_3

2024-03-12

task 1

task 2

task 3

task 4

task 5