Haya_Elouri_homework

rm(list=ls()); gc()

##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 526932 28.2    1170811 62.6         NA   669417 35.8
## Vcells 970899  7.5    8388608 64.0      16384  1851684 14.2

# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"

ess <- read_fst("france_data.fst")

Task 1

Based on the recoding of lrscale for “left” and “right” and omitting “moderates” (see Tutorial 4), and educ.ba from this tutorial, do the long form coding for Chi-Square. Using the steps outlined in the tutorial, first generate the tables of expected proportions and frequencies. Determine and interpret the critical value for independence. Finally, determine and interpret both the Pearson’s chi-squared statistic and the p-value. Discuss main takeaways.

france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    # Recoding education with case_when for consistency
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  # Ensure all NAs are handled uniformly
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )

# using unique to identify
unique(france_data$gndr)

## [1] "Female" "Male"

unique(france_data$clsprty)

## [1] "Yes" "No"  NA

# here's how to make sure it's really cleaned
france_data <- france_data %>% filter(!is.na(clsprty))
france_data <- france_data %>% filter(!is.na(trstplt))

# double-checking + quick distribution
unique(france_data$clsprty)

## [1] "Yes" "No"

table(france_data$clsprty)

## 
##   No  Yes 
## 9098 9480

france_clean <- france_data %>% 
  mutate(
    clsprty = ifelse(clsprty == 2, 0, ifelse(clsprty %in% c(7, 8, 9), NA, clsprty)), # If 'clsprty' is 2, set it to 0. If it's 7, 8, or 9, set it to NA.
    stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem), # For 'stfdem', set values 77, 88, and 99 to NA.
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt), # For 'trstplt', set values 77, 88, and 99 to NA.
  )

france_clean <- france_clean %>% filter(!is.na(clsprty), !is.na(stfdem), !is.na(trstplt))

france_clean <- france_clean %>%
  mutate(
    # Recoding 'lrscale' into a new variable 'ID'
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",                    # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",                  # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ "Moderate",                # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),
    # Recoding 'yrbrn' (year born) into a new variable 'Gen'
    Gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",                   # Birth years from 1900 to 1945 categorized as "1"
      yrbrn %in% 1946:1964 ~ "2",                   # Birth years from 1946 to 1964 categorized as "2"
      yrbrn %in% 1965:1979 ~ "3",                   # Birth years from 1965 to 1979 categorized as "3"
      yrbrn %in% 1980:1996 ~ "4",                   # Birth years from 1980 to 1996 categorized as "4"
      TRUE ~ as.character(yrbrn)                    # Other years remain unchanged
    ),
    # Converting 'Gen' to a factor with meaningful labels
    Gen = factor(Gen, levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials")) # Assigning labels to the levels in 'Gen'
  )

tab_dat <- france_data

# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$clsprty, tab_dat$educ.ba)

# Calculate row-wise sums of counts in table 
rsums <- rowSums(mytab)

# Calculate column-wise sums of counts in table
csums <- colSums(mytab)

# Get the total count of the table (N)
N <- sum(mytab)

# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")

## Table of Expected Proportions:

print(ptab)

##           [,1]      [,2]
## [1,] 0.1099746 0.3797444
## [2,] 0.1145921 0.3956889

# Table of expected frequencies
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")

## Table of Expected Frequencies:

print(ftab)

##          [,1]     [,2]
## [1,] 2043.108 7054.892
## [2,] 2128.892 7351.108

# Critical Value for Independence:
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")

## Critical Value: 3.841

Interpreting the crucial value: At the 0.05 significance level, we reject the null hypothesis if the Chi-squared test statistic, which we calculate from the chisq.test function, is greater than 3.841.

# Pearson's chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")

## Pearson's X^2: 160.4057

# p-value:
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")

## p-value: 0

# Using chisq.test for validation:
cat("Chi-squared test result:\n")

## Chi-squared test result:

print(chisq.test(mytab, correct = F))

## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 160.41, df = 1, p-value < 2.2e-16

Analysis of the chi-squared test results:

The p-value, which is written as “p-value < 2.2e-16,” denotes a small number that is almost equal to zero. This value, which is close to zero, is represented as 2.2 × 10^(-16) in scientific notation.

The small p-value indicates a substantial correlation between the two categorical variables examined in the mytab dataset, which prompts us to reject the null hypothesis.

This result is further supported by the large chi-squared statistic of 160.41. An increased chi-squared value suggests a more significant difference between the actual and predicted frequencies, hence strengthening the evidence opposing the null hypothesis of the variables’ independence.

Task 2

Do Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables sbbsntx (recoded to the corresponding 5 categories) as the response and domicil (recoded to urban and rural, setting 2 to urban instead of peri-urban) as the explanatory variable for the country of France.

See variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%22874a86e2-a0f6-40b4-aef7-51c8eed98a7d/1%22])

Provide interpretations of the output (consider the variable info).

test_stat <- france_data %>%
  specify(explanatory = domicil, # change variable name for explanatory variable
          response = sbbsntx) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "slope") # replace in between quotation marks appropriate test statistic

## Warning: Removed 14528 rows containing missing values.

## Message: The independence null hypothesis does not inform calculation of the
## observed statistic (a slope) and will be ignored.

print(test_stat$stat)

##    domicil 
## -0.0768133

null_distribution <- france_data %>%
  specify(explanatory = domicil,
          response = sbbsntx) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "slope")

## Warning: Removed 14528 rows containing missing values.

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

This means there is strong evidence to reject the null, because our p value is less than 0.05. The domicil number doesnt show any connection between the variables that we are studying.

Task 3

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 2, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution

## Response: sbbsntx (numeric)
## Explanatory: domicil (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate     stat
##        <int>    <dbl>
##  1         1 -0.0287 
##  2         2  0.00504
##  3         3  0.0249 
##  4         4  0.0271 
##  5         5 -0.00239
##  6         6 -0.0237 
##  7         7  0.00207
##  8         8  0.00522
##  9         9  0.0112 
## 10        10 -0.00851
## # ℹ 990 more rows

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution

## Response: sbbsntx (numeric)
## Explanatory: domicil (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate     stat
##        <int>    <dbl>
##  1         1 -0.0287 
##  2         2  0.00504
##  3         3  0.0249 
##  4         4  0.0271 
##  5         5 -0.00239
##  6         6 -0.0237 
##  7         7  0.00207
##  8         8  0.00522
##  9         9  0.0112 
## 10        10 -0.00851
## # ℹ 990 more rows

Under the presumptions of frequentist hypothesis testing, the red line here represents clear proof to reject the null. ## Task 4

Conduct Steps 1 to 3 using the infer package (as we did in the tutorial), for the variables ccrdprs (leaving it on the 0-10 numeric scale) as the response (or outcome) variable and lrscale (recoded as left and right, omitting “moderates” as we did in Task 1) as the explanatory variable.

Variable info here: https://ess.sikt.no/en/datafile/ffc43f48-e15a-4a1c-8813-47eda377c355/92?tab=1&elements=[%2283b08cf2-508e-49a0-9fc8-c3cc281290ec/4%22]

Provide interpretations of the output (consider the variable info).

france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    # Recoding education with case_when for consistency
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  # Ensure all NAs are handled uniformly
  mutate(
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba)  # Force educ.ba to be treated as character
  )

unique(france_data$gndr)

## [1] "Female" "Male"

unique(france_data$clsprty)

## [1] "Yes" "No"  NA

france_data <- france_data %>% filter(!is.na(clsprty))
france_data <- france_data %>% filter(!is.na(trstplt))

# double-checking + quick distribution
unique(france_data$clsprty)

## [1] "Yes" "No"

table(france_data$clsprty)

## 
##   No  Yes 
## 9098 9480

france_clean <- france_data %>% 
  mutate(
    clsprty = ifelse(clsprty == 2, 0, ifelse(clsprty %in% c(7, 8, 9), NA, clsprty)), # If 'clsprty' is 2, set it to 0. If it's 7, 8, or 9, set it to NA.
    stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem), # For 'stfdem', set values 77, 88, and 99 to NA.
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt), # For 'trstplt', set values 77, 88, and 99 to NA.
  )

france_clean <- france_clean %>% filter(!is.na(clsprty), !is.na(stfdem), !is.na(trstplt))

france_clean <- france_clean %>%
  mutate(
    # Recoding 'lrscale' into a new variable 'ID'
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",                    # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",                  # Values 7 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% 4:6 ~ "Moderate",                # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),
    # Recoding 'yrbrn' (year born) into a new variable 'Gen'
    Gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",                   # Birth years from 1900 to 1945 categorized as "1"
      yrbrn %in% 1946:1964 ~ "2",                   # Birth years from 1946 to 1964 categorized as "2"
      yrbrn %in% 1965:1979 ~ "3",                   # Birth years from 1965 to 1979 categorized as "3"
      yrbrn %in% 1980:1996 ~ "4",                   # Birth years from 1980 to 1996 categorized as "4"
      TRUE ~ as.character(yrbrn)                    # Other years remain unchanged
    ),
    # Converting 'Gen' to a factor with meaningful labels
    Gen = factor(Gen, levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials")) # Assigning labels to the levels in 'Gen'
  )

test_stat <- france_data %>%
  specify(explanatory = lrscale, # change variable name for explanatory variable
          response = ccrdprs) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "slope") # replace in between quotation marks appropriate test statistic

## Warning: Removed 14645 rows containing missing values.

## Message: The independence null hypothesis does not inform calculation of the
## observed statistic (a slope) and will be ignored.

print(test_stat$stat)

##     lrscale 
## 0.007179428

null_distribution <- france_data %>%
  specify(explanatory = lrscale,
          response = ccrdprs) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "slope")

## Warning: Removed 14645 rows containing missing values.

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.212

The result shows that the observed statistic’s computed p-value, which was derived from the null distribution, is 0.216.The observed statistic is not substantially different from what would be predicted under the null hypothesis, according to a p-value of 0.216, which indicates that there is not enough evidence to reject the null hypothesis at conventional levels of significance. This demonstrates that there is a significant difference between left-leaning and right-leaning persons’ mean scores on the CCRDPRS (To what extent sense personal responsibility to reduce climate change). Thus, there is evidence to support the alternative hypothesis that the mean ccrdprs scores for the two groups differ significantly, based on where they live meaning they lean more right.. ## Task 5

Now do Step 4 (i.e., the null distribution visualization with confidence intervals) for the same variables and country as Task 4, and interpret the output.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution

## Response: ccrdprs (numeric)
## Explanatory: lrscale (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate       stat
##        <int>      <dbl>
##  1         1 -0.00549  
##  2         2 -0.000150 
##  3         3  0.0197   
##  4         4 -0.0000880
##  5         5  0.00513  
##  6         6  0.00481  
##  7         7  0.00459  
##  8         8  0.00216  
##  9         9 -0.00244  
## 10        10  0.00441  
## # ℹ 990 more rows

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution

## Response: ccrdprs (numeric)
## Explanatory: lrscale (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate       stat
##        <int>      <dbl>
##  1         1 -0.00549  
##  2         2 -0.000150 
##  3         3  0.0197   
##  4         4 -0.0000880
##  5         5  0.00513  
##  6         6  0.00481  
##  7         7  0.00459  
##  8         8  0.00216  
##  9         9 -0.00244  
## 10        10  0.00441  
## # ℹ 990 more rows

Given the normal cutoff points at the tails, we would not reject the null because the red line is situated within the distributions and is selected based on the test statistic. The red line represents the t-satistic that shows us the ratio of depature.

Task 1

Task 2

Task 3

End.