JoslinSoto_Nicole_Project

packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
               "fst", "viridis", "knitr", "rmarkdown", "ggridges", "viridis", "questionr", "flextable", "infer")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]

if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "rmarkdown"    "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "ggridges"     "rmarkdown"    "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "flextable"    "questionr"    "ggridges"     "rmarkdown"    "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[13]]
##  [1] "infer"        "flextable"    "questionr"    "ggridges"     "rmarkdown"   
##  [6] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
## [11] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"

setwd("C:/Users/Erika/Desktop/SOC_202_YAY"
)

library(fst) 

ess <- read_fst("All-ESS-Data.fst")

france_data <- ess %>%
  # Filter for France
  filter(cntry == "FR") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",  # Recode 1 to "Yes"
      clsprty == 2 ~ "No",   # Recode 2 to "No"
      clsprty %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(clsprty)  # Keep other values as-is but ensure they are characters
    ),
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
    
    # Recoding education for rounds 1-4
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    
    # Re-coding education for rounds 5 and later
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    
    # Create educ_level column
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    ),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    )
  )

unique(france_data$gndr)

## [1] "Female" "Male"

unique(france_data$clsprty)

## [1] "Yes" "No"  NA

france_data <- france_data %>% filter(!is.na(clsprty))
france_data <- france_data %>% filter(!is.na(trstplt))
france_data <- france_data %>% filter(!is.na(educ_level))

unique(france_data$clsprty)

## [1] "Yes" "No"

table(france_data$clsprty)

## 
##   No  Yes 
## 9098 9480

tab_dat <- france_data


mytab <- table(tab_dat$clsprty, tab_dat$educ_level)


rsums <- rowSums(mytab)


csums <- colSums(mytab)


N <- sum(mytab)

ptab <- tcrossprod(rsums/N, csums/N)

cat("Table of Expected Proportions:\n")

## Table of Expected Proportions:

print(ptab)

##           [,1]      [,2]
## [1,] 0.1095265 0.3801926
## [2,] 0.1141252 0.3961558

ftab <- N * ptab
cat("Table of Expected Frequencies:\n")

## Table of Expected Frequencies:

print(ftab)

##          [,1]     [,2]
## [1,] 2034.783 7063.217
## [2,] 2120.217 7359.783

alpha <- 0.05 
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")

## Critical Value: 3.841

test_stat <- sum((mytab - ftab)^2 / ftab)

cat("Pearson's X^2:", round(test_stat, 4), "\n")

## Pearson's X^2: 159.6903

p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")

## p-value: 0

cat("Chi-squared test result:\n")

## Chi-squared test result:

print(chisq.test(mytab, correct = F))

## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 159.69, df = 1, p-value < 2.2e-16

test_stat <- france_data %>%
  specify(explanatory = educ_level, 
          response = trstplt) %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "t")

## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "BA" - "No BA", or divided in the order "BA" / "No BA" for ratio-based
## statistics. To specify this order yourself, supply `order = c("BA", "No BA")`
## to the calculate() function.

print(test_stat$stat)

##        t 
## 14.10355

null_distribution <- france_data %>%
  specify(explanatory = educ_level,
          response = trstplt) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "t")

## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "BA" - "No BA", or divided in the order "BA" / "No BA" for ratio-based
## statistics. To specify this order yourself, supply `order = c("BA", "No BA")`
## to the calculate() function.

p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution

## Response: trstplt (numeric)
## Explanatory: educ_level (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.119 
##  2         2  0.0192
##  3         3  0.407 
##  4         4  0.509 
##  5         5  1.62  
##  6         6  1.02  
##  7         7 -0.0215
##  8         8 -0.572 
##  9         9 -1.43  
## 10        10  1.45  
## # ℹ 990 more rows

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)

null_distribution

## Response: trstplt (numeric)
## Explanatory: educ_level (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.119 
##  2         2  0.0192
##  3         3  0.407 
##  4         4  0.509 
##  5         5  1.62  
##  6         6  1.02  
##  7         7 -0.0215
##  8         8 -0.572 
##  9         9 -1.43  
## 10        10  1.45  
## # ℹ 990 more rows

edu_stat <- france_data %>%
  specify(explanatory = gndr, 
          response = educ_level, 
          success = "BA") %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "z")

## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "Female" - "Male", or divided in the order "Female" / "Male" for
## ratio-based statistics. To specify this order yourself, supply `order =
## c("Female", "Male")` to the calculate() function.

print(edu_stat$stat)

## [1] -1.11198

null_edu <- france_data %>%
  specify(explanatory = gndr, 
          response = educ_level, 
          success = "BA") %>% 
  hypothesize(null = "independence") %>%
    generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "z")

## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "Female" - "Male", or divided in the order "Female" / "Male" for
## ratio-based statistics. To specify this order yourself, supply `order =
## c("Female", "Male")` to the calculate() function.

p_val <- null_edu %>% 
  get_pvalue(obs_stat = edu_stat, direction = "two-sided") 

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.286

null_edu %>%
  visualize() +
  shade_p_value(obs_stat = edu_stat, direction = "two-sided")

null_edu

## Response: educ_level (factor)
## Explanatory: gndr (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 1
##       stat
##      <dbl>
##  1  1.43  
##  2 -0.512 
##  3 -1.29  
##  4  0.229 
##  5 -0.0532
##  6 -3.23  
##  7 -0.583 
##  8 -1.15  
##  9  1.01  
## 10 -0.265 
## # ℹ 990 more rows

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_edu %>%
  visualize() +
  shade_p_value(obs_stat = edu_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)

null_edu

## Response: educ_level (factor)
## Explanatory: gndr (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 1
##       stat
##      <dbl>
##  1  1.43  
##  2 -0.512 
##  3 -1.29  
##  4  0.229 
##  5 -0.0532
##  6 -3.23  
##  7 -0.583 
##  8 -1.15  
##  9  1.01  
## 10 -0.265 
## # ℹ 990 more rows

JoslinSoto_Nicole_Project_202

2023-11-13