#Installing & Applying Packages
packages <-c("tidyverse", "modelsummary", "flextable", "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "questionr", "infer")
new_packages <-packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Version 2.0.0 of `modelsummary`, to be released soon, will introduce a
## breaking change: The default table-drawing package will be `tinytable`
## instead of `kableExtra`. All currently supported table-drawing packages
## will continue to be supported for the foreseeable future, including
## `kableExtra`, `gt`, `huxtable`, `flextable, and `DT`.
##
## You can always call the `config_modelsummary()` function to change the
## default table-drawing package in persistent fashion. To try `tinytable`
## now:
##
## config_modelsummary(factory_default = 'tinytable')
##
## To set the default back to `kableExtra`:
##
## config_modelsummary(factory_default = 'kableExtra')
##
##
## Attaching package: 'flextable'
##
##
## The following object is masked from 'package:purrr':
##
## compose
##
##
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following objects are masked from 'package:flextable':
##
## as_image, footnote
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "flextable" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "fst" "flextable" "modelsummary" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "viridis" "viridisLite" "fst" "flextable" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "knitr" "viridis" "viridisLite" "fst" "flextable"
## [6] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "kableExtra" "knitr" "viridis" "viridisLite" "fst"
## [6] "flextable" "modelsummary" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[8]]
## [1] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [6] "fst" "flextable" "modelsummary" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "flextable" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[10]]
## [1] "questionr" "ggridges" "rmarkdown" "kableExtra" "knitr"
## [6] "viridis" "viridisLite" "fst" "flextable" "modelsummary"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[11]]
## [1] "infer" "questionr" "ggridges" "rmarkdown" "kableExtra"
## [6] "knitr" "viridis" "viridisLite" "fst" "flextable"
## [11] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
#Import dataset
rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 1247649 66.7 2278070 121.7 NA 2278070 121.7
## Vcells 2139699 16.4 8388608 64.0 16384 3200680 24.5
ess <- read.fst("All-ESS-Data.fst")
#Recoding & cleaning education and lrscale
france_data <- ess %>%
filter(cntry == "FR") %>%
mutate(
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"),
edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
educ.ba = as.character(educ.ba)) %>%
mutate(
lrscale = case_when(
lrscale %in% 0:3 ~ "Left",
lrscale %in% 7:10 ~ "Right",
lrscale %in% c(4, 5, 6, 77, 88, 99) ~NA_character_ )) %>%
filter(!is.na(lrscale) & !is.na(educ.ba))
#Note: 4,5,6 set to NA because those values correspond to "Moderate"
#Checking variables and lengths before making contingency tables
table(france_data$lrscale)
##
## Left Right
## 4709 4302
table(france_data$educ.ba)
##
## BA or more No BA
## 2261 6750
length(france_data$lrscale)
## [1] 9011
length(france_data$educ.ba)
## [1] 9011
#New dataset & contingency table
tab_dat <-france_data
mytab <-table(tab_dat$lrscale, tab_dat$educ.ba)
rsums <-rowSums(mytab)
csums <-colSums(mytab)
N<-sum(mytab)
#Table of expected proportions
ptab <-tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
## [,1] [,2]
## [1,] 0.1311243 0.3914592
## [2,] 0.1197912 0.3576253
#Table of expected frequencies
ftab <-N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
## [,1] [,2]
## [1,] 1181.561 3527.439
## [2,] 1079.439 3222.561
#Critical Value of Independence
alpha <-0.05
c_val <-qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841
#Pearson's chi-squared statistic
test_stat <-sum((mytab-ftab)^2/ftab)
cat("Pearson's X^2:", round(test_stat,4),"\n")
## Pearson's X^2: 100.8551
#Calculating the p-value
p_val <-pchisq(test_stat, df=1, lower.tail = F)
cat("p-value", round(p_val, 4), "\n")
## p-value 0
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab,correct=F))
##
## Pearson's Chi-squared test
##
## data: mytab
## X-squared = 100.86, df = 1, p-value < 2.2e-16
#Recoding and cleaning sbbsntx & domicil
france_data <- ess %>%
filter(cntry == "FR") %>%
mutate(
sbbsntx = case_when(
sbbsntx == 1 ~ "Agree strongly",
sbbsntx == 2 ~ "Agree",
sbbsntx == 3 ~ "Neither agree nor disagree",
sbbsntx == 4 ~ "Disagree",
sbbsntx == 5 ~ "Disagree strongly",
sbbsntx %in% c(7,8,9) ~NA_character_)) %>%
mutate(
domicil = case_when(
domicil == 1 ~ "Urban",
domicil == 2 ~ "Urban",
domicil == 3 ~ "Rural",
domicil == 4 ~ "Rural",
domicil == 5 ~ "Rural",
domicil %in% c(7,8,9) ~NA_character_ )) %>%
filter(!is.na(domicil) & !is.na(sbbsntx))
#Checking variables
table(france_data$sbbsntx)
##
## Agree Agree strongly
## 1538 756
## Disagree Disagree strongly
## 640 301
## Neither agree nor disagree
## 804
table(france_data$domicil)
##
## Rural Urban
## 2876 1163
#Step 1: Calculating test statistic test_stat
test_stat <- france_data %>%
specify(explanatory = domicil,
response = sbbsntx) %>%
hypothesize(null="independence")%>%
calculate(stat="chisq")
##Chisq used because sbbsntx is a categorical variable with 3 or more categories and domicil is a categorical variable with 2 categories.
print(test_stat$stat)
## X-squared
## 19.97187
#Step 2: Simulate the null distribution
null_distribution <-france_data %>%
specify(explanatory = domicil,
response = sbbsntx) %>%
hypothesize(null="independence")%>%
generate(reps=1000, type="permute")%>%
calculate(stat="chisq")
#Step 3: Calculate the p-value
p_val <-null_distribution %>%
get_pvalue(obs_stat=test_stat, direction = "greater")
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
#Step 4: Visualize
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "greater")
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
null_distribution
## Response: sbbsntx (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
## replicate stat
## <int> <dbl>
## 1 1 4.86
## 2 2 4.10
## 3 3 5.12
## 4 4 10.4
## 5 5 4.13
## 6 6 0.770
## 7 7 6.07
## 8 8 1.37
## 9 9 0.718
## 10 10 2.48
## # ℹ 990 more rows
#Note: "greater" used because computed a chisq test
#Visualize with confidence intervals
conf_int<-null_distribution %>%
get_confidence_interval(level=0.95, type ="percentile")
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints=conf_int)
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
#Note: "greater" used because computed a chisq test
#Recoding & cleaning ccrdprs and lrscale
france_data <- ess %>%
filter(cntry == "FR") %>%
mutate(
ccrdprs = ifelse(ccrdprs %in% c(66, 77, 88, 99), NA_integer_, ccrdprs))%>%
mutate(
lrscale = case_when(
lrscale %in% 0:3 ~ "Left",
lrscale %in% 7:10 ~ "Right",
lrscale %in% c(4, 5, 6, 77, 88, 99) ~NA_character_ )) %>%
filter(!is.na(lrscale) & !is.na(ccrdprs))
## Note: 4,5,6 set to NA because those values correspond to "Moderate"
#Checking variables
table(france_data$lrscale)
##
## Left Right
## 889 982
table(france_data$ccrdprs)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 33 12 28 38 57 205 173 344 472 174 335
#Step 1: Calculating test statistic test_stat
cc_stat <- france_data %>%
specify(explanatory = lrscale,
response = ccrdprs) %>%
hypothesize(null="independence")%>%
calculate(stat="t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "Left" - "Right", or divided in the order "Left" / "Right" for
## ratio-based statistics. To specify this order yourself, supply `order =
## c("Left", "Right")` to the calculate() function.
##t-test used because lrscale is a dichotomous categorical explanatory variable and ccrdprs is a numeric response variable.
print(cc_stat$stat)
## t
## 5.482311
#Step 2: Simulate the null distribution
null_cc <-france_data %>%
specify(explanatory = lrscale,
response = ccrdprs) %>%
hypothesize(null="independence")%>%
generate(reps=1000, type="permute")%>%
calculate(stat="t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "Left" - "Right", or divided in the order "Left" / "Right" for
## ratio-based statistics. To specify this order yourself, supply `order =
## c("Left", "Right")` to the calculate() function.
#Step 3: Calculate the p-value
p_val <-null_cc %>%
get_pvalue(obs_stat=cc_stat, direction = "two-sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
#Step 4: Visualize
null_cc %>%
visualize() +
shade_p_value(obs_stat = cc_stat, direction = "two-sided")
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?
null_cc
## Response: ccrdprs (numeric)
## Explanatory: lrscale (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
## replicate stat
## <int> <dbl>
## 1 1 -0.532
## 2 2 -0.852
## 3 3 1.50
## 4 4 -1.19
## 5 5 -0.511
## 6 6 -0.321
## 7 7 -1.34
## 8 8 0.483
## 9 9 -0.428
## 10 10 0.378
## # ℹ 990 more rows
#Visualize with confidence intervals
conf_int<-null_distribution %>%
get_confidence_interval(level=0.95, type ="percentile")
null_cc %>%
visualize() +
shade_p_value(obs_stat = cc_stat, direction = "two-sided") +
shade_confidence_interval(endpoints=conf_int)
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?