# List of packages
packages <- c("tidyverse", "infer", "fst")
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.2 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.3 v tibble 3.2.1
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
setwd("C:/Users/mimi h/Documents/UOFT/YEAR FOUR/SOC202/Homework_7_Project")
ess <- read_fst("All-ESS-Data.fst")
Recoding data:
swiss_data <- ess %>%
filter(cntry == "CH") %>%
# Recoding brncntr
mutate(
brncntr = case_when(
brncntr == 1 ~ "Yes",
brncntr == 2 ~ "No",
brncntr %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ as.character(brncntr)
),
edulvla = case_when(
essround < 5 & edulvla == 55 ~ NA_real_,
TRUE ~ edulvla
),
edulvlb = case_when(
essround >= 5 & edulvlb == 5555 ~ NA_real_,
TRUE ~ edulvlb
),
educ_level = case_when(
essround < 5 & edulvla == 5 ~ "BA",
essround >= 5 & edulvlb > 600 ~ "BA",
TRUE ~ "No BA"
))
swiss_data <- swiss_data %>% filter(!is.na(brncntr))
swiss_data <- swiss_data %>% filter(!is.na(educ_level))
unique(swiss_data$brncntr)
## [1] "No" "Yes"
Generating tables for proportions and frequencies:
tab_dat <- swiss_data
mytab <- table(tab_dat$brncntr, tab_dat$educ_level)
rsums <- rowSums(mytab)
csums <- colSums(mytab)
N <- sum(mytab)
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
## [,1] [,2]
## [1,] 0.05206627 0.1764393
## [2,] 0.17578931 0.5957051
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
## [,1] [,2]
## [1,] 881.1175 2985.882
## [2,] 2974.8825 10081.118
Generating critical value to assess independence:
alpha <- 0.05
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841
If the Chi-squared test statistic that I will compute exceeds 3.841, then I will reject the null hypothesis at the 0.5 significance level.
Calculating Pearson’s Chi-squared statistic:
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 65.1218
The X-squared value is much higher than our calculated critical value of 3.841, meaning that there is a statistically significant association with being born in or outside Switzerland and the level of educational attainment among the population. Therefore, the probability of being born in or outside of Switzerland is not the same across different levels of educational attainment.
Calculating the p-value:
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
##
## Pearson's Chi-squared test
##
## data: mytab
## X-squared = 65.122, df = 1, p-value = 7.041e-16
The p-value is extremely close to zero, again reinforcing the strong statistical association between the two variable, as an extremely small p-value is further evidence to reject the null hypothesis. The main takeaway from both of these tests is that there is an association between native-born and non-native born status in Switzerland, with levels of educational attainment at the post-secondary level.
Step 1: Calculating the test statistic for domicil and gincdif in the Netherlands.
Cleaning and recoding data:
nl_data <- ess %>%
filter(cntry == "NL")
nl_data <- nl_data %>%
mutate(geo = recode(as.character(domicil),
'1' = "Urban",
'2' = "Urban",
'3' = "Rural",
'4' = "Rural",
'5' = "Rural",
'7' = NA_character_,
'8' = NA_character_,
'9' = NA_character_))
table(nl_data$geo)
##
## Rural Urban
## 12681 5623
nl_data <- nl_data %>% filter(!is.na(geo))
nl_data <- nl_data %>%
mutate(
gincdif = case_when(
gincdif == 1 ~ "Agree strongly",
gincdif == 2 ~ "Agree",
gincdif == 3 ~ "Neither agree nor disagree",
gincdif == 4 ~ "Disagree",
gincdif == 5 ~ "Disagree strongly",
gincdif %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ as.character(gincdif)
))
nl_data <- nl_data %>% filter(!is.na(gincdif))
table(nl_data$gincdif)
##
## Agree Agree strongly
## 7886 2889
## Disagree Disagree strongly
## 3676 543
## Neither agree nor disagree
## 3137
Calculating the critical value:
tab_dat <- nl_data
mytab <- table(tab_dat$gincdif, tab_dat$geo)
rsums <- rowSums(mytab)
csums <- colSums(mytab)
N <- sum(mytab)
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
## [,1] [,2]
## [1,] 0.30163846 0.133307214
## [2,] 0.11050387 0.048836488
## [3,] 0.14060651 0.062140162
## [4,] 0.02076968 0.009179028
## [5,] 0.11998984 0.053028751
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
## [,1] [,2]
## [1,] 5469.007 2416.9931
## [2,] 2003.546 885.4544
## [3,] 2549.337 1126.6633
## [4,] 376.575 166.4250
## [5,] 2175.536 961.4643
alpha <- 0.05
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841
Calculating the test statistic:
test_stat <- nl_data %>%
specify(explanatory = gincdif,
response = geo) %>%
hypothesize(null = "independence") %>%
calculate(stat = "Chisq")
print(test_stat$stat)
## X-squared
## 8.176048
Simulating the distribution:
null_distribution <- nl_data %>%
specify(explanatory = gincdif,
response = geo) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
Calculating the p-value of your sample:
p_val <- null_distribution %>%
get_pvalue(obs_stat = test_stat, direction = "greater")
p_val
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.081
Visualizing the null distribution:
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "greater")
null_distribution
## Response: geo (factor)
## Explanatory: gincdif (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 3.13
## 2 2 1.42
## 3 3 6.33
## 4 4 2.61
## 5 5 6.83
## 6 6 1.85
## 7 7 3.01
## 8 8 6.18
## 9 9 5.09
## 10 10 3.71
## # i 990 more rows
Simulating the null distribution with confidence intervel:
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints = conf_int)
#Interpretation:
Based on evidence from all our outputs, I will not reject the null hypothesis, however I posit that the statistical association is relatively weak. Using the chi-squared statistic, the test statistic for the sample was found to be X-squared: 8.17, while the p-value was found to be 0.098. The observed test sample is higher than the calculated critical value (3.841), which means that null hypothesis can be rejected at the 0.5 significance level. The p-value is close to 0 but not extremely so, meaning that a statistically significant association exists between the two values, but that it may not be especially strong.
Lastly, simulations of the null distribution indicate that the observed statistic does fall within the null distribution, but it also falls within the confidence interval. Overall, the takeaway from these tests is that there is not sufficient evidence to reject the null hypothesis, but any statistical association that exists between these variables is relatively middling.
Research question: is there an association between amount of time spent listening to the radio about news/politics and trust in politicians in Hungary?
Cleaning and recoding data:
hungary_data <- ess %>%
filter(cntry == "HU") %>%
mutate(
rdpol = case_when(
rdpol == 0 ~ "No time at all",
rdpol == 1 ~ "Between 0 to 1 hours a day",
rdpol == 2 ~ "Between 0 to 1 hours a day",
rdpol == 3 ~ "Between 1 to 2 hours a day",
rdpol == 4 ~ "Between 1 to 2 hours a day",
rdpol == 5 ~ "Between 2 to 3 hours a day",
rdpol == 6 ~ "Between 2 to 3 hours a day",
rdpol == 7 ~ "Over 3 hours a day",
rdpol %in% c(66, 77, 88, 99) ~ NA_character_,))
table(hungary_data$rdpol)
##
## Between 0 to 1 hours a day Between 1 to 2 hours a day
## 4304 538
## Between 2 to 3 hours a day No time at all
## 140 755
## Over 3 hours a day
## 184
hungary_trstplt <- ess %>%
filter(cntry == "HU") %>%
select(trstplt)
hungary_trstplt$y <- hungary_trstplt$trstplt
hungary_trstplt$y[hungary_trstplt$y %in% 77:99] <- NA
table(hungary_trstplt$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 3069 1620 2214 2143 1664 2538 1146 889 561 172 177
Step 1: Calculating test statistic
test_stat1 <- hungary_data %>%
specify(explanatory = rdpol,
response = trstplt) %>%
hypothesize(null = "independence") %>%
calculate(stat = "F", direction = "greater") # Calculating with ANOVA because of my variable types (one numeric variable + one categorical variable with three or more categories).
## Warning: Removed 10721 rows containing missing values.
print(test_stat1$stat)
## [1] 3.307193
Step 2: Simulate the null distribution
null_distribution2 <- hungary_data %>%
specify(explanatory = rdpol,
response = trstplt) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
calculate(stat = "F")
## Warning: Removed 10721 rows containing missing values.
Step 3: Calculate the p-value
p_val <- null_distribution2 %>%
get_pvalue(obs_stat = test_stat, direction = "greater")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
Visualize the null distribution with confidence intervals:
null_distribution2 %>%
visualize() +
shade_p_value(obs_stat = test_stat1, direction = "greater")
conf_int <- null_distribution2 %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution2 %>%
visualize() +
shade_p_value(obs_stat = test_stat1, direction = "greater") +
shade_confidence_interval(endpoints = conf_int)
The small but positive value of the test statistic and the p-value being close to 0 indicates that there is evidence to reject the null hypothesis, or in other words, there may be a statistical association between time spent listening to political content on the radio and trust in politicians in Hungary. However, based on evidence from visualizations of the null distribution, this association may be relatively weak. For example, the observed test statistic falls beyond the confidence interval but at the tail end of the simulation distribution, meaning that the the observed test statistic does fall within the null distribution, if only somewhat.
Overall, though there is evidence to reject the null hypothesis for the independence of these two variables (rdpol and trstplt), a significant statistical association does exist, though it may not be especially strong.