Task 1.

# List of packages
packages <- c("tidyverse", "infer", "fst")

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.2     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.3     v tibble    3.2.1
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"
setwd("C:/Users/mimi h/Documents/UOFT/YEAR FOUR/SOC202/Homework_7_Project")
ess <- read_fst("All-ESS-Data.fst")

Recoding data:

swiss_data <- ess %>%
  filter(cntry == "CH") %>%
  
  # Recoding brncntr
  mutate(
    brncntr = case_when(
      brncntr == 1 ~ "Yes",
      brncntr == 2 ~ "No",
      brncntr %in% c(7, 8, 9) ~ NA_character_, 
      TRUE ~ as.character(brncntr)
      ),
   
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    ))

swiss_data <- swiss_data %>% filter(!is.na(brncntr))
swiss_data <- swiss_data %>% filter(!is.na(educ_level))
unique(swiss_data$brncntr)
## [1] "No"  "Yes"

Generating tables for proportions and frequencies:

tab_dat <- swiss_data

mytab <- table(tab_dat$brncntr, tab_dat$educ_level)

rsums <- rowSums(mytab)

csums <- colSums(mytab)

N <- sum(mytab)

ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
##            [,1]      [,2]
## [1,] 0.05206627 0.1764393
## [2,] 0.17578931 0.5957051
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
##           [,1]      [,2]
## [1,]  881.1175  2985.882
## [2,] 2974.8825 10081.118

Generating critical value to assess independence:

alpha <- 0.05
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841

If the Chi-squared test statistic that I will compute exceeds 3.841, then I will reject the null hypothesis at the 0.5 significance level.

Calculating Pearson’s Chi-squared statistic:

test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 65.1218

The X-squared value is much higher than our calculated critical value of 3.841, meaning that there is a statistically significant association with being born in or outside Switzerland and the level of educational attainment among the population. Therefore, the probability of being born in or outside of Switzerland is not the same across different levels of educational attainment.

Calculating the p-value:

p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 65.122, df = 1, p-value = 7.041e-16

The p-value is extremely close to zero, again reinforcing the strong statistical association between the two variable, as an extremely small p-value is further evidence to reject the null hypothesis. The main takeaway from both of these tests is that there is an association between native-born and non-native born status in Switzerland, with levels of educational attainment at the post-secondary level.

Task 2.

Step 1: Calculating the test statistic for domicil and gincdif in the Netherlands.

Cleaning and recoding data:

nl_data <- ess %>%
  filter(cntry == "NL")

nl_data <- nl_data %>%
  mutate(geo = recode(as.character(domicil), 
                      '1' = "Urban", 
                      '2' = "Urban",
                      '3' = "Rural", 
                      '4' = "Rural", 
                      '5' = "Rural",
                      '7' = NA_character_,
                      '8' = NA_character_,
                      '9' = NA_character_))

table(nl_data$geo)
## 
## Rural Urban 
## 12681  5623
nl_data <- nl_data %>% filter(!is.na(geo))
nl_data <- nl_data %>%
mutate(
    gincdif = case_when(
      gincdif == 1 ~ "Agree strongly",  
      gincdif == 2 ~ "Agree",
      gincdif == 3 ~ "Neither agree nor disagree",
      gincdif == 4 ~ "Disagree",  
      gincdif == 5 ~ "Disagree strongly",  
      gincdif %in% c(7, 8, 9) ~ NA_character_,
      TRUE ~ as.character(gincdif)
      ))
nl_data <- nl_data %>% filter(!is.na(gincdif))
table(nl_data$gincdif)
## 
##                      Agree             Agree strongly 
##                       7886                       2889 
##                   Disagree          Disagree strongly 
##                       3676                        543 
## Neither agree nor disagree 
##                       3137

Calculating the critical value:

tab_dat <- nl_data
mytab <- table(tab_dat$gincdif, tab_dat$geo)
rsums <- rowSums(mytab)
csums <- colSums(mytab)
N <- sum(mytab)

ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
##            [,1]        [,2]
## [1,] 0.30163846 0.133307214
## [2,] 0.11050387 0.048836488
## [3,] 0.14060651 0.062140162
## [4,] 0.02076968 0.009179028
## [5,] 0.11998984 0.053028751
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
##          [,1]      [,2]
## [1,] 5469.007 2416.9931
## [2,] 2003.546  885.4544
## [3,] 2549.337 1126.6633
## [4,]  376.575  166.4250
## [5,] 2175.536  961.4643
alpha <- 0.05
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841

Calculating the test statistic:

test_stat <- nl_data %>%
  specify(explanatory = gincdif, 
          response = geo) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "Chisq")

print(test_stat$stat)
## X-squared 
##  8.176048

Simulating the distribution:

null_distribution <- nl_data %>%
  specify(explanatory = gincdif,
          response = geo) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")

Calculating the p-value of your sample:

p_val <- null_distribution %>%
  get_pvalue(obs_stat = test_stat, direction = "greater")

p_val
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.081

Visualizing the null distribution:

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater")

null_distribution
## Response: geo (factor)
## Explanatory: gincdif (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  3.13
##  2         2  1.42
##  3         3  6.33
##  4         4  2.61
##  5         5  6.83
##  6         6  1.85
##  7         7  3.01
##  8         8  6.18
##  9         9  5.09
## 10        10  3.71
## # i 990 more rows

Simulating the null distribution with confidence intervel:

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

#Interpretation:

Based on evidence from all our outputs, I will not reject the null hypothesis, however I posit that the statistical association is relatively weak. Using the chi-squared statistic, the test statistic for the sample was found to be X-squared: 8.17, while the p-value was found to be 0.098. The observed test sample is higher than the calculated critical value (3.841), which means that null hypothesis can be rejected at the 0.5 significance level. The p-value is close to 0 but not extremely so, meaning that a statistically significant association exists between the two values, but that it may not be especially strong.

Lastly, simulations of the null distribution indicate that the observed statistic does fall within the null distribution, but it also falls within the confidence interval. Overall, the takeaway from these tests is that there is not sufficient evidence to reject the null hypothesis, but any statistical association that exists between these variables is relatively middling.

Task 3.

Research question: is there an association between amount of time spent listening to the radio about news/politics and trust in politicians in Hungary?

Cleaning and recoding data:

hungary_data <- ess %>%
  filter(cntry == "HU") %>%
  
mutate(
    rdpol = case_when(
      rdpol == 0 ~ "No time at all",
      rdpol == 1 ~ "Between 0 to 1 hours a day",
      rdpol == 2 ~ "Between 0 to 1 hours a day",
      rdpol == 3 ~ "Between 1 to 2 hours a day",
      rdpol == 4 ~ "Between 1 to 2 hours a day",
      rdpol == 5 ~ "Between 2 to 3 hours a day",
      rdpol == 6 ~ "Between 2 to 3 hours a day",
      rdpol == 7 ~ "Over 3 hours a day",
      rdpol %in% c(66, 77, 88, 99) ~ NA_character_,))
      
table(hungary_data$rdpol)
## 
## Between 0 to 1 hours a day Between 1 to 2 hours a day 
##                       4304                        538 
## Between 2 to 3 hours a day             No time at all 
##                        140                        755 
##         Over 3 hours a day 
##                        184
hungary_trstplt <- ess %>%
  filter(cntry == "HU") %>%
  select(trstplt)

hungary_trstplt$y <- hungary_trstplt$trstplt

hungary_trstplt$y[hungary_trstplt$y %in% 77:99] <- NA

table(hungary_trstplt$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
## 3069 1620 2214 2143 1664 2538 1146  889  561  172  177

Step 1: Calculating test statistic

test_stat1 <- hungary_data %>%
  specify(explanatory = rdpol,
          response = trstplt) %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "F", direction = "greater") # Calculating with ANOVA because of my variable types (one numeric variable + one categorical variable with three or more categories).
## Warning: Removed 10721 rows containing missing values.
print(test_stat1$stat)
## [1] 3.307193

Step 2: Simulate the null distribution

null_distribution2 <- hungary_data %>%
  specify(explanatory = rdpol,
          response = trstplt) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
  calculate(stat = "F")
## Warning: Removed 10721 rows containing missing values.

Step 3: Calculate the p-value

p_val <- null_distribution2 %>%
  get_pvalue(obs_stat = test_stat, direction = "greater")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

Visualize the null distribution with confidence intervals:

null_distribution2 %>%
  visualize() +
  shade_p_value(obs_stat = test_stat1, direction = "greater")

conf_int <- null_distribution2 %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution2 %>%
  visualize() +
  shade_p_value(obs_stat = test_stat1, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

Interpretation:

The small but positive value of the test statistic and the p-value being close to 0 indicates that there is evidence to reject the null hypothesis, or in other words, there may be a statistical association between time spent listening to political content on the radio and trust in politicians in Hungary. However, based on evidence from visualizations of the null distribution, this association may be relatively weak. For example, the observed test statistic falls beyond the confidence interval but at the tail end of the simulation distribution, meaning that the the observed test statistic does fall within the null distribution, if only somewhat.

Overall, though there is evidence to reject the null hypothesis for the independence of these two variables (rdpol and trstplt), a significant statistical association does exist, though it may not be especially strong.