R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

packages <- c("tidyverse", "infer", "fst") # add any you need here

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"
ess <- read_fst("All-ESS-Data7.fst")

Task 1

switzerland_data <- ess %>%

  filter(cntry == "CH") %>%
  
  mutate(
    brncntr = case_when(
      brncntr == 1 ~ "Yes",  
      brncntr == 2 ~ "No",  
      brncntr %in% c(7, 8, 9) ~ NA_character_,  
      TRUE ~ as.character(brncntr)  
    ),
    
    
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    
    
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    
   
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )
tab_dat <- switzerland_data

mytab <- table(tab_dat$brncntr, tab_dat$educ_level)

rsums <- rowSums(mytab)

csums <- colSums(mytab)

N <- sum(mytab)

ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
##            [,1]      [,2]
## [1,] 0.05206627 0.1764393
## [2,] 0.17578931 0.5957051
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
##           [,1]      [,2]
## [1,]  881.1175  2985.882
## [2,] 2974.8825 10081.118
alpha <- 0.05
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841

Interpreting the Critical Value: If the Chi-squared test statistic that you compute (from the chisq.test function) exceeds 3.841, then we reject the null hypothesis at the 0.05 significance level. This would mean that there is a statistically significant association between feeling close to a party and education level in your dataset.

test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 65.1218

Interpretation: The difference between our observed data and what we’d expect under the assumption of independence is statistically significant. Thus, there is a statistically significant association between born in country and one’s educational level. This means that the probability of born in country is not the same across different education levels.

p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 65.122, df = 1, p-value = 7.041e-16

Task 2

netherlands_data <- ess %>%
  filter(cntry == "NL") %>%
  
  mutate(
    gincdif = case_when(
      gincdif == 1 ~ "Agree strongly",  
      gincdif == 2 ~ "Agree",  
      gincdif == 3 ~ "Neither agree nor disagree",  
      gincdif == 4 ~ "Disagree",
      gincdif == 5 ~ "Disagree strongly",
      gincdif %in% c(7, 8, 9) ~ NA_character_,  
      TRUE ~ as.character(gincdif)  
    ),
    
    
    domicila = case_when(
      essround < 5 & domicil == 55 ~ NA_real_,
      TRUE ~ domicil
    ),
    
    
    domicilb = case_when(
      essround >= 5 & domicil == 5555 ~ NA_real_,
      TRUE ~ domicil
    ),
    
   
    domicil = case_when(
      essround < 5 & domicila == 5 ~ "BA",
      essround >= 5 & domicilb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )
test_stat <- netherlands_data %>%
  specify(explanatory = domicil, 
          response = gincdif) %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq") 
## Warning: Removed 173 rows containing missing values.
print(test_stat$stat)
## X-squared 
##  4.256857
null_distribution <- netherlands_data %>%
  specify(explanatory = domicil,
          response = gincdif) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "chisq")
## Warning: Removed 173 rows containing missing values.
p_value <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided") 

p_value
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.722
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: gincdif (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1  4.37
##  2         2  5.72
##  3         3  2.59
##  4         4  2.66
##  5         5  7.58
##  6         6  3.96
##  7         7  1.42
##  8         8  5.57
##  9         9  4.99
## 10        10  4.16
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: gincdif (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1  4.37
##  2         2  5.72
##  3         3  2.59
##  4         4  2.66
##  5         5  7.58
##  6         6  3.96
##  7         7  1.42
##  8         8  5.57
##  9         9  4.99
## 10        10  4.16
## # ℹ 990 more rows

Task 3

germany_data <- ess %>%
  filter(cntry == "DE") %>%
  
  mutate(
    gincdif = case_when(
      gincdif == 1 ~ "Agree strongly",  
      gincdif == 2 ~ "Agree",  
      gincdif == 3 ~ "Neither agree nor disagree",  
      gincdif == 4 ~ "Disagree",
      gincdif == 5 ~ "Disagree strongly",
      gincdif %in% c(7, 8, 9) ~ NA_character_,  
      TRUE ~ as.character(gincdif)  
    ),
    
    
    domicila = case_when(
      essround < 5 & domicil == 55 ~ NA_real_,
      TRUE ~ domicil
    ),
    
    
    domicilb = case_when(
      essround >= 5 & domicil == 5555 ~ NA_real_,
      TRUE ~ domicil
    ),
    
   
    domicil = case_when(
      essround < 5 & domicila == 5 ~ "BA",
      essround >= 5 & domicilb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )
test_stat <- germany_data %>%
  specify(explanatory = domicil, 
          response = gincdif) %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq") 
## Warning: Removed 534 rows containing missing values.
print(test_stat$stat)
## X-squared 
##  24.43407
null_distribution <- germany_data %>%
  specify(explanatory = domicil,
          response = gincdif) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "chisq")
## Warning: Removed 534 rows containing missing values.
p_value <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided") 
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_value
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: gincdif (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1  4.52
##  2         2  4.43
##  3         3  6.65
##  4         4  1.90
##  5         5 10.3 
##  6         6  1.12
##  7         7  2.44
##  8         8  2.66
##  9         9  3.02
## 10        10  1.98
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: gincdif (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1  4.52
##  2         2  4.43
##  3         3  6.65
##  4         4  1.90
##  5         5 10.3 
##  6         6  1.12
##  7         7  2.44
##  8         8  2.66
##  9         9  3.02
## 10        10  1.98
## # ℹ 990 more rows