R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Task 1

# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"
setwd("~/Desktop/SOC_202_YAY/")

getwd()  
## [1] "/Users/apple/Desktop/SOC_202_YAY"
ess <- read_fst("All-ESS-Data.fst")
table(ess$cntry)
## 
##    AL    AT    BE    BG    CH    CY    CZ    DE    DK    EE    ES    FI    FR 
##  1201 15225 17451 13240 16925  6065 20090 34425 12408 16856 19452 19532 19038 
##    GB    GR    HR    HU    IE    IL    IS    IT    LT    LU    LV    ME    MK 
## 20979 12558  6535 16642 22233 16218  3975 10178 11652  3187  3921  2478  1429 
##    NL    NO    PL    PT    RO    RS    RU    SE    SI    SK    TR    UA    XK 
## 18329 16065 17689 17881  2146  3548 12458 18216 13484 11292  4272  9987  1295
switzerland_data <- ess %>%
  # Filter for France
  filter(cntry == "CH") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
   brncntr = case_when(
      brncntr == 1 ~ "Yes",  # Recode 1 to "Yes"
      brncntr == 2 ~ "No",   # Recode 2 to "No"
      brncntr %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(brncntr)  # Keep other values as-is but ensure they are characters
    ),
    brncntr = ifelse(brncntr %in% c(77, 88, 99), NA, brncntr),
    
    # Recoding education for rounds 1-4
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    
    # Re-coding education for rounds 5 and later
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    
    # Create educ_level column
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    ),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    )
  )
unique(switzerland_data$gndr)
## [1] "Female" "Male"   NA
unique(switzerland_data$brncntr)
## [1] "No"  "Yes" NA
# here's how to make sure it's really cleaned

switzerland_data <- switzerland_data %>% filter(!is.na(brncntr))
switzerland_data <- switzerland_data %>% filter(!is.na(educ_level))

# double-checking + quick distribution
unique(switzerland_data$brncntr)
## [1] "No"  "Yes"
table(switzerland_data$brncntr)
## 
##    No   Yes 
##  3867 13056
## Not necessary, but always good to rename when doing operations so you can backtrack to clean dataset
tab_dat <- switzerland_data

# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$brncntr, tab_dat$educ_level)

# Calculate row-wise sums of counts in table 
rsums <- rowSums(mytab)

# Calculate column-wise sums of counts in table
csums <- colSums(mytab)

# Get the total count of the table (N)
N <- sum(mytab)

# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
##            [,1]      [,2]
## [1,] 0.05206627 0.1764393
## [2,] 0.17578931 0.5957051
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
##           [,1]      [,2]
## [1,]  881.1175  2985.882
## [2,] 2974.8825 10081.118
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841

#Critical Value: If the Chi-squared test statistic that we cpmpute exceeds 3.841, then we will reject the null hypothesis at the 0.05 significance level.

test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 65.1218

#Interpretation: The difference between our observed data and what we’d expect under the assumption of independence is statistically significant. Thus, there is a statistically significant association between feeling close to a party and one’s educational level. This means that the probability of feeling close to a party is not the same across different education levels.

p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
## 
##  Pearson's Chi-squared test
## 
## data:  mytab
## X-squared = 65.122, df = 1, p-value = 7.041e-16

p-value < 7.041e-16: This is another representation of the p-value in scientific notation because it’s extremely close to 0.

The chi-squared test tells us if two categorical variables are independent or if they are associated in some manner. In our test:

The extremely small p-value (essentially 0) means that we reject the null hypothesis that the two variables are independent of each other. In other words, there’s strong evidence to suggest that there’s a significant association between the two variables in mytab.

The large chi-squared statistic of 165.44 further reinforces this point. If the two variables were independent, we’d expect a chi-squared statistic closer to 0.

Task 2

netherlands_data <- ess %>%
 
  filter(cntry == "NL") %>%
  
  # Recoding clsprty and cleaning trstplt (will use later)
  mutate(
   domicil = case_when(
      domicil == 1 ~ "Urban",
      domicil == 2 ~ "Rural",  
      domicil %in% c(7, 8, 9) ~ NA_character_,  # Handle other specific cases where you want to set it as NA
      TRUE ~ as.character(domicil)  # Keep other values as-is but ensure they are characters
    ),
    domicil = ifelse(domicil %in% c(77, 88, 99), NA, domicil),
    
   
    
  gincdif = case_when(
  gincdif == 1 ~ "Agree strongly",
  gincdif == 2 ~ "Agree",
  gincdif == 3 ~ "Neither agree nor disagree",
  gincdif == 4 ~ "Disagree",
  gincdif == 5 ~ "Disagree strongly",
  gincdif %in% c(7, 8, 9) ~ NA_character_
)
    )
test_stat <- netherlands_data %>%
  specify(explanatory = domicil, # change variable name for explanatory variable
          response = gincdif) %>% # change variable name for outcome of interest
 

  hypothesize(null = "independence") %>%
  calculate(stat = "chisq") # replace in between quotation marks appropriate test statistic
## Warning: Removed 198 rows containing missing values.
print(test_stat$stat)
## X-squared 
##  52.61113

Interpretation: The t-statistic measures the difference between the observed sample mean and the population mean (or another sample mean, in the case of two-sample tests) in units of standard error. The larger the magnitude of t (regardless of the sign), the greater the evidence against the null hypothesis.

A t-statistic of 52.61113 is quite large, indicating a significant difference in trust in politicians between those with a BA and those without a BA. It suggests that the mean trust level in politicians is different for the two groups.

null_distribution <- netherlands_data%>%
  specify(explanatory = domicil,
          response = gincdif) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
  calculate(stat = "chisq")
## Warning: Removed 198 rows containing missing values.
p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: gincdif (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 13.6 
##  2         2 22.9 
##  3         3 13.5 
##  4         4 25.6 
##  5         5 11.5 
##  6         6 20.5 
##  7         7 21.7 
##  8         8  7.76
##  9         9 23.6 
## 10        10 16.5 
## # ℹ 990 more rows

Visualizes the null distribution alongside the observsed test statistic (the red line), indicating here clear evidence, under assumptions of frequentist hypothesis testing, to reject the null. If the observed statistic would fall within the null distribution, you would see shaded red regions of the null distribution that are as (or more) extreme than our observed statistic.

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

# Task 3

netherlands_data <- netherlands_data %>%
  mutate(
    pdwrk_recode = case_when(
      pdwrk == 1 ~ 'yes',
      pdwrk == 0 ~ 'no', 
      pdwrk %in% c(7, 8, 9) ~ NA_character_,
      TRUE ~ as.character(pdwrk) 
    ),
    happy_recode = case_when(
      happy == 0 ~ "Extremely unhappy",
      happy == 1 ~ "1",
      happy == 2 ~ "2",
      happy == 3 ~ "3",
      happy == 4 ~ "4",
      happy == 5 ~ "5",
      happy == 6 ~ "6",
      happy == 7 ~ "7",
      happy == 8 ~ "8",
      happy == 9 ~ "9",
      happy == 10 ~ "Extremely happy",
      happy %in% c(77, 88, 99) ~ NA_character_
    )
  ) %>%
  mutate(
    pdwrk_recode = factor(pdwrk_recode),
    happy_recode = factor(happy_recode)
  )
test_stat <- netherlands_data %>%
  specify(explanatory = pdwrk, # change variable name for explanatory variable
          response = happy) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  

print(test_stat$stat)
## Response: happy (numeric)
## Explanatory: pdwrk (numeric)
## Null Hypothesis: independence
## # A tibble: 18,329 × 2
##    happy pdwrk
##    <dbl> <dbl>
##  1    10     0
##  2    10     1
##  3     8     1
##  4     7     1
##  5     6     1
##  6     8     1
##  7     7     1
##  8     8     1
##  9     8     0
## 10    10     1
## # ℹ 18,319 more rows

Interpretation: The t-statistic measures the difference between the observed sample mean and the population mean (or another sample mean, in the case of two-sample tests) in units of standard error. The larger the magnitude of t (regardless of the sign), the greater the evidence against the null hypothesis.

A t-statistic of 52.61113 is quite large, indicating a significant difference in trust in politicians between those with a BA and those without a BA. It suggests that the mean trust level in politicians is different for the two groups.

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: The first row and first column value of the given `obs_stat` will be
## used.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.272
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: The first row and first column value of the given `obs_stat` will be
## used.
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

null_distribution
## Response: gincdif (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 13.6 
##  2         2 22.9 
##  3         3 13.5 
##  4         4 25.6 
##  5         5 11.5 
##  6         6 20.5 
##  7         7 21.7 
##  8         8  7.76
##  9         9 23.6 
## 10        10 16.5 
## # ℹ 990 more rows

Visualizes the null distribution alongside the observsed test statistic (the red line), indicating here clear evidence, under assumptions of frequentist hypothesis testing, to reject the null. If the observed statistic would fall within the null distribution, you would see shaded red regions of the null distribution that are as (or more) extreme than our observed statistic.

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning: The first row and first column value of the given `obs_stat` will be
## used.
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.