This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
# List of packages
packages <- c("tidyverse", "infer", "fst") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
setwd("~/Desktop/SOC_202_YAY/")
getwd()
## [1] "/Users/apple/Desktop/SOC_202_YAY"
ess <- read_fst("All-ESS-Data.fst")
table(ess$cntry)
##
## AL AT BE BG CH CY CZ DE DK EE ES FI FR
## 1201 15225 17451 13240 16925 6065 20090 34425 12408 16856 19452 19532 19038
## GB GR HR HU IE IL IS IT LT LU LV ME MK
## 20979 12558 6535 16642 22233 16218 3975 10178 11652 3187 3921 2478 1429
## NL NO PL PT RO RS RU SE SI SK TR UA XK
## 18329 16065 17689 17881 2146 3548 12458 18216 13484 11292 4272 9987 1295
switzerland_data <- ess %>%
# Filter for France
filter(cntry == "CH") %>%
# Recoding clsprty and cleaning trstplt (will use later)
mutate(
brncntr = case_when(
brncntr == 1 ~ "Yes", # Recode 1 to "Yes"
brncntr == 2 ~ "No", # Recode 2 to "No"
brncntr %in% c(7, 8, 9) ~ NA_character_, # Handle other specific cases where you want to set it as NA
TRUE ~ as.character(brncntr) # Keep other values as-is but ensure they are characters
),
brncntr = ifelse(brncntr %in% c(77, 88, 99), NA, brncntr),
# Recoding education for rounds 1-4
edulvla = case_when(
essround < 5 & edulvla == 55 ~ NA_real_,
TRUE ~ edulvla
),
# Re-coding education for rounds 5 and later
edulvlb = case_when(
essround >= 5 & edulvlb == 5555 ~ NA_real_,
TRUE ~ edulvlb
),
# Create educ_level column
educ_level = case_when(
essround < 5 & edulvla == 5 ~ "BA",
essround >= 5 & edulvlb > 600 ~ "BA",
TRUE ~ "No BA"
),
# Recoding gender
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
gndr == 9 ~ NA_character_,
TRUE ~ as.character(gndr)
)
)
unique(switzerland_data$gndr)
## [1] "Female" "Male" NA
unique(switzerland_data$brncntr)
## [1] "No" "Yes" NA
# here's how to make sure it's really cleaned
switzerland_data <- switzerland_data %>% filter(!is.na(brncntr))
switzerland_data <- switzerland_data %>% filter(!is.na(educ_level))
# double-checking + quick distribution
unique(switzerland_data$brncntr)
## [1] "No" "Yes"
table(switzerland_data$brncntr)
##
## No Yes
## 3867 13056
## Not necessary, but always good to rename when doing operations so you can backtrack to clean dataset
tab_dat <- switzerland_data
# Create a crosstable and save it into the object mytab
mytab <- table(tab_dat$brncntr, tab_dat$educ_level)
# Calculate row-wise sums of counts in table
rsums <- rowSums(mytab)
# Calculate column-wise sums of counts in table
csums <- colSums(mytab)
# Get the total count of the table (N)
N <- sum(mytab)
# Generate table of expected proportions
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
## [,1] [,2]
## [1,] 0.05206627 0.1764393
## [2,] 0.17578931 0.5957051
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
## [,1] [,2]
## [1,] 881.1175 2985.882
## [2,] 2974.8825 10081.118
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841
#Critical Value: If the Chi-squared test statistic that we cpmpute exceeds 3.841, then we will reject the null hypothesis at the 0.05 significance level.
test_stat <- sum((mytab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 65.1218
#Interpretation: The difference between our observed data and what we’d expect under the assumption of independence is statistically significant. Thus, there is a statistically significant association between feeling close to a party and one’s educational level. This means that the probability of feeling close to a party is not the same across different education levels.
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(mytab, correct = F))
##
## Pearson's Chi-squared test
##
## data: mytab
## X-squared = 65.122, df = 1, p-value = 7.041e-16
p-value < 7.041e-16: This is another representation of the p-value in scientific notation because it’s extremely close to 0.
The chi-squared test tells us if two categorical variables are independent or if they are associated in some manner. In our test:
The extremely small p-value (essentially 0) means that we reject the null hypothesis that the two variables are independent of each other. In other words, there’s strong evidence to suggest that there’s a significant association between the two variables in mytab.
The large chi-squared statistic of 165.44 further reinforces this point. If the two variables were independent, we’d expect a chi-squared statistic closer to 0.
netherlands_data <- ess %>%
filter(cntry == "NL") %>%
# Recoding clsprty and cleaning trstplt (will use later)
mutate(
domicil = case_when(
domicil == 1 ~ "Urban",
domicil == 2 ~ "Rural",
domicil %in% c(7, 8, 9) ~ NA_character_, # Handle other specific cases where you want to set it as NA
TRUE ~ as.character(domicil) # Keep other values as-is but ensure they are characters
),
domicil = ifelse(domicil %in% c(77, 88, 99), NA, domicil),
gincdif = case_when(
gincdif == 1 ~ "Agree strongly",
gincdif == 2 ~ "Agree",
gincdif == 3 ~ "Neither agree nor disagree",
gincdif == 4 ~ "Disagree",
gincdif == 5 ~ "Disagree strongly",
gincdif %in% c(7, 8, 9) ~ NA_character_
)
)
test_stat <- netherlands_data %>%
specify(explanatory = domicil, # change variable name for explanatory variable
response = gincdif) %>% # change variable name for outcome of interest
hypothesize(null = "independence") %>%
calculate(stat = "chisq") # replace in between quotation marks appropriate test statistic
## Warning: Removed 198 rows containing missing values.
print(test_stat$stat)
## X-squared
## 52.61113
Interpretation: The t-statistic measures the difference between the observed sample mean and the population mean (or another sample mean, in the case of two-sample tests) in units of standard error. The larger the magnitude of t (regardless of the sign), the greater the evidence against the null hypothesis.
A t-statistic of 52.61113 is quite large, indicating a significant difference in trust in politicians between those with a BA and those without a BA. It suggests that the mean trust level in politicians is different for the two groups.
null_distribution <- netherlands_data%>%
specify(explanatory = domicil,
response = gincdif) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
calculate(stat = "chisq")
## Warning: Removed 198 rows containing missing values.
p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.
null_distribution
## Response: gincdif (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
## replicate stat
## <int> <dbl>
## 1 1 13.6
## 2 2 22.9
## 3 3 13.5
## 4 4 25.6
## 5 5 11.5
## 6 6 20.5
## 7 7 21.7
## 8 8 7.76
## 9 9 23.6
## 10 10 16.5
## # ℹ 990 more rows
Visualizes the null distribution alongside the observsed test statistic (the red line), indicating here clear evidence, under assumptions of frequentist hypothesis testing, to reject the null. If the observed statistic would fall within the null distribution, you would see shaded red regions of the null distribution that are as (or more) extreme than our observed statistic.
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided") +
shade_confidence_interval(endpoints = conf_int)
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.
# Task 3
netherlands_data <- netherlands_data %>%
mutate(
pdwrk_recode = case_when(
pdwrk == 1 ~ 'yes',
pdwrk == 0 ~ 'no',
pdwrk %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ as.character(pdwrk)
),
happy_recode = case_when(
happy == 0 ~ "Extremely unhappy",
happy == 1 ~ "1",
happy == 2 ~ "2",
happy == 3 ~ "3",
happy == 4 ~ "4",
happy == 5 ~ "5",
happy == 6 ~ "6",
happy == 7 ~ "7",
happy == 8 ~ "8",
happy == 9 ~ "9",
happy == 10 ~ "Extremely happy",
happy %in% c(77, 88, 99) ~ NA_character_
)
) %>%
mutate(
pdwrk_recode = factor(pdwrk_recode),
happy_recode = factor(happy_recode)
)
test_stat <- netherlands_data %>%
specify(explanatory = pdwrk, # change variable name for explanatory variable
response = happy) %>% # change variable name for outcome of interest
hypothesize(null = "independence") %>%
print(test_stat$stat)
## Response: happy (numeric)
## Explanatory: pdwrk (numeric)
## Null Hypothesis: independence
## # A tibble: 18,329 × 2
## happy pdwrk
## <dbl> <dbl>
## 1 10 0
## 2 10 1
## 3 8 1
## 4 7 1
## 5 6 1
## 6 8 1
## 7 7 1
## 8 8 1
## 9 8 0
## 10 10 1
## # ℹ 18,319 more rows
Interpretation: The t-statistic measures the difference between the observed sample mean and the population mean (or another sample mean, in the case of two-sample tests) in units of standard error. The larger the magnitude of t (regardless of the sign), the greater the evidence against the null hypothesis.
A t-statistic of 52.61113 is quite large, indicating a significant difference in trust in politicians between those with a BA and those without a BA. It suggests that the mean trust level in politicians is different for the two groups.
p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: The first row and first column value of the given `obs_stat` will be
## used.
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0.272
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: The first row and first column value of the given `obs_stat` will be
## used.
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.
null_distribution
## Response: gincdif (factor)
## Explanatory: domicil (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
## replicate stat
## <int> <dbl>
## 1 1 13.6
## 2 2 22.9
## 3 3 13.5
## 4 4 25.6
## 5 5 11.5
## 6 6 20.5
## 7 7 21.7
## 8 8 7.76
## 9 9 23.6
## 10 10 16.5
## # ℹ 990 more rows
Visualizes the null distribution alongside the observsed test statistic (the red line), indicating here clear evidence, under assumptions of frequentist hypothesis testing, to reject the null. If the observed statistic would fall within the null distribution, you would see shaded red regions of the null distribution that are as (or more) extreme than our observed statistic.
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided") +
shade_confidence_interval(endpoints = conf_int)
## Warning: The first row and first column value of the given `obs_stat` will be
## used.
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.