# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "broom") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.2 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.3 v tibble 3.2.1
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "broom" "modelsummary" "fst" "infer" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
ess <- read_fst("All-ESS-Data.fst")
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
ess$year[ess$essround == i] <- replacements[i]
}
germany_data <- ess %>%
filter(cntry == "DE") %>%
mutate(
prtclede = ifelse(prtclede %in% c(66, 77, 88, 99), NA, prtclede), # Outcome of interest.
gvrfgap = ifelse(gvrfgap %in% c(7, 8, 9), NA, gvrfgap), # Predictive variable 1.
rlgblg = ifelse(rlgblg %in% c(7, 8, 9), NA, rlgblg), # Predictive variable 2
)
# Filtering NA values.
germany_data <- germany_data %>% filter(!is.na(prtclede))
germany_data <- germany_data %>% filter(!is.na(gvrfgap))
germany_data <- germany_data %>% filter(!is.na(rlgblg))
# Tabling data.
unique(germany_data$prtclede)
## [1] 1 3 4 2 5 6 8 9 7
table(germany_data$prtclede)
##
## 1 2 3 4 5 6 7 8 9
## 1104 834 354 461 100 152 30 15 74
count(germany_data %>% select(prtclede, gvrfgap, rlgblg))
## n
## 1 3124
datasummary_skim(germany_data %>% select(prtclede, gvrfgap, rlgblg))
| Unique (#) | Missing (%) | Mean | SD | Min | Median | Max | ||
|---|---|---|---|---|---|---|---|---|
| prtclede | 9 | 0 | 2.6 | 1.8 | 1.0 | 2.0 | 9.0 | |
| gvrfgap | 5 | 0 | 3.0 | 1.1 | 1.0 | 3.0 | 5.0 | |
| rlgblg | 2 | 0 | 1.4 | 0.5 | 1.0 | 1.0 | 2.0 |
Table 1. Summary descriptive statistics.
There are 3124 observations in my sample.
de_data2 <- germany_data # Creating a dataset duplicate.
de_data2 <- ess %>%
filter(cntry == "DE") %>%
mutate(
gvrfgap = case_when(
gvrfgap == 1 ~ "Agree strongly",
gvrfgap == 2 ~ "Agree",
gvrfgap == 3 ~ "Neither agree nor disagree",
gvrfgap == 4 ~ "Disagree",
gvrfgap == 5 ~ "Disagree strongly"
),
prtclede = case_when(
prtclede == 1 ~ "CDU",
prtclede == 2 ~ "SDP",
prtclede == 3 ~ "The Left",
prtclede == 4 ~ "The Greens",
prtclede == 5 ~ "FDP",
prtclede == 6 ~ "AFD",
prtclede == 7 ~ "Pirate Party",
prtclede == 8 ~ "NPD",
prtclede == 9 ~ "Other",
),
#Acronym guide for German political parties located on poster.
rlgblg = case_when(
rlgblg == 1 ~ "Yes",
rlgblg == 2 ~ "No",
))
table(de_data2$prtclede)
##
## AFD CDU FDP NPD Other Pirate Party
## 225 1549 173 15 105 32
## SDP The Greens The Left
## 1089 738 479
de_tabdata <- de_data2
# Creating a crosstable and save it into the object de_tab.
de_tab <- table(de_tabdata$prtclede, de_tabdata$gvrfgap)
rsums <- rowSums(de_tab)
csums <- colSums(de_tab)
N <- sum(de_tab)
ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")
## Table of Expected Proportions:
print(ptab)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.014249281 0.0041864048 0.014125240 0.0042174152 0.011768449
## [2,] 0.103494781 0.0304065189 0.102593847 0.0306317524 0.085476103
## [3,] 0.009468273 0.0027817558 0.009385850 0.0028023614 0.007819825
## [4,] 0.001406179 0.0004131320 0.001393938 0.0004161923 0.001161360
## [5,] 0.006937150 0.0020381181 0.006876761 0.0020532153 0.005729376
## [6,] 0.002812358 0.0008262641 0.002787876 0.0008323846 0.002322720
## [7,] 0.078371048 0.0230252263 0.077688819 0.0231957835 0.064726469
## [8,] 0.043497806 0.0127795514 0.043119153 0.0128742148 0.035924739
## [9,] 0.033279572 0.0097774585 0.032989869 0.0098498841 0.027485522
ftab <- N * ptab
cat("Table of Expected Frequencies:\n")
## Table of Expected Frequencies:
print(ftab)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 44.614500 13.107633 44.226126 13.204727 36.847014
## [2,] 324.042159 95.202811 321.221335 95.908017 267.625679
## [3,] 29.645161 8.709677 29.387097 8.774194 24.483871
## [4,] 4.402747 1.293516 4.364420 1.303098 3.636218
## [5,] 21.720217 6.381348 21.531140 6.428617 17.938678
## [6,] 8.805493 2.587033 8.728841 2.606196 7.272437
## [7,] 245.379751 72.091983 243.243692 72.625998 202.658576
## [8,] 136.191632 40.012775 135.006068 40.309166 112.480358
## [9,] 104.198339 30.613223 103.291281 30.839987 86.057170
Coding for the Critical Value for Independence:
alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")
## Critical Value: 3.841
If the Chi-squared test statistic that I will compute exceeds 3.841, then I will reject the null hypothesis at the 0.5 significance level.
test_stat <- sum((de_tab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")
## Pearson's X^2: 563.6064
The X-squared value is much higher than the calculated critical value of 3.841, meaning that there is a statistically significant association between party preference and tolerance towards refugee intake.
p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")
## p-value: 0
cat("Chi-squared test result:\n")
## Chi-squared test result:
print(chisq.test(de_tab, correct = F))
## Warning in chisq.test(de_tab, correct = F): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: de_tab
## X-squared = 563.61, df = 32, p-value < 2.2e-16
The p-value is < 2.2e-16, extremely close to zero. This further reinforces that the two variables are not independent of each other, as stated in the null hypothesis.
Note: the R-Studio warning is due to the small values of the expected counts, resulting in a p-value approximation that may be poor.
Step 1: (Re-)Calculating the test statistic.
test_stat <- de_data2 %>%
specify(explanatory = gvrfgap, # Predictive variable.
response = prtclede) %>% # Outcome of interest.
hypothesize(null = "independence") %>%
calculate(stat = "Chisq")
## Warning: Removed 31294 rows containing missing values.
print(test_stat$stat)
## X-squared
## 563.6064
A Chi-squared test of independence was chosen as both the predictive and outcome variables are categorical variables with more than three categories. The large X-squared test statistic again indicates a high likelihood that there is an association between the two variables.
Step 2: Simulate the null distribution:
null_distribution <- de_data2 %>%
specify(explanatory = gvrfgap,
response = prtclede) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
calculate(stat = "Chisq")
## Warning: Removed 31294 rows containing missing values.
Step 3: (Re-)Calculating the p-value:
p_val <- null_distribution %>%
get_pvalue(obs_stat = test_stat, direction = "greater")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
The p-value is likely not 0, but extremely close to it, and not below it either. Under the assumptions of frequentist hypothesis testing, it represents strong evidence to reject the null hypothesis.
Step 4: Visualize the null distribution:
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints = conf_int)
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
Table 2. Simulated null distribution with test statistic and confidence
interval.
null_distribution
## Response: prtclede (factor)
## Explanatory: gvrfgap (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 42.3
## 2 2 22.2
## 3 3 35.4
## 4 4 38.1
## 5 5 20.6
## 6 6 23.4
## 7 7 30.8
## 8 8 26.4
## 9 9 42.8
## 10 10 35.9
## # i 990 more rows
Recording predictor variables into binary categorical responses:
germany_data <- germany_data %>%
mutate(refu_gen = case_when(
gvrfgap %in% c(1, 2) ~ "Agree",
gvrfgap %in% c(4, 5) ~ "Disagree",
gvrfgap %in% c(3, 7, 8, 9) ~ NA_character_,
TRUE ~ as.character(gvrfgap),
),
religious = case_when(
rlgblg == 1 ~ "Yes",
rlgblg == 2 ~ "No",
))
table(germany_data$refu_gen) # refu_gen will be the sub-setted variable without the response 'Neither agree nor disagree'.
##
## Agree Disagree
## 1186 1181
germany_data$weight <- germany_data$dweight * germany_data$pweight
model1 <- lm(prtclede ~ refu_gen, data = germany_data, weights = weight)# Single predictor model.
model2 <- lm(prtclede ~ refu_gen + religious, data = germany_data, weights = weight) #Multivariate model.
model3 <- lm(prtclede ~ refu_gen + religious + refu_gen*religious, data = germany_data, weights = weight) #Interaction model.
modelsummary(
list(model1, model2, model3),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL,
coef_omit = "Intercept")
| (1) | (2) | (3) | |
|---|---|---|---|
| refu_genDisagree | -0.1 (0.1) | -0.1 (0.1) | 0.0 (0.1) |
| religiousYes | -0.8 (0.1)*** | -0.7 (0.1)*** | |
| refu_genDisagree × religiousYes | 0.0 (0.2) | ||
| Num.Obs. | 2367 | 2367 | 2367 |
| R2 | 0.000 | 0.038 | 0.039 |
| R2 Adj. | 0.000 | 0.038 | 0.037 |
| AIC | 9860.3 | 9770.1 | 9772.1 |
| BIC | 9877.6 | 9793.2 | 9800.9 |
| Log.Lik. | -4927.134 | -4881.070 | -4881.033 |
| RMSE | 1.86 | 1.83 | 1.83 |
models <- list(
"Model 1" = lm(prtclede ~ refu_gen, data = germany_data, weights = weight),
"Model 2" = lm(prtclede ~ refu_gen + religious, data = germany_data, weights = weight),
"Model 3" = lm(prtclede ~ refu_gen + religious + refu_gen*religious, data = germany_data, weights = weight)
)
modelsummary(models)
| Model 1 | Model 2 | Model 3 | |
|---|---|---|---|
| (Intercept) | 2.660 | 3.142 | 3.129 |
| (0.054) | (0.072) | (0.087) | |
| refu_genDisagree | -0.067 | -0.057 | -0.029 |
| (0.077) | (0.076) | (0.126) | |
| religiousYes | -0.762 | -0.742 | |
| (0.079) | (0.109) | ||
| refu_genDisagree × religiousYes | -0.043 | ||
| (0.158) | |||
| Num.Obs. | 2367 | 2367 | 2367 |
| R2 | 0.000 | 0.038 | 0.039 |
| R2 Adj. | 0.000 | 0.038 | 0.037 |
| AIC | 9860.3 | 9770.1 | 9772.1 |
| BIC | 9877.6 | 9793.2 | 9800.9 |
| Log.Lik. | -4927.134 | -4881.070 | -4881.033 |
| RMSE | 1.86 | 1.83 | 1.83 |
ggplot(germany_data, aes(x = gvrfgap)) +
geom_histogram(bins = 5, fill = "#CC0000", color = "black") +
theme_minimal() +
labs(title = "Histogram of Refugee Intake Attitude (Germany)",
x = "Attitude towards Refugee Application Lenience (5 = Strongly Disagree)",
y = "Count")