Setting up environment:

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "broom") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.2     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.3     v tibble    3.2.1
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "broom"        "modelsummary" "fst"          "infer"        "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"

ess <- read_fst("All-ESS-Data.fst")
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

Filtering and recoding:

germany_data <- ess %>%
  filter(cntry == "DE") %>% 
  mutate(
    prtclede = ifelse(prtclede %in% c(66, 77, 88, 99), NA, prtclede), # Outcome of interest.
    gvrfgap = ifelse(gvrfgap %in% c(7, 8, 9), NA, gvrfgap), # Predictive variable 1.
    rlgblg = ifelse(rlgblg %in% c(7, 8, 9), NA, rlgblg), # Predictive variable 2
  )

# Filtering NA values.
germany_data <- germany_data %>% filter(!is.na(prtclede))
germany_data <- germany_data %>% filter(!is.na(gvrfgap))
germany_data <- germany_data %>% filter(!is.na(rlgblg))

# Tabling data.
unique(germany_data$prtclede)

## [1] 1 3 4 2 5 6 8 9 7

table(germany_data$prtclede)

## 
##    1    2    3    4    5    6    7    8    9 
## 1104  834  354  461  100  152   30   15   74

Counting observations:

count(germany_data %>% select(prtclede, gvrfgap, rlgblg))

##      n
## 1 3124

datasummary_skim(germany_data %>% select(prtclede, gvrfgap, rlgblg))

	Unique (#)	Mean	SD	Min	Median	Max
prtclede	9	2.6	1.8	1.0	2.0	9.0
gvrfgap	5	3.0	1.1	1.0	3.0	5.0
rlgblg	2	1.4	0.5	1.0	1.0	2.0

Table 1. Summary descriptive statistics.

There are 3124 observations in my sample.

Recoding numeric responses to categorical:

de_data2 <- germany_data # Creating a dataset duplicate.

de_data2 <- ess %>%
  filter(cntry == "DE") %>%
  mutate(
     gvrfgap = case_when(
      gvrfgap == 1 ~ "Agree strongly",
      gvrfgap == 2 ~ "Agree",
      gvrfgap == 3 ~ "Neither agree nor disagree",
      gvrfgap == 4 ~ "Disagree",
      gvrfgap == 5 ~ "Disagree strongly"
    ),
    
    prtclede = case_when(
      prtclede == 1 ~ "CDU",
      prtclede == 2 ~ "SDP",
      prtclede == 3 ~ "The Left",
      prtclede == 4 ~ "The Greens",
      prtclede == 5 ~ "FDP",
      prtclede == 6 ~ "AFD",
      prtclede == 7 ~ "Pirate Party",
      prtclede == 8 ~ "NPD",
      prtclede == 9 ~ "Other",
    ),
      #Acronym guide for German political parties located on poster.
      
    rlgblg = case_when(
      rlgblg == 1 ~ "Yes",
      rlgblg == 2 ~ "No",
    ))

table(de_data2$prtclede)

## 
##          AFD          CDU          FDP          NPD        Other Pirate Party 
##          225         1549          173           15          105           32 
##          SDP   The Greens     The Left 
##         1089          738          479

Calculating Chi-squared critical value:

de_tabdata <- de_data2

# Creating a crosstable and save it into the object de_tab.
de_tab <- table(de_tabdata$prtclede, de_tabdata$gvrfgap)

rsums <- rowSums(de_tab)
csums <- colSums(de_tab)
N <- sum(de_tab)

ptab <- tcrossprod(rsums/N, csums/N)
cat("Table of Expected Proportions:\n")

## Table of Expected Proportions:

print(ptab)

##              [,1]         [,2]        [,3]         [,4]        [,5]
##  [1,] 0.014249281 0.0041864048 0.014125240 0.0042174152 0.011768449
##  [2,] 0.103494781 0.0304065189 0.102593847 0.0306317524 0.085476103
##  [3,] 0.009468273 0.0027817558 0.009385850 0.0028023614 0.007819825
##  [4,] 0.001406179 0.0004131320 0.001393938 0.0004161923 0.001161360
##  [5,] 0.006937150 0.0020381181 0.006876761 0.0020532153 0.005729376
##  [6,] 0.002812358 0.0008262641 0.002787876 0.0008323846 0.002322720
##  [7,] 0.078371048 0.0230252263 0.077688819 0.0231957835 0.064726469
##  [8,] 0.043497806 0.0127795514 0.043119153 0.0128742148 0.035924739
##  [9,] 0.033279572 0.0097774585 0.032989869 0.0098498841 0.027485522

ftab <- N * ptab
cat("Table of Expected Frequencies:\n")

## Table of Expected Frequencies:

print(ftab)

##             [,1]      [,2]       [,3]      [,4]       [,5]
##  [1,]  44.614500 13.107633  44.226126 13.204727  36.847014
##  [2,] 324.042159 95.202811 321.221335 95.908017 267.625679
##  [3,]  29.645161  8.709677  29.387097  8.774194  24.483871
##  [4,]   4.402747  1.293516   4.364420  1.303098   3.636218
##  [5,]  21.720217  6.381348  21.531140  6.428617  17.938678
##  [6,]   8.805493  2.587033   8.728841  2.606196   7.272437
##  [7,] 245.379751 72.091983 243.243692 72.625998 202.658576
##  [8,] 136.191632 40.012775 135.006068 40.309166 112.480358
##  [9,] 104.198339 30.613223 103.291281 30.839987  86.057170

Coding for the Critical Value for Independence:

alpha <- 0.05 # significance level
c_val <- qchisq(alpha, df = 1, lower.tail = FALSE)
cat("Critical Value:", round(c_val, 3), "\n")

## Critical Value: 3.841

If the Chi-squared test statistic that I will compute exceeds 3.841, then I will reject the null hypothesis at the 0.5 significance level.

Calculating the Pearson’s chi-squared statistic:

test_stat <- sum((de_tab - ftab)^2 / ftab)
cat("Pearson's X^2:", round(test_stat, 4), "\n")

## Pearson's X^2: 563.6064

The X-squared value is much higher than the calculated critical value of 3.841, meaning that there is a statistically significant association between party preference and tolerance towards refugee intake.

Calculating the p-value:

p_val <- pchisq(test_stat, df = 1, lower.tail = F)
cat("p-value:", round(p_val, 4), "\n")

## p-value: 0

cat("Chi-squared test result:\n")

## Chi-squared test result:

print(chisq.test(de_tab, correct = F))

## Warning in chisq.test(de_tab, correct = F): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  de_tab
## X-squared = 563.61, df = 32, p-value < 2.2e-16

The p-value is < 2.2e-16, extremely close to zero. This further reinforces that the two variables are not independent of each other, as stated in the null hypothesis.

Note: the R-Studio warning is due to the small values of the expected counts, resulting in a p-value approximation that may be poor.

Simulating the Null Distribution with the Infer Package:

Step 1: (Re-)Calculating the test statistic.

test_stat <- de_data2 %>%
  specify(explanatory = gvrfgap, # Predictive variable.
          response = prtclede) %>% # Outcome of interest.
  hypothesize(null = "independence") %>%
  calculate(stat = "Chisq")

## Warning: Removed 31294 rows containing missing values.

print(test_stat$stat)

## X-squared 
##  563.6064

A Chi-squared test of independence was chosen as both the predictive and outcome variables are categorical variables with more than three categories. The large X-squared test statistic again indicates a high likelihood that there is an association between the two variables.

Step 2: Simulate the null distribution:

null_distribution <- de_data2 %>%
  specify(explanatory = gvrfgap,
          response = prtclede) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
  calculate(stat = "Chisq")

## Warning: Removed 31294 rows containing missing values.

Step 3: (Re-)Calculating the p-value:

p_val <- null_distribution %>%
  get_pvalue(obs_stat = test_stat, direction = "greater")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.

p_val

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

The p-value is likely not 0, but extremely close to it, and not below it either. Under the assumptions of frequentist hypothesis testing, it represents strong evidence to reject the null hypothesis.

Step 4: Visualize the null distribution:

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf

Table 2. Simulated null distribution with test statistic and confidence interval.

null_distribution

## Response: prtclede (factor)
## Explanatory: gvrfgap (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  42.3
##  2         2  22.2
##  3         3  35.4
##  4         4  38.1
##  5         5  20.6
##  6         6  23.4
##  7         7  30.8
##  8         8  26.4
##  9         9  42.8
## 10        10  35.9
## # i 990 more rows

Producing Regression Models:

Recording predictor variables into binary categorical responses:

germany_data <- germany_data %>%
  mutate(refu_gen = case_when(
    gvrfgap %in% c(1, 2) ~ "Agree",
    gvrfgap %in% c(4, 5) ~ "Disagree",
    gvrfgap %in% c(3, 7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(gvrfgap),
  ),
  religious = case_when(
    rlgblg == 1 ~ "Yes",
    rlgblg == 2 ~ "No",
  ))

table(germany_data$refu_gen) # refu_gen will be the sub-setted variable without the response 'Neither agree nor disagree'.

## 
##    Agree Disagree 
##     1186     1181

germany_data$weight <- germany_data$dweight * germany_data$pweight

model1 <- lm(prtclede ~ refu_gen, data = germany_data, weights = weight)# Single predictor model.
model2 <- lm(prtclede ~ refu_gen + religious, data = germany_data, weights = weight) #Multivariate model.
model3 <- lm(prtclede ~ refu_gen + religious + refu_gen*religious, data = germany_data, weights = weight) #Interaction model.

modelsummary(
  list(model1, model2, model3),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_omit = "Intercept")

	(1)	(2)	(3)
refu_genDisagree	-0.1 (0.1)	-0.1 (0.1)	0.0 (0.1)
religiousYes		-0.8 (0.1)***	-0.7 (0.1)***
refu_genDisagree × religiousYes			0.0 (0.2)
Num.Obs.	2367	2367	2367
R2	0.000	0.038	0.039
R2 Adj.	0.000	0.038	0.037
AIC	9860.3	9770.1	9772.1
BIC	9877.6	9793.2	9800.9
Log.Lik.	-4927.134	-4881.070	-4881.033
RMSE	1.86	1.83	1.83

models <- list(
  "Model 1" = lm(prtclede ~ refu_gen, data = germany_data, weights = weight),
  "Model 2" = lm(prtclede ~ refu_gen + religious, data = germany_data, weights = weight),
  "Model 3" = lm(prtclede ~ refu_gen + religious + refu_gen*religious, data = germany_data, weights = weight)
)

modelsummary(models)

	Model 1	Model 2	Model 3
(Intercept)	2.660	3.142	3.129
	(0.054)	(0.072)	(0.087)
refu_genDisagree	-0.067	-0.057	-0.029
	(0.077)	(0.076)	(0.126)
religiousYes		-0.762	-0.742
		(0.079)	(0.109)
refu_genDisagree × religiousYes			-0.043
			(0.158)
Num.Obs.	2367	2367	2367
R2	0.000	0.038	0.039
R2 Adj.	0.000	0.038	0.037
AIC	9860.3	9770.1	9772.1
BIC	9877.6	9793.2	9800.9
Log.Lik.	-4927.134	-4881.070	-4881.033
RMSE	1.86	1.83	1.83

Producing a visual of choice.

ggplot(germany_data, aes(x = gvrfgap)) +
  geom_histogram(bins = 5, fill = "#CC0000", color = "black") +
  theme_minimal() +
  labs(title = "Histogram of Refugee Intake Attitude (Germany)",
       x = "Attitude towards Refugee Application Lenience (5 = Strongly Disagree)",
       y = "Count")

Huynh_MinhTam_FinalProject202

2023-12-13