packages <- c("tidyverse", "infer", "ggplot2", "broom", "dplyr", "readr", "modelsummary", "effects", "fst")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "broom"     "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[5]]
##  [1] "broom"     "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[6]]
##  [1] "broom"     "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[7]]
##  [1] "modelsummary" "broom"        "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "effects"      "carData"      "modelsummary" "broom"        "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[9]]
##  [1] "fst"          "effects"      "carData"      "modelsummary" "broom"       
##  [6] "infer"        "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"
ess <- read_fst("All-ESS-Data.fst")
# Creating a subset of ESS data for German respondents born between 1990 and 2005.
germany_data_subset <- ess %>%
  filter(cntry == "DE", yrbrn > 1989 & yrbrn < 2006) %>%
  mutate(
    stfgov = ifelse(stfgov %in% c(77, 88, 99), NA, stfgov),  # Handling special codes as NA for stfgov
    eisced = ifelse(eisced %in% c(77, 88, 99), NA, eisced),  # Handling special codes as NA for eisced
    gndr = ifelse(gndr %in% c(9), NA, gndr)               # Handling special codes as NA for gndr
  ) %>%
  select(stfgov, eisced, gndr) %>%
  na.omit()  # Removing rows with any NA values

# Generate a summary skim of the subset data
datasummary_skim(germany_data_subset)
Unique (#) Missing (%) Mean SD Min Median Max
stfgov 11 0 4.8 2.1 0.0 5.0 10.0
eisced 8 0 3.7 3.1 1.0 4.0 55.0
gndr 2 0 1.5 0.5 1.0 2.0 2.0
germany_data <- ess %>%
  filter(cntry == "DE") %>%
  select(stfgov, eisced, gndr) %>%
  na.omit()
# Create 'educ_level' variable: classify 'eisced' education levels as 'Low' (<=3), 'Medium' (4-5), 'High' (>=6)
germany_data <- germany_data %>%
  mutate(
    educ_level = case_when(
      eisced <= 3 ~ "Low",
      eisced %in% 4:5 ~ "Medium",
      eisced >= 6 ~ "High",
      TRUE ~ NA_character_
    ),
    # Create 'gender_cat' variable: convert 'gndr' numeric codes to 'Male' (1) or 'Female' (2)
    gender_cat = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ NA_character_
    )
  ) %>%
  # Filter out rows with missing values in 'stfgov', 'educ_level', or 'gender_cat' for complete case analysis
  filter(!is.na(stfgov) & !is.na(educ_level) & !is.na(gender_cat))
# Calculate summary statistics for government satisfaction by education level
summary_statistics <- germany_data %>%
  group_by(educ_level) %>% # Group data by education level
  summarise(
    # Calculate the mean of 'stfgov', remove missing values with na.rm = TRUE
    Satisfied_Mean = mean(stfgov, na.rm = TRUE),
    # Calculate the standard deviation of 'stfgov', remove missing values with na.rm = TRUE
    Satisfied_SD = sd(stfgov, na.rm = TRUE),
    # Count the number of observations in each education level group
    N = n()
  )
# Build a linear regression model to predict government satisfaction using education level as a predictor
model <- lm(stfgov ~ educ_level, data = germany_data)
# Build a linear regression model to predict government satisfaction using education level and gender category as predictors
model_2 <- lm(stfgov ~ educ_level + gender_cat, data = germany_data)
summary_2 <- summary(model_2)
print(summary_2)
## 
## Call:
## lm(formula = stfgov ~ educ_level + gender_cat, data = germany_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.916 -4.018 -2.413 -0.413 93.982 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        7.0265     0.1836  38.261  < 2e-16 ***
## educ_levelLow      0.8900     0.1938   4.592  4.4e-06 ***
## educ_levelMedium   0.3949     0.2304   1.714   0.0865 .  
## gender_catMale    -2.0081     0.1564 -12.839  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.36 on 34041 degrees of freedom
## Multiple R-squared:  0.005859,   Adjusted R-squared:  0.005771 
## F-statistic: 66.87 on 3 and 34041 DF,  p-value: < 2.2e-16
# Create a plot of model coefficients using ggplot2
coefficients_plot <- ggplot(tidy(model), aes(x = term, y = estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error), width = 0.2) +
  theme_minimal() +
  labs(title = "Effect of Educational Level on How satisfied with the national government",
       x = "Educational Level", # Label for x-axis
       y = "Estimate") # Label for y-axis
# Perform a hypothesis test to assess between 'stfgov' and 'educ_level'
hypothesis_test <- germany_data %>%
  specify(stfgov ~ educ_level) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "F")
# Display the results of the hypothesis test
hypothesis_test
## Response: stfgov (numeric)
## Explanatory: educ_level (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 1.13  
##  2         2 0.209 
##  3         3 0.571 
##  4         4 0.176 
##  5         5 0.0299
##  6         6 0.284 
##  7         7 1.14  
##  8         8 1.88  
##  9         9 0.359 
## 10        10 1.30  
## # ℹ 990 more rows
# Calculating the test statistic on observed data
test_stat <- germany_data %>%
  specify(explanatory = educ_level, 
          response = stfgov) %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "F") 
print(test_stat$stat)
## [1] 17.80072
# Simulate the null distribution
null_distribution <- germany_data %>%
  specify(explanatory = educ_level,
          response = stfgov) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "F")
# Calculate the p-value
p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0
# Visualize
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning: F usually corresponds to right-tailed tests. Proceed with caution.

null_distribution
## Response: stfgov (numeric)
## Explanatory: educ_level (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.252
##  2         2 1.19 
##  3         3 0.345
##  4         4 0.994
##  5         5 1.73 
##  6         6 0.299
##  7         7 5.48 
##  8         8 5.18 
##  9         9 1.42 
## 10        10 1.20 
## # ℹ 990 more rows
# Also visualize with confidence intervals.
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning: F usually corresponds to right-tailed tests. Proceed with caution.

null_distribution
## Response: stfgov (numeric)
## Explanatory: educ_level (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.252
##  2         2 1.19 
##  3         3 0.345
##  4         4 0.994
##  5         5 1.73 
##  6         6 0.299
##  7         7 5.48 
##  8         8 5.18 
##  9         9 1.42 
## 10        10 1.20 
## # ℹ 990 more rows
library(effects)
# Fitting a linear model with interaction
model_with_interaction <- lm(stfgov ~ educ_level * gndr, data = germany_data)
# Creating an interaction plot
interaction_plot <- interaction.plot(x.factor = germany_data$educ_level, 
                                     trace.factor = germany_data$gndr, 
                                     response = model_with_interaction$fitted.values,
                                     xlab = "Educational Level", 
                                     ylab = " How satisfied with the national government", 
                                     main = "Interaction Effect of Educational Level and Gender on  How satisfied with the national government",
                                     legend = TRUE)

# Printing the interaction plot
print(interaction_plot)
## NULL
print(summary_statistics)
## # A tibble: 3 × 4
##   educ_level Satisfied_Mean Satisfied_SD     N
##   <chr>               <dbl>        <dbl> <int>
## 1 High                 5.91         10.9  7881
## 2 Low                  7.00         16.1 18500
## 3 Medium               6.31         13.2  7664
print(summary(model))
## 
## Call:
## lm(formula = stfgov ~ educ_level, data = germany_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.002 -4.002 -2.002 -0.312 93.090 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.9099     0.1621   36.45  < 2e-16 ***
## educ_levelLow      1.0919     0.1936    5.64 1.72e-08 ***
## educ_levelMedium   0.4018     0.2309    1.74   0.0819 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.39 on 34042 degrees of freedom
## Multiple R-squared:  0.001045,   Adjusted R-squared:  0.000986 
## F-statistic:  17.8 on 2 and 34042 DF,  p-value: 1.876e-08
print(coefficients_plot)

print(interaction_plot)
## NULL
model1 <- lm(stfgov ~ educ_level, data = germany_data)
model2 <- lm(stfgov ~ educ_level + gender_cat, data = germany_data)
model3 <- lm(stfgov ~ educ_level * gender_cat, data = germany_data)

modelsummary(
  list(model1, model2, model3),
  fmt = 1,
  estimate = c("{estimate} ({std.error}){stars}",
               "{estimate} ({std.error}){stars}",
               "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_omit = "Intercept"
)
 (1)   (2)   (3)
educ_levelLow 1.1 (0.2)*** 0.9 (0.2)*** 1.3 (0.3)***
educ_levelMedium 0.4 (0.2)+ 0.4 (0.2)+ 1.0 (0.3)**
gender_catMale −2.0 (0.2)*** −1.4 (0.3)***
educ_levelLow × gender_catMale −0.7 (0.4)+
educ_levelMedium × gender_catMale −1.1 (0.5)*
Num.Obs. 34045 34045 34045
R2 0.001 0.006 0.006
R2 Adj. 0.001 0.006 0.006
AIC 278202.8 278040.4 278038.4
BIC 278236.6 278082.6 278097.5
Log.Lik. −139097.420 −139015.186 −139012.216
RMSE 14.39 14.36 14.36
anova_test <- aov(stfgov ~ educ_level, data = germany_data)
anova_summary <- summary(anova_test)
# Get the mean square (usually used as an estimate of variance) and F-value from the ANOVA table
anova_stats <- anova_summary[[1]][, c("Mean Sq", "F value")]
# Transform into a dataframe for ggplot
anova_stats_df <- as.data.frame(anova_stats)
# Add the term names to the dataframe
anova_stats_df$Term <- rownames(anova_stats_df)
# Plotting
ggplot(anova_stats_df, aes(x = Term, y = `Mean Sq`)) +
  geom_bar(stat = "identity", fill = 'skyblue') +
  geom_errorbar(aes(ymin = `Mean Sq` - sqrt(`Mean Sq`), ymax = `Mean Sq` + sqrt(`Mean Sq`)), width = 0.4) +
  theme_minimal() +
  labs(title = "ANOVA Test for Educational Level on  How satisfied with the national government",
       x = "Educational Level",
       y = "Mean Square Error")