DATA CLEANING AND PREP

# Recode 'vote' and handle missing values
data$vote[data$vote %in% c(7, 8, 9)] <- NA
data$vote <- factor(data$vote, levels = c(1, 2, 3), labels = c("Yes", "No", "Not eligible"))
# Recode 'eisced' into 'education_level' and handle missing values
data$eisced[data$eisced %in% c(0, 55, 77, 88, 99)] <- NA
data$education_level <- factor(data$eisced, 
                               levels = c(1, 2, 3, 4, 5, 6, 7),
                               labels = c("Low", "Low", "Medium", "Medium", "Medium", "High", "High"))
# Handle missing values for 'agea'
data$agea[data$agea == 999] <- NA
# Ensure 'cntry' is a factor
data$cntry <- factor(data$cntry)
# Recode 'polintr' and handle missing values
data$polintr[data$polintr %in% c(7, 8, 9)] <- NA
data$polintr <- factor(data$polintr, levels = c(1, 2, 3, 4), 
                       labels = c("Very interested", "Quite interested", "Hardly interested", "Not at all interested"))
# Check for missing values in key variables
colSums(is.na(data[c("vote", "education_level", "agea", "cntry", "polintr")]))
##            vote education_level            agea           cntry         polintr 
##             170             140             151               0              23
# Remove rows with missing data in key variables
data_clean <- na.omit(data[c("vote", "education_level", "agea", "cntry", "polintr")])

# Check the structure of the cleaned data
str(data_clean)
## 'data.frame':    21735 obs. of  5 variables:
##  $ vote           : Factor w/ 3 levels "Yes","No","Not eligible": 1 1 1 2 1 1 1 1 1 1 ...
##  $ education_level: Factor w/ 3 levels "Low","Medium",..: 2 2 3 2 2 2 1 3 2 1 ...
##  $ agea           : int  65 21 53 78 64 59 77 69 52 75 ...
##  $ cntry          : Factor w/ 13 levels "AT","CH","DE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ polintr        : Factor w/ 4 levels "Very interested",..: 1 2 2 3 2 2 2 1 3 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:455] 117 182 194 285 347 428 430 437 483 539 ...
##   ..- attr(*, "names")= chr [1:455] "117" "182" "194" "285" ...

DESCRIPTIVE STATISTICS

# Summary for 'vote'
vote_summary <- data.frame(
  Variable = "Vote",
  Category = names(table(data_clean$vote)),
  Count = as.numeric(table(data_clean$vote)),
  Percentage = paste0(round(prop.table(table(data_clean$vote)) * 100, 1), "%")
)

# Summary for 'education_level'
education_summary <- data.frame(
  Variable = "Education Level",
  Category = names(table(data_clean$education_level)),
  Count = as.numeric(table(data_clean$education_level)),
  Percentage = paste0(round(prop.table(table(data_clean$education_level)) * 100, 1), "%")
)

# Summary for 'cntry'
country_summary <- data.frame(
  Variable = "Country",
  Category = names(table(data_clean$cntry)),
  Count = as.numeric(table(data_clean$cntry)),
  Percentage = paste0(round(prop.table(table(data_clean$cntry)) * 100, 1), "%")
)

# Summary for 'polintr'
polintr_summary <- data.frame(
  Variable = "Political Interest",
  Category = names(table(data_clean$polintr)),
  Count = as.numeric(table(data_clean$polintr)),
  Percentage = paste0(round(prop.table(table(data_clean$polintr)) * 100, 1), "%")
)
# Summary for continuous variable 'agea'
age_summary <- data.frame(
  Variable = "Age (years)",
  Mean = round(mean(data_clean$agea, na.rm = TRUE), 2),
  Median = round(median(data_clean$agea, na.rm = TRUE), 2),
  SD = round(sd(data_clean$agea, na.rm = TRUE), 2),
  Min = min(data_clean$agea, na.rm = TRUE),
  Max = max(data_clean$agea, na.rm = TRUE)
)
# Load the packages 
library(kableExtra)
library(knitr)
# Combine categorical summaries
categorical_summary <- rbind(vote_summary, education_summary, country_summary, polintr_summary)

# Add the continuous variable summary
age_summary_df <- data.frame(
  Variable = age_summary$Variable,
  Category = "",
  Count = "",
  Percentage = "",
  Mean = age_summary$Mean,
  Median = age_summary$Median,
  SD = age_summary$SD,
  Min = age_summary$Min,
  Max = age_summary$Max
)

# Add placeholder columns to categorical summary
categorical_summary <- cbind(categorical_summary, Mean = "", Median = "", SD = "", Min = "", Max = "")

# Combine categorical and continuous summaries
final_summary_df <- rbind(categorical_summary, age_summary_df)

# Generate the table using knitr::kable
kable_output <- kable(final_summary_df, format = "html", col.names = c("Variable", "Category", "Count", "Percentage", "Mean", "Median", "SD", "Min", "Max")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

# Save the table as a Word document
save_kable(kable_output, file = "summary_table.docx")
library(flextable)
## 
## Attaching package: 'flextable'
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
# Convert to flextable
ft <- flextable(final_summary_df)

# Save the flextable as a Word document
save_as_docx(ft, path = "summary_table.docx")

VISUALIZATIONS

# Load required packages
library(ggplot2)
# Create the histogram
ggplot(data_clean, aes(x = polintr, fill = polintr)) +
  geom_bar() +
  labs(title = "Distribution of Political Interest",
       x = "Level of Political Interest",
       y = "Count of Respondents",
       fill = "Political Interest") +
  theme_minimal()

# Save the plot
ggsave("political_interest_distribution.png", width = 8, height = 6)
# Box plot of age by political interest
ggplot(data_clean, aes(x = polintr, y = agea, fill = polintr)) +
  geom_boxplot() +
  labs(title = "Age Distribution by Political Interest",
       x = "Political Interest",
       y = "Age",
       fill = "Political Interest") +
  theme_minimal()

# Save the plot
ggsave("age_by_political_interest.png", width = 8, height = 6)

INITIAL REGRESSIONS

# Ensure that the vote variable is binary with 1 for "Yes" and 0 for "No"
data_clean$vote_binary <- ifelse(data_clean$vote == "Yes", 1, 0)
# Fit the first logistic regression model with education_level as the predictor
model1 <- glm(vote_binary ~ education_level, data = data_clean, family = binomial)

# Summarize the model to see the results
summary(model1)
## 
## Call:
## glm(formula = vote_binary ~ education_level, family = binomial, 
##     data = data_clean)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            0.29869    0.03228   9.254   <2e-16 ***
## education_levelMedium  0.85029    0.03869  21.974   <2e-16 ***
## education_levelHigh    1.27172    0.04746  26.798   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24720  on 21734  degrees of freedom
## Residual deviance: 23939  on 21732  degrees of freedom
## AIC: 23945
## 
## Number of Fisher Scoring iterations: 4
# Fit the second logistic regression model with interaction between education_level and agea
model2 <- glm(vote_binary ~ education_level * agea, data = data_clean, family = binomial)

# Summarize the model to see the results
summary(model2)
## 
## Call:
## glm(formula = vote_binary ~ education_level * agea, family = binomial, 
##     data = data_clean)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -2.570668   0.105245 -24.426  < 2e-16 ***
## education_levelMedium       1.970782   0.123106  16.009  < 2e-16 ***
## education_levelHigh         2.562039   0.155623  16.463  < 2e-16 ***
## agea                        0.052837   0.001798  29.385  < 2e-16 ***
## education_levelMedium:agea -0.017552   0.002205  -7.959 1.73e-15 ***
## education_levelHigh:agea   -0.018480   0.003088  -5.985 2.17e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24720  on 21734  degrees of freedom
## Residual deviance: 21757  on 21729  degrees of freedom
## AIC: 21769
## 
## Number of Fisher Scoring iterations: 4
library(pandoc)
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
# Combine the models into a list for comparison
model_list <- list("Model 1: Education Level" = model1, "Model 2: Education + Age Interaction" = model2)

# Create a regression table and save it as a Word document
modelsummary(model_list, statistic = "conf.int", output = "regression_table.docx")