DATA CLEANING AND PREP
# Recode 'vote' and handle missing values
data$vote[data$vote %in% c(7, 8, 9)] <- NA
data$vote <- factor(data$vote, levels = c(1, 2, 3), labels = c("Yes", "No", "Not eligible"))
# Recode 'eisced' into 'education_level' and handle missing values
data$eisced[data$eisced %in% c(0, 55, 77, 88, 99)] <- NA
data$education_level <- factor(data$eisced,
levels = c(1, 2, 3, 4, 5, 6, 7),
labels = c("Low", "Low", "Medium", "Medium", "Medium", "High", "High"))
# Handle missing values for 'agea'
data$agea[data$agea == 999] <- NA
# Ensure 'cntry' is a factor
data$cntry <- factor(data$cntry)
# Recode 'polintr' and handle missing values
data$polintr[data$polintr %in% c(7, 8, 9)] <- NA
data$polintr <- factor(data$polintr, levels = c(1, 2, 3, 4),
labels = c("Very interested", "Quite interested", "Hardly interested", "Not at all interested"))
# Check for missing values in key variables
colSums(is.na(data[c("vote", "education_level", "agea", "cntry", "polintr")]))
## vote education_level agea cntry polintr
## 170 140 151 0 23
# Remove rows with missing data in key variables
data_clean <- na.omit(data[c("vote", "education_level", "agea", "cntry", "polintr")])
# Check the structure of the cleaned data
str(data_clean)
## 'data.frame': 21735 obs. of 5 variables:
## $ vote : Factor w/ 3 levels "Yes","No","Not eligible": 1 1 1 2 1 1 1 1 1 1 ...
## $ education_level: Factor w/ 3 levels "Low","Medium",..: 2 2 3 2 2 2 1 3 2 1 ...
## $ agea : int 65 21 53 78 64 59 77 69 52 75 ...
## $ cntry : Factor w/ 13 levels "AT","CH","DE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ polintr : Factor w/ 4 levels "Very interested",..: 1 2 2 3 2 2 2 1 3 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:455] 117 182 194 285 347 428 430 437 483 539 ...
## ..- attr(*, "names")= chr [1:455] "117" "182" "194" "285" ...
DESCRIPTIVE STATISTICS
# Summary for 'vote'
vote_summary <- data.frame(
Variable = "Vote",
Category = names(table(data_clean$vote)),
Count = as.numeric(table(data_clean$vote)),
Percentage = paste0(round(prop.table(table(data_clean$vote)) * 100, 1), "%")
)
# Summary for 'education_level'
education_summary <- data.frame(
Variable = "Education Level",
Category = names(table(data_clean$education_level)),
Count = as.numeric(table(data_clean$education_level)),
Percentage = paste0(round(prop.table(table(data_clean$education_level)) * 100, 1), "%")
)
# Summary for 'cntry'
country_summary <- data.frame(
Variable = "Country",
Category = names(table(data_clean$cntry)),
Count = as.numeric(table(data_clean$cntry)),
Percentage = paste0(round(prop.table(table(data_clean$cntry)) * 100, 1), "%")
)
# Summary for 'polintr'
polintr_summary <- data.frame(
Variable = "Political Interest",
Category = names(table(data_clean$polintr)),
Count = as.numeric(table(data_clean$polintr)),
Percentage = paste0(round(prop.table(table(data_clean$polintr)) * 100, 1), "%")
)
# Summary for continuous variable 'agea'
age_summary <- data.frame(
Variable = "Age (years)",
Mean = round(mean(data_clean$agea, na.rm = TRUE), 2),
Median = round(median(data_clean$agea, na.rm = TRUE), 2),
SD = round(sd(data_clean$agea, na.rm = TRUE), 2),
Min = min(data_clean$agea, na.rm = TRUE),
Max = max(data_clean$agea, na.rm = TRUE)
)
# Load the packages
library(kableExtra)
library(knitr)
# Combine categorical summaries
categorical_summary <- rbind(vote_summary, education_summary, country_summary, polintr_summary)
# Add the continuous variable summary
age_summary_df <- data.frame(
Variable = age_summary$Variable,
Category = "",
Count = "",
Percentage = "",
Mean = age_summary$Mean,
Median = age_summary$Median,
SD = age_summary$SD,
Min = age_summary$Min,
Max = age_summary$Max
)
# Add placeholder columns to categorical summary
categorical_summary <- cbind(categorical_summary, Mean = "", Median = "", SD = "", Min = "", Max = "")
# Combine categorical and continuous summaries
final_summary_df <- rbind(categorical_summary, age_summary_df)
# Generate the table using knitr::kable
kable_output <- kable(final_summary_df, format = "html", col.names = c("Variable", "Category", "Count", "Percentage", "Mean", "Median", "SD", "Min", "Max")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
# Save the table as a Word document
save_kable(kable_output, file = "summary_table.docx")
library(flextable)
##
## Attaching package: 'flextable'
## The following objects are masked from 'package:kableExtra':
##
## as_image, footnote
# Convert to flextable
ft <- flextable(final_summary_df)
# Save the flextable as a Word document
save_as_docx(ft, path = "summary_table.docx")
VISUALIZATIONS
# Load required packages
library(ggplot2)
# Create the histogram
ggplot(data_clean, aes(x = polintr, fill = polintr)) +
geom_bar() +
labs(title = "Distribution of Political Interest",
x = "Level of Political Interest",
y = "Count of Respondents",
fill = "Political Interest") +
theme_minimal()
# Save the plot
ggsave("political_interest_distribution.png", width = 8, height = 6)
# Box plot of age by political interest
ggplot(data_clean, aes(x = polintr, y = agea, fill = polintr)) +
geom_boxplot() +
labs(title = "Age Distribution by Political Interest",
x = "Political Interest",
y = "Age",
fill = "Political Interest") +
theme_minimal()
# Save the plot
ggsave("age_by_political_interest.png", width = 8, height = 6)
INITIAL REGRESSIONS
# Ensure that the vote variable is binary with 1 for "Yes" and 0 for "No"
data_clean$vote_binary <- ifelse(data_clean$vote == "Yes", 1, 0)
# Fit the first logistic regression model with education_level as the predictor
model1 <- glm(vote_binary ~ education_level, data = data_clean, family = binomial)
# Summarize the model to see the results
summary(model1)
##
## Call:
## glm(formula = vote_binary ~ education_level, family = binomial,
## data = data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.29869 0.03228 9.254 <2e-16 ***
## education_levelMedium 0.85029 0.03869 21.974 <2e-16 ***
## education_levelHigh 1.27172 0.04746 26.798 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24720 on 21734 degrees of freedom
## Residual deviance: 23939 on 21732 degrees of freedom
## AIC: 23945
##
## Number of Fisher Scoring iterations: 4
# Fit the second logistic regression model with interaction between education_level and agea
model2 <- glm(vote_binary ~ education_level * agea, data = data_clean, family = binomial)
# Summarize the model to see the results
summary(model2)
##
## Call:
## glm(formula = vote_binary ~ education_level * agea, family = binomial,
## data = data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.570668 0.105245 -24.426 < 2e-16 ***
## education_levelMedium 1.970782 0.123106 16.009 < 2e-16 ***
## education_levelHigh 2.562039 0.155623 16.463 < 2e-16 ***
## agea 0.052837 0.001798 29.385 < 2e-16 ***
## education_levelMedium:agea -0.017552 0.002205 -7.959 1.73e-15 ***
## education_levelHigh:agea -0.018480 0.003088 -5.985 2.17e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24720 on 21734 degrees of freedom
## Residual deviance: 21757 on 21729 degrees of freedom
## AIC: 21769
##
## Number of Fisher Scoring iterations: 4
library(pandoc)
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
# Combine the models into a list for comparison
model_list <- list("Model 1: Education Level" = model1, "Model 2: Education + Age Interaction" = model2)
# Create a regression table and save it as a Word document
modelsummary(model_list, statistic = "conf.int", output = "regression_table.docx")