This analysis compares three different classification models for predicting voting preferences based on demographic and socioeconomic variables.
This analysis employs three distinct machine learning approaches to predict voting preferences:
Multinomial Logit: A statistical model that estimates the probability of categorical outcomes using linear combinations of predictors, providing interpretable coefficients but assuming linear relationships
Decision Tree: A non-parametric model that recursively partitions data based on feature thresholds, offering high interpretability through clear decision rules but potentially suffering from overfitting
Random Forest: An ensemble method that aggregates multiple decision trees using bootstrap sampling and feature randomization, delivering robust predictions and reliable importance rankings through mean decrease in Gini impurity
Variable importance rankings reveal consistent patterns across models, with demographic and regional factors emerging as primary determinants of voting behavior, while the relative significance of socioeconomic variables varies by modeling approach, highlighting both the complexity of voter decision-making and the complementary insights provided by different analytical techniques.
df <-read_excel("df.xlsx")
df$vote_choice <- as.factor(df$vote_choice)
df$vote_choice <- relevel(df$vote_choice, ref = "Others")
binary_vars <- c("female", "higherEd", "evangelical", "catholic",
"no_religion", "NE", "S", "N", "CW", "SE","other_religion")
df[binary_vars] <- lapply(df[binary_vars], function(x) {
factor(x, levels = c(0, 1))
})
df$race <- as.factor(df$race)Note: NE: Northeast Region; N: North Region; S: South Region; CW: Central-West Region; SE: Southeast Region
## Dataset dimensions: 500 15
## Vote distribution:
##
## Others Bolsonaro Lula
## 112 190 198
model_multinomial <- multinom(vote_choice ~ age + female + income_k + higherEd +
evangelical + catholic + no_religion + NE + S + N + CW +
race, data = df, trace = FALSE)
coef_summary <- summary(model_multinomial)$coefficients
coef_summary_no_intercept <- coef_summary[, -1]
importance_multinomial <- data.frame(
Variable = colnames(coef_summary_no_intercept),
Importance = apply(abs(coef_summary_no_intercept), 2, mean, na.rm = TRUE)
) %>% arrange(desc(Importance))
importance_multinomial## Variable Importance
## raceWhite raceWhite 1.52453084
## NE1 NE1 1.29014850
## S1 S1 1.21021501
## no_religion1 no_religion1 0.94930178
## evangelical1 evangelical1 0.91199655
## raceOther raceOther 0.84204164
## CW1 CW1 0.79861206
## female1 female1 0.54276931
## catholic1 catholic1 0.51295321
## N1 N1 0.48152771
## income_k income_k 0.22821262
## raceMixed raceMixed 0.20472575
## higherEd1 higherEd1 0.12295937
## age age 0.03460616
##
## ==============================================
## Dependent variable:
## ----------------------------
## Bolsonaro Lula
## (1) (2)
## ----------------------------------------------
## age 0.049*** -0.020***
## (0.009) (0.008)
##
## female1 -0.786*** 0.300
## (0.301) (0.270)
##
## income_k 0.331*** 0.126*
## (0.078) (0.073)
##
## higherEd1 0.163 0.083
## (0.302) (0.269)
##
## evangelical1 1.781*** -0.043
## (0.439) (0.467)
##
## catholic1 -0.263 0.763**
## (0.361) (0.323)
##
## no_religion1 -1.763*** -0.135
## (0.492) (0.376)
##
## NE1 0.166 2.415***
## (0.539) (0.454)
##
## S1 2.093*** 0.328
## (0.467) (0.471)
##
## N1 -0.698 0.265
## (0.535) (0.474)
##
## CW1 1.052* 0.545
## (0.547) (0.518)
##
## raceMixed 0.290 -0.120
## (0.613) (0.444)
##
## raceOther 0.185 -1.499
## (1.126) (0.994)
##
## raceWhite 1.541** -1.508***
## (0.615) (0.466)
##
## Constant -4.760*** 0.523
## (0.869) (0.655)
##
## ----------------------------------------------
## Akaike Inf. Crit. 784.853 784.853
## ==============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -362.4265310 -534.8207298 344.7883977 0.3223402 0.4982116 0.5646988
Importance of variables
tree_model <- rpart(vote_choice ~ age + female + income_k + higherEd +
evangelical + catholic + no_religion + NE + S + N + CW + race,
data = df,
method = "class",
control = rpart.control(minsplit = 30, cp = 0.01))
importance_tree <- tree_model$variable.importance
importance_tree_df <- data.frame(
Variable = names(importance_tree),
Importance = importance_tree
) %>% arrange(desc(Importance))
importance_tree_df## Variable Importance
## age age 69.5808042
## race race 22.1535859
## income_k income_k 16.3608745
## NE NE 12.9530948
## evangelical evangelical 6.7000933
## S S 5.4275689
## no_religion no_religion 2.5866393
## catholic catholic 1.7036754
## female female 1.6522829
## N N 1.2983335
## CW CW 0.4697001
## higherEd higherEd 0.2726740
The decision tree highlights clear distinctions in the profiles of Brazilian voters. Lula’s supporters are mainly younger voters (age < 30), especially those who are not evangelical (evangelical = 0), not white (race ≠ White).
Bolsonaro’s voters are generally older, white, and evangelical (evangelical = 1), often with moderate to higher income. This combination represents his core base of support.
Voters for “Others” are more common among younger, non-evangelical individuals and those not from the Northeast (NE = 0), showing weaker attachment to either Lula or Bolsonaro.
Overall, age and religion are the main dividing factors: Lula attracts younger, secular, and non-white voters, Bolsonaro appeals to older white evangelicals, and “Others” concentrate among younger, less religious voters outside the Northeast.
rf_model <- randomForest(vote_choice ~ age + female + income_k + higherEd +
evangelical + catholic + no_religion + NE + S + N + CW + race,
data = df,
ntree = 200,
importance = TRUE)
importance_rf <- importance(rf_model)
importance_rf_df <- data.frame(
Variable = rownames(importance_rf),
Importance = importance_rf[, "MeanDecreaseGini"]
) %>% arrange(desc(Importance))
importance_rf_df## Variable Importance
## age age 92.558449
## income_k income_k 53.834240
## race race 28.616988
## NE NE 17.239346
## evangelical evangelical 16.237962
## S S 11.308581
## female female 10.228674
## catholic catholic 9.921021
## higherEd higherEd 8.751669
## no_religion no_religion 7.844488
## N N 5.403042
## CW CW 5.243616
## ## Multinomial Logit Model
## **Variable Importance Ranking**
multinomial_simple <- importance_multinomial %>%
mutate(Importance = round(Importance, 4)) %>%
arrange(desc(Importance))
knitr::kable(multinomial_simple,
caption = "Multinomial Logit - Variable Importance",
col.names = c("Variable", "Importance Score"))| Variable | Importance Score | |
|---|---|---|
| raceWhite | raceWhite | 1.5245 |
| NE1 | NE1 | 1.2901 |
| S1 | S1 | 1.2102 |
| no_religion1 | no_religion1 | 0.9493 |
| evangelical1 | evangelical1 | 0.9120 |
| raceOther | raceOther | 0.8420 |
| CW1 | CW1 | 0.7986 |
| female1 | female1 | 0.5428 |
| catholic1 | catholic1 | 0.5130 |
| N1 | N1 | 0.4815 |
| income_k | income_k | 0.2282 |
| raceMixed | raceMixed | 0.2047 |
| higherEd1 | higherEd1 | 0.1230 |
| age | age | 0.0346 |
##
##
## ## Decision Tree Model
## **Variable Importance Ranking**
tree_simple <- importance_tree_df %>%
mutate(Importance = round(Importance, 4)) %>%
arrange(desc(Importance))
knitr::kable(tree_simple,
caption = "Decision Tree - Variable Importance",
col.names = c("Variable", "Importance Score"))| Variable | Importance Score | |
|---|---|---|
| age | age | 69.5808 |
| race | race | 22.1536 |
| income_k | income_k | 16.3609 |
| NE | NE | 12.9531 |
| evangelical | evangelical | 6.7001 |
| S | S | 5.4276 |
| no_religion | no_religion | 2.5866 |
| catholic | catholic | 1.7037 |
| female | female | 1.6523 |
| N | N | 1.2983 |
| CW | CW | 0.4697 |
| higherEd | higherEd | 0.2727 |
# Importance - Random Forest
rf_simple <- importance_rf_df %>%
mutate(Importance = round(Importance, 4)) %>%
arrange(desc(Importance))
knitr::kable(rf_simple,
caption = "Random Forest - Variable Importance",
col.names = c("Variable", "Importance Score"))| Variable | Importance Score | |
|---|---|---|
| age | age | 92.5584 |
| income_k | income_k | 53.8342 |
| race | race | 28.6170 |
| NE | NE | 17.2393 |
| evangelical | evangelical | 16.2380 |
| S | S | 11.3086 |
| female | female | 10.2287 |
| catholic | catholic | 9.9210 |
| higherEd | higherEd | 8.7517 |
| no_religion | no_religion | 7.8445 |
| N | N | 5.4030 |
| CW | CW | 5.2436 |
create_prediction_data <- function(var_name, var_range, n_points = 100) {
base_data <- data.frame(
age = median(df$age),
female = factor(0, levels = c(0, 1)),
income_k = median(df$income_k),
higherEd = factor(0, levels = c(0, 1)),
evangelical = factor(0, levels = c(0, 1)),
catholic = factor(1, levels = c(0, 1)),
no_religion = factor(0, levels = c(0, 1)),
NE = factor(0, levels = c(0, 1)),
S = factor(1, levels = c(0, 1)),
N = factor(0, levels = c(0, 1)),
CW = factor(0, levels = c(0, 1)),
race = factor("White", levels = levels(df$race))
)
var_sequence <- seq(var_range[1], var_range[2], length.out = n_points)
pred_data <- base_data[rep(1, n_points), ]
pred_data[[var_name]] <- var_sequence
return(pred_data)
}age_pred_data_logit <- create_prediction_data("age", c(18, 80))
logit_age_probs <- predict(model_multinomial, newdata = age_pred_data_logit, type = "probs")
logit_age_results <- data.frame(
age = age_pred_data_logit$age,
Bolsonaro = logit_age_probs[, "Bolsonaro"],
Lula = logit_age_probs[, "Lula"],
Others = logit_age_probs[, "Others"]
)
income_pred_data_logit <- create_prediction_data("income_k", c(1, 10))
logit_income_probs <- predict(model_multinomial, newdata = income_pred_data_logit, type = "probs")
logit_income_results <- data.frame(
income_k = income_pred_data_logit$income_k,
Bolsonaro = logit_income_probs[, "Bolsonaro"],
Lula = logit_income_probs[, "Lula"],
Others = logit_income_probs[, "Others"]
)
p_logit_age <- ggplot(logit_age_results, aes(x = age)) +
geom_line(aes(y = Bolsonaro, color = "Bolsonaro"), size = 1.2) +
geom_line(aes(y = Lula, color = "Lula"), size = 1.2) +
geom_line(aes(y = Others, color = "Others"), size = 1.2) +
labs(title = "Probability Variation - Age (Multinomial Logit)",
x = "Age", y = "Probability", color = "Candidate") +
scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
theme_minimal()
p_logit_income <- ggplot(logit_income_results, aes(x = income_k)) +
geom_line(aes(y = Bolsonaro, color = "Bolsonaro"), size = 1.2) +
geom_line(aes(y = Lula, color = "Lula"), size = 1.2) +
geom_line(aes(y = Others, color = "Others"), size = 1.2) +
labs(title = "Probability Variation - Income (Multinomial Logit)",
x = "Income (thousands)", y = "Probability", color = "Candidate") +
scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
theme_minimal()
p_logit_age / p_logit_incomepdp_tree_age_bolsonaro <- partial(tree_model, pred.var = "age", which.class = "Bolsonaro",
prob = TRUE, plot = FALSE)
pdp_tree_age_lula <- partial(tree_model, pred.var = "age", which.class = "Lula",
prob = TRUE, plot = FALSE)
pdp_tree_age_others <- partial(tree_model, pred.var = "age", which.class = "Others",
prob = TRUE, plot = FALSE)
p_tree_age <- ggplot() +
geom_line(data = pdp_tree_age_bolsonaro, aes(x = age, y = yhat, color = "Bolsonaro"), size = 1.2) +
geom_line(data = pdp_tree_age_lula, aes(x = age, y = yhat, color = "Lula"), size = 1.2) +
geom_line(data = pdp_tree_age_others, aes(x = age, y = yhat, color = "Others"), size = 1.2) +
labs(title = "Partial Dependence - Age (Decision Tree)",
x = "Age", y = "Partial Dependence", color = "Candidate") +
scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
theme_minimal()
pdp_tree_income_bolsonaro <- partial(tree_model, pred.var = "income_k", which.class = "Bolsonaro",
prob = TRUE, plot = FALSE)
pdp_tree_income_lula <- partial(tree_model, pred.var = "income_k", which.class = "Lula",
prob = TRUE, plot = FALSE)
pdp_tree_income_others <- partial(tree_model, pred.var = "income_k", which.class = "Others",
prob = TRUE, plot = FALSE)
p_tree_income <- ggplot() +
geom_line(data = pdp_tree_income_bolsonaro, aes(x = income_k, y = yhat, color = "Bolsonaro"), size = 1.2) +
geom_line(data = pdp_tree_income_lula, aes(x = income_k, y = yhat, color = "Lula"), size = 1.2) +
geom_line(data = pdp_tree_income_others, aes(x = income_k, y = yhat, color = "Others"), size = 1.2) +
labs(title = "Partial Dependence - Income (Decision Tree)",
x = "Income (thousands)", y = "Partial Dependence", color = "Candidate") +
scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
theme_minimal()
p_tree_age / p_tree_incomepdp_rf_age_bolsonaro <- partial(rf_model, pred.var = "age", which.class = "Bolsonaro",
prob = TRUE, plot = FALSE)
pdp_rf_age_lula <- partial(rf_model, pred.var = "age", which.class = "Lula",
prob = TRUE, plot = FALSE)
pdp_rf_age_others <- partial(rf_model, pred.var = "age", which.class = "Others",
prob = TRUE, plot = FALSE)
p_rf_age <- ggplot() +
geom_line(data = pdp_rf_age_bolsonaro, aes(x = age, y = yhat, color = "Bolsonaro"), size = 1.2) +
geom_line(data = pdp_rf_age_lula, aes(x = age, y = yhat, color = "Lula"), size = 1.2) +
geom_line(data = pdp_rf_age_others, aes(x = age, y = yhat, color = "Others"), size = 1.2) +
labs(title = "Partial Dependence - Age (Random Forest)",
x = "Age", y = "Partial Dependence", color = "Candidate") +
scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
theme_minimal()
pdp_rf_income_bolsonaro <- partial(rf_model, pred.var = "income_k", which.class = "Bolsonaro",
prob = TRUE, plot = FALSE)
pdp_rf_income_lula <- partial(rf_model, pred.var = "income_k", which.class = "Lula",
prob = TRUE, plot = FALSE)
pdp_rf_income_others <- partial(rf_model, pred.var = "income_k", which.class = "Others",
prob = TRUE, plot = FALSE)
p_rf_income <- ggplot() +
geom_line(data = pdp_rf_income_bolsonaro, aes(x = income_k, y = yhat, color = "Bolsonaro"), size = 1.2) +
geom_line(data = pdp_rf_income_lula, aes(x = income_k, y = yhat, color = "Lula"), size = 1.2) +
geom_line(data = pdp_rf_income_others, aes(x = income_k, y = yhat, color = "Others"), size = 1.2) +
labs(title = "Partial Dependence - Income (Random Forest)",
x = "Income (thousands)", y = "Partial Dependence", color = "Candidate") +
scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
theme_minimal()
p_rf_age / p_rf_income## age female income_k higherEd evangelical catholic
## Min. :16.00 0:235 Min. : 1.400 0:197 0:398 0:332
## 1st Qu.:31.75 1:265 1st Qu.: 3.700 1:303 1:102 1:168
## Median :48.00 Median : 5.050
## Mean :47.62 Mean : 5.206
## 3rd Qu.:63.00 3rd Qu.: 6.400
## Max. :79.00 Max. :10.800
## no_religion other_religion SE NE S N CW race
## 0:425 0:345 0:247 0:411 0:421 0:462 0:459 Black: 46
## 1: 75 1:155 1:253 1: 89 1: 79 1: 38 1: 41 Mixed:215
## Other: 10
## White:229
##
##
## vote_choice
## Others :112
## Bolsonaro:190
## Lula :198
##
##
##
Note: NE: Northeast Region; N: North Region; S: South Region; CW: Central-West Region; SE: Southeast Region