Comparative Analysis of Classification Models

Introduction

This analysis compares three different classification models for predicting voting preferences based on demographic and socioeconomic variables.

This analysis employs three distinct machine learning approaches to predict voting preferences:

  1. Multinomial Logit: A statistical model that estimates the probability of categorical outcomes using linear combinations of predictors, providing interpretable coefficients but assuming linear relationships

  2. Decision Tree: A non-parametric model that recursively partitions data based on feature thresholds, offering high interpretability through clear decision rules but potentially suffering from overfitting

  3. Random Forest: An ensemble method that aggregates multiple decision trees using bootstrap sampling and feature randomization, delivering robust predictions and reliable importance rankings through mean decrease in Gini impurity

Variable importance rankings reveal consistent patterns across models, with demographic and regional factors emerging as primary determinants of voting behavior, while the relative significance of socioeconomic variables varies by modeling approach, highlighting both the complexity of voter decision-making and the complementary insights provided by different analytical techniques.

library(dplyr)
library(nnet)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ggplot2)
library(caret)
library(patchwork)
library(pdp)
library(tidyr)
library(stargazer)
library(rpart.plot)
library(readxl)
library(pscl)

Data

df <-read_excel("df.xlsx")

df$vote_choice <- as.factor(df$vote_choice)
df$vote_choice <- relevel(df$vote_choice, ref = "Others")

binary_vars <- c("female", "higherEd", "evangelical", "catholic", 
                 "no_religion", "NE", "S", "N", "CW", "SE","other_religion")

df[binary_vars] <- lapply(df[binary_vars], function(x) {
  factor(x, levels = c(0, 1))
})

df$race <- as.factor(df$race)

Note: NE: Northeast Region; N: North Region; S: South Region; CW: Central-West Region; SE: Southeast Region

Data Overview

cat("Dataset dimensions:", dim(df), "\n")
## Dataset dimensions: 500 15
cat("Vote distribution:\n")
## Vote distribution:
print(table(df$vote_choice))
## 
##    Others Bolsonaro      Lula 
##       112       190       198

1. Multinomial Logit Model

model_multinomial <- multinom(vote_choice ~ age + female + income_k + higherEd +
                                evangelical + catholic + no_religion + NE + S + N + CW +
                                race, data = df, trace = FALSE)

coef_summary <- summary(model_multinomial)$coefficients
coef_summary_no_intercept <- coef_summary[, -1]

importance_multinomial <- data.frame(
  Variable = colnames(coef_summary_no_intercept),
  Importance = apply(abs(coef_summary_no_intercept), 2, mean, na.rm = TRUE)
) %>% arrange(desc(Importance))

importance_multinomial
##                  Variable Importance
## raceWhite       raceWhite 1.52453084
## NE1                   NE1 1.29014850
## S1                     S1 1.21021501
## no_religion1 no_religion1 0.94930178
## evangelical1 evangelical1 0.91199655
## raceOther       raceOther 0.84204164
## CW1                   CW1 0.79861206
## female1           female1 0.54276931
## catholic1       catholic1 0.51295321
## N1                     N1 0.48152771
## income_k         income_k 0.22821262
## raceMixed       raceMixed 0.20472575
## higherEd1       higherEd1 0.12295937
## age                   age 0.03460616
#Summary
stargazer(model_multinomial, type = "text", out = "output.htm")
## 
## ==============================================
##                       Dependent variable:     
##                   ----------------------------
##                     Bolsonaro        Lula     
##                        (1)            (2)     
## ----------------------------------------------
## age                  0.049***      -0.020***  
##                      (0.009)        (0.008)   
##                                               
## female1             -0.786***        0.300    
##                      (0.301)        (0.270)   
##                                               
## income_k             0.331***       0.126*    
##                      (0.078)        (0.073)   
##                                               
## higherEd1             0.163          0.083    
##                      (0.302)        (0.269)   
##                                               
## evangelical1         1.781***       -0.043    
##                      (0.439)        (0.467)   
##                                               
## catholic1             -0.263        0.763**   
##                      (0.361)        (0.323)   
##                                               
## no_religion1        -1.763***       -0.135    
##                      (0.492)        (0.376)   
##                                               
## NE1                   0.166        2.415***   
##                      (0.539)        (0.454)   
##                                               
## S1                   2.093***        0.328    
##                      (0.467)        (0.471)   
##                                               
## N1                    -0.698         0.265    
##                      (0.535)        (0.474)   
##                                               
## CW1                   1.052*         0.545    
##                      (0.547)        (0.518)   
##                                               
## raceMixed             0.290         -0.120    
##                      (0.613)        (0.444)   
##                                               
## raceOther             0.185         -1.499    
##                      (1.126)        (0.994)   
##                                               
## raceWhite            1.541**       -1.508***  
##                      (0.615)        (0.466)   
##                                               
## Constant            -4.760***        0.523    
##                      (0.869)        (0.655)   
##                                               
## ----------------------------------------------
## Akaike Inf. Crit.    784.853        784.853   
## ==============================================
## Note:              *p<0.1; **p<0.05; ***p<0.01
#McFadden's R-squared 
pR2(model_multinomial)
## fitting null model for pseudo-r2
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -362.4265310 -534.8207298  344.7883977    0.3223402    0.4982116    0.5646988

2. Decision Tree Model

Importance of variables

tree_model <- rpart(vote_choice ~ age + female + income_k + higherEd +
                      evangelical + catholic + no_religion + NE + S + N + CW + race,
                    data = df,
                    method = "class",
                    control = rpart.control(minsplit = 30, cp = 0.01))

importance_tree <- tree_model$variable.importance
importance_tree_df <- data.frame(
  Variable = names(importance_tree),
  Importance = importance_tree
) %>% arrange(desc(Importance))

importance_tree_df
##                Variable Importance
## age                 age 69.5808042
## race               race 22.1535859
## income_k       income_k 16.3608745
## NE                   NE 12.9530948
## evangelical evangelical  6.7000933
## S                     S  5.4275689
## no_religion no_religion  2.5866393
## catholic       catholic  1.7036754
## female           female  1.6522829
## N                     N  1.2983335
## CW                   CW  0.4697001
## higherEd       higherEd  0.2726740
rpart.plot(tree_model)

The decision tree highlights clear distinctions in the profiles of Brazilian voters. Lula’s supporters are mainly younger voters (age < 30), especially those who are not evangelical (evangelical = 0), not white (race ≠ White).

Bolsonaro’s voters are generally older, white, and evangelical (evangelical = 1), often with moderate to higher income. This combination represents his core base of support.

Voters for “Others” are more common among younger, non-evangelical individuals and those not from the Northeast (NE = 0), showing weaker attachment to either Lula or Bolsonaro.

Overall, age and religion are the main dividing factors: Lula attracts younger, secular, and non-white voters, Bolsonaro appeals to older white evangelicals, and “Others” concentrate among younger, less religious voters outside the Northeast.

3. Random Forest Model

rf_model <- randomForest(vote_choice ~ age + female + income_k + higherEd +
                           evangelical + catholic + no_religion + NE + S + N + CW + race,
                         data = df,
                         ntree = 200,
                         importance = TRUE)

importance_rf <- importance(rf_model)
importance_rf_df <- data.frame(
  Variable = rownames(importance_rf),
  Importance = importance_rf[, "MeanDecreaseGini"]
) %>% arrange(desc(Importance))

importance_rf_df
##                Variable Importance
## age                 age  92.558449
## income_k       income_k  53.834240
## race               race  28.616988
## NE                   NE  17.239346
## evangelical evangelical  16.237962
## S                     S  11.308581
## female           female  10.228674
## catholic       catholic   9.921021
## higherEd       higherEd   8.751669
## no_religion no_religion   7.844488
## N                     N   5.403042
## CW                   CW   5.243616

Variable Importance Comparison

# Importance- Multinomial Logit
cat("## Multinomial Logit Model\n\n")
## ## Multinomial Logit Model
cat("**Variable Importance Ranking**\n\n")
## **Variable Importance Ranking**
multinomial_simple <- importance_multinomial %>%
  mutate(Importance = round(Importance, 4)) %>%
  arrange(desc(Importance))

knitr::kable(multinomial_simple, 
             caption = "Multinomial Logit - Variable Importance",
             col.names = c("Variable", "Importance Score"))
Multinomial Logit - Variable Importance
Variable Importance Score
raceWhite raceWhite 1.5245
NE1 NE1 1.2901
S1 S1 1.2102
no_religion1 no_religion1 0.9493
evangelical1 evangelical1 0.9120
raceOther raceOther 0.8420
CW1 CW1 0.7986
female1 female1 0.5428
catholic1 catholic1 0.5130
N1 N1 0.4815
income_k income_k 0.2282
raceMixed raceMixed 0.2047
higherEd1 higherEd1 0.1230
age age 0.0346
# Importance - Decision Tree
cat("\n\n## Decision Tree Model\n\n")
## 
## 
## ## Decision Tree Model
cat("**Variable Importance Ranking**\n\n")
## **Variable Importance Ranking**
tree_simple <- importance_tree_df %>%
  mutate(Importance = round(Importance, 4)) %>%
  arrange(desc(Importance))

knitr::kable(tree_simple, 
             caption = "Decision Tree - Variable Importance", 
             col.names = c("Variable", "Importance Score"))
Decision Tree - Variable Importance
Variable Importance Score
age age 69.5808
race race 22.1536
income_k income_k 16.3609
NE NE 12.9531
evangelical evangelical 6.7001
S S 5.4276
no_religion no_religion 2.5866
catholic catholic 1.7037
female female 1.6523
N N 1.2983
CW CW 0.4697
higherEd higherEd 0.2727
# Importance - Random Forest

rf_simple <- importance_rf_df %>%
  mutate(Importance = round(Importance, 4)) %>%
  arrange(desc(Importance))

knitr::kable(rf_simple, 
             caption = "Random Forest - Variable Importance",
             col.names = c("Variable", "Importance Score"))
Random Forest - Variable Importance
Variable Importance Score
age age 92.5584
income_k income_k 53.8342
race race 28.6170
NE NE 17.2393
evangelical evangelical 16.2380
S S 11.3086
female female 10.2287
catholic catholic 9.9210
higherEd higherEd 8.7517
no_religion no_religion 7.8445
N N 5.4030
CW CW 5.2436

Probability Variation Analysis

create_prediction_data <- function(var_name, var_range, n_points = 100) {
  base_data <- data.frame(
    age = median(df$age),
    female = factor(0, levels = c(0, 1)),
    income_k = median(df$income_k),
    higherEd = factor(0, levels = c(0, 1)),
    evangelical = factor(0, levels = c(0, 1)),
    catholic = factor(1, levels = c(0, 1)),
    no_religion = factor(0, levels = c(0, 1)),
    NE = factor(0, levels = c(0, 1)),
    S = factor(1, levels = c(0, 1)),
    N = factor(0, levels = c(0, 1)),
    CW = factor(0, levels = c(0, 1)),
    race = factor("White", levels = levels(df$race))
  )
  
  var_sequence <- seq(var_range[1], var_range[2], length.out = n_points)
  pred_data <- base_data[rep(1, n_points), ]
  pred_data[[var_name]] <- var_sequence
  
  return(pred_data)
}

Multinomial Logit Probability Variations

age_pred_data_logit <- create_prediction_data("age", c(18, 80))
logit_age_probs <- predict(model_multinomial, newdata = age_pred_data_logit, type = "probs")
logit_age_results <- data.frame(
  age = age_pred_data_logit$age,
  Bolsonaro = logit_age_probs[, "Bolsonaro"],
  Lula = logit_age_probs[, "Lula"],
  Others = logit_age_probs[, "Others"]
)

income_pred_data_logit <- create_prediction_data("income_k", c(1, 10))
logit_income_probs <- predict(model_multinomial, newdata = income_pred_data_logit, type = "probs")
logit_income_results <- data.frame(
  income_k = income_pred_data_logit$income_k,
  Bolsonaro = logit_income_probs[, "Bolsonaro"],
  Lula = logit_income_probs[, "Lula"],
  Others = logit_income_probs[, "Others"]
)

p_logit_age <- ggplot(logit_age_results, aes(x = age)) +
  geom_line(aes(y = Bolsonaro, color = "Bolsonaro"), size = 1.2) +
  geom_line(aes(y = Lula, color = "Lula"), size = 1.2) +
  geom_line(aes(y = Others, color = "Others"), size = 1.2) +
  labs(title = "Probability Variation - Age (Multinomial Logit)",
       x = "Age", y = "Probability", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

p_logit_income <- ggplot(logit_income_results, aes(x = income_k)) +
  geom_line(aes(y = Bolsonaro, color = "Bolsonaro"), size = 1.2) +
  geom_line(aes(y = Lula, color = "Lula"), size = 1.2) +
  geom_line(aes(y = Others, color = "Others"), size = 1.2) +
  labs(title = "Probability Variation - Income (Multinomial Logit)",
       x = "Income (thousands)", y = "Probability", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

p_logit_age / p_logit_income

Partial Dependence Plots - Decision Tree

pdp_tree_age_bolsonaro <- partial(tree_model, pred.var = "age", which.class = "Bolsonaro", 
                                  prob = TRUE, plot = FALSE)
pdp_tree_age_lula <- partial(tree_model, pred.var = "age", which.class = "Lula", 
                             prob = TRUE, plot = FALSE)
pdp_tree_age_others <- partial(tree_model, pred.var = "age", which.class = "Others", 
                               prob = TRUE, plot = FALSE)

p_tree_age <- ggplot() +
  geom_line(data = pdp_tree_age_bolsonaro, aes(x = age, y = yhat, color = "Bolsonaro"), size = 1.2) +
  geom_line(data = pdp_tree_age_lula, aes(x = age, y = yhat, color = "Lula"), size = 1.2) +
  geom_line(data = pdp_tree_age_others, aes(x = age, y = yhat, color = "Others"), size = 1.2) +
  labs(title = "Partial Dependence - Age (Decision Tree)",
       x = "Age", y = "Partial Dependence", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

pdp_tree_income_bolsonaro <- partial(tree_model, pred.var = "income_k", which.class = "Bolsonaro", 
                                     prob = TRUE, plot = FALSE)
pdp_tree_income_lula <- partial(tree_model, pred.var = "income_k", which.class = "Lula", 
                                prob = TRUE, plot = FALSE)
pdp_tree_income_others <- partial(tree_model, pred.var = "income_k", which.class = "Others", 
                                  prob = TRUE, plot = FALSE)

p_tree_income <- ggplot() +
  geom_line(data = pdp_tree_income_bolsonaro, aes(x = income_k, y = yhat, color = "Bolsonaro"), size = 1.2) +
  geom_line(data = pdp_tree_income_lula, aes(x = income_k, y = yhat, color = "Lula"), size = 1.2) +
  geom_line(data = pdp_tree_income_others, aes(x = income_k, y = yhat, color = "Others"), size = 1.2) +
  labs(title = "Partial Dependence - Income (Decision Tree)",
       x = "Income (thousands)", y = "Partial Dependence", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

p_tree_age / p_tree_income

Partial Dependence Plots - Random Forest

pdp_rf_age_bolsonaro <- partial(rf_model, pred.var = "age", which.class = "Bolsonaro", 
                                prob = TRUE, plot = FALSE)
pdp_rf_age_lula <- partial(rf_model, pred.var = "age", which.class = "Lula", 
                           prob = TRUE, plot = FALSE)
pdp_rf_age_others <- partial(rf_model, pred.var = "age", which.class = "Others", 
                             prob = TRUE, plot = FALSE)

p_rf_age <- ggplot() +
  geom_line(data = pdp_rf_age_bolsonaro, aes(x = age, y = yhat, color = "Bolsonaro"), size = 1.2) +
  geom_line(data = pdp_rf_age_lula, aes(x = age, y = yhat, color = "Lula"), size = 1.2) +
  geom_line(data = pdp_rf_age_others, aes(x = age, y = yhat, color = "Others"), size = 1.2) +
  labs(title = "Partial Dependence - Age (Random Forest)",
       x = "Age", y = "Partial Dependence", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

pdp_rf_income_bolsonaro <- partial(rf_model, pred.var = "income_k", which.class = "Bolsonaro", 
                                   prob = TRUE, plot = FALSE)
pdp_rf_income_lula <- partial(rf_model, pred.var = "income_k", which.class = "Lula", 
                              prob = TRUE, plot = FALSE)
pdp_rf_income_others <- partial(rf_model, pred.var = "income_k", which.class = "Others", 
                                prob = TRUE, plot = FALSE)

p_rf_income <- ggplot() +
  geom_line(data = pdp_rf_income_bolsonaro, aes(x = income_k, y = yhat, color = "Bolsonaro"), size = 1.2) +
  geom_line(data = pdp_rf_income_lula, aes(x = income_k, y = yhat, color = "Lula"), size = 1.2) +
  geom_line(data = pdp_rf_income_others, aes(x = income_k, y = yhat, color = "Others"), size = 1.2) +
  labs(title = "Partial Dependence - Income (Random Forest)",
       x = "Income (thousands)", y = "Partial Dependence", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

p_rf_age / p_rf_income

Appendix 1: Descriptive Statistics

summary(df)
##       age        female     income_k      higherEd evangelical catholic
##  Min.   :16.00   0:235   Min.   : 1.400   0:197    0:398       0:332   
##  1st Qu.:31.75   1:265   1st Qu.: 3.700   1:303    1:102       1:168   
##  Median :48.00           Median : 5.050                                
##  Mean   :47.62           Mean   : 5.206                                
##  3rd Qu.:63.00           3rd Qu.: 6.400                                
##  Max.   :79.00           Max.   :10.800                                
##  no_religion other_religion SE      NE      S       N       CW         race    
##  0:425       0:345          0:247   0:411   0:421   0:462   0:459   Black: 46  
##  1: 75       1:155          1:253   1: 89   1: 79   1: 38   1: 41   Mixed:215  
##                                                                     Other: 10  
##                                                                     White:229  
##                                                                                
##                                                                                
##     vote_choice 
##  Others   :112  
##  Bolsonaro:190  
##  Lula     :198  
##                 
##                 
## 

Note: NE: Northeast Region; N: North Region; S: South Region; CW: Central-West Region; SE: Southeast Region