Comparative Analysis of Classification Models

Introduction

This analysis compares three different classification models for predicting voting preferences based on demographic and socioeconomic variables.

This analysis employs three distinct machine learning approaches to predict voting preferences:

Multinomial Logit: A statistical model that estimates the probability of categorical outcomes using linear combinations of predictors, providing interpretable coefficients but assuming linear relationships
Decision Tree: A non-parametric model that recursively partitions data based on feature thresholds, offering high interpretability through clear decision rules but potentially suffering from overfitting
Random Forest: An ensemble method that aggregates multiple decision trees using bootstrap sampling and feature randomization, delivering robust predictions and reliable importance rankings through mean decrease in Gini impurity

Variable importance rankings reveal consistent patterns across models, with demographic and regional factors emerging as primary determinants of voting behavior, while the relative significance of socioeconomic variables varies by modeling approach, highlighting both the complexity of voter decision-making and the complementary insights provided by different analytical techniques.

library(dplyr)
library(nnet)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ggplot2)
library(caret)
library(patchwork)
library(pdp)
library(tidyr)
library(stargazer)
library(rpart.plot)
library(readxl)
library(pscl)

Data

df <-read_excel("df.xlsx")

df$vote_choice <- as.factor(df$vote_choice)
df$vote_choice <- relevel(df$vote_choice, ref = "Others")

binary_vars <- c("female", "higherEd", "evangelical", "catholic", 
                 "no_religion", "NE", "S", "N", "CW", "SE","other_religion")

df[binary_vars] <- lapply(df[binary_vars], function(x) {
  factor(x, levels = c(0, 1))
})

df$race <- as.factor(df$race)

Note: NE: Northeast Region; N: North Region; S: South Region; CW: Central-West Region; SE: Southeast Region

Data Overview

cat("Dataset dimensions:", dim(df), "\n")

## Dataset dimensions: 500 15

cat("Vote distribution:\n")

## Vote distribution:

print(table(df$vote_choice))

## 
##    Others Bolsonaro      Lula 
##       112       190       198

1. Multinomial Logit Model

model_multinomial <- multinom(vote_choice ~ age + female + income_k + higherEd +
                                evangelical + catholic + no_religion + NE + S + N + CW +
                                race, data = df, trace = FALSE)

coef_summary <- summary(model_multinomial)$coefficients
coef_summary_no_intercept <- coef_summary[, -1]

importance_multinomial <- data.frame(
  Variable = colnames(coef_summary_no_intercept),
  Importance = apply(abs(coef_summary_no_intercept), 2, mean, na.rm = TRUE)
) %>% arrange(desc(Importance))

importance_multinomial

##                  Variable Importance
## raceWhite       raceWhite 1.52453084
## NE1                   NE1 1.29014850
## S1                     S1 1.21021501
## no_religion1 no_religion1 0.94930178
## evangelical1 evangelical1 0.91199655
## raceOther       raceOther 0.84204164
## CW1                   CW1 0.79861206
## female1           female1 0.54276931
## catholic1       catholic1 0.51295321
## N1                     N1 0.48152771
## income_k         income_k 0.22821262
## raceMixed       raceMixed 0.20472575
## higherEd1       higherEd1 0.12295937
## age                   age 0.03460616

#Summary
stargazer(model_multinomial, type = "text", out = "output.htm")

## 
## ==============================================
##                       Dependent variable:     
##                   ----------------------------
##                     Bolsonaro        Lula     
##                        (1)            (2)     
## ----------------------------------------------
## age                  0.049***      -0.020***  
##                      (0.009)        (0.008)   
##                                               
## female1             -0.786***        0.300    
##                      (0.301)        (0.270)   
##                                               
## income_k             0.331***       0.126*    
##                      (0.078)        (0.073)   
##                                               
## higherEd1             0.163          0.083    
##                      (0.302)        (0.269)   
##                                               
## evangelical1         1.781***       -0.043    
##                      (0.439)        (0.467)   
##                                               
## catholic1             -0.263        0.763**   
##                      (0.361)        (0.323)   
##                                               
## no_religion1        -1.763***       -0.135    
##                      (0.492)        (0.376)   
##                                               
## NE1                   0.166        2.415***   
##                      (0.539)        (0.454)   
##                                               
## S1                   2.093***        0.328    
##                      (0.467)        (0.471)   
##                                               
## N1                    -0.698         0.265    
##                      (0.535)        (0.474)   
##                                               
## CW1                   1.052*         0.545    
##                      (0.547)        (0.518)   
##                                               
## raceMixed             0.290         -0.120    
##                      (0.613)        (0.444)   
##                                               
## raceOther             0.185         -1.499    
##                      (1.126)        (0.994)   
##                                               
## raceWhite            1.541**       -1.508***  
##                      (0.615)        (0.466)   
##                                               
## Constant            -4.760***        0.523    
##                      (0.869)        (0.655)   
##                                               
## ----------------------------------------------
## Akaike Inf. Crit.    784.853        784.853   
## ==============================================
## Note:              *p<0.1; **p<0.05; ***p<0.01

#McFadden's R-squared 
pR2(model_multinomial)

## fitting null model for pseudo-r2

##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -362.4265310 -534.8207298  344.7883977    0.3223402    0.4982116    0.5646988

2. Decision Tree Model

Importance of variables

tree_model <- rpart(vote_choice ~ age + female + income_k + higherEd +
                      evangelical + catholic + no_religion + NE + S + N + CW + race,
                    data = df,
                    method = "class",
                    control = rpart.control(minsplit = 30, cp = 0.01))

importance_tree <- tree_model$variable.importance
importance_tree_df <- data.frame(
  Variable = names(importance_tree),
  Importance = importance_tree
) %>% arrange(desc(Importance))

importance_tree_df

##                Variable Importance
## age                 age 69.5808042
## race               race 22.1535859
## income_k       income_k 16.3608745
## NE                   NE 12.9530948
## evangelical evangelical  6.7000933
## S                     S  5.4275689
## no_religion no_religion  2.5866393
## catholic       catholic  1.7036754
## female           female  1.6522829
## N                     N  1.2983335
## CW                   CW  0.4697001
## higherEd       higherEd  0.2726740

rpart.plot(tree_model)

The decision tree highlights clear distinctions in the profiles of Brazilian voters. Lula’s supporters are mainly younger voters (age < 30), especially those who are not evangelical (evangelical = 0), not white (race ≠ White).

Bolsonaro’s voters are generally older, white, and evangelical (evangelical = 1), often with moderate to higher income. This combination represents his core base of support.

Voters for “Others” are more common among younger, non-evangelical individuals and those not from the Northeast (NE = 0), showing weaker attachment to either Lula or Bolsonaro.

Overall, age and religion are the main dividing factors: Lula attracts younger, secular, and non-white voters, Bolsonaro appeals to older white evangelicals, and “Others” concentrate among younger, less religious voters outside the Northeast.

3. Random Forest Model

rf_model <- randomForest(vote_choice ~ age + female + income_k + higherEd +
                           evangelical + catholic + no_religion + NE + S + N + CW + race,
                         data = df,
                         ntree = 200,
                         importance = TRUE)

importance_rf <- importance(rf_model)
importance_rf_df <- data.frame(
  Variable = rownames(importance_rf),
  Importance = importance_rf[, "MeanDecreaseGini"]
) %>% arrange(desc(Importance))

importance_rf_df

##                Variable Importance
## age                 age  92.558449
## income_k       income_k  53.834240
## race               race  28.616988
## NE                   NE  17.239346
## evangelical evangelical  16.237962
## S                     S  11.308581
## female           female  10.228674
## catholic       catholic   9.921021
## higherEd       higherEd   8.751669
## no_religion no_religion   7.844488
## N                     N   5.403042
## CW                   CW   5.243616

Variable Importance Comparison

# Importance- Multinomial Logit
cat("## Multinomial Logit Model\n\n")

## ## Multinomial Logit Model

cat("**Variable Importance Ranking**\n\n")

## **Variable Importance Ranking**

multinomial_simple <- importance_multinomial %>%
  mutate(Importance = round(Importance, 4)) %>%
  arrange(desc(Importance))

knitr::kable(multinomial_simple, 
             caption = "Multinomial Logit - Variable Importance",
             col.names = c("Variable", "Importance Score"))

Multinomial Logit - Variable Importance
	Variable	Importance Score
raceWhite	raceWhite	1.5245
NE1	NE1	1.2901
S1	S1	1.2102
no_religion1	no_religion1	0.9493
evangelical1	evangelical1	0.9120
raceOther	raceOther	0.8420
CW1	CW1	0.7986
female1	female1	0.5428
catholic1	catholic1	0.5130
N1	N1	0.4815
income_k	income_k	0.2282
raceMixed	raceMixed	0.2047
higherEd1	higherEd1	0.1230
age	age	0.0346

# Importance - Decision Tree
cat("\n\n## Decision Tree Model\n\n")

## 
## 
## ## Decision Tree Model

cat("**Variable Importance Ranking**\n\n")

## **Variable Importance Ranking**

tree_simple <- importance_tree_df %>%
  mutate(Importance = round(Importance, 4)) %>%
  arrange(desc(Importance))

knitr::kable(tree_simple, 
             caption = "Decision Tree - Variable Importance", 
             col.names = c("Variable", "Importance Score"))

Decision Tree - Variable Importance
	Variable	Importance Score
age	age	69.5808
race	race	22.1536
income_k	income_k	16.3609
NE	NE	12.9531
evangelical	evangelical	6.7001
S	S	5.4276
no_religion	no_religion	2.5866
catholic	catholic	1.7037
female	female	1.6523
N	N	1.2983
CW	CW	0.4697
higherEd	higherEd	0.2727

# Importance - Random Forest

rf_simple <- importance_rf_df %>%
  mutate(Importance = round(Importance, 4)) %>%
  arrange(desc(Importance))

knitr::kable(rf_simple, 
             caption = "Random Forest - Variable Importance",
             col.names = c("Variable", "Importance Score"))

Random Forest - Variable Importance
	Variable	Importance Score
age	age	92.5584
income_k	income_k	53.8342
race	race	28.6170
NE	NE	17.2393
evangelical	evangelical	16.2380
S	S	11.3086
female	female	10.2287
catholic	catholic	9.9210
higherEd	higherEd	8.7517
no_religion	no_religion	7.8445
N	N	5.4030
CW	CW	5.2436

Probability Variation Analysis

create_prediction_data <- function(var_name, var_range, n_points = 100) {
  base_data <- data.frame(
    age = median(df$age),
    female = factor(0, levels = c(0, 1)),
    income_k = median(df$income_k),
    higherEd = factor(0, levels = c(0, 1)),
    evangelical = factor(0, levels = c(0, 1)),
    catholic = factor(1, levels = c(0, 1)),
    no_religion = factor(0, levels = c(0, 1)),
    NE = factor(0, levels = c(0, 1)),
    S = factor(1, levels = c(0, 1)),
    N = factor(0, levels = c(0, 1)),
    CW = factor(0, levels = c(0, 1)),
    race = factor("White", levels = levels(df$race))
  )
  
  var_sequence <- seq(var_range[1], var_range[2], length.out = n_points)
  pred_data <- base_data[rep(1, n_points), ]
  pred_data[[var_name]] <- var_sequence
  
  return(pred_data)
}

Multinomial Logit Probability Variations

age_pred_data_logit <- create_prediction_data("age", c(18, 80))
logit_age_probs <- predict(model_multinomial, newdata = age_pred_data_logit, type = "probs")
logit_age_results <- data.frame(
  age = age_pred_data_logit$age,
  Bolsonaro = logit_age_probs[, "Bolsonaro"],
  Lula = logit_age_probs[, "Lula"],
  Others = logit_age_probs[, "Others"]
)

income_pred_data_logit <- create_prediction_data("income_k", c(1, 10))
logit_income_probs <- predict(model_multinomial, newdata = income_pred_data_logit, type = "probs")
logit_income_results <- data.frame(
  income_k = income_pred_data_logit$income_k,
  Bolsonaro = logit_income_probs[, "Bolsonaro"],
  Lula = logit_income_probs[, "Lula"],
  Others = logit_income_probs[, "Others"]
)

p_logit_age <- ggplot(logit_age_results, aes(x = age)) +
  geom_line(aes(y = Bolsonaro, color = "Bolsonaro"), size = 1.2) +
  geom_line(aes(y = Lula, color = "Lula"), size = 1.2) +
  geom_line(aes(y = Others, color = "Others"), size = 1.2) +
  labs(title = "Probability Variation - Age (Multinomial Logit)",
       x = "Age", y = "Probability", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

p_logit_income <- ggplot(logit_income_results, aes(x = income_k)) +
  geom_line(aes(y = Bolsonaro, color = "Bolsonaro"), size = 1.2) +
  geom_line(aes(y = Lula, color = "Lula"), size = 1.2) +
  geom_line(aes(y = Others, color = "Others"), size = 1.2) +
  labs(title = "Probability Variation - Income (Multinomial Logit)",
       x = "Income (thousands)", y = "Probability", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

p_logit_age / p_logit_income

Partial Dependence Plots - Decision Tree

pdp_tree_age_bolsonaro <- partial(tree_model, pred.var = "age", which.class = "Bolsonaro", 
                                  prob = TRUE, plot = FALSE)
pdp_tree_age_lula <- partial(tree_model, pred.var = "age", which.class = "Lula", 
                             prob = TRUE, plot = FALSE)
pdp_tree_age_others <- partial(tree_model, pred.var = "age", which.class = "Others", 
                               prob = TRUE, plot = FALSE)

p_tree_age <- ggplot() +
  geom_line(data = pdp_tree_age_bolsonaro, aes(x = age, y = yhat, color = "Bolsonaro"), size = 1.2) +
  geom_line(data = pdp_tree_age_lula, aes(x = age, y = yhat, color = "Lula"), size = 1.2) +
  geom_line(data = pdp_tree_age_others, aes(x = age, y = yhat, color = "Others"), size = 1.2) +
  labs(title = "Partial Dependence - Age (Decision Tree)",
       x = "Age", y = "Partial Dependence", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

pdp_tree_income_bolsonaro <- partial(tree_model, pred.var = "income_k", which.class = "Bolsonaro", 
                                     prob = TRUE, plot = FALSE)
pdp_tree_income_lula <- partial(tree_model, pred.var = "income_k", which.class = "Lula", 
                                prob = TRUE, plot = FALSE)
pdp_tree_income_others <- partial(tree_model, pred.var = "income_k", which.class = "Others", 
                                  prob = TRUE, plot = FALSE)

p_tree_income <- ggplot() +
  geom_line(data = pdp_tree_income_bolsonaro, aes(x = income_k, y = yhat, color = "Bolsonaro"), size = 1.2) +
  geom_line(data = pdp_tree_income_lula, aes(x = income_k, y = yhat, color = "Lula"), size = 1.2) +
  geom_line(data = pdp_tree_income_others, aes(x = income_k, y = yhat, color = "Others"), size = 1.2) +
  labs(title = "Partial Dependence - Income (Decision Tree)",
       x = "Income (thousands)", y = "Partial Dependence", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

p_tree_age / p_tree_income

Partial Dependence Plots - Random Forest

pdp_rf_age_bolsonaro <- partial(rf_model, pred.var = "age", which.class = "Bolsonaro", 
                                prob = TRUE, plot = FALSE)
pdp_rf_age_lula <- partial(rf_model, pred.var = "age", which.class = "Lula", 
                           prob = TRUE, plot = FALSE)
pdp_rf_age_others <- partial(rf_model, pred.var = "age", which.class = "Others", 
                             prob = TRUE, plot = FALSE)

p_rf_age <- ggplot() +
  geom_line(data = pdp_rf_age_bolsonaro, aes(x = age, y = yhat, color = "Bolsonaro"), size = 1.2) +
  geom_line(data = pdp_rf_age_lula, aes(x = age, y = yhat, color = "Lula"), size = 1.2) +
  geom_line(data = pdp_rf_age_others, aes(x = age, y = yhat, color = "Others"), size = 1.2) +
  labs(title = "Partial Dependence - Age (Random Forest)",
       x = "Age", y = "Partial Dependence", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

pdp_rf_income_bolsonaro <- partial(rf_model, pred.var = "income_k", which.class = "Bolsonaro", 
                                   prob = TRUE, plot = FALSE)
pdp_rf_income_lula <- partial(rf_model, pred.var = "income_k", which.class = "Lula", 
                              prob = TRUE, plot = FALSE)
pdp_rf_income_others <- partial(rf_model, pred.var = "income_k", which.class = "Others", 
                                prob = TRUE, plot = FALSE)

p_rf_income <- ggplot() +
  geom_line(data = pdp_rf_income_bolsonaro, aes(x = income_k, y = yhat, color = "Bolsonaro"), size = 1.2) +
  geom_line(data = pdp_rf_income_lula, aes(x = income_k, y = yhat, color = "Lula"), size = 1.2) +
  geom_line(data = pdp_rf_income_others, aes(x = income_k, y = yhat, color = "Others"), size = 1.2) +
  labs(title = "Partial Dependence - Income (Random Forest)",
       x = "Income (thousands)", y = "Partial Dependence", color = "Candidate") +
  scale_color_manual(values = c("Bolsonaro" = "blue", "Lula" = "black", "Others" = "gray")) +
  theme_minimal()

p_rf_age / p_rf_income

Appendix 1: Descriptive Statistics

summary(df)

##       age        female     income_k      higherEd evangelical catholic
##  Min.   :16.00   0:235   Min.   : 1.400   0:197    0:398       0:332   
##  1st Qu.:31.75   1:265   1st Qu.: 3.700   1:303    1:102       1:168   
##  Median :48.00           Median : 5.050                                
##  Mean   :47.62           Mean   : 5.206                                
##  3rd Qu.:63.00           3rd Qu.: 6.400                                
##  Max.   :79.00           Max.   :10.800                                
##  no_religion other_religion SE      NE      S       N       CW         race    
##  0:425       0:345          0:247   0:411   0:421   0:462   0:459   Black: 46  
##  1: 75       1:155          1:253   1: 89   1: 79   1: 38   1: 41   Mixed:215  
##                                                                     Other: 10  
##                                                                     White:229  
##                                                                                
##                                                                                
##     vote_choice 
##  Others   :112  
##  Bolsonaro:190  
##  Lula     :198  
##                 
##                 
##