Data Analysis and Classification on a subset of data about polls for the 2006 and 2010 elections in Brazil for the “Câmara Federal de Deputados”. Data was taken from the TSE portal which originally encompassed approximately 7300 candidates.
The response variable is the variable that you are interesting in making measurements and conclusions on.
A predictor variable is a variable used to predict another variable.
Our response variable will be "situacao", we want to study how well the predictor variables can help predict its behavior and how they impact in the linear regression.
readr::read_csv(here::here('data/train_class.csv'),
progress = FALSE,
local=readr::locale("br"),
col_types = cols(ano = col_integer(),
sequencial_candidato = col_character(),
quantidade_doacoes = col_integer(),
quantidade_doadores = col_integer(),
total_receita = col_double(),
media_receita = col_double(),
recursos_de_outros_candidatos.comites = col_double(),
recursos_de_pessoas_fisicas = col_double(),
recursos_de_pessoas_juridicas = col_double(),
recursos_proprios = col_double(),
`recursos_de_partido_politico` = col_double(),
quantidade_despesas = col_integer(),
quantidade_fornecedores = col_integer(),
total_despesa = col_double(),
media_despesa = col_double(),
situacao = col_character(),
.default = col_character())) %>%
mutate(sequencial_candidato = as.numeric(sequencial_candidato),
estado_civil = as.factor(estado_civil),
ocupacao = as.factor(ocupacao),
situacao = as.factor(situacao),
partido = as.factor(partido),
grau = as.factor(grau),
sexo = as.factor(sexo),
uf = as.factor(uf)) -> data
data %>%
glimpse()
## Observations: 7,622
## Variables: 24
## $ ano <int> 2006, 2006, 2006, 2006, ...
## $ sequencial_candidato <dbl> 10001, 10002, 10002, 100...
## $ nome <chr> "JOSÉ LUIZ NOGUEIRA DE S...
## $ uf <fct> AP, RO, AP, MS, RO, AP, ...
## $ partido <fct> PT, PT, PT, PRONA, PT, P...
## $ quantidade_doacoes <int> 6, 13, 17, 6, 48, 8, 6, ...
## $ quantidade_doadores <int> 6, 13, 16, 6, 48, 8, 6, ...
## $ total_receita <dbl> 16600.00, 22826.00, 1581...
## $ media_receita <dbl> 2766.67, 1755.85, 9301.2...
## $ recursos_de_outros_candidatos.comites <dbl> 0.00, 6625.00, 2250.00, ...
## $ recursos_de_pessoas_fisicas <dbl> 9000.00, 15000.00, 34150...
## $ recursos_de_pessoas_juridicas <dbl> 6300.00, 1000.00, 62220....
## $ recursos_proprios <dbl> 1300.00, 201.00, 59500.0...
## $ recursos_de_partido_politico <dbl> 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quantidade_despesas <int> 14, 24, 123, 8, 133, 38,...
## $ quantidade_fornecedores <int> 14, 23, 108, 8, 120, 37,...
## $ total_despesa <dbl> 16583.60, 20325.99, 1460...
## $ media_despesa <dbl> 1184.54, 846.92, 1187.09...
## $ cargo <chr> "DEPUTADO FEDERAL", "DEP...
## $ sexo <fct> MASCULINO, FEMININO, FEM...
## $ grau <fct> ENSINO MÉDIO COMPLETO, S...
## $ estado_civil <fct> CASADO(A), SOLTEIRO(A), ...
## $ ocupacao <fct> VEREADOR, SERVIDOR PÚBLI...
## $ situacao <fct> nao_eleito, nao_eleito, ...
data %>%
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
arrange(desc(num_nulls))
data %>%
ggplot(aes(situacao)) +
geom_bar() +
labs(x="Situation", y="Absolute Frequency")
data %>%
group_by(situacao) %>%
summarise(num = n()) %>%
ungroup() %>%
mutate(total = sum(num),
proportion = num/total)
data %>%
select(-ano,
-sequencial_candidato,
-nome) %>%
select(
quantidade_doacoes,
quantidade_doadores,
total_receita,
media_receita,
recursos_de_outros_candidatos.comites,
recursos_de_pessoas_fisicas,
recursos_de_pessoas_juridicas,
recursos_proprios,
`recursos_de_partido_politico`) %>%
na.omit() %>%
ggcorr(palette = "RdBu", label = TRUE,
hjust = 0.95, label_size = 3,size = 3,
nbreaks = 5, layout.exp = 5) +
ggtitle("Correlation plot for employed variables")
data %>%
ggplot(aes(situacao,recursos_proprios)) +
geom_boxplot() +
coord_flip() +
labs(y="Revenue from personal resources (R$)", x="Situation")
data %>%
ggplot(aes(situacao,
recursos_de_partido_politico)) +
geom_boxplot() +
coord_flip() +
labs(y="Revenue from political party. (R$)", x="Situation")
data %>%
ggplot(aes(situacao,
recursos_de_outros_candidatos.comites)) +
geom_boxplot() +
coord_flip() +
labs(y="Revenue from other candidate’s committees (R$)", x="Situation")
data %>%
ggplot(aes(situacao,
recursos_de_pessoas_fisicas)) +
geom_boxplot() +
coord_flip() +
labs(y="Revenue from individuals (R$)", x="Situation")
data %>%
ggplot(aes(situacao,
recursos_de_pessoas_juridicas)) +
geom_boxplot() +
coord_flip() +
labs(y="Revenue from legal entities (R$)", x="Situation")
data %>%
ggplot(aes(situacao,
quantidade_doacoes)) +
geom_boxplot() +
coord_flip() +
labs(y="Number of donations", x="Situation")
data %>%
ggplot(aes(situacao,
quantidade_doadores)) +
geom_boxplot() +
coord_flip() +
labs(y="Number of donators", x="Situation")
data %>%
ggplot(aes(situacao,
media_receita)) +
geom_boxplot() +
coord_flip() +
labs(y="Mean expenditure", x="Situation")
data %>%
ggplot(aes(situacao,
total_receita)) +
geom_boxplot() +
coord_flip() +
labs(y="Total expenditure", x="Situation")
data %>%
ggplot() +
geom_mosaic(aes(x = product(sexo, situacao),
fill=sexo)) +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
guides(fill = guide_legend(title = "Sex")) +
labs(x="Situation")
data %>%
ggplot() +
geom_mosaic(aes(x = product(grau, situacao),
fill=grau)) +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
guides(fill = guide_legend(title = "Level of education")) +
labs(x="Situation")
data %>%
ggplot() +
geom_mosaic(aes(x = product(estado_civil, situacao),
fill=estado_civil)) +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
guides(fill = guide_legend(title = "Marital Status")) +
labs(x="Situation")
set.seed(107)
data$id <- 1:nrow(data)
data %>%
dplyr::sample_frac(.8) -> train
cat("#### Train Shape",
"\n##### Observations: ",nrow(train),
"\n##### Variables: ",ncol(train))
## #### Train Shape
## ##### Observations: 6098
## ##### Variables: 25
dplyr::anti_join(data,
train,
by = 'id') -> test
cat("#### Test Shape",
"\n##### Observations: ",nrow(test),
"\n##### Variables: ",ncol(test))
## #### Test Shape
## ##### Observations: 1524
## ##### Variables: 25
train %>%
select(-ano,-nome,-id,-sequencial_candidato) -> train
test %>%
select(-ano,-nome,-id,-sequencial_candidato) -> test
train %>%
dplyr::select_if(.,is.numeric) -> train.numeric
train %>%
dplyr::select_if(.,negate(is.numeric)) -> train.categorical
test %>%
dplyr::select_if(.,is.numeric) -> test.numeric
test %>%
dplyr::select_if(.,negate(is.numeric)) -> test.categorical
train.numeric %>%
preProcess(.,method = c("center","scale")) -> processParams
processParams %>%
predict(.,train.numeric) -> train.numeric
processParams %>%
predict(.,test.numeric) -> test.numeric
processParams
## Created from 6098 samples and 13 variables
##
## Pre-processing:
## - centered (13)
## - ignored (0)
## - scaled (13)
train.numeric %>%
dplyr::bind_cols(train.categorical) -> train
test.numeric %>%
dplyr::bind_cols(test.categorical) -> test
encoding <- build_encoding(dataSet = train,
cols = c("uf","sexo","grau","ocupacao",
"partido","estado_civil"),
verbose = F)
train <- one_hot_encoder(dataSet = train,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Train Shape",
"\n##### Observations: ",nrow(train),
"\n##### Variables: ",ncol(train))
test <- one_hot_encoder(dataSet = test,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Data Shape",
"\n##### Observations: ",nrow(test),
"\n##### Variables: ",ncol(test))
## #### Data Shape
## ##### Observations: 1524
## ##### Variables: 263
train %>%
nearZeroVar(saveMetrics = TRUE) %>%
tibble::rownames_to_column("variable") %>%
filter(nzv == T) %>%
pull(variable) -> near_zero_vars
train %>%
select(-one_of(near_zero_vars)) -> train
test %>%
select(-one_of(near_zero_vars)) -> test
near_zero_vars %>%
glimpse()
## chr [1:224] "cargo" "uf.AC" "uf.AL" "uf.AM" "uf.AP" "uf.BA" "uf.CE" ...
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred,
y_true = data$obs,
positive = lev[1])
c(F1 = f1_val)
}
F_Measure <- function(expected, predicted, ...) {
data.frame(expected=expected,
prediction=predicted) %>%
mutate(TP = ifelse(expected == "eleito" &
prediction == "eleito",1,0),
TN = ifelse(expected == "nao_eleito" &
prediction == "nao_eleito",1,0),
FN = ifelse(expected == "eleito" &
prediction == "nao_eleito",1,0),
FP = ifelse(expected == "nao_eleito" &
prediction == "eleito",1,0)) -> result
result %>%
summarize(TP = sum(TP),
TN = sum(TN),
FP = sum(FP),
FN = sum(FN)) %>%
mutate(recall = TP / (TP + FN),
precision = TP / (TP + FP),
accuracy = (TP + TN)/(TP + TN + FP + FN),
f_measure = 2 * (precision * recall) / (precision + recall)) -> result
return(result)
}
rlGrid <- expand.grid( cost = c(200,2,0.02),
loss = c("L1", "L2_dual", "L2_primal"),
epsilon = c(0.001,0.01) )
train %>%
caret::train(situacao ~ .,
data= .,
method = "regLogistic",
metric = "F1",
trControl = trainControl(method = "boot",
classProbs = TRUE,
summaryFunction = f1,
savePredictions = "final"),
tuneGrid = rlGrid) -> model.rl
model.rl
## Regularized Logistic Regression
##
## 6098 samples
## 38 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 6098, 6098, 6098, 6098, 6098, 6098, ...
## Resampling results across tuning parameters:
##
## cost loss epsilon F1
## 2e-02 L1 0.001 0.5841929
## 2e-02 L1 0.010 0.5850193
## 2e-02 L2_dual 0.001 0.5994658
## 2e-02 L2_dual 0.010 0.5994658
## 2e-02 L2_primal 0.001 0.5994150
## 2e-02 L2_primal 0.010 0.6000753
## 2e+00 L1 0.001 0.6342838
## 2e+00 L1 0.010 0.6359184
## 2e+00 L2_dual 0.001 0.6333125
## 2e+00 L2_dual 0.010 0.6333125
## 2e+00 L2_primal 0.001 0.6332243
## 2e+00 L2_primal 0.010 0.6335837
## 2e+02 L1 0.001 0.6329757
## 2e+02 L1 0.010 0.6355253
## 2e+02 L2_dual 0.001 0.6445493
## 2e+02 L2_dual 0.010 0.6487011
## 2e+02 L2_primal 0.001 0.6330532
## 2e+02 L2_primal 0.010 0.6344372
##
## F1 was used to select the optimal model using the largest value.
## The final values used for the model were cost = 200, loss = L2_dual
## and epsilon = 0.01.
model.rl %$%
results %>%
mutate(cost=as.factor(cost)) %>%
ggplot(aes(epsilon,F1,
color=cost)) +
geom_line() +
geom_point() +
labs(y= "F1 (Bootstrap)", x="Tolerance") +
facet_wrap(. ~ loss, labeller = "label_both") +
guides(color = guide_legend(title = "Cost")) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
model.rl %>%
varImp() %$%
importance %>%
as.data.frame() %>%
rownames_to_column(var="Feature") %>%
mutate(Feature = tolower(Feature)) %>%
ggplot() +
geom_col(aes(x = reorder(Feature,eleito),y = eleito),
position = position_dodge(width=0.8),width=0.6) +
labs(x="Feature", y="Overall Importance") +
coord_flip()
model.rl %$%
pred %>%
F_Measure(expected = .$obs,
predicted = .$pred)
test %>%
select(-situacao) %>%
predict(object=model.rl,.) %>%
F_Measure(test$situacao,.)
neighborsGrid <- expand.grid(.k = seq(from=1, to=50, by=1))
train %>%
train(situacao ~ .,
data = .,
metric = "F1",
method = "knn",
na.action = na.omit,
tuneGrid = neighborsGrid,
trControl = trainControl(method = "boot",
classProbs = TRUE,
summaryFunction = f1,
savePredictions = "final")) -> model.knn
model.knn
## k-Nearest Neighbors
##
## 6098 samples
## 38 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 6098, 6098, 6098, 6098, 6098, 6098, ...
## Resampling results across tuning parameters:
##
## k F1
## 1 0.5824759
## 2 0.5875895
## 3 0.6007862
## 4 0.6061060
## 5 0.6144416
## 6 0.6200697
## 7 0.6241954
## 8 0.6311975
## 9 0.6356655
## 10 0.6336647
## 11 0.6366401
## 12 0.6399036
## 13 0.6392513
## 14 0.6434043
## 15 0.6464512
## 16 0.6462525
## 17 0.6458874
## 18 0.6491884
## 19 0.6477945
## 20 0.6480499
## 21 0.6472381
## 22 0.6445113
## 23 0.6465735
## 24 0.6457208
## 25 0.6461998
## 26 0.6482132
## 27 0.6486326
## 28 0.6496663
## 29 0.6500242
## 30 0.6499457
## 31 0.6499628
## 32 0.6509105
## 33 0.6506456
## 34 0.6504918
## 35 0.6509949
## 36 0.6496751
## 37 0.6487479
## 38 0.6485889
## 39 0.6476139
## 40 0.6484988
## 41 0.6487328
## 42 0.6487954
## 43 0.6478067
## 44 0.6501784
## 45 0.6495676
## 46 0.6487787
## 47 0.6490995
## 48 0.6492741
## 49 0.6478356
## 50 0.6486335
##
## F1 was used to select the optimal model using the largest value.
## The final value used for the model was k = 35.
model.knn %$%
bestTune %$%
k -> bestParameter
model.knn %$%
results %>%
ggplot(aes(k,F1)) +
geom_vline(xintercept = bestParameter,
color = "red") +
geom_point(color="#0D98E8") +
geom_line(color="#0D98E8") +
labs(x="#Neighbors",
y="F1 (Bootstrap)")
model.knn %>%
varImp() %$%
importance %>%
as.data.frame() %>%
rownames_to_column(var="Feature") %>%
mutate(Feature = tolower(Feature)) %>%
ggplot() +
geom_col(aes(x = reorder(Feature,eleito),y = eleito),
position = position_dodge(width=0.8),width=0.6) +
labs(x="Feature", y="Overall Importance") +
coord_flip()
model.knn %$%
pred %>%
F_Measure(expected = .$obs,
predicted = .$pred)
test %>%
select(-situacao) %>%
predict(object=model.knn,.) %>%
F_Measure(test$situacao,.)
rpart.grid <- expand.grid(.cp = seq(from=0, to=0.1, by=0.005))
caret::train(x = select(train, -situacao),
y = train$situacao,
metric = "F1",
method = "rpart",
na.action = na.omit,
tuneGrid = rpart.grid,
trControl = trainControl(method = "boot",
classProbs = TRUE,
summaryFunction = f1,
savePredictions = "final")) -> model.tree
model.tree
## CART
##
## 6098 samples
## 38 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 6098, 6098, 6098, 6098, 6098, 6098, ...
## Resampling results across tuning parameters:
##
## cp F1
## 0.000 0.6222605
## 0.005 0.6597754
## 0.010 0.6730765
## 0.015 0.6737400
## 0.020 0.6684035
## 0.025 0.6684085
## 0.030 0.6668214
## 0.035 0.6681258
## 0.040 0.6681258
## 0.045 0.6681258
## 0.050 0.6661671
## 0.055 0.6661671
## 0.060 0.6712018
## 0.065 0.6699034
## 0.070 0.6700850
## 0.075 0.6702300
## 0.080 0.6717618
## 0.085 0.6726706
## 0.090 0.6632295
## 0.095 0.6726590
## 0.100 0.6660222
##
## F1 was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.015.
model.tree %$%
bestTune %$%
cp -> bestParameter
model.tree %$%
results %>%
ggplot(aes(cp,F1)) +
geom_vline(xintercept = bestParameter,
color = "red") +
geom_point(color="#0D98E8") +
geom_line(color="#0D98E8") +
labs(x="Complexity Parameter",
y="F1 (Bootstrap)")
model.tree %$%
finalModel %>%
fancyRpartPlot(sub="")
model.tree %>%
varImp() %$%
importance %>%
as.data.frame() %>%
rownames_to_column(var="Feature") %>%
mutate(Feature = tolower(Feature)) %>%
ggplot() +
geom_col(aes(x = reorder(Feature,Overall),y = Overall),
position = position_dodge(width=0.8),width=0.6) +
labs(x="Feature", y="Overall Importance") +
coord_flip()
model.tree %$%
pred %>%
F_Measure(expected = .$obs,
predicted = .$pred)
test %>%
select(-situacao) %>%
predict(object=model.tree,.) %>%
F_Measure(test$situacao,.)
train(x = select(train, -situacao),
y = train$situacao,
metric = "F1",
na.action = na.exclude,
method='adaboost',
tuneLength=2,
trControl = trainControl(savePredictions = "final",
summaryFunction = f1,
classProbs = TRUE,
method = "boot")) -> model.ada
model.ada
## AdaBoost Classification Trees
##
## 6098 samples
## 38 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4878, 4879, 4878, 4879, 4878
## Resampling results across tuning parameters:
##
## nIter method F1
## 50 Adaboost.M1 0.6501011
## 50 Real adaboost 0.6483831
## 100 Adaboost.M1 0.6611736
## 100 Real adaboost 0.6670238
##
## F1 was used to select the optimal model using the largest value.
## The final values used for the model were nIter = 100 and method =
## Real adaboost.
model.ada %$%
results %>%
ggplot(aes(nIter,F1,
color=as.factor(method))) +
geom_point(shape=1) +
geom_line() +
labs(x="# Trees",y="F1 (Bootstrap)") +
guides(color = guide_legend(title = "Method"))
model.ada %$%
pred %>%
F_Measure(expected = .$obs,
predicted = .$pred)
test %>%
select(-situacao) %>%
predict(object=model.ada,.) %>%
F_Measure(test$situacao,.)
train %>%
SMOTE(situacao ~ .,
data = .,
perc.over = 200,
perc.under=200) -> oversampled
cat("#### Train Shape",
"\n##### Observations: ",nrow(oversampled),
"\n##### Variables: ",ncol(oversampled))
## #### Train Shape
## ##### Observations: 5691
## ##### Variables: 39
oversampled %>%
group_by(situacao) %>%
summarise(num = n()) %>%
ungroup() %>%
mutate(total = sum(num),
proportion = num/total)
rlGrid <- expand.grid( cost = c(0.02,0.1,2,20,100,200),
loss = c("L1", "L2_dual", "L2_primal"),
epsilon = seq(from=0.001,to=0.1, by=0.005) )
oversampled %>%
caret::train(situacao ~ .,
data= .,
method = "regLogistic",
metric = "F1",
trControl = trainControl(method = "boot",
classProbs = TRUE,
summaryFunction = f1,
savePredictions = "final"),
tuneGrid = rlGrid) -> model.rl.smote
model.rl.smote
## Regularized Logistic Regression
##
## 5691 samples
## 38 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5691, 5691, 5691, 5691, 5691, 5691, ...
## Resampling results across tuning parameters:
##
## cost loss epsilon F1
## 2e-02 L1 0.001 0.8538309
## 2e-02 L1 0.006 0.8540300
## 2e-02 L1 0.011 0.8553297
## 2e-02 L1 0.016 0.8553614
## 2e-02 L1 0.021 0.8550155
## 2e-02 L1 0.026 0.8554397
## 2e-02 L1 0.031 0.8548420
## 2e-02 L1 0.036 0.8535600
## 2e-02 L1 0.041 0.8549967
## 2e-02 L1 0.046 0.8540555
## 2e-02 L1 0.051 0.8539844
## 2e-02 L1 0.056 0.8542698
## 2e-02 L1 0.061 0.8541436
## 2e-02 L1 0.066 0.8539469
## 2e-02 L1 0.071 0.8531028
## 2e-02 L1 0.076 0.8503106
## 2e-02 L1 0.081 0.8526496
## 2e-02 L1 0.086 0.8501218
## 2e-02 L1 0.091 0.8469466
## 2e-02 L1 0.096 0.8481336
## 2e-02 L2_dual 0.001 0.8536869
## 2e-02 L2_dual 0.006 0.8536869
## 2e-02 L2_dual 0.011 0.8536869
## 2e-02 L2_dual 0.016 0.8536869
## 2e-02 L2_dual 0.021 0.8536869
## 2e-02 L2_dual 0.026 0.8536604
## 2e-02 L2_dual 0.031 0.8536863
## 2e-02 L2_dual 0.036 0.8536604
## 2e-02 L2_dual 0.041 0.8536604
## 2e-02 L2_dual 0.046 0.8536869
## 2e-02 L2_dual 0.051 0.8536604
## 2e-02 L2_dual 0.056 0.8536869
## 2e-02 L2_dual 0.061 0.8536604
## 2e-02 L2_dual 0.066 0.8536869
## 2e-02 L2_dual 0.071 0.8536616
## 2e-02 L2_dual 0.076 0.8536877
## 2e-02 L2_dual 0.081 0.8536877
## 2e-02 L2_dual 0.086 0.8536877
## 2e-02 L2_dual 0.091 0.8537141
## 2e-02 L2_dual 0.096 0.8536604
## 2e-02 L2_primal 0.001 0.8536869
## 2e-02 L2_primal 0.006 0.8535285
## 2e-02 L2_primal 0.011 0.8535821
## 2e-02 L2_primal 0.016 0.8533720
## 2e-02 L2_primal 0.021 0.8531790
## 2e-02 L2_primal 0.026 0.8523491
## 2e-02 L2_primal 0.031 0.8524624
## 2e-02 L2_primal 0.036 0.8524892
## 2e-02 L2_primal 0.041 0.8525225
## 2e-02 L2_primal 0.046 0.8525225
## 2e-02 L2_primal 0.051 0.8527130
## 2e-02 L2_primal 0.056 0.8527130
## 2e-02 L2_primal 0.061 0.8528945
## 2e-02 L2_primal 0.066 0.8528945
## 2e-02 L2_primal 0.071 0.8528945
## 2e-02 L2_primal 0.076 0.8523373
## 2e-02 L2_primal 0.081 0.8523373
## 2e-02 L2_primal 0.086 0.8520629
## 2e-02 L2_primal 0.091 0.8517127
## 2e-02 L2_primal 0.096 0.8510357
## 1e-01 L1 0.001 0.8583723
## 1e-01 L1 0.006 0.8590873
## 1e-01 L1 0.011 0.8589444
## 1e-01 L1 0.016 0.8588426
## 1e-01 L1 0.021 0.8579992
## 1e-01 L1 0.026 0.8577336
## 1e-01 L1 0.031 0.8566834
## 1e-01 L1 0.036 0.8569420
## 1e-01 L1 0.041 0.8570155
## 1e-01 L1 0.046 0.8560039
## 1e-01 L1 0.051 0.8563881
## 1e-01 L1 0.056 0.8561794
## 1e-01 L1 0.061 0.8552046
## 1e-01 L1 0.066 0.8544922
## 1e-01 L1 0.071 0.8566927
## 1e-01 L1 0.076 0.8539058
## 1e-01 L1 0.081 0.8552251
## 1e-01 L1 0.086 0.8535566
## 1e-01 L1 0.091 0.8531086
## 1e-01 L1 0.096 0.8523900
## 1e-01 L2_dual 0.001 0.8593243
## 1e-01 L2_dual 0.006 0.8593243
## 1e-01 L2_dual 0.011 0.8593243
## 1e-01 L2_dual 0.016 0.8593243
## 1e-01 L2_dual 0.021 0.8593243
## 1e-01 L2_dual 0.026 0.8593243
## 1e-01 L2_dual 0.031 0.8593243
## 1e-01 L2_dual 0.036 0.8593243
## 1e-01 L2_dual 0.041 0.8593243
## 1e-01 L2_dual 0.046 0.8593243
## 1e-01 L2_dual 0.051 0.8593243
## 1e-01 L2_dual 0.056 0.8593243
## 1e-01 L2_dual 0.061 0.8593243
## 1e-01 L2_dual 0.066 0.8593243
## 1e-01 L2_dual 0.071 0.8593243
## 1e-01 L2_dual 0.076 0.8593243
## 1e-01 L2_dual 0.081 0.8593243
## 1e-01 L2_dual 0.086 0.8593508
## 1e-01 L2_dual 0.091 0.8593508
## 1e-01 L2_dual 0.096 0.8593508
## 1e-01 L2_primal 0.001 0.8593042
## 1e-01 L2_primal 0.006 0.8595058
## 1e-01 L2_primal 0.011 0.8589625
## 1e-01 L2_primal 0.016 0.8588698
## 1e-01 L2_primal 0.021 0.8589009
## 1e-01 L2_primal 0.026 0.8588683
## 1e-01 L2_primal 0.031 0.8587031
## 1e-01 L2_primal 0.036 0.8592199
## 1e-01 L2_primal 0.041 0.8587951
## 1e-01 L2_primal 0.046 0.8586075
## 1e-01 L2_primal 0.051 0.8582162
## 1e-01 L2_primal 0.056 0.8575988
## 1e-01 L2_primal 0.061 0.8573405
## 1e-01 L2_primal 0.066 0.8580074
## 1e-01 L2_primal 0.071 0.8580074
## 1e-01 L2_primal 0.076 0.8580074
## 1e-01 L2_primal 0.081 0.8575835
## 1e-01 L2_primal 0.086 0.8575835
## 1e-01 L2_primal 0.091 0.8575835
## 1e-01 L2_primal 0.096 0.8580478
## 2e+00 L1 0.001 0.8630215
## 2e+00 L1 0.006 0.8620900
## 2e+00 L1 0.011 0.8605691
## 2e+00 L1 0.016 0.8605038
## 2e+00 L1 0.021 0.8598250
## 2e+00 L1 0.026 0.8591707
## 2e+00 L1 0.031 0.8584852
## 2e+00 L1 0.036 0.8575296
## 2e+00 L1 0.041 0.8571389
## 2e+00 L1 0.046 0.8564196
## 2e+00 L1 0.051 0.8568481
## 2e+00 L1 0.056 0.8556583
## 2e+00 L1 0.061 0.8556827
## 2e+00 L1 0.066 0.8550816
## 2e+00 L1 0.071 0.8550941
## 2e+00 L1 0.076 0.8555661
## 2e+00 L1 0.081 0.8541527
## 2e+00 L1 0.086 0.8542077
## 2e+00 L1 0.091 0.8550299
## 2e+00 L1 0.096 0.8515433
## 2e+00 L2_dual 0.001 0.8633611
## 2e+00 L2_dual 0.006 0.8633611
## 2e+00 L2_dual 0.011 0.8633611
## 2e+00 L2_dual 0.016 0.8633611
## 2e+00 L2_dual 0.021 0.8633611
## 2e+00 L2_dual 0.026 0.8633347
## 2e+00 L2_dual 0.031 0.8633611
## 2e+00 L2_dual 0.036 0.8633611
## 2e+00 L2_dual 0.041 0.8633347
## 2e+00 L2_dual 0.046 0.8633611
## 2e+00 L2_dual 0.051 0.8633611
## 2e+00 L2_dual 0.056 0.8633347
## 2e+00 L2_dual 0.061 0.8633086
## 2e+00 L2_dual 0.066 0.8633086
## 2e+00 L2_dual 0.071 0.8633347
## 2e+00 L2_dual 0.076 0.8633347
## 2e+00 L2_dual 0.081 0.8632889
## 2e+00 L2_dual 0.086 0.8633347
## 2e+00 L2_dual 0.091 0.8633867
## 2e+00 L2_dual 0.096 0.8633344
## 2e+00 L2_primal 0.001 0.8631802
## 2e+00 L2_primal 0.006 0.8632088
## 2e+00 L2_primal 0.011 0.8625311
## 2e+00 L2_primal 0.016 0.8620412
## 2e+00 L2_primal 0.021 0.8616338
## 2e+00 L2_primal 0.026 0.8617593
## 2e+00 L2_primal 0.031 0.8615645
## 2e+00 L2_primal 0.036 0.8610353
## 2e+00 L2_primal 0.041 0.8611588
## 2e+00 L2_primal 0.046 0.8606701
## 2e+00 L2_primal 0.051 0.8603380
## 2e+00 L2_primal 0.056 0.8593506
## 2e+00 L2_primal 0.061 0.8598675
## 2e+00 L2_primal 0.066 0.8599000
## 2e+00 L2_primal 0.071 0.8601501
## 2e+00 L2_primal 0.076 0.8603673
## 2e+00 L2_primal 0.081 0.8609143
## 2e+00 L2_primal 0.086 0.8609143
## 2e+00 L2_primal 0.091 0.8609143
## 2e+00 L2_primal 0.096 0.8609143
## 2e+01 L1 0.001 0.8634660
## 2e+01 L1 0.006 0.8620763
## 2e+01 L1 0.011 0.8611021
## 2e+01 L1 0.016 0.8600686
## 2e+01 L1 0.021 0.8597277
## 2e+01 L1 0.026 0.8579331
## 2e+01 L1 0.031 0.8579397
## 2e+01 L1 0.036 0.8579594
## 2e+01 L1 0.041 0.8577182
## 2e+01 L1 0.046 0.8563854
## 2e+01 L1 0.051 0.8563497
## 2e+01 L1 0.056 0.8555920
## 2e+01 L1 0.061 0.8545562
## 2e+01 L1 0.066 0.8552778
## 2e+01 L1 0.071 0.8553063
## 2e+01 L1 0.076 0.8564034
## 2e+01 L1 0.081 0.8536629
## 2e+01 L1 0.086 0.8546537
## 2e+01 L1 0.091 0.8551635
## 2e+01 L1 0.096 0.8532121
## 2e+01 L2_dual 0.001 0.8647225
## 2e+01 L2_dual 0.006 0.8643226
## 2e+01 L2_dual 0.011 0.8645530
## 2e+01 L2_dual 0.016 0.8645646
## 2e+01 L2_dual 0.021 0.8643154
## 2e+01 L2_dual 0.026 0.8649277
## 2e+01 L2_dual 0.031 0.8645553
## 2e+01 L2_dual 0.036 0.8647538
## 2e+01 L2_dual 0.041 0.8647882
## 2e+01 L2_dual 0.046 0.8646306
## 2e+01 L2_dual 0.051 0.8645009
## 2e+01 L2_dual 0.056 0.8643955
## 2e+01 L2_dual 0.061 0.8647655
## 2e+01 L2_dual 0.066 0.8645841
## 2e+01 L2_dual 0.071 0.8646958
## 2e+01 L2_dual 0.076 0.8646026
## 2e+01 L2_dual 0.081 0.8647170
## 2e+01 L2_dual 0.086 0.8646956
## 2e+01 L2_dual 0.091 0.8645020
## 2e+01 L2_dual 0.096 0.8646657
## 2e+01 L2_primal 0.001 0.8632635
## 2e+01 L2_primal 0.006 0.8633430
## 2e+01 L2_primal 0.011 0.8620685
## 2e+01 L2_primal 0.016 0.8615168
## 2e+01 L2_primal 0.021 0.8616624
## 2e+01 L2_primal 0.026 0.8618644
## 2e+01 L2_primal 0.031 0.8618400
## 2e+01 L2_primal 0.036 0.8613683
## 2e+01 L2_primal 0.041 0.8611675
## 2e+01 L2_primal 0.046 0.8604526
## 2e+01 L2_primal 0.051 0.8602061
## 2e+01 L2_primal 0.056 0.8598627
## 2e+01 L2_primal 0.061 0.8602352
## 2e+01 L2_primal 0.066 0.8602479
## 2e+01 L2_primal 0.071 0.8605169
## 2e+01 L2_primal 0.076 0.8605169
## 2e+01 L2_primal 0.081 0.8610384
## 2e+01 L2_primal 0.086 0.8610384
## 2e+01 L2_primal 0.091 0.8610384
## 2e+01 L2_primal 0.096 0.8610384
## 1e+02 L1 0.001 0.8634602
## 1e+02 L1 0.006 0.8623150
## 1e+02 L1 0.011 0.8610625
## 1e+02 L1 0.016 0.8604169
## 1e+02 L1 0.021 0.8586845
## 1e+02 L1 0.026 0.8588839
## 1e+02 L1 0.031 0.8584691
## 1e+02 L1 0.036 0.8581030
## 1e+02 L1 0.041 0.8569379
## 1e+02 L1 0.046 0.8561318
## [ reached getOption("max.print") -- omitted 110 rows ]
##
## F1 was used to select the optimal model using the largest value.
## The final values used for the model were cost = 200, loss = L2_dual
## and epsilon = 0.051.
model.rl.smote %$%
results %>%
mutate(cost=as.factor(cost)) %>%
ggplot(aes(epsilon,F1,
color=cost)) +
geom_line() +
geom_point() +
labs(y= "F1 (Bootstrap)", x="Tolerance") +
facet_wrap(. ~ loss, labeller = "label_both") +
guides(color = guide_legend(title = "Cost")) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
model.rl.smote %>%
varImp() %$%
importance %>%
as.data.frame() %>%
rownames_to_column(var="Feature") %>%
mutate(Feature = tolower(Feature)) %>%
ggplot() +
geom_col(aes(x = reorder(Feature,eleito),y = eleito),
position = position_dodge(width=0.8),width=0.6) +
labs(x="Feature", y="Overall Importance") +
coord_flip()
model.rl.smote %$%
pred %>%
F_Measure(expected = .$obs,
predicted = .$pred)
test %>%
select(-situacao) %>%
predict(object=model.rl.smote,.) %>%
F_Measure(test$situacao,.)
neighborsGrid <- expand.grid(.k = seq(from=1, to=50, by=1))
oversampled %>%
train(situacao ~ .,
data = .,
metric = "F1",
method = "knn",
na.action = na.omit,
tuneGrid = neighborsGrid,
trControl = trainControl(method = "boot",
classProbs = TRUE,
summaryFunction = f1,
savePredictions = "final")) -> model.knn.smote
model.knn.smote
## k-Nearest Neighbors
##
## 5691 samples
## 38 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5691, 5691, 5691, 5691, 5691, 5691, ...
## Resampling results across tuning parameters:
##
## k F1
## 1 0.9321092
## 2 0.9132311
## 3 0.9049864
## 4 0.9008830
## 5 0.9008448
## 6 0.8998784
## 7 0.9004886
## 8 0.8970152
## 9 0.8976531
## 10 0.8974074
## 11 0.8969770
## 12 0.8956613
## 13 0.8958010
## 14 0.8950967
## 15 0.8944800
## 16 0.8943504
## 17 0.8943644
## 18 0.8946889
## 19 0.8945043
## 20 0.8938185
## 21 0.8935417
## 22 0.8928582
## 23 0.8927143
## 24 0.8918316
## 25 0.8917310
## 26 0.8914433
## 27 0.8911126
## 28 0.8905211
## 29 0.8904550
## 30 0.8900654
## 31 0.8900685
## 32 0.8895356
## 33 0.8891479
## 34 0.8884953
## 35 0.8877799
## 36 0.8873453
## 37 0.8875215
## 38 0.8874952
## 39 0.8868673
## 40 0.8864589
## 41 0.8863021
## 42 0.8861829
## 43 0.8853884
## 44 0.8852885
## 45 0.8846439
## 46 0.8844596
## 47 0.8846346
## 48 0.8843556
## 49 0.8840617
## 50 0.8842155
##
## F1 was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
model.knn.smote %$%
bestTune %$%
k -> bestParameter
model.knn.smote %$%
results %>%
ggplot(aes(k,F1)) +
geom_vline(xintercept = bestParameter,
color = "red") +
geom_point(color="#0D98E8") +
geom_line(color="#0D98E8") +
labs(x="#Neighbors",
y="F1 (Bootstrap)")
model.knn.smote %>%
varImp() %$%
importance %>%
as.data.frame() %>%
rownames_to_column(var="Feature") %>%
mutate(Feature = tolower(Feature)) %>%
ggplot() +
geom_col(aes(x = reorder(Feature,eleito),y = eleito),
position = position_dodge(width=0.8),width=0.6) +
labs(x="Feature", y="Overall Importance") +
coord_flip()
model.knn.smote %$%
pred %>%
F_Measure(expected = .$obs,
predicted = .$pred)
The results in train/validation are overly optimistic in reason of the imbalance correction and should not be taken seriously.
test %>%
select(-situacao) %>%
predict(object=model.knn.smote,.) %>%
F_Measure(test$situacao,.)
rpart.grid <- expand.grid(.cp = seq(from=0, to=0.1, by=0.005))
caret::train(x = select(oversampled, -situacao),
y = oversampled$situacao,
metric = "F1",
method = "rpart",
na.action = na.omit,
tuneGrid = rpart.grid,
trControl = trainControl(method = "boot",
classProbs = TRUE,
summaryFunction = f1,
savePredictions = "final")) -> model.tree.smote
model.tree.smote
## CART
##
## 5691 samples
## 38 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5691, 5691, 5691, 5691, 5691, 5691, ...
## Resampling results across tuning parameters:
##
## cp F1
## 0.000 0.8861902
## 0.005 0.8959433
## 0.010 0.8942471
## 0.015 0.8901458
## 0.020 0.8901458
## 0.025 0.8901458
## 0.030 0.8901458
## 0.035 0.8901458
## 0.040 0.8901458
## 0.045 0.8901458
## 0.050 0.8901458
## 0.055 0.8901458
## 0.060 0.8901458
## 0.065 0.8901458
## 0.070 0.8901458
## 0.075 0.8901458
## 0.080 0.8901458
## 0.085 0.8901458
## 0.090 0.8901458
## 0.095 0.8901458
## 0.100 0.8901458
##
## F1 was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005.
model.tree.smote %$%
bestTune %$%
cp -> bestParameter
model.tree.smote %$%
results %>%
ggplot(aes(cp,F1)) +
geom_vline(xintercept = bestParameter,
color = "red") +
geom_point(color="#0D98E8") +
geom_line(color="#0D98E8") +
labs(x="Complexity Parameter",
y="F1 (Bootstrap)")
model.tree.smote %>%
varImp() %$%
importance %>%
as.data.frame() %>%
rownames_to_column(var="Feature") %>%
mutate(Feature = tolower(Feature)) %>%
ggplot() +
geom_col(aes(x = reorder(Feature,Overall),y = Overall),
position = position_dodge(width=0.8),width=0.6) +
labs(x="Feature", y="Overall Importance") +
coord_flip()
model.tree.smote %$%
finalModel %>%
fancyRpartPlot(sub="")
model.tree.smote %$%
pred %>%
F_Measure(expected = .$obs,
predicted = .$pred)
test %>%
select(-situacao) %>%
predict(object=model.tree.smote,.) %>%
F_Measure(test$situacao,.)
train(x = select(oversampled, -situacao),
y = oversampled$situacao,
metric = "F1",
na.action = na.exclude,
method='adaboost',
tuneLength=2,
trControl = trainControl(savePredictions = "final",
summaryFunction = f1,
classProbs = TRUE,
method = "boot")) -> model.ada.smote
model.ada.smote
## AdaBoost Classification Trees
##
## 5691 samples
## 38 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5691, 5691, 5691, 5691, 5691, 5691, ...
## Resampling results across tuning parameters:
##
## nIter method F1
## 50 Adaboost.M1 0.9352572
## 50 Real adaboost 0.9343263
## 100 Adaboost.M1 0.9361092
## 100 Real adaboost 0.9353782
##
## F1 was used to select the optimal model using the largest value.
## The final values used for the model were nIter = 100 and method
## = Adaboost.M1.
model.ada.smote %$%
results %>%
ggplot(aes(nIter,F1,
color=as.factor(method))) +
geom_point(shape=1) +
geom_line() +
labs(x="# Trees",y="F1 (Bootstrap)") +
guides(color = guide_legend(title = "Method"))
model.ada.smote %$%
pred %>%
F_Measure(expected = .$obs,
predicted = .$pred)
The results in train/validation are overly optimistic in reason of the imbalance correction and should not be taken seriously.
Actual progress shall be assessed in the test stage
test %>%
select(-situacao) %>%
predict(object=model.ada.smote,.) %>%
F_Measure(test$situacao,.)