data <- read.csv("Data/train.csv")
part <- createDataPartition(y = data$situacao, p=0.8, list=FALSE)
data <- data[part, ]
val <- data[-part, ]
test <- read.csv("Data/test.csv")
data
eleito<-sum(as.numeric(data$situacao == "eleito"))
neleito<-sum(as.numeric(data$situacao == "nao_eleito"))
ballance <- data.frame(group= c("Não Eleito", "Eleito"), value = c(neleito, eleito))
head(ballance)
pie <- ggplot(ballance, aes(x = factor(1), y=value,fill=group) ) + geom_bar(width = 1,stat="identity")+coord_polar(theta = "y")
pie + scale_fill_manual(values=c("#229954", "#C0392B"))
Podemos reparar que Não eleito está em proporção bem maior do que Eleito(1026Eleitos contra 6596NãoEleitos). Esse desbalanceamento pode enviezar o classificador a predizer com uma frequencia maior o não eleito, aumentando então o Falso Negativo, ou seja, predizendo quem seria eleito com o label Não Eleito. Tentativas de balancear as duas classes partem de duas ideias, ou eliminar exemplos de não eleitos (DOWNSAMPLING), ou sintetizar exemplos de Eleitos a partir dos exemplos reais (UPSAMPLING).
#Removendo Redundâncias
formula <- as.formula(situacao ~ partido + quantidade_doacoes + total_despesa + recursos_proprios + recursos_de_partido_politico + sexo + grau)
#Cross Validation
set.seed(1984)
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
#Sampling
fitControl$sampling <- "smote"
cross.knn <- train(formula,
data = data,
method = "knn",
tuneLength = 10,
trControl = fitControl,
preProcess = c('scale', 'center'),
na.action = na.omit)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCB
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCO
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCB
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCO
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCO
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCO
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCB
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCO
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCO
cross.knn
## k-Nearest Neighbors
##
## 6098 samples
## 7 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## Pre-processing: scaled (41), centered (41)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 5488, 5488, 5488, 5488, 5488, 5488, ...
## Addtional sampling using SMOTE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8961421 0.5727824
## 7 0.8990409 0.5772318
## 9 0.8979452 0.5684262
## 11 0.9005163 0.5801125
## 13 0.8989842 0.5716431
## 15 0.8995845 0.5736501
## 17 0.8983278 0.5637172
## 19 0.8977261 0.5635306
## 21 0.8978896 0.5627169
## 23 0.8959239 0.5547041
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
cross.reg <- train(formula,
data = data,
method = "glm",
trControl = fitControl,
preProcess = c('scale', 'center'),
na.action = na.omit)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cross.reg
## Generalized Linear Model
##
## 6098 samples
## 7 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## Pre-processing: scaled (41), centered (41)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 5488, 5488, 5489, 5488, 5488, 5488, ...
## Addtional sampling using SMOTE prior to pre-processing
##
## Resampling results:
##
## Accuracy Kappa
## 0.8875035 0.5752475
cross.ad <- train(formula,
data = data,
method = "rpart",
trControl = fitControl,
preProcess = c('scale', 'center'),
na.action = na.omit)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: partidoPCO
cross.ad
## CART
##
## 6098 samples
## 7 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## Pre-processing: scaled (41), centered (41)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 5488, 5488, 5489, 5489, 5489, 5488, ...
## Addtional sampling using SMOTE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01339829 0.8678279 0.5809528
## 0.09622412 0.8697414 0.5904495
## 0.25334957 0.8697414 0.5904495
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.2533496.
formulaAda <- as.formula(situacao ~ quantidade_doacoes + total_despesa + recursos_proprios + recursos_de_partido_politico)
fitControlAda <- trainControl(method = "repeatedcv", number = 3, repeats = 2)
#Sampling
fitControlAda$sampling <- "smote"
cross.ada <- train(formulaAda,
data = data,
method = "ada",
trControl = fitControlAda,
preProcess = c('scale', 'center'),
na.action = na.omit)
cross.ada
## Boosted Classification Trees
##
## 6098 samples
## 4 predictors
## 2 classes: 'eleito', 'nao_eleito'
##
## Pre-processing: scaled (4), centered (4)
## Resampling: Cross-Validated (3 fold, repeated 2 times)
## Summary of sample sizes: 4065, 4065, 4066, 4065, 4066, 4065, ...
## Addtional sampling using SMOTE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## maxdepth iter Accuracy Kappa
## 1 50 0.8658580 0.5841438
## 1 100 0.8681541 0.5886766
## 1 150 0.8689739 0.5896718
## 2 50 0.8749592 0.5990937
## 2 100 0.8765988 0.6019187
## 2 150 0.8779929 0.6045187
## 3 50 0.8807805 0.6087962
## 3 100 0.8811084 0.6079780
## 3 150 0.8811903 0.6069486
##
## Tuning parameter 'nu' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were iter = 150, maxdepth = 3 and nu
## = 0.1.
pred = predict(cross.reg, newdata=val, na.action = na.pass)
confMatrix <- confusionMatrix(data=pred, val$situacao)
reg.acc <- confMatrix$overall['Accuracy']
reg.rec <- recall(confMatrix$table, relevant = "eleito")
reg.pre <- precision(confMatrix$table, relevant = "eleito")
reg.fmes <- F_meas(confMatrix$table, relevant = "eleito")
pred = predict(cross.knn, newdata=val, na.action = na.pass)
confMatrix <- confusionMatrix(data=pred, val$situacao)
knn.acc <- confMatrix$overall['Accuracy']
knn.rec <- recall(confMatrix$table, relevant = "eleito")
knn.pre <- precision(confMatrix$table, relevant = "eleito")
knn.fmes <- F_meas(confMatrix$table, relevant = "eleito")
pred = predict(cross.ad, newdata=val, na.action = na.pass)
confMatrix <- confusionMatrix(data=pred, val$situacao)
ad.acc <- confMatrix$overall['Accuracy']
ad.rec <- recall(confMatrix$table, relevant = "eleito")
ad.pre <- precision(confMatrix$table, relevant = "eleito")
ad.fmes <- F_meas(confMatrix$table, relevant = "eleito")
pred = predict(cross.reg, newdata=val, na.action = na.pass)
confMatrix <- confusionMatrix(data=pred, val$situacao)
ada.acc <- confMatrix$overall['Accuracy']
ada.rec <- recall(confMatrix$table, relevant = "eleito")
ada.pre <- precision(confMatrix$table, relevant = "eleito")
ada.fmes <- F_meas(confMatrix$table, relevant = "eleito")
aval <- matrix(c(reg.acc,reg.rec,reg.pre,reg.fmes,knn.acc,knn.rec,knn.pre,knn.fmes,ad.acc,ad.rec,ad.pre,ad.fmes,ada.acc,ada.rec,ada.pre,ada.fmes),ncol=4,byrow=TRUE)
colnames(aval) <- c("acc","rec","pre", "fmes")
rownames(aval) <- c("reg","knn","ad", "ada")
aval <- as.table(aval)
aval
## acc rec pre fmes
## reg 0.8823529 0.6787879 0.5517241 0.6086957
## knn 0.9044118 0.6484848 0.6445783 0.6465257
## ad 0.8700980 0.9575758 0.5096774 0.6652632
## ada 0.8823529 0.6787879 0.5517241 0.6086957
Regressão e Knn tiveram a maior acurácia. Arvore de decisão tem o maior recall, ou seja dentro dos possíveis positidos ele acertou bastante, não deixando muitos eleitos escaparem, mas a precisão foi a menor, ou seja, possivelmente o modelo ád está enviezada a classificar os candidatos como eleitos. O fscore leva em consideração recall e precision, nessa métrica ad tem o melhor resultado.
varImp(cross.reg)
## glm variable importance
##
## only 20 most important variables shown (out of 41)
##
## Overall
## total_despesa 100.000
## partidoPRONA 61.057
## quantidade_doacoes 53.450
## recursos_proprios 50.476
## recursos_de_partido_politico 41.969
## partidoPT 41.386
## sexoMASCULINO 40.713
## partidoPFL 23.118
## partidoPMDB 23.050
## `grauSUPERIOR COMPLETO` 22.804
## partidoPR 20.943
## partidoPP 17.298
## partidoPL 16.316
## partidoPSB 14.417
## partidoPDT 13.903
## `partidoPC do B` 13.540
## partidoPRB 11.980
## partidoPSC 10.254
## partidoPSDB 9.309
## partidoPSL 9.207
varImp(cross.knn)
## ROC curve variable importance
##
## Importance
## total_despesa 100.000
## quantidade_doacoes 86.257
## recursos_proprios 53.208
## recursos_de_partido_politico 31.892
## grau 10.951
## partido 6.723
## sexo 0.000
varImp(cross.ad)
## rpart variable importance
##
## only 20 most important variables shown (out of 42)
##
## Overall
## total_despesa 100.00
## quantidade_doacoes 76.86
## partidoPT 56.87
## recursos_proprios 47.62
## grauSUPERIOR COMPLETO 30.22
## partidoPRP 0.00
## `grauSUPERIOR INCOMPLETO` 0.00
## partidoPDT 0.00
## partidoPAN 0.00
## partidoPSTU 0.00
## partidoPV 0.00
## partidoPCO 0.00
## recursos_de_partido_politico 0.00
## partidoPMN 0.00
## partidoPSOL 0.00
## `grauSUPERIOR COMPLETO` 0.00
## partidoPRB 0.00
## `grauENSINO FUNDAMENTAL INCOMPLETO` 0.00
## partidoPL 0.00
## `grauLÊ E ESCREVE` 0.00
varImp(cross.ada)
## ROC curve variable importance
##
## Importance
## total_despesa 100.00
## quantidade_doacoes 79.82
## recursos_proprios 31.30
## recursos_de_partido_politico 0.00
Alinhado com os resultados anteriores as variáveis relacionadas com financiamento de campanha são os mais importantes na classificação, sendo total_despera, quatidade_doacoes e recursos_propios sempre entre os mais importantes.
Variáveis mais importantes:
REG:total_despesa, quantidade_doacoes, partidoPRONA,recursos_proprios, partidoPT, partidoPFL, sexoMASCULINO
KNN: total_despesa, quantidade_doacoes, recursos_proprios, recursos_de_partido_politico, grau, partido, sexo
AD:total_despesa, quantidade_doacoes, partidoPT, recursos_proprios, grauSUPERIOR COMPLETO, partidoPRONA
ADA: total_despesa, recursos_proprios, recursos_de_partido_politico
prediction <- predict(cross.reg, test)
data_out <- data.frame(ID = test$sequencial_candidato, Predicted = prediction)
data_out$ID <-as.character(data_out$ID)
data_out %>% write_csv(path = "out.csv")