library(readr)
library(ggplot2)
library(leaps)
library(caret)
## Loading required package: lattice
## Warning: replacing previous import by 'plyr::ddply' when loading 'caret'
## Warning: replacing previous import by 'tidyr::%>%' when loading 'broom'
## Warning: replacing previous import by 'tidyr::gather' when loading 'broom'
## Warning: replacing previous import by 'tidyr::spread' when loading 'broom'
## Warning: replacing previous import by 'rlang::!!' when loading 'recipes'
## Warning: replacing previous import by 'rlang::expr' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_lhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_rhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::is_empty' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::lang' when loading 'recipes'
## Warning: replacing previous import by 'rlang::na_dbl' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::names2' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::quos' when loading 'recipes'
## Warning: replacing previous import by 'rlang::sym' when loading 'recipes'
## Warning: replacing previous import by 'rlang::syms' when loading 'recipes'
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#library(C50)
library(rpart)
library(rpart.plot)
library(ROSE)
## Loaded ROSE 0.0-3
#para lcc
df <- read.csv("/home/joaolaf/Área de Trabalho/train.csv",encoding = "utf-8")
#para notebook
#df <- read.csv("C:\\Users\\João Lucas\\Desktop\\train.csv",encoding = "utf-8")
colnames(df)[13] <- "recursos_pessoas_fisicas"
dataPartition <- createDataPartition(y = df$situacao_final,p = 0.75,list = FALSE)
treino <- df[dataPartition,]
teste <- df[-dataPartition,]
Como queremos saber se os candidatos serão eleitos ou não, vamos fazer a predição com base na situação final de cada candidato na eleição de 2014. No gráfico abaixo, podemos ver que existe uma grande diferença entre a situação eleita e a não eleita.
a <-treino %>% group_by(situacao_final) %>%
summarise(totalAparece = n())
a$indexS <- factor(a$situacao_final, levels = a$situacao_final)
p <- plot_ly(a,x =~indexS,y = ~totalAparece,type = 'bar', name = 'Situações') %>% layout(title = 'Frequência de cada situação',xaxis = list(title=""),yaxis = list(title = "Quantas vezes cada situação aparece"),barmode = 'stack')
## Warning: replacing previous import by 'shiny::validateCssUnit' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::br' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::tags' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::div' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::h1' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::h2' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::h3' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::h4' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::h5' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::h6' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::knit_print.html' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::tagSetChildren' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::includeScript' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::em' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::tagAppendChild' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::is.singleton' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::includeHTML' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::includeMarkdown' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::code' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::tagList' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::a' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::tagAppendAttributes' when
## loading 'crosstalk'
## Warning: replacing previous import by 'shiny::singleton' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::hr' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::p' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::suppressDependencies' when
## loading 'crosstalk'
## Warning: replacing previous import by 'shiny::tagAppendChildren' when
## loading 'crosstalk'
## Warning: replacing previous import by 'shiny::includeText' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::pre' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::span' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::withTags' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::htmlTemplate' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::img' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::tag' when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::includeCSS' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::knit_print.shiny.tag' when
## loading 'crosstalk'
## Warning: replacing previous import by 'shiny::knit_print.shiny.tag.list'
## when loading 'crosstalk'
## Warning: replacing previous import by 'shiny::strong' when loading
## 'crosstalk'
## Warning: replacing previous import by 'shiny::HTML' when loading
## 'crosstalk'
p
Como isso pode ser prejudicial ao nosso estudo, fazendo com o que o resultado se torne enviezado, vamos tentar ajustar os dados da melhor maneira possível. Existem 3 métodos simples para se fazer isso: “up”, onde vamos aumentar o parâmetro que está em menor vigor; “down”, onde vamos diminuir o parâmetro que está em maior vigor; e o “both”, que vai fazer um pouco dos dois. Nesse estudo, optei por usar o “both”, tendo em vista que se usássemos o “down”, restariam pouquíssimos dados para o estudo. Já o “up”, vários novos dados teriam que ser criados, o que poderia afetar o no resultado final.
treino_balanced <- ovun.sample(situacao_final~., data=treino,p=0.5, seed=1,method="both")$data
table(treino_balanced$situacao_final)
##
## nao_eleito eleito
## 1615 1487
b <-treino_balanced %>% group_by(situacao_final) %>%
summarise(totalApareceBalanced = n())
b$indexS <- factor(b$situacao_final, levels = b$situacao_final)
p_balanced <- plot_ly(b,x =~indexS,y = ~totalApareceBalanced,type = 'bar', name = 'Situações') %>% layout(title = 'Frequência de cada situação',xaxis = list(title=""),yaxis = list(title = "Quantas vezes cada situação aparece"),barmode = 'stack')
p_balanced
Podemos ver que, ao final do processo, os dados não apresentam tanta diferença.
Para a nossa fórmula, iremos usar os dados que tiveram melhor eficácia quando tentamos predizer o número de votos (lab 3 da disciplina de ad2 -> https://rpubs.com/joaolcaas/predicao_2014_ad2)
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
verboseIter = FALSE)
formula.votos = as.formula(situacao_final ~ total_receita + total_despesa + quantidade_despesas + recursos_de_pessoas_juridicas + recursos_de_partidos + quantidade_doacoes + quantidade_fornecedores + media_receita + quantidade_doadores)
Para testar a nossa fórmula, vamos usar a regressão logística, que é um técnica que usa conceito similiar ao de regressão linear,porém, com uma diferença de que a variável dependente é uma variável discreta.
regressaoLogistica <- train(formula.votos,
data = treino_balanced,
method="glm",
trControl = ctrl,
family="binomial", # se a variável for binária
na.action = na.omit)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
regressaoLogistica
## Generalized Linear Model
##
## 3102 samples
## 9 predictors
## 2 classes: 'nao_eleito', 'eleito'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2791, 2792, 2791, 2792, 2792, 2792, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9016111 0.8025106
varImp(regressaoLogistica)
## glm variable importance
##
## Overall
## media_receita 100.00
## recursos_de_partidos 97.93
## quantidade_doadores 76.64
## quantidade_doacoes 67.56
## recursos_de_pessoas_juridicas 65.01
## quantidade_fornecedores 45.13
## total_receita 28.65
## total_despesa 21.31
## quantidade_despesas 0.00
Pela nossa regressão logistica, vimos que os dados mais altos na votacao(como total_receita, que tinha 100 de importancia) desceram bastante e não são mais o mesmo.Além disso podemos ver o valor da Acurácia, que é de 0.8841712. Essa é uma métrica importante quando estamos falando sobre regressão logística pois nos diz a proporção de observações corretamente classificadas.
Vamos usar outra formula, adicionando novos atributos, deixando apenas as variáveis mais importantes segundo a regressão logistica
formula.situacao.final = as.formula(situacao_final ~ media_receita + quantidade_doacoes + recursos_de_partidos + descricao_cor_raca + despesa_max_campanha + sexo + grau)
regressaoLogisticaNovaFormula <- train(formula.situacao.final,
data = treino_balanced,
method="glm",
trControl = ctrl,
family="binomial", # se a variável for binária
na.action = na.omit)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
regressaoLogisticaNovaFormula
## Generalized Linear Model
##
## 3102 samples
## 7 predictors
## 2 classes: 'nao_eleito', 'eleito'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2792, 2791, 2792, 2791, 2792, 2792, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8995475 0.7985564
Para a nova fórmula, vemos que a Acurácia não se manifestou de forma muito diferente. Sendo assim, vamos continuar usando a fórmula que usa os parâmetros do lab anterior.
Particionando a “árvore”, esse modelo gera resultados a partir de parâmetros que demostram se o valor de tal parâmetro leva a um resultado positivo ou negativo.
arvore1 <- train(formula.votos,
data= treino_balanced, method = "rpart",
trControl = ctrl,
cp=0.001, # parâmetro de complexidade
maxdepth=20)
arvore1
## CART
##
## 3102 samples
## 9 predictors
## 2 classes: 'nao_eleito', 'eleito'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2793, 2792, 2791, 2792, 2791, 2791, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.004034970 0.9397173 0.8797151
## 0.006388702 0.9355278 0.8713408
## 0.852051110 0.6910077 0.3588454
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.00403497.
Como o algoritmo seleciona a melhor Acurácia, o valor do do melhor modelo usado é o de cp = 0.004 aproximadamente. Esse modelo nos retornou uma acurácia melhor do que a regressão logística, indicando que os valores podem ser melhor classificados.
Assim como as outras técnicas, adaboost vai nos ajudar a predizer quem será eleito, porém, com mais poder, tendo em vista que, a técnica estuda a predição e aumenta os pesos das variáveis que são de mais importância.
Como essa é uma técnica que demanda tempo para ser executada, vamos diminuir o tamanho da formula para o treino.
formula.adaboost = as.formula(situacao_final ~ media_receita + quantidade_doacoes + recursos_de_partidos)
adaboost <- train(formula.adaboost,
data=treino_balanced,
trControl = ctrl,
method = "adaboost")
adaboost
## AdaBoost Classification Trees
##
## 3102 samples
## 3 predictors
## 2 classes: 'nao_eleito', 'eleito'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2793, 2793, 2791, 2792, 2792, 2792, ...
## Resampling results across tuning parameters:
##
## nIter method Accuracy Kappa
## 50 Adaboost.M1 0.9792744 0.9585467
## 50 Real adaboost 0.9781474 0.9563006
## 100 Adaboost.M1 0.9801769 0.9603483
## 100 Real adaboost 0.9782437 0.9564905
## 150 Adaboost.M1 0.9804348 0.9608638
## 150 Real adaboost 0.9779207 0.9558461
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nIter = 150 and method
## = Adaboost.M1.
Como podemos ver, o adaboost se mostrou bastante preciso em relação a acurácia. Com o número de interações diferentes, o adaboost aparece nos mostrando um valor maior do que 0.97, indicando uma proporção forte de abservações bem classificadas.
Como já vimos, acurácia é a proporção de observações corretamente classificadas.
Precision é a porcentagem daqueles de “acerto”, ou seja, daqueles que o modelo classificou como certo, quantos realmente eram? Recall é a frequência em que o classificador encontra os exemplos de uma classe, ou seja, quanto da classe Y realmente é Y? Já o f-measure é a média ponderada da precision e do recall, levando em conta falsos positivos e falsos negativos, sendo o balanço entre esses dois valores.
Primeiro, vamos encontrar os valores para o teste, passando por cada algoritmo.
teste$predicao.reg <- predict(regressaoLogistica, teste)
TP.teste.reg <- teste %>% filter(situacao_final == "eleito", predicao.reg== "eleito") %>% nrow()
TN.teste.reg <- teste %>% filter(situacao_final == "nao_eleito" , predicao.reg == "nao_eleito" ) %>% nrow()
FP.teste.reg <- teste %>% filter(situacao_final == "nao_eleito" , predicao.reg == "eleito") %>% nrow()
FN.teste.reg <- teste %>% filter(situacao_final == "eleito", predicao.reg == "nao_eleito" ) %>% nrow()
accuracy.teste.reg <- (TP.teste.reg + TN.teste.reg)/(TP.teste.reg + TN.teste.reg + FP.teste.reg + FN.teste.reg)
precision.teste.reg <- TP.teste.reg / (TP.teste.reg + FP.teste.reg)
recall.teste.reg <- TP.teste.reg / (TP.teste.reg + FN.teste.reg)
Fmeasure.teste.reg <- 2*(recall.teste.reg*precision.teste.reg)/(recall.teste.reg+precision.teste.reg)
accuracy.teste.reg
## [1] 0.9119071
precision.teste.reg
## [1] 0.5384615
recall.teste.reg
## [1] 0.875
Fmeasure.teste.reg
## [1] 0.6666667
teste$predicao.arvore <- predict(arvore1, teste)
TP.teste.arvore <- teste %>% filter(situacao_final == "eleito", predicao.arvore== "eleito") %>% nrow()
TN.teste.arvore <- teste %>% filter(situacao_final == "nao_eleito" , predicao.arvore == "nao_eleito" ) %>% nrow()
FP.teste.arvore <- teste %>% filter(situacao_final == "nao_eleito" , predicao.arvore == "eleito") %>% nrow()
FN.teste.arvore <- teste %>% filter(situacao_final == "eleito", predicao.arvore == "nao_eleito" ) %>% nrow()
accuracy.teste.arvore <- (TP.teste.arvore + TN.teste.arvore)/(TP.teste.arvore + TN.teste.arvore + FP.teste.arvore+ FN.teste.arvore)
precision.teste.arvore <- TP.teste.arvore / (TP.teste.arvore + FP.teste.arvore)
recall.teste.arvore <- TP.teste.arvore / (TP.teste.arvore + FN.teste.arvore)
Fmeasure.teste.arvore <- 2*(recall.teste.arvore*precision.teste.arvore)/(recall.teste.arvore+precision.teste.arvore)
accuracy.teste.arvore
## [1] 0.8838335
precision.teste.arvore
## [1] 0.4607843
recall.teste.arvore
## [1] 0.9038462
Fmeasure.teste.arvore
## [1] 0.6103896
teste$predicao.adaboost<- predict(adaboost, teste)
TP.teste.adaboost <- teste %>% filter(situacao_final == "eleito", predicao.adaboost == "eleito") %>% nrow()
TN.teste.adaboost <- teste %>% filter(situacao_final == "nao_eleito" , predicao.adaboost == "nao_eleito" ) %>% nrow()
FP.teste.adaboost <- teste %>% filter(situacao_final == "nao_eleito" , predicao.adaboost == "eleito") %>% nrow()
FN.teste.adaboost <- teste %>% filter(situacao_final == "eleito", predicao.adaboost == "nao_eleito" ) %>% nrow()
accuracy.teste.adaboost <- (TP.teste.adaboost + TN.teste.adaboost)/(TP.teste.adaboost + TN.teste.adaboost + FP.teste.adaboost + FN.teste.adaboost)
precision.teste.adaboost <- TP.teste.adaboost / (TP.teste.adaboost + FP.teste.adaboost)
recall.teste.adaboost <- TP.teste.adaboost / (TP.teste.adaboost + FN.teste.adaboost)
Fmeasure.teste.adaboost <- 2*(recall.teste.adaboost*precision.teste.adaboost)/(recall.teste.adaboost+precision.teste.adaboost)
accuracy.teste.adaboost
## [1] 0.9215876
precision.teste.adaboost
## [1] 0.5905512
recall.teste.adaboost
## [1] 0.7211538
Fmeasure.teste.adaboost
## [1] 0.6493506
No teste, a acurácia aparece bem igual entre duas predições. Regressão logística e adaboost nos mostra que essa precisão, quanto aos dados, é maior do que 0.90. Quanto ao resto dos dados, as aparições desses dados nos mostra uma variação. A Árvore tem um recall de quase 100, nos mostrando que ele encontra as variáveis corretas nos lugares corretos, porém, com precision muito babaixo do resto. Já o f-measure se mostra muito igual entre as 3 predições.
treino$predicao.reg<- predict(regressaoLogistica, treino)
TP.treino.reg <- treino %>% filter(situacao_final == "eleito", predicao.reg == "eleito") %>% nrow()
TN.treino.reg <- treino %>% filter(situacao_final == "nao_eleito" , predicao.reg == "nao_eleito" ) %>% nrow()
FP.treino.reg <- treino %>% filter(situacao_final == "nao_eleito" , predicao.reg == "eleito") %>% nrow()
FN.treino.reg <- treino %>% filter(situacao_final == "eleito", predicao.reg == "nao_eleito" ) %>% nrow()
accuracy.treino.reg <- (TP.treino.reg + TN.treino.reg)/(TP.treino.reg + TN.treino.reg + FP.treino.reg + FN.treino.reg)
precision.treino.reg <- TP.treino.reg / (TP.treino.reg + FP.treino.reg)
recall.treino.reg <- TP.treino.reg / (TP.treino.reg + FN.treino.reg)
Fmeasure.treino.reg <- 2*(recall.treino.reg*precision.treino.reg)/(recall.treino.reg+precision.treino.reg)
accuracy.treino.reg
## [1] 0.9229529
precision.treino.reg
## [1] 0.5768421
recall.treino.reg
## [1] 0.8782051
Fmeasure.treino.reg
## [1] 0.6963151
treino$predicao.arvore<- predict(arvore1, treino)
TP.predicao.arvore <- treino %>% filter(situacao_final == "eleito", predicao.arvore == "eleito") %>% nrow()
TN.predicao.arvore <- treino %>% filter(situacao_final == "nao_eleito" , predicao.arvore == "nao_eleito" ) %>% nrow()
FP.predicao.arvore <- treino %>% filter(situacao_final == "nao_eleito" , predicao.arvore == "eleito") %>% nrow()
FN.predicao.arvore <- treino %>% filter(situacao_final == "eleito", predicao.arvore == "nao_eleito" ) %>% nrow()
accuracy.predicao.arvore <- (TP.predicao.arvore + TN.predicao.arvore)/(TP.predicao.arvore + TN.predicao.arvore + FP.predicao.arvore + FN.predicao.arvore)
precision.predicao.arvore <- TP.predicao.arvore / (TP.predicao.arvore + FP.predicao.arvore)
recall.predicao.arvore <- TP.predicao.arvore / (TP.predicao.arvore + FN.predicao.arvore)
Fmeasure.predicao.arvore <- 2*(recall.predicao.arvore*precision.predicao.arvore)/(recall.predicao.arvore+precision.predicao.arvore)
accuracy.predicao.arvore
## [1] 0.9019987
precision.predicao.arvore
## [1] 0.5066007
recall.predicao.arvore
## [1] 0.9839744
Fmeasure.predicao.arvore
## [1] 0.6688453
treino$predicao.adaboost<- predict(adaboost, treino)
TP.predicao.adaboost <- treino %>% filter(situacao_final == "eleito", predicao.adaboost == "eleito") %>% nrow()
TN.predicao.adaboost <- treino %>% filter(situacao_final == "nao_eleito" , predicao.adaboost == "nao_eleito" ) %>% nrow()
FP.predicao.adaboost <- treino %>% filter(situacao_final == "nao_eleito" , predicao.adaboost == "eleito") %>% nrow()
FN.predicao.adaboost <- treino %>% filter(situacao_final == "eleito", predicao.adaboost == "nao_eleito" ) %>% nrow()
accuracy.predicao.adaboost <- (TP.predicao.adaboost + TN.predicao.adaboost)/(TP.predicao.adaboost + TN.predicao.adaboost + FP.predicao.adaboost + FN.predicao.adaboost)
precision.predicao.adaboost <- TP.predicao.adaboost / (TP.predicao.adaboost + FP.predicao.adaboost)
recall.predicao.adaboost <- TP.predicao.adaboost / (TP.predicao.adaboost + FN.predicao.adaboost)
Fmeasure.predicao.adaboost <- 2*(recall.predicao.adaboost*precision.predicao.adaboost)/(recall.predicao.adaboost+precision.predicao.adaboost)
accuracy.predicao.adaboost
## [1] 0.9677627
precision.predicao.adaboost
## [1] 0.7585366
recall.predicao.adaboost
## [1] 0.9967949
Fmeasure.predicao.adaboost
## [1] 0.8614958
Em relação ao treino, os resultados não são muito diferentes, porém, temos no adaboost um recall de 1(100), o que é uma indicação positiva quanto a classificação dos dados nesse modelo.
varImp(regressaoLogistica)
## glm variable importance
##
## Overall
## media_receita 100.00
## recursos_de_partidos 97.93
## quantidade_doadores 76.64
## quantidade_doacoes 67.56
## recursos_de_pessoas_juridicas 65.01
## quantidade_fornecedores 45.13
## total_receita 28.65
## total_despesa 21.31
## quantidade_despesas 0.00
Mostrando as variáveis mais importantes através do varImp, a regressão logística aponta o que já tínhamos visto na questão 2. A variável media_receita é de extrema importância para esse processo de predição.
varImp(arvore1)
## rpart variable importance
##
## Overall
## total_receita 100.0000
## total_despesa 99.5956
## quantidade_despesas 87.9690
## quantidade_fornecedores 78.7474
## recursos_de_pessoas_juridicas 73.8307
## recursos_de_partidos 2.9576
## quantidade_doadores 1.7309
## media_receita 0.6944
## quantidade_doacoes 0.0000
Já na árvore, como foi no nosso lab anterior(), a variável total_receita apresenta 100 de importância. Já media_receita, que foi a melhor variável no modelo acima, mostra que é uma variável importante aqui também, apresentando resultado de aproximadamente 75. Porém, total_receita não mostra nenhuma importância na estratégia de regressão logística.
varImp(adaboost)
## ROC curve variable importance
##
## Importance
## quantidade_doacoes 100.00
## media_receita 95.57
## recursos_de_partidos 0.00
Como tivemos poucas variaveis aqui (para conseguir rodar o adaboost), não vamos ter muito o que analisar. Media_receita, assim como na regressão logística, se mostrou a variável mais importante nessa predição.
Para essa nova variável, vamos testá-la no modelo que mostrou melhor resultado em todo o nosso estudo, regressão logística. Como sabemos, regressão logística teve como variável mais importante media_receita. Como queremos novas variáveis, vamos elevar essa ao quadrado, assim como a variável que apresentou segundo melhor resultado, que foi quantidade_doadores.
formula.nova.variavel = as.formula(situacao_final ~ poly(media_receita,2) + poly(quantidade_doadores,2) + total_receita + total_despesa + quantidade_despesas + recursos_de_pessoas_juridicas + recursos_de_partidos + quantidade_doacoes + quantidade_fornecedores)
regressaoLogisticaNovaVariavel <- train(formula.nova.variavel,
data = treino_balanced,
method="glm",
trControl = ctrl,
family="binomial", # se a variável for binária
na.action = na.omit)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
regressaoLogisticaNovaVariavel
## Generalized Linear Model
##
## 3102 samples
## 9 predictors
## 2 classes: 'nao_eleito', 'eleito'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2791, 2792, 2793, 2791, 2791, 2791, ...
## Resampling results:
##
## Accuracy Kappa
## 0.907286 0.8140922
Com essas duas novas variáveis, o resultado não apresentou muita diferença, porém, expressou uma pequena melhora, o que é uma boa indicação de que a variável foi boa para a predição.
TEM QUE ENVIAR ID + SITUACAO_FINAL
#para lcc
testeKaggle <- read.csv("/home/joaolaf/Área de Trabalho/test.csv",encoding = "utf-8")
#para notebook
#df <- read.csv("C:\\Users\\João Lucas\\Desktop\\train.csv",encoding = "utf-8")
colnames(testeKaggle)[13] <- "recursos_pessoas_fisicas"
regressaoLogistica.kaggle <- train(formula.votos, #ver qual a melhor formula
data = treino_balanced,
method="glm",
trControl = ctrl,
family="binomial", # se a variável for binária
na.action = na.omit)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
reg.final.kaggle <- predict(regressaoLogistica.kaggle,testeKaggle %>% select(-ID))
kaggle.final <- data.frame(ID = testeKaggle$ID, prediction = reg.final.kaggle)
write.csv(kaggle.final,"teste.csv",row.names = F)
1.https://shiring.github.io/machine_learning/2017/04/02/unbalanced 2.https://hackinganalytics.files.wordpress.com/2016/09/rare.pdf