#i-RIDGE
fitControl <- trainControl(method = "cv", number = 10, repeats = 10, allowParallel = TRUE)
## Warning: `repeats` has no meaning for this resampling method.
model.ridge1 <- train(votos ~ .,
data = eleicoes_train,
method = "ridge",
trControl = fitControl,
preProcess = c('scale', 'center'),
na.action = na.omit)
model.ridge1
## Ridge Regression
##
## 7476 samples
## 17 predictors
##
## Pre-processing: scaled (76), centered (76)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6728, 6730, 6728, 6728, 6730, 6728, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0e+00 37104.04 0.4419414 16490.15
## 1e-04 37104.44 0.4419696 16490.21
## 1e-01 37962.71 0.4378781 16552.72
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.
#ii-LASSO
model.lasso1 <- train(votos ~ .,
data = eleicoes_train,
method = "lasso",
trControl = fitControl,
preProcess = c('scale', 'center'),
na.action = na.omit)
importance <- varImp(model.lasso1, scale=FALSE)
print(importance)
## loess r-squared variable importance
##
## Overall
## total_receita 0.481646
## total_despesa 0.480237
## recursos_de_pessoas_juridicas 0.434003
## recursos_de_pessoas_fisicas 0.235768
## quantidade_fornecedores 0.177902
## quantidade_despesas 0.176090
## media_receita 0.143573
## recursos_de_partido_politico 0.121428
## quantidade_doadores 0.077183
## quantidade_doacoes 0.072337
## grau 0.020436
## partido 0.006144
## sexo 0.005891
## uf 0.001435
## media_despesa 0.000000
## recursos_de_outros_candidatos.comites 0.000000
## recursos_proprios 0.000000
model.lasso1
## The lasso
##
## 7476 samples
## 17 predictors
##
## Pre-processing: scaled (76), centered (76)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6729, 6728, 6728, 6729, 6729, 6728, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 36636.51 0.4280098 16538.70
## 0.5 42622.72 0.4026728 16818.36
## 0.9 49933.64 0.3998484 17098.03
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.1.
model.lasso1
## The lasso
##
## 7476 samples
## 17 predictors
##
## Pre-processing: scaled (76), centered (76)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6729, 6728, 6728, 6729, 6729, 6728, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 36636.51 0.4280098 16538.70
## 0.5 42622.72 0.4026728 16818.36
## 0.9 49933.64 0.3998484 17098.03
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.1.
model.knn <- train(votos ~ .,
data = eleicoes_train,
method = "knn",
trControl = fitControl,
preProcess = c('scale', 'center'),
na.action = na.omit)
model.knn
## k-Nearest Neighbors
##
## 7476 samples
## 17 predictors
##
## Pre-processing: scaled (76), centered (76)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6729, 6729, 6728, 6728, 6728, 6729, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 36291.70 0.4277048 15120.88
## 7 36277.52 0.4295987 15312.10
## 9 36529.85 0.4212308 15560.22
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.
allResamples <- resamples(list("Ridge" = model.ridge1, "Lasso" = model.lasso1,
"KNN" = model.knn))
parallelplot(allResamples , metric = "RMSE")
No modelo Ridge o menor RMSE foi de 36289.08. No modelo Lasso o menor RMSE foi de 42929.28. No modelo KNN o menor RMSE foi de 36111.03. Além de ter o melhor RSMEos valores de KNN não se distanciaram muito no modelo KNN # 3-Quais as variáveis mais importantes segundo o modelo de regressão Ridge e Lasso? Variáveis foram descartadas pelo Lasso? Quais? As variáveis mais importantes para o lasso foram:
-total_receita -total_despesa -recursos_de_pessoas_juridicas -recursos_de_pessoas_fisicas
-quantidade_fornecedores -quantidade_despesas -media_receita
-recursos_de_partido_politico -quantidade_doadores
-quantidade_doacoes
-grau
-partido -sexo
-uf
As variáveis descartadas pelo lasso foram:
-media_despesa -recursos_de_outros_candidatos.comites -recursos_proprios
Algumas variáveis não foram utilizadas no modelo, como nome e sequencial_candidato. Assumimos que essas variáveis são inúteis aos modelos.
model.ridgeall <- train(votos ~ .,
data = eleicoes_train,
method = "knn",
preProcess = c('scale', 'center'),
na.action = na.omit)
model.ridgeall
## k-Nearest Neighbors
##
## 7476 samples
## 17 predictors
##
## Pre-processing: scaled (76), centered (76)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 7476, 7476, 7476, 7476, 7476, 7476, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 38107.91 0.3620070 15738.29
## 7 37547.83 0.3744638 15813.44
## 9 37297.23 0.3807376 15918.60
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
pred <- predict(model.ridgeall, eleicoes_test)
ans <- data.frame(ID = eleicoes_test$sequencial_candidato, votos = pred)
ans$ID <- as.character(ans$ID)
ans
write_csv(ans,"kaggleKNN.csv")