Carregando bibliotecas
library(tidyr)
library(dplyr)
library(caret)
library(doParallel)
Usando paralelismo
cls = makeCluster(3)
registerDoParallel(cls)
Carregando dados de validação e de treinamento
dados <- read.csv("train.csv", encoding="latin1")
validacao <- read.csv("test.csv", encoding="latin1")
Removendo colunas que dão erro
treino <- dados %>% select(-nome, -cargo, -numero_cadidato, -setor_economico_receita, -setor_economico_despesa)
validacao <- validacao %>% select(-nome, -cargo, -numero_cadidato, -setor_economico_receita, -setor_economico_despesa)
Tunando o modelo Lasso
set.seed(2346)
fractionGrid <- expand.grid(fraction = seq(0.001, 1, length = 100))
model_lasso <- train(votos ~ . ,
data = treino,
method = "lasso",
trControl = fitControl,
tuneGrid = fractionGrid,
preProcess = c('scale', 'center', 'nzv'),
na.action = na.omit)
## Aggregating results
## Selecting tuning parameters
## Fitting fraction = 0.606 on full training set
model_lasso
## The lasso
##
## 4152 samples
## 20 predictor
##
## Pre-processing: scaled (31), centered (31), remove (52)
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 3736, 3737, 3736, 3737, 3738, 3737, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.00100000 43288.20 0.4608114 23599.57
## 0.01109091 42226.94 0.4608114 22824.30
## 0.02118182 41201.95 0.4608114 22055.67
## 0.03127273 40216.54 0.4608114 21291.61
## 0.04136364 39274.30 0.4608114 20542.39
## 0.05145455 38379.09 0.4608114 19806.75
## 0.06154545 37535.03 0.4608114 19102.89
## 0.07163636 36746.85 0.4607725 18436.98
## 0.08172727 36032.46 0.4602413 17820.40
## 0.09181818 35418.33 0.4591225 17261.75
## 0.10190909 34912.59 0.4585020 16783.64
## 0.11200000 34493.29 0.4608476 16373.15
## 0.12209091 34128.33 0.4628078 15996.65
## 0.13218182 33798.27 0.4648204 15639.06
## 0.14227273 33497.09 0.4670992 15299.08
## 0.15236364 33225.66 0.4704251 14972.07
## 0.16245455 32975.91 0.4740102 14654.86
## 0.17254545 32760.72 0.4767886 14350.83
## 0.18263636 32581.27 0.4787238 14057.09
## 0.19272727 32437.65 0.4799086 13773.97
## 0.20281818 32325.31 0.4807074 13502.22
## 0.21290909 32233.69 0.4816421 13244.15
## 0.22300000 32160.04 0.4826025 13004.73
## 0.23309091 32107.49 0.4834164 12784.95
## 0.24318182 32069.13 0.4842551 12598.59
## 0.25327273 32047.36 0.4849566 12433.73
## 0.26336364 32028.00 0.4858067 12291.43
## 0.27345455 32014.53 0.4865394 12182.93
## 0.28354545 32010.67 0.4871553 12118.88
## 0.29363636 32004.87 0.4877938 12083.87
## 0.30372727 32003.39 0.4882444 12064.57
## 0.31381818 32001.68 0.4885991 12054.81
## 0.32390909 31994.67 0.4890544 12046.99
## 0.33400000 31985.71 0.4895973 12039.27
## 0.34409091 31978.53 0.4900982 12032.79
## 0.35418182 31972.47 0.4905828 12026.91
## 0.36427273 31967.69 0.4910451 12022.83
## 0.37436364 31966.11 0.4914288 12019.93
## 0.38445455 31967.06 0.4917560 12019.27
## 0.39454545 31969.69 0.4920335 12020.16
## 0.40463636 31972.34 0.4922634 12022.48
## 0.41472727 31976.18 0.4924460 12025.27
## 0.42481818 31976.94 0.4927121 12028.24
## 0.43490909 31975.00 0.4930556 12031.21
## 0.44500000 31967.73 0.4935353 12034.99
## 0.45509091 31959.99 0.4940022 12039.99
## 0.46518182 31952.51 0.4944654 12045.92
## 0.47527273 31944.82 0.4949277 12053.32
## 0.48536364 31936.75 0.4953998 12061.21
## 0.49545455 31928.22 0.4958737 12070.14
## 0.50554545 31919.37 0.4963486 12080.05
## 0.51563636 31911.26 0.4967960 12090.83
## 0.52572727 31904.84 0.4971852 12103.02
## 0.53581818 31902.69 0.4974059 12116.98
## 0.54590909 31900.64 0.4976220 12132.43
## 0.55600000 31897.37 0.4978667 12148.85
## 0.56609091 31894.71 0.4980928 12166.57
## 0.57618182 31892.60 0.4983032 12185.56
## 0.58627273 31891.25 0.4984904 12206.04
## 0.59636364 31891.35 0.4986319 12226.90
## 0.60645455 31890.81 0.4987762 12245.32
## 0.61654545 31891.01 0.4989000 12264.70
## 0.62663636 31892.65 0.4989794 12284.82
## 0.63672727 31894.90 0.4990419 12304.67
## 0.64681818 31897.21 0.4990861 12323.69
## 0.65690909 31899.08 0.4991086 12341.05
## 0.66700000 31901.55 0.4991064 12357.59
## 0.67709091 31904.45 0.4990854 12372.95
## 0.68718182 31907.40 0.4990620 12387.51
## 0.69727273 31910.78 0.4990258 12400.76
## 0.70736364 31914.86 0.4989700 12413.50
## 0.71745455 31918.69 0.4989233 12425.06
## 0.72754545 31921.27 0.4989006 12434.45
## 0.73763636 31923.30 0.4988889 12443.50
## 0.74772727 31925.58 0.4988705 12452.62
## 0.75781818 31928.12 0.4988447 12461.71
## 0.76790909 31930.93 0.4988109 12470.92
## 0.77800000 31934.00 0.4987698 12480.19
## 0.78809091 31937.33 0.4987209 12489.56
## 0.79818182 31940.92 0.4986637 12499.13
## 0.80827273 31944.77 0.4985988 12508.98
## 0.81836364 31948.89 0.4985259 12518.95
## 0.82845455 31953.24 0.4984459 12528.98
## 0.83854545 31957.87 0.4983582 12539.08
## 0.84863636 31962.76 0.4982628 12549.22
## 0.85872727 31967.86 0.4981615 12559.39
## 0.86881818 31973.21 0.4980533 12569.60
## 0.87890909 31978.83 0.4979377 12579.88
## 0.88900000 31984.71 0.4978144 12590.28
## 0.89909091 31990.85 0.4976843 12600.83
## 0.90918182 31997.18 0.4975486 12611.37
## 0.91927273 32003.84 0.4974038 12621.99
## 0.92936364 32010.77 0.4972516 12632.70
## 0.93945455 32017.94 0.4970929 12643.49
## 0.94954545 32025.35 0.4969279 12654.38
## 0.95963636 32033.01 0.4967562 12665.39
## 0.96972727 32040.92 0.4965777 12676.54
## 0.97981818 32049.06 0.4963930 12687.72
## 0.98990909 32057.45 0.4962016 12699.02
## 1.00000000 32066.10 0.4960036 12710.39
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.6064545.
Tunando o modelo KNN
set.seed(2346)
neighborsGrid <- expand.grid(k = seq(1, 30, length=30))
model_knn <- train(votos ~ .,
data = treino,
method = "knn",
trControl = fitControl,
tuneGrid = neighborsGrid,
preProcess = c('scale', 'center', 'nzv'),
na.action = na.omit)
## Aggregating results
## Selecting tuning parameters
## Fitting k = 22 on full training set
model_knn
## k-Nearest Neighbors
##
## 4152 samples
## 20 predictor
##
## Pre-processing: scaled (31), centered (31), remove (52)
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 3736, 3737, 3736, 3737, 3738, 3737, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 41177.75 0.3535770 13307.94
## 2 38090.57 0.3823543 12200.49
## 3 35412.44 0.4149788 11701.17
## 4 34050.66 0.4384779 11475.34
## 5 33406.42 0.4543372 11315.46
## 6 32838.00 0.4670258 11195.68
## 7 32459.28 0.4756630 11116.49
## 8 32287.67 0.4804375 11081.40
## 9 31989.12 0.4895893 10962.66
## 10 31799.64 0.4956950 10946.59
## 11 31657.62 0.5012021 10874.85
## 12 31492.71 0.5069404 10839.65
## 13 31440.76 0.5091176 10856.70
## 14 31389.96 0.5117038 10826.45
## 15 31382.15 0.5120714 10835.73
## 16 31385.35 0.5122732 10868.59
## 17 31470.19 0.5097622 10913.05
## 18 31444.53 0.5110191 10911.16
## 19 31394.60 0.5134570 10913.80
## 20 31359.44 0.5151175 10922.59
## 21 31385.19 0.5149211 10942.65
## 22 31358.43 0.5162241 10949.21
## 23 31359.19 0.5167009 10958.55
## 24 31392.01 0.5161162 10981.84
## 25 31378.89 0.5169801 10987.73
## 26 31386.79 0.5172323 10991.62
## 27 31390.89 0.5178791 10998.03
## 28 31406.70 0.5178645 10997.37
## 29 31392.69 0.5193645 10996.98
## 30 31397.25 0.5196575 11008.20
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 22.
Comparando os RMSE dos modelos
Vendo as variáveis mais importantes e descartadas pelos modelos Ridge e Lasso
ggplot(varImp(model_ridge))

ggplot(varImp(model_lasso))

Predição
Usando o modelo para prever os votos
predicao <- predict(model_lasso, newdata = validacao)
Salvando em arquivo
predictiondf <- abs(predictiondf)
write.csv(predictiondf, file = "prediction.csv", row.names = F)