Carregando bibliotecas

library(tidyr)
library(dplyr)
library(caret)
library(doParallel)

Usando paralelismo

cls = makeCluster(3)
registerDoParallel(cls)

Carregando dados de validação e de treinamento

dados <- read.csv("train.csv", encoding="latin1")

validacao <- read.csv("test.csv", encoding="latin1")

Trocando valores NA pela média dos valores na coluna (para a validação é feito apenas para não dar erro)

for(i in 1:ncol(dados)){
  dados[is.na(dados[,i]), i] <- mean(dados[,i], na.rm = TRUE)
}

for(i in 1:ncol(validacao)){
  validacao[is.na(validacao[,i]), i] <- mean(validacao[,i], na.rm = TRUE)
}

Removendo colunas que dão erro

treino <- dados %>% select(-nome, -cargo, -numero_cadidato,  -setor_economico_receita, -setor_economico_despesa)
validacao <- validacao %>% select(-nome, -cargo, -numero_cadidato,  -setor_economico_receita, -setor_economico_despesa)

Tunando fit para validação cruzada 10-Fold com 10 repetições e com opção de paralelismo TRUE

fitControl <- trainControl(method = "repeatedcv", repeats = 2, number = 10, verboseIter = TRUE, allowParallel = TRUE)

Tunando o lambda do modelo com método Ridge e criando o modelo

set.seed(2346)

lambdaGrid <- expand.grid(lambda = 10^seq(2, -10, length=5))

model_ridge <- train(votos ~ ., 
               data = treino,
               method = "ridge",
               trControl = fitControl,
               tuneGrid = lambdaGrid,
               preProcess = c('scale', 'center', 'nzv'),
               na.action = na.omit)
## Aggregating results
## Selecting tuning parameters
## Fitting lambda = 1e-04 on full training set
model_ridge
## Ridge Regression 
## 
## 4152 samples
##   20 predictor
## 
## Pre-processing: scaled (31), centered (31), remove (52) 
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 3736, 3737, 3736, 3737, 3738, 3737, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE       Rsquared   MAE     
##   1e-10    32066.10  0.4960036  12710.39
##   1e-07    32066.08  0.4960040  12710.38
##   1e-04    32052.14  0.4964027  12700.90
##   1e-01    32069.11  0.4990033  12371.78
##   1e+02   154718.67  0.4996509  84842.83
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.

Tunando o modelo Lasso

set.seed(2346)

fractionGrid <- expand.grid(fraction = seq(0.001, 1, length = 100))

model_lasso <- train(votos ~ . , 
                    data = treino, 
                    method = "lasso", 
                    trControl = fitControl,
                    tuneGrid = fractionGrid,
                    preProcess = c('scale', 'center', 'nzv'),
                    na.action = na.omit)
## Aggregating results
## Selecting tuning parameters
## Fitting fraction = 0.606 on full training set
model_lasso
## The lasso 
## 
## 4152 samples
##   20 predictor
## 
## Pre-processing: scaled (31), centered (31), remove (52) 
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 3736, 3737, 3736, 3737, 3738, 3737, ... 
## Resampling results across tuning parameters:
## 
##   fraction    RMSE      Rsquared   MAE     
##   0.00100000  43288.20  0.4608114  23599.57
##   0.01109091  42226.94  0.4608114  22824.30
##   0.02118182  41201.95  0.4608114  22055.67
##   0.03127273  40216.54  0.4608114  21291.61
##   0.04136364  39274.30  0.4608114  20542.39
##   0.05145455  38379.09  0.4608114  19806.75
##   0.06154545  37535.03  0.4608114  19102.89
##   0.07163636  36746.85  0.4607725  18436.98
##   0.08172727  36032.46  0.4602413  17820.40
##   0.09181818  35418.33  0.4591225  17261.75
##   0.10190909  34912.59  0.4585020  16783.64
##   0.11200000  34493.29  0.4608476  16373.15
##   0.12209091  34128.33  0.4628078  15996.65
##   0.13218182  33798.27  0.4648204  15639.06
##   0.14227273  33497.09  0.4670992  15299.08
##   0.15236364  33225.66  0.4704251  14972.07
##   0.16245455  32975.91  0.4740102  14654.86
##   0.17254545  32760.72  0.4767886  14350.83
##   0.18263636  32581.27  0.4787238  14057.09
##   0.19272727  32437.65  0.4799086  13773.97
##   0.20281818  32325.31  0.4807074  13502.22
##   0.21290909  32233.69  0.4816421  13244.15
##   0.22300000  32160.04  0.4826025  13004.73
##   0.23309091  32107.49  0.4834164  12784.95
##   0.24318182  32069.13  0.4842551  12598.59
##   0.25327273  32047.36  0.4849566  12433.73
##   0.26336364  32028.00  0.4858067  12291.43
##   0.27345455  32014.53  0.4865394  12182.93
##   0.28354545  32010.67  0.4871553  12118.88
##   0.29363636  32004.87  0.4877938  12083.87
##   0.30372727  32003.39  0.4882444  12064.57
##   0.31381818  32001.68  0.4885991  12054.81
##   0.32390909  31994.67  0.4890544  12046.99
##   0.33400000  31985.71  0.4895973  12039.27
##   0.34409091  31978.53  0.4900982  12032.79
##   0.35418182  31972.47  0.4905828  12026.91
##   0.36427273  31967.69  0.4910451  12022.83
##   0.37436364  31966.11  0.4914288  12019.93
##   0.38445455  31967.06  0.4917560  12019.27
##   0.39454545  31969.69  0.4920335  12020.16
##   0.40463636  31972.34  0.4922634  12022.48
##   0.41472727  31976.18  0.4924460  12025.27
##   0.42481818  31976.94  0.4927121  12028.24
##   0.43490909  31975.00  0.4930556  12031.21
##   0.44500000  31967.73  0.4935353  12034.99
##   0.45509091  31959.99  0.4940022  12039.99
##   0.46518182  31952.51  0.4944654  12045.92
##   0.47527273  31944.82  0.4949277  12053.32
##   0.48536364  31936.75  0.4953998  12061.21
##   0.49545455  31928.22  0.4958737  12070.14
##   0.50554545  31919.37  0.4963486  12080.05
##   0.51563636  31911.26  0.4967960  12090.83
##   0.52572727  31904.84  0.4971852  12103.02
##   0.53581818  31902.69  0.4974059  12116.98
##   0.54590909  31900.64  0.4976220  12132.43
##   0.55600000  31897.37  0.4978667  12148.85
##   0.56609091  31894.71  0.4980928  12166.57
##   0.57618182  31892.60  0.4983032  12185.56
##   0.58627273  31891.25  0.4984904  12206.04
##   0.59636364  31891.35  0.4986319  12226.90
##   0.60645455  31890.81  0.4987762  12245.32
##   0.61654545  31891.01  0.4989000  12264.70
##   0.62663636  31892.65  0.4989794  12284.82
##   0.63672727  31894.90  0.4990419  12304.67
##   0.64681818  31897.21  0.4990861  12323.69
##   0.65690909  31899.08  0.4991086  12341.05
##   0.66700000  31901.55  0.4991064  12357.59
##   0.67709091  31904.45  0.4990854  12372.95
##   0.68718182  31907.40  0.4990620  12387.51
##   0.69727273  31910.78  0.4990258  12400.76
##   0.70736364  31914.86  0.4989700  12413.50
##   0.71745455  31918.69  0.4989233  12425.06
##   0.72754545  31921.27  0.4989006  12434.45
##   0.73763636  31923.30  0.4988889  12443.50
##   0.74772727  31925.58  0.4988705  12452.62
##   0.75781818  31928.12  0.4988447  12461.71
##   0.76790909  31930.93  0.4988109  12470.92
##   0.77800000  31934.00  0.4987698  12480.19
##   0.78809091  31937.33  0.4987209  12489.56
##   0.79818182  31940.92  0.4986637  12499.13
##   0.80827273  31944.77  0.4985988  12508.98
##   0.81836364  31948.89  0.4985259  12518.95
##   0.82845455  31953.24  0.4984459  12528.98
##   0.83854545  31957.87  0.4983582  12539.08
##   0.84863636  31962.76  0.4982628  12549.22
##   0.85872727  31967.86  0.4981615  12559.39
##   0.86881818  31973.21  0.4980533  12569.60
##   0.87890909  31978.83  0.4979377  12579.88
##   0.88900000  31984.71  0.4978144  12590.28
##   0.89909091  31990.85  0.4976843  12600.83
##   0.90918182  31997.18  0.4975486  12611.37
##   0.91927273  32003.84  0.4974038  12621.99
##   0.92936364  32010.77  0.4972516  12632.70
##   0.93945455  32017.94  0.4970929  12643.49
##   0.94954545  32025.35  0.4969279  12654.38
##   0.95963636  32033.01  0.4967562  12665.39
##   0.96972727  32040.92  0.4965777  12676.54
##   0.97981818  32049.06  0.4963930  12687.72
##   0.98990909  32057.45  0.4962016  12699.02
##   1.00000000  32066.10  0.4960036  12710.39
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.6064545.

Tunando o modelo KNN

set.seed(2346)

neighborsGrid <- expand.grid(k = seq(1, 30, length=30))

model_knn <- train(votos ~ ., 
                    data = treino,
                    method = "knn", 
                    trControl = fitControl,
                    tuneGrid = neighborsGrid,
                    preProcess = c('scale', 'center', 'nzv'),
                    na.action = na.omit)
## Aggregating results
## Selecting tuning parameters
## Fitting k = 22 on full training set
model_knn
## k-Nearest Neighbors 
## 
## 4152 samples
##   20 predictor
## 
## Pre-processing: scaled (31), centered (31), remove (52) 
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 3736, 3737, 3736, 3737, 3738, 3737, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  41177.75  0.3535770  13307.94
##    2  38090.57  0.3823543  12200.49
##    3  35412.44  0.4149788  11701.17
##    4  34050.66  0.4384779  11475.34
##    5  33406.42  0.4543372  11315.46
##    6  32838.00  0.4670258  11195.68
##    7  32459.28  0.4756630  11116.49
##    8  32287.67  0.4804375  11081.40
##    9  31989.12  0.4895893  10962.66
##   10  31799.64  0.4956950  10946.59
##   11  31657.62  0.5012021  10874.85
##   12  31492.71  0.5069404  10839.65
##   13  31440.76  0.5091176  10856.70
##   14  31389.96  0.5117038  10826.45
##   15  31382.15  0.5120714  10835.73
##   16  31385.35  0.5122732  10868.59
##   17  31470.19  0.5097622  10913.05
##   18  31444.53  0.5110191  10911.16
##   19  31394.60  0.5134570  10913.80
##   20  31359.44  0.5151175  10922.59
##   21  31385.19  0.5149211  10942.65
##   22  31358.43  0.5162241  10949.21
##   23  31359.19  0.5167009  10958.55
##   24  31392.01  0.5161162  10981.84
##   25  31378.89  0.5169801  10987.73
##   26  31386.79  0.5172323  10991.62
##   27  31390.89  0.5178791  10998.03
##   28  31406.70  0.5178645  10997.37
##   29  31392.69  0.5193645  10996.98
##   30  31397.25  0.5196575  11008.20
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 22.

Comparando os RMSE dos modelos

O modelo com menor RMSE foi o Lasso

model_ridge$results[which.min(model_ridge$results[, "RMSE"]), ]["RMSE"]
##       RMSE
## 3 32052.14
model_lasso$results[which.min(model_lasso$results[, "RMSE"]), ]["RMSE"]
##        RMSE
## 61 31890.81
model_knn$results[which.min(model_knn$results[, "RMSE"]), ]["RMSE"]
##        RMSE
## 22 31358.43
ggplot(model_ridge)

ggplot(model_lasso)

ggplot(model_knn)

Vendo as variáveis mais importantes e descartadas pelos modelos Ridge e Lasso

ggplot(varImp(model_ridge))

ggplot(varImp(model_lasso))

Retreinando modelo Lasso com parametro fixo sem validação cruzada

set.seed(2346)

fractionGrid <- expand.grid(fraction = c(0.3068136))

model_lasso <- train(votos ~ . , 
                    data = treino, 
                    method = "lasso",
                    tuneGrid = fractionGrid,
                    metric = "RMSE",
                    preProcess = c('scale', 'center', 'nzv'),
                    na.action = na.omit)

model_lasso
## The lasso 
## 
## 4152 samples
##   20 predictor
## 
## Pre-processing: scaled (31), centered (31), remove (52) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 4152, 4152, 4152, 4152, 4152, 4152, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   33736.25  0.4366099  12399.43
## 
## Tuning parameter 'fraction' was held constant at a value of 0.3068136
model_lasso$results[which.min(model_lasso$results[, "RMSE"]), ]["RMSE"]
##       RMSE
## 1 33736.25

Predição

Usando o modelo para prever os votos

predicao <- predict(model_lasso, newdata = validacao)

Transformaçõe nos dados

predictiondf = data.frame(votos = predicao)
predictiondf <- predictiondf  %>% mutate( ID = validacao$ID ) %>% select(ID, votos)

Salvando em arquivo

predictiondf <- abs(predictiondf)
write.csv(predictiondf, file = "prediction.csv", row.names = F)