library(tidyverse)
library(here)
library(caret)
library(ggplot2)
options(max.print = .Machine$integer.max)
set.seed(37)
theme_set(theme_minimal())
Nesta análise iremos construir modelos preditivos de regressão para a predição de votação de candidatos à Câmara Federal de Deputados. Anteriormente, nesta análise, buscamos explicar essas votações através da regressão linear multivariada.
Uma regressão com muitos coeficientes torna o modelo muito complexo, dificultando sua interpretação. Uma das consequências disso é o overfitting, que acontece quando o modelo se adapta ao ruído dos dados. Os métodos Ridge, Lasso e KNN são uma forma de eliminar esse problema, pois reduzem a complexidade do modelo, minimizando as chances do overfitting acontecer.
Nesta análise, utilizaremos estes métodos para prever a quantidade de votos que um candidato receberá.
Primeiramente, vamos importar nossos dados. Uma melhor descrição das variáveis pode ser encontrada, também, aqui.
train <- read.csv(here("data/train.csv"))
Inicialmente, vamos retirar dos nossos dados as colunas que servem apenas para identificar um candidato, a variável cargo, que possui apenas um valor, e as variáveis categóricas de vários níveis.
train <- train %>%
select(-sequencial_candidato,
-nome,
-cargo,
-uf,
-ocupacao)
Agora, vamos utilizar a validação cruzada.
fitControl <- trainControl(method = "cv",
number = 10,
search = "random")
lambdaGrid <- expand.grid(lambda = 10^seq(10, -2, length=100))
ridge <- train(votos ~ .,
data = train,
method = "ridge",
trControl = fitControl,
preProcess = c('scale', 'center', 'nzv'),
tuneGrid = lambdaGrid)
ridge
# Ridge Regression
#
# 7476 samples
# 18 predictor
#
# Pre-processing: scaled (27), centered (27), remove (28)
# Resampling: Cross-Validated (10 fold)
# Summary of sample sizes: 6728, 6728, 6729, 6728, 6729, 6728, ...
# Resampling results across tuning parameters:
#
# lambda RMSE Rsquared MAE
# 1.000000e-02 37285.91 0.4108784 16191.83
# 1.321941e-02 37311.86 0.4106387 16192.27
# 1.747528e-02 37345.55 0.4103062 16191.31
# 2.310130e-02 37387.56 0.4098917 16187.08
# 3.053856e-02 37438.91 0.4094078 16177.98
# 4.037017e-02 37501.53 0.4088640 16163.30
# 5.336699e-02 37578.79 0.4082613 16141.69
# 7.054802e-02 37676.22 0.4075885 16110.63
# 9.326033e-02 37802.50 0.4068199 16069.10
# 1.232847e-01 37970.89 0.4059166 16023.47
# 1.629751e-01 38201.46 0.4048305 15976.69
# 2.154435e-01 38524.55 0.4035123 15935.85
# 2.848036e-01 38985.86 0.4019229 15917.25
# 3.764936e-01 39653.28 0.4000447 15957.85
# 4.977024e-01 40624.93 0.3978911 16099.52
# 6.579332e-01 42035.99 0.3955093 16416.33
# 8.697490e-01 44059.65 0.3929765 17012.19
# 1.149757e+00 46896.07 0.3903894 18045.90
# 1.519911e+00 50744.43 0.3878512 19634.59
# 2.009233e+00 55759.84 0.3854564 21959.49
# 2.656088e+00 62005.33 0.3832791 24998.55
# 3.511192e+00 69414.73 0.3813659 28829.52
# 4.641589e+00 77780.33 0.3797344 33252.63
# 6.135907e+00 86772.26 0.3783782 38022.49
# 8.111308e+00 95986.84 0.3772743 42917.56
# 1.072267e+01 105011.22 0.3763908 47711.89
# 1.417474e+01 113485.88 0.3756930 52226.75
# 1.873817e+01 121148.12 0.3751474 56312.88
# 2.477076e+01 127848.40 0.3747242 59887.26
# 3.274549e+01 133541.96 0.3743979 62926.80
# 4.328761e+01 138265.10 0.3741474 65446.96
# 5.722368e+01 142106.50 0.3739558 67494.13
# 7.564633e+01 145181.19 0.3738097 69131.47
# 1.000000e+02 147611.02 0.3736984 70425.22
# 1.321941e+02 149512.05 0.3736139 71437.73
# 1.747528e+02 150987.74 0.3735497 72225.15
# 2.310130e+02 152126.30 0.3735010 72833.07
# 3.053856e+02 153000.66 0.3734641 73299.77
# 4.037017e+02 153669.70 0.3734361 73656.83
# 5.336699e+02 154180.23 0.3734149 73929.25
# 7.054802e+02 154569.00 0.3733989 74136.66
# 9.326033e+02 154864.56 0.3733867 74294.32
# 1.232847e+03 155089.00 0.3733775 74414.03
# 1.629751e+03 155259.27 0.3733706 74504.84
# 2.154435e+03 155388.35 0.3733653 74573.68
# 2.848036e+03 155486.16 0.3733613 74625.84
# 3.764936e+03 155560.24 0.3733583 74665.35
# 4.977024e+03 155616.33 0.3733560 74695.26
# 6.579332e+03 155658.79 0.3733543 74717.90
# 8.697490e+03 155690.93 0.3733530 74735.04
# 1.149757e+04 155715.25 0.3733520 74748.01
# 1.519911e+04 155733.66 0.3733513 74757.82
# 2.009233e+04 155747.58 0.3733507 74765.25
# 2.656088e+04 155758.12 0.3733503 74770.86
# 3.511192e+04 155766.09 0.3733500 74775.12
# 4.641589e+04 155772.12 0.3733497 74778.33
# 6.135907e+04 155776.68 0.3733495 74780.76
# 8.111308e+04 155780.13 0.3733494 74782.60
# 1.072267e+05 155782.74 0.3733493 74784.00
# 1.417474e+05 155784.72 0.3733492 74785.05
# 1.873817e+05 155786.21 0.3733492 74785.85
# 2.477076e+05 155787.34 0.3733491 74786.45
# 3.274549e+05 155788.20 0.3733491 74786.90
# 4.328761e+05 155788.84 0.3733490 74787.25
# 5.722368e+05 155789.33 0.3733490 74787.51
# 7.564633e+05 155789.70 0.3733490 74787.71
# 1.000000e+06 155789.98 0.3733490 74787.86
# 1.321941e+06 155790.20 0.3733490 74787.97
# 1.747528e+06 155790.36 0.3733490 74788.05
# 2.310130e+06 155790.48 0.3733490 74788.12
# 3.053856e+06 155790.57 0.3733490 74788.17
# 4.037017e+06 155790.64 0.3733490 74788.21
# 5.336699e+06 155790.69 0.3733490 74788.23
# 7.054802e+06 155790.73 0.3733490 74788.25
# 9.326033e+06 155790.76 0.3733490 74788.27
# 1.232847e+07 155790.78 0.3733490 74788.28
# 1.629751e+07 155790.80 0.3733490 74788.29
# 2.154435e+07 155790.81 0.3733490 74788.30
# 2.848036e+07 155790.82 0.3733490 74788.30
# 3.764936e+07 155790.83 0.3733490 74788.31
# 4.977024e+07 155790.84 0.3733490 74788.31
# 6.579332e+07 155790.84 0.3733490 74788.31
# 8.697490e+07 155790.84 0.3733490 74788.32
# 1.149757e+08 155790.85 0.3733490 74788.32
# 1.519911e+08 155790.85 0.3733490 74788.32
# 2.009233e+08 155790.85 0.3733490 74788.32
# 2.656088e+08 155790.85 0.3733490 74788.32
# 3.511192e+08 155790.85 0.3733490 74788.32
# 4.641589e+08 155790.85 0.3733490 74788.32
# 6.135907e+08 155790.85 0.3733490 74788.32
# 8.111308e+08 155790.85 0.3733490 74788.32
# 1.072267e+09 155790.85 0.3733490 74788.32
# 1.417474e+09 155790.85 0.3733490 74788.32
# 1.873817e+09 155790.85 0.3733490 74788.32
# 2.477076e+09 155790.85 0.3733490 74788.32
# 3.274549e+09 155790.85 0.3733490 74788.32
# 4.328761e+09 155790.85 0.3733490 74788.32
# 5.722368e+09 155790.85 0.3733490 74788.32
# 7.564633e+09 155790.85 0.3733490 74788.32
# 1.000000e+10 155790.85 0.3733490 74788.32
#
# RMSE was used to select the optimal model using the smallest value.
# The final value used for the model was lambda = 0.01.
fractionGrid <- expand.grid(fraction = seq(.1, .9, length = 100))
lasso <- train(votos ~ .,
data = train,
method = "lasso",
trControl = fitControl,
preProcess = c('scale', 'center', 'nzv'),
tuneGrid = fractionGrid)
lasso
# The lasso
#
# 7476 samples
# 18 predictor
#
# Pre-processing: scaled (27), centered
# (27), remove (28)
# Resampling: Cross-Validated (10 fold)
# Summary of sample sizes: 6728, 6730, 6728, 6728, 6728, 6729, ...
# Resampling results across tuning parameters:
#
# fraction RMSE Rsquared MAE
# 0.1000000 38642.19 0.3848477 16315.32
# 0.1080808 38834.80 0.3828931 16324.38
# 0.1161616 39032.77 0.3811543 16333.44
# 0.1242424 39235.42 0.3796083 16342.50
# 0.1323232 39442.17 0.3782330 16351.55
# 0.1404040 39652.53 0.3770081 16360.61
# 0.1484848 39866.06 0.3759153 16369.67
# 0.1565657 40082.41 0.3749386 16378.73
# 0.1646465 40301.24 0.3740637 16387.79
# 0.1727273 40522.30 0.3732780 16396.85
# 0.1808081 40745.34 0.3725709 16405.91
# 0.1888889 40970.15 0.3719329 16414.97
# 0.1969697 41196.56 0.3713559 16424.03
# 0.2050505 41424.41 0.3708327 16433.09
# 0.2131313 41653.56 0.3703572 16442.15
# 0.2212121 41883.88 0.3699240 16451.21
# 0.2292929 42115.28 0.3695285 16460.27
# 0.2373737 42347.66 0.3691666 16469.33
# 0.2454545 42580.92 0.3688347 16478.39
# 0.2535354 42815.00 0.3685298 16487.45
# 0.2616162 43049.83 0.3682489 16496.51
# 0.2696970 43285.35 0.3679899 16505.57
# 0.2777778 43521.50 0.3677504 16514.63
# 0.2858586 43758.24 0.3675287 16523.69
# 0.2939394 43995.52 0.3673230 16532.75
# 0.3020202 44233.30 0.3671319 16541.81
# 0.3101010 44471.54 0.3669541 16550.87
# 0.3181818 44710.22 0.3667884 16559.93
# 0.3262626 44949.30 0.3666337 16568.99
# 0.3343434 45188.76 0.3664891 16578.05
# 0.3424242 45428.57 0.3663537 16587.11
# 0.3505051 45668.71 0.3662268 16596.17
# 0.3585859 45909.15 0.3661077 16605.23
# 0.3666667 46149.89 0.3659958 16614.29
# 0.3747475 46390.89 0.3658905 16623.35
# 0.3828283 46632.15 0.3657913 16632.41
# 0.3909091 46873.65 0.3656978 16641.47
# 0.3989899 47115.37 0.3656095 16650.52
# 0.4070707 47357.31 0.3655260 16659.58
# 0.4151515 47599.45 0.3654471 16668.64
# 0.4232323 47841.78 0.3653723 16677.70
# 0.4313131 48084.29 0.3653015 16686.76
# 0.4393939 48326.97 0.3652343 16695.82
# 0.4474747 48569.81 0.3651705 16704.88
# 0.4555556 48812.80 0.3651098 16713.94
# 0.4636364 49055.94 0.3650521 16723.00
# 0.4717172 49299.22 0.3649972 16732.06
# 0.4797980 49542.64 0.3649449 16741.12
# 0.4878788 49786.17 0.3648950 16750.18
# 0.4959596 50029.83 0.3648474 16759.24
# 0.5040404 50273.60 0.3648019 16768.30
# 0.5121212 50517.48 0.3647585 16777.36
# 0.5202020 50761.46 0.3647170 16786.42
# 0.5282828 51005.54 0.3646773 16795.48
# 0.5363636 51249.72 0.3646393 16804.54
# 0.5444444 51493.99 0.3646028 16813.60
# 0.5525253 51738.35 0.3645679 16822.66
# 0.5606061 51982.78 0.3645344 16831.72
# 0.5686869 52227.30 0.3645022 16840.78
# 0.5767677 52471.90 0.3644714 16849.84
# 0.5848485 52716.57 0.3644417 16858.90
# 0.5929293 52961.31 0.3644132 16867.96
# 0.6010101 53206.11 0.3643858 16877.02
# 0.6090909 53450.98 0.3643594 16886.08
# 0.6171717 53695.92 0.3643340 16895.14
# 0.6252525 53940.91 0.3643095 16904.20
# 0.6333333 54185.97 0.3642859 16913.26
# 0.6414141 54431.08 0.3642632 16922.32
# 0.6494949 54676.24 0.3642412 16931.38
# 0.6575758 54921.45 0.3642201 16940.44
# 0.6656566 55166.72 0.3641996 16949.49
# 0.6737374 55412.03 0.3641799 16958.55
# 0.6818182 55657.39 0.3641608 16967.61
# 0.6898990 55902.80 0.3641424 16976.67
# 0.6979798 56148.24 0.3641245 16985.73
# 0.7060606 56393.73 0.3641073 16994.79
# 0.7141414 56639.27 0.3640906 17003.85
# 0.7222222 56884.84 0.3640744 17012.91
# 0.7303030 57130.44 0.3640588 17021.97
# 0.7383838 57376.09 0.3640436 17031.03
# 0.7464646 57621.77 0.3640289 17040.09
# 0.7545455 57867.48 0.3640147 17049.15
# 0.7626263 58113.23 0.3640008 17058.21
# 0.7707071 58359.01 0.3639874 17067.27
# 0.7787879 58604.82 0.3639744 17076.33
# 0.7868687 58850.67 0.3639618 17085.39
# 0.7949495 59096.54 0.3639495 17094.45
# 0.8030303 59342.44 0.3639376 17103.51
# 0.8111111 59588.37 0.3639260 17112.57
# 0.8191919 59834.32 0.3639148 17121.63
# 0.8272727 60080.30 0.3639038 17130.69
# 0.8353535 60326.31 0.3638932 17139.75
# 0.8434343 60572.34 0.3638828 17148.81
# 0.8515152 60818.40 0.3638728 17157.87
# 0.8595960 61064.48 0.3638629 17166.93
# 0.8676768 61310.58 0.3638534 17175.99
# 0.8757576 61556.70 0.3638441 17185.05
# 0.8838384 61802.85 0.3638350 17194.11
# 0.8919192 62049.01 0.3638262 17203.17
# 0.9000000 62295.20 0.3638176 17212.23
#
# RMSE was used to select the optimal model
# using the smallest value.
# The final value used for the model was
# fraction = 0.1.
knnGrid <- expand.grid(k = seq(1, 100, length=100))
knn <- train(votos ~ .,
data = train,
method = "knn",
trControl = fitControl,
preProcess = c('scale', 'center', 'nzv'),
tuneGrid = knnGrid)
knn
# k-Nearest Neighbors
#
# 7476 samples
# 18 predictor
#
# Pre-processing: scaled (27), centered (27), remove (28)
# Resampling: Cross-Validated (10 fold)
# Summary of sample sizes: 6728, 6728, 6728, 6728, 6728, 6729, ...
# Resampling results across tuning parameters:
#
# k RMSE Rsquared MAE
# 1 39590.44 0.3862799 15381.88
# 2 37167.56 0.4294439 13930.65
# 3 35371.46 0.4630306 13314.64
# 4 34622.11 0.4798480 13073.82
# 5 34238.83 0.4892339 12835.11
# 6 34698.04 0.4785924 12911.47
# 7 34357.34 0.4886469 12837.88
# 8 34049.18 0.4980518 12759.87
# 9 34030.43 0.4990378 12799.41
# 10 34026.00 0.4991038 12812.29
# 11 33943.69 0.5019108 12786.70
# 12 33984.58 0.5011603 12840.54
# 13 33937.09 0.5031835 12869.59
# 14 33938.13 0.5037618 12882.91
# 15 33996.95 0.5022974 12941.52
# 16 33972.10 0.5037643 12965.29
# 17 33972.50 0.5040242 12991.06
# 18 33989.73 0.5041524 13013.16
# 19 33985.07 0.5046022 13047.05
# 20 34026.85 0.5036187 13067.02
# 21 34020.33 0.5043436 13106.36
# 22 34013.41 0.5047910 13109.82
# 23 33998.49 0.5053982 13111.13
# 24 34021.16 0.5045920 13132.46
# 25 34085.60 0.5026646 13168.65
# 26 34104.19 0.5022285 13176.22
# 27 34141.88 0.5008675 13209.97
# 28 34148.22 0.5011873 13212.88
# 29 34169.11 0.5012985 13215.41
# 30 34178.75 0.5014839 13230.16
# 31 34179.34 0.5018145 13220.68
# 32 34185.16 0.5021520 13214.24
# 33 34201.40 0.5018754 13215.76
# 34 34246.49 0.5009640 13223.09
# 35 34296.27 0.4994913 13260.76
# 36 34307.88 0.4993427 13271.57
# 37 34322.59 0.4992152 13283.17
# 38 34359.88 0.4985052 13299.53
# 39 34374.36 0.4979220 13318.58
# 40 34419.33 0.4965822 13348.06
# 41 34466.76 0.4953434 13379.65
# 42 34484.54 0.4950057 13411.82
# 43 34534.96 0.4935595 13455.70
# 44 34570.02 0.4926484 13498.04
# 45 34579.36 0.4927892 13513.46
# 46 34608.78 0.4920655 13540.12
# 47 34614.31 0.4923375 13561.26
# 48 34652.54 0.4912735 13589.55
# 49 34675.37 0.4906998 13616.94
# 50 34702.54 0.4901829 13637.19
# 51 34723.20 0.4898901 13649.42
# 52 34740.71 0.4896901 13668.01
# 53 34771.23 0.4888510 13686.12
# 54 34799.81 0.4881109 13719.47
# 55 34828.53 0.4872162 13752.63
# 56 34852.06 0.4866434 13779.02
# 57 34857.89 0.4865359 13790.75
# 58 34843.17 0.4874145 13797.83
# 59 34881.75 0.4862853 13815.79
# 60 34901.92 0.4857017 13834.16
# 61 34909.96 0.4857047 13839.64
# 62 34922.06 0.4855083 13854.50
# 63 34948.12 0.4847626 13881.75
# 64 34955.30 0.4846196 13900.76
# 65 34977.99 0.4841329 13918.85
# 66 35014.56 0.4830444 13944.08
# 67 35029.38 0.4824660 13965.46
# 68 35047.16 0.4819754 13985.39
# 69 35063.29 0.4814069 14000.23
# 70 35077.96 0.4809808 14018.89
# 71 35101.53 0.4802459 14045.34
# 72 35121.86 0.4796296 14066.82
# 73 35140.56 0.4791648 14079.44
# 74 35162.45 0.4786162 14100.32
# 75 35184.67 0.4780427 14113.74
# 76 35200.04 0.4776865 14132.02
# 77 35216.61 0.4773027 14149.96
# 78 35235.86 0.4767582 14163.50
# 79 35244.84 0.4767540 14176.32
# 80 35250.27 0.4768065 14191.70
# 81 35254.06 0.4769629 14200.59
# 82 35273.77 0.4765515 14219.34
# 83 35287.13 0.4762929 14232.99
# 84 35289.64 0.4763490 14244.49
# 85 35310.79 0.4758519 14260.76
# 86 35294.40 0.4766080 14267.88
# 87 35305.69 0.4764606 14276.26
# 88 35323.03 0.4760973 14284.75
# 89 35348.90 0.4753998 14305.21
# 90 35366.46 0.4750482 14312.07
# 91 35380.56 0.4748401 14322.59
# 92 35393.08 0.4746048 14329.96
# 93 35390.75 0.4748567 14335.14
# 94 35407.90 0.4745384 14346.81
# 95 35428.63 0.4739101 14354.87
# 96 35444.09 0.4735954 14363.20
# 97 35454.19 0.4735241 14372.79
# 98 35462.65 0.4735148 14379.34
# 99 35480.99 0.4730622 14387.06
# 100 35490.63 0.4729729 14391.26
#
# RMSE was used to select the optimal model using the smallest value.
# The final value used for the model was k = 13.
O modelo com menor valor par ao RMSE foi o KNN, cujo valor final foi 33937.09 para k = 13. O modelo Ridge vem logo em seguida com o segundo menor valor, que foi 37285.91 para lamba = 0.01. Por último, o modelo com um maior valor de RMSE foi o Lasso, que, selecionando fraction = 0.1 resultou em 38642.19.
Para o modelo Ridge, temos:
ggplot(varImp(ridge))
De acordo com o gráfico, as variáveis mais importantes são, em ordem de importância: 1. total_receita 2. total_despesa 3. recursos_de_pessoas_juridicas 4. recursos_de_pessoas_fisicas 5. quantidade_fornecedores 6. quantidade_despesas 7. media_receita 8. recursos_de_partido_politico 9. quantidade_doadores 10. quantidade_doacoes 11. grau 12. estado_civil 13. partido 14. sexo
Para o modelo Lasso, temos:
ggplot(varImp(lasso))
De acordo com o gráfico, vemos que a maioria das variáveis mais importantes para o modelo Ridge são também importantes para o modelo Lasso. As variáveis descartadas pelo modelo Lasso, são: * ano * recursos_proprios * recursos_de_outros_candidatos.comites * media_despesa
best.grid <- expand.grid(k = knn$bestTune)
best.model <- train(votos ~ .,
data = train,
method = "knn",
tuneGrid = best.grid)
best.model
test <- read.csv(here("data/test.csv"))
submission <- test %>%
select(sequencial_candidato)
test <- test %>%
select(-sequencial_candidato,
-nome,
-cargo,
-uf,
-ocupacao)
predictions <- predict(best.model, test)
submission$votos <- predictions
submission <- submission %>%
select(ID = sequencial_candidato,
votos = votos)
write.csv(x = submission,
file = "sample_submission.csv",
row.names = FALSE)