Tendo por base os dados da votaçao de Deputado Federal das eleições de 2014 queremos gerar um modelo de predição de votos. Utilizando os dados disponíveis, treinamos modelos utilizando regularização Ridge, Lasso e KNN. Possuímos 23 variáveis possíveis de explicar a quantidade de votos do candidato, utilizaremos todas as variáveis para treinarmos o modelo.
dados <- read.csv("eleicoes2014.csv", fileEncoding = 'latin1')
dados <- dados %>% select(-cargo, -nome)
fitControl <- trainControl( method = "cv",
number=5,
search="random")
modeloRidge <- train( votos ~ .,
data = dados,
method = 'ridge',
na.action = na.omit,
preProcess = c('scale','center', 'nzv'),
trControl = fitControl,
tuneLength = 50)
modeloLasso <- train( votos ~ .,
data = dados,
method = 'lasso',
na.action = na.omit,
preProcess = c('scale','center', 'nzv'),
trControl = fitControl,
tuneLength = 50)
modeloKnn <- train( votos ~ .,
data = dados,
method = 'knn',
na.action = na.omit,
preProcess = c('scale','center', 'nzv'),
trControl = fitControl,
tuneLength = 50)
Iremos agora avaliar cada modelo gerado:
Modelo Ridge:
modeloRidge
## Ridge Regression
##
## 4152 samples
## 23 predictor
##
## Pre-processing: scaled (34), centered (34), remove (192)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 322, 324, 322, 324, 324
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 1.744541e-05 90997.02 0.1520829 46529.03
## 2.061781e-05 90999.47 0.1519737 46519.63
## 2.429987e-05 91001.02 0.1518787 46510.33
## 3.521466e-05 91000.72 0.1517075 46489.38
## 5.393777e-05 90991.90 0.1515809 46466.38
## 6.120874e-05 90987.04 0.1515572 46459.78
## 7.643879e-05 90975.44 0.1515307 46448.59
## 8.794180e-05 90965.84 0.1515240 46441.55
## 1.094079e-04 90946.77 0.1515293 46430.43
## 1.207521e-04 90936.30 0.1515383 46425.27
## 1.478632e-04 90910.66 0.1515701 46414.23
## 2.871048e-04 90775.86 0.1518234 46369.94
## 3.549495e-04 90711.15 0.1519642 46351.41
## 4.024354e-04 90666.61 0.1520646 46339.16
## 5.793576e-04 90506.66 0.1524416 46296.01
## 1.787711e-03 89633.74 0.1547913 46053.33
## 1.949108e-03 89539.74 0.1550699 46026.26
## 2.007038e-03 89507.01 0.1551680 46016.75
## 2.260217e-03 89369.83 0.1555856 45976.48
## 3.351117e-03 88867.63 0.1571962 45821.39
## 3.728503e-03 88720.28 0.1576916 45773.68
## 4.263354e-03 88529.25 0.1583480 45710.20
## 5.246455e-03 88221.76 0.1594351 45604.21
## 6.749798e-03 87831.39 0.1608618 45461.07
## 8.580583e-03 87444.06 0.1623207 45309.16
## 1.183486e-02 86903.01 0.1644234 45079.81
## 1.434914e-02 86567.51 0.1657698 44929.25
## 2.598279e-02 85495.58 0.1704324 44420.71
## 2.615636e-02 85483.42 0.1704899 44414.54
## 3.180911e-02 85127.18 0.1722423 44226.24
## 3.184488e-02 85125.15 0.1722528 44225.12
## 4.426055e-02 84540.32 0.1754987 43891.78
## 8.328461e-02 83540.68 0.1830013 43216.59
## 8.705405e-02 83479.71 0.1836004 43168.74
## 1.100822e-01 83182.28 0.1869646 42956.08
## 1.282933e-01 83014.79 0.1893414 42832.83
## 1.540841e-01 82847.21 0.1923909 42690.31
## 1.636393e-01 82801.07 0.1934439 42642.61
## 1.882913e-01 82713.09 0.1960017 42535.65
## 2.316638e-01 82642.00 0.2000476 42412.97
## 2.431698e-01 82637.22 0.2010400 42385.54
## 3.142448e-01 82704.14 0.2065734 42298.04
## 3.923185e-01 82918.83 0.2117172 42323.12
## 1.131206e+00 87849.14 0.2377533 45205.93
## 1.242620e+00 88788.81 0.2398525 45959.53
## 1.286476e+00 89164.32 0.2406063 46260.43
## 2.060445e+00 95970.46 0.2494681 51791.76
## 4.534857e+00 114895.51 0.2580413 68306.89
## 6.072819e+00 123726.60 0.2595815 76153.48
## 6.309782e+00 124921.04 0.2597368 77198.88
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.2431698.
Modelo Lasso:
modeloLasso
## The lasso
##
## 4152 samples
## 23 predictor
##
## Pre-processing: scaled (34), centered (34), remove (192)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 324, 323, 324, 323, 322
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.005943615 79331.04 0.1009459 47030.81
## 0.013699861 82333.42 0.1252992 45633.77
## 0.024702298 83729.40 0.1461212 45265.44
## 0.050887137 87063.53 0.1477040 47185.16
## 0.057572983 88391.60 0.1447525 47628.89
## 0.067466039 90498.46 0.1406493 48452.42
## 0.079759124 91459.51 0.1398032 48988.47
## 0.080964732 91513.01 0.1397579 49005.14
## 0.088989410 91946.82 0.1381290 49150.54
## 0.142677798 93652.80 0.1305405 49899.56
## 0.151292309 93730.59 0.1295569 49952.04
## 0.163578472 93852.95 0.1280519 50035.54
## 0.229814705 94748.74 0.1185728 50624.35
## 0.253764323 95080.57 0.1158121 50837.50
## 0.276223923 95389.12 0.1135843 51048.95
## 0.302669838 95756.73 0.1111571 51302.03
## 0.323647498 96066.54 0.1093128 51510.11
## 0.326940490 96116.56 0.1090308 51544.47
## 0.344671358 96336.83 0.1077355 51695.82
## 0.367835980 96414.17 0.1072273 51757.26
## 0.377762078 96419.72 0.1071773 51766.21
## 0.390114318 96426.42 0.1071227 51777.24
## 0.411020396 96436.48 0.1070432 51795.13
## 0.445884952 96453.47 0.1069152 51824.67
## 0.464355299 96463.50 0.1068424 51840.57
## 0.477400043 96470.78 0.1067908 51851.79
## 0.486469708 96475.92 0.1067549 51859.60
## 0.494293786 96480.42 0.1067239 51866.33
## 0.513477667 96491.70 0.1066476 51883.38
## 0.519650466 96495.40 0.1066231 51888.94
## 0.566353685 96524.53 0.1064354 51931.31
## 0.604193960 96549.55 0.1062834 51965.37
## 0.614928176 96556.88 0.1062401 51975.03
## 0.672109262 96597.67 0.1060088 52026.50
## 0.682619243 96605.49 0.1059661 52035.96
## 0.695096032 96614.89 0.1059154 52047.20
## 0.698460856 96617.45 0.1059017 52050.26
## 0.707444218 96624.33 0.1058651 52058.44
## 0.723369671 96636.70 0.1058002 52072.95
## 0.744971777 96653.85 0.1057120 52093.26
## 0.754569635 96661.60 0.1056727 52102.39
## 0.767249841 96671.96 0.1056208 52114.45
## 0.796108794 96696.06 0.1055025 52141.90
## 0.798804601 96698.35 0.1054914 52144.46
## 0.807645044 96705.89 0.1054551 52152.87
## 0.840740281 96734.76 0.1053190 52184.35
## 0.850337762 96743.30 0.1052794 52193.48
## 0.927205836 96814.61 0.1049620 52266.60
## 0.947604574 96834.38 0.1048776 52286.00
## 0.951166808 96837.87 0.1048628 52289.39
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.005943615.
Modelo KNN:
modeloKnn
## k-Nearest Neighbors
##
## 4152 samples
## 23 predictor
##
## Pre-processing: scaled (34), centered (34), remove (192)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 323, 324, 324, 323, 322
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 2 95982.45 0.1282634 46202.91
## 3 83967.54 0.1786967 42435.06
## 11 74398.27 0.2121227 39929.43
## 16 73197.71 0.2327735 39015.45
## 21 72779.20 0.2451751 38746.95
## 24 72831.85 0.2465568 38738.78
## 25 72870.78 0.2466117 38851.32
## 28 72974.48 0.2473753 38715.29
## 29 72989.32 0.2479031 38731.71
## 33 73116.56 0.2482141 39080.50
## 36 73292.43 0.2450937 39253.20
## 42 73648.39 0.2396234 39655.74
## 43 73624.88 0.2422435 39670.31
## 48 73708.65 0.2459433 39694.96
## 50 73739.91 0.2484549 39756.60
## 52 73918.77 0.2429786 39873.28
## 53 73949.89 0.2439150 39863.45
## 56 74011.78 0.2462706 40022.53
## 61 74351.30 0.2378904 40264.89
## 63 74456.98 0.2369678 40258.10
## 66 74654.45 0.2321592 40434.99
## 68 74782.63 0.2300014 40532.93
## 77 74901.94 0.2346986 40670.60
## 83 74996.46 0.2360150 40778.30
## 84 75094.26 0.2323799 40858.80
## 90 75261.73 0.2345475 40936.50
## 91 75301.12 0.2332870 41031.70
## 93 75285.43 0.2356736 41066.28
## 96 75348.86 0.2324535 41192.42
## 112 75925.60 0.2196537 41715.20
## 114 75963.71 0.2207351 41784.09
## 115 75980.47 0.2219083 41807.31
## 117 76041.54 0.2214600 41847.31
## 120 76095.84 0.2228208 41947.45
## 125 76149.63 0.2246522 42054.77
## 126 76171.97 0.2241664 42072.09
## 127 76200.78 0.2239581 42099.18
## 128 76196.39 0.2244830 42105.15
## 133 76295.46 0.2245279 42285.61
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 21.
Uma forma de avaliarmos qual o melhor modelo gerado é avaliarmos o RMSE. O RMSE, Root Mean Square Error, é uma medida freqüentemente usada que calcula a diferença entre valores previstos por um modelo e os valores realmente observados da variável em estudo. Desta forma percebemos que o modelos com menor RMSE é o KNN com RSME de 71099.92 utilizando um k = 21.
Iremos agora comparar a importancias das variáveis para o modelo Ridge e Lasso:
Importância das variáveis para o modelo Lasso:
ggplot(varImp(modeloLasso))
Importância das variáveis para o modelo Ridge:
ggplot(varImp(modeloRidge))
Pelos gráficos acima percebemos que as variáveis possuem a mesma importância tanto para o modelo Lasso tanto para Ridge. As variáveis mais importantes segundo nossos modelos são: total_receita, recursos_de_partidos, recursos_de_pessoa_juridicas e total_depesa.
Analisando os valores dos coeficientes de cada varável para o modelo lasso:
predict.enet(modeloLasso$finalModel, type='coefficients', s=modeloLasso$bestTune$fraction, mode='fraction')
## $s
## [1] 0.005943615
##
## $fraction
## 0
## 0.005943615
##
## $mode
## [1] "fraction"
##
## $coefficients
## sequencial_candidato
## 0.000
## numero_cadidato
## 0.000
## UFBA
## 0.000
## UFMG
## 0.000
## UFPR
## 0.000
## UFRJ
## 0.000
## UFRS
## 0.000
## UFSP
## 0.000
## partidoPMDB
## 0.000
## partidoPSB
## 0.000
## partidoPSD
## 0.000
## partidoPSDB
## 0.000
## partidoPT
## 0.000
## setor_economico_receitaAtividades de organizações políticas
## 0.000
## quantidade_doacoes
## 0.000
## quantidade_doadores
## 0.000
## total_receita
## 0.000
## media_receita
## 0.000
## recursos_de_outros_candidatos.comites
## 19256.837
## recursos_de_partidos
## 0.000
## recursos_de_pessoas_físicas
## 0.000
## recursos_de_pessoas_juridicas
## 0.000
## recursos_proprios
## 0.000
## quantidade_despesas
## 9853.591
## quantidade_fornecedores
## 0.000
## total_despesa
## 0.000
## media_despesa
## 0.000
## idade
## 0.000
## sexoMASCULINO
## 0.000
## grauENSINO MÉDIO COMPLETO
## 0.000
## grauSUPERIOR COMPLETO
## 0.000
## grauSUPERIOR INCOMPLETO
## 0.000
## estado_civilDIVORCIADO(A)
## 0.000
## estado_civilSOLTEIRO(A)
## 0.000
Pela tabela acima percebemos que algumas variáveis possuem valor zero. Estas variáveis com coeficiente zero são as variáveis que são descartadas pelo modelo Lasso, desta forma os valores estado_civil DIVORCIADO(A), grau ESINO MÉDIO COMPLETO, idade, total_depesa, quantidade_fornecedores, recursos_pessoas_juridicas, recursos_pessoas_fisicas,quantidade_doacoes, setor_economico_receita Atividade de organizações políticas, partido PSB, partido PT, UF RS, UF MG, numero_candidato, sequencial_candidato são irrelevantes para o modelo de predição.
Utilizaremos agora o melhor modelo de predição e iremos realizar o treino do mesmo utilizando todos os dados disponíveis.
modeloKnn <- train( votos ~ .,
data = dados,
method = 'knn',
na.action = na.omit,
preProcess = c('scale','center', 'nzv'),
tuneLength = 50
)
predictions <- predict(modeloKnn, dados)
predictions
## [1] 140961.03 57056.08 104744.30 28522.70 44019.68 37461.32 61837.22
## [8] 40785.92 63740.84 76100.70 122741.49 50706.30 56892.05 61012.32
## [15] 49138.84 50188.46 33609.62 74153.62 68238.03 86967.78 80455.38
## [22] 69398.03 54469.86 69233.97 118932.51 67972.62 33344.38 110752.81
## [29] 45297.84 59863.35 48078.54 124010.24 40491.92 68488.63 61212.16
## [36] 57359.46 43975.05 34275.49 77933.65 44629.78 42410.08 56830.11
## [43] 44572.92 59824.27 41327.43 45042.76 130514.38 75693.78 69157.19
## [50] 67287.76 57450.84 69224.24 48742.32 98217.22 37200.32 70785.59
## [57] 56809.08 77738.49 42025.86 47003.27 68362.14 71530.68 123384.43
## [64] 66125.32 63208.46 69745.41 38196.89 62324.81 44513.81 133246.24
## [71] 78663.03 71550.68 56605.95 33341.22 59917.76 55510.73 87425.51
## [78] 77194.97 87472.97 45142.76 58131.54 62217.30 84521.62 85967.65
## [85] 32285.08 47305.08 59651.11 58870.16 65200.14 36703.92 80330.57
## [92] 46946.78 51911.05 72703.08 81295.86 52367.11 45230.95 73845.62
## [99] 89862.03 58319.73 57986.65 123542.14 47573.24 158522.49 74422.30
## [106] 40868.08 39039.49 32220.24 48746.22 42625.54 42991.43 43497.84
## [113] 47524.00 61218.03 62813.70 55832.00 51671.68 51701.05 37393.97
## [120] 40730.19 58465.19 54545.35 159081.95 29386.73 37220.03 68925.49
## [127] 100485.27 99581.14 35659.54 53764.38 44865.43 37859.08 55294.68
## [134] 129315.81 50360.27 39724.35 42187.32 56820.84 47863.08 62823.43
## [141] 53408.32 127580.14 108427.16 55449.59 102473.41 40393.57 88970.70
## [148] 83288.95 63840.16 46116.59 67804.08 37942.32 80175.92 97004.86
## [155] 96933.35 100926.30 62410.59 60618.38 65732.92 93801.11 63693.27
## [162] 43818.16 51251.38 44876.51 44920.38 43621.97 60286.35 36657.84
## [169] 28450.51 83895.16 70853.70 64659.19 39663.65 44142.76 58409.49
## [176] 54713.22 49107.57 37120.03 65979.86 40949.05 79605.51 47721.30
## [183] 60691.59 57688.08 75654.24 60791.89 103578.14 48431.43 56335.32
## [190] 34623.27 36811.38 37666.11 49301.51 53376.86 69483.81 87816.68
## [197] 47961.16 68703.16 47529.86 116427.59 52808.54 33386.57 58686.05
## [204] 33471.92 65628.16 135611.76 40187.22 40601.05 86792.27 36795.76
## [211] 128897.73 51939.89 72401.46 54637.54 42013.76 34783.35 56579.76
## [218] 110594.95 69059.57 96185.73 39305.16 50611.32 34984.73 98342.65
## [225] 108703.27 41436.68 56455.76 32391.00 79328.41 43007.73 66251.57
## [232] 41953.22 70475.05 98165.62 47610.14 50349.13 42802.73 118003.24
## [239] 45863.38 36669.00 65757.32 72064.59 51218.84 82231.65 77718.27
## [246] 61792.84 95067.73 86763.00 63316.16 90634.19 35648.97 44518.05
## [253] 50651.97 46937.46 49528.70 47099.95 34262.92 39025.57 65791.54
## [260] 49962.19 60334.89 45866.73 35531.27 72067.73 93549.43 56012.95
## [267] 108889.95 105885.46 49439.32 31662.51 135641.95 114391.97 74971.54
## [274] 71347.51 29735.76 56306.49 44555.00 40460.24 37779.92 62101.78
## [281] 54761.76 72949.35 87373.46 49442.78 49837.16 119391.00 55879.16
## [288] 54627.41 101814.11 37490.51 40631.86 42220.46 53845.43 47198.14
## [295] 128506.54 51184.76 102353.59 37152.27 62560.08 31369.62 39379.11
## [302] 43310.78 45284.86 56999.41 34090.41 38248.54 24647.62 40834.05
## [309] 32559.57 29351.46 44237.97 62519.38 70721.70 30236.97 57658.24
## [316] 37962.97 73830.38 95241.24 45600.05 55316.16 62647.46 62920.97
## [323] 48085.24 40734.30 70690.27 48870.97 65556.62 94089.05 83859.92
## [330] 35914.51 51184.30 68830.03 90738.59 40505.97 38900.92 93709.89
## [337] 61561.41 99325.35 94041.92 41872.19 60806.22 30091.03 31741.24
## [344] 61814.30 81321.41 51766.08 100657.73 34515.19 89081.73 57559.22
## [351] 67291.70 41347.73 123278.14 44446.08 46256.78 55823.11 94660.11
## [358] 42661.76 50960.81 42797.81 38965.97 47815.30 41208.81 62247.00
## [365] 39298.70 70246.05 32379.84 38735.27 44735.08 59350.16 76768.78
## [372] 52802.59 50536.43 69313.22 65786.97 47579.65 35850.73 51537.00
## [379] 64684.73 48947.27 53940.22 67402.51 73415.81 70957.92 84888.46
## [386] 111786.32 42491.49 36177.32 37783.92 107341.03 46184.62 37829.54
## [393] 91275.32 38469.86 40721.65 63105.46 95180.00 62001.73 88563.16
## [400] 62030.32 67817.78 37275.81 34584.51 86924.11
Agora iremos treinar nosso modelos com os dados de teste para a competição do Kaggle.
dadosKaggleTreino <- read.csv("train.csv")
dadosKaggleTreino <- dadosKaggleTreino %>% select(-cargo, -nome, -setor_economico_receita, -setor_economico_despesa)
dadosKaggleTeste <- read.csv("test.csv")
dadosKaggleTeste <- dadosKaggleTeste %>% select(-cargo, -nome, -setor_economico_receita, -setor_economico_despesa)
dadosKaggleTeste[is.na(dadosKaggleTeste)] <- 0
dadosKaggleTreino[is.na(dadosKaggleTreino)] <- 0
modeloKnnKaggle <- train( votos ~ .,
data = dadosKaggleTreino,
method = 'knn',
na.action = na.omit,
preProcess = c('scale','center', 'nzv'),
tuneLength = 1
)
predicoes <- predict(modeloKnnKaggle, dadosKaggleTeste)
dadosKaggleTeste$votos <- predicoes
dadosKaggleTeste <- dadosKaggleTeste %>% select(ID, votos)
write.csv(dadosKaggleTeste, file = "kaggle.csv", row.names = FALSE)