Tendo por base os dados da votaçao de Deputado Federal das eleições de 2014 queremos gerar um modelo de predição de votos. Utilizando os dados disponíveis, treinamos modelos utilizando regularização Ridge, Lasso e KNN. Possuímos 23 variáveis possíveis de explicar a quantidade de votos do candidato, utilizaremos todas as variáveis para treinarmos o modelo.

dados <- read.csv("eleicoes2014.csv", fileEncoding = 'latin1')

dados <- dados %>% select(-cargo, -nome)

fitControl <- trainControl( method = "cv",
                            number=5,
                            search="random")


modeloRidge <- train( votos ~ .,
                      data = dados,
                      method = 'ridge',
                      na.action = na.omit,
                      preProcess = c('scale','center', 'nzv'),
                      trControl = fitControl,
                      tuneLength = 50)

modeloLasso <- train( votos ~ .,
                      data = dados,
                      method = 'lasso',
                      na.action = na.omit,
                      preProcess = c('scale','center', 'nzv'),
                      trControl = fitControl,
                      tuneLength = 50)

modeloKnn <- train( votos ~ .,
                      data = dados,
                      method = 'knn',
                      na.action = na.omit,
                      preProcess = c('scale','center', 'nzv'),
                      trControl = fitControl,
                      tuneLength = 50)

Iremos agora avaliar cada modelo gerado:

Modelo Ridge:

modeloRidge
## Ridge Regression 
## 
## 4152 samples
##   23 predictor
## 
## Pre-processing: scaled (34), centered (34), remove (192) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 322, 324, 322, 324, 324 
## Resampling results across tuning parameters:
## 
##   lambda        RMSE       Rsquared   MAE     
##   1.744541e-05   90997.02  0.1520829  46529.03
##   2.061781e-05   90999.47  0.1519737  46519.63
##   2.429987e-05   91001.02  0.1518787  46510.33
##   3.521466e-05   91000.72  0.1517075  46489.38
##   5.393777e-05   90991.90  0.1515809  46466.38
##   6.120874e-05   90987.04  0.1515572  46459.78
##   7.643879e-05   90975.44  0.1515307  46448.59
##   8.794180e-05   90965.84  0.1515240  46441.55
##   1.094079e-04   90946.77  0.1515293  46430.43
##   1.207521e-04   90936.30  0.1515383  46425.27
##   1.478632e-04   90910.66  0.1515701  46414.23
##   2.871048e-04   90775.86  0.1518234  46369.94
##   3.549495e-04   90711.15  0.1519642  46351.41
##   4.024354e-04   90666.61  0.1520646  46339.16
##   5.793576e-04   90506.66  0.1524416  46296.01
##   1.787711e-03   89633.74  0.1547913  46053.33
##   1.949108e-03   89539.74  0.1550699  46026.26
##   2.007038e-03   89507.01  0.1551680  46016.75
##   2.260217e-03   89369.83  0.1555856  45976.48
##   3.351117e-03   88867.63  0.1571962  45821.39
##   3.728503e-03   88720.28  0.1576916  45773.68
##   4.263354e-03   88529.25  0.1583480  45710.20
##   5.246455e-03   88221.76  0.1594351  45604.21
##   6.749798e-03   87831.39  0.1608618  45461.07
##   8.580583e-03   87444.06  0.1623207  45309.16
##   1.183486e-02   86903.01  0.1644234  45079.81
##   1.434914e-02   86567.51  0.1657698  44929.25
##   2.598279e-02   85495.58  0.1704324  44420.71
##   2.615636e-02   85483.42  0.1704899  44414.54
##   3.180911e-02   85127.18  0.1722423  44226.24
##   3.184488e-02   85125.15  0.1722528  44225.12
##   4.426055e-02   84540.32  0.1754987  43891.78
##   8.328461e-02   83540.68  0.1830013  43216.59
##   8.705405e-02   83479.71  0.1836004  43168.74
##   1.100822e-01   83182.28  0.1869646  42956.08
##   1.282933e-01   83014.79  0.1893414  42832.83
##   1.540841e-01   82847.21  0.1923909  42690.31
##   1.636393e-01   82801.07  0.1934439  42642.61
##   1.882913e-01   82713.09  0.1960017  42535.65
##   2.316638e-01   82642.00  0.2000476  42412.97
##   2.431698e-01   82637.22  0.2010400  42385.54
##   3.142448e-01   82704.14  0.2065734  42298.04
##   3.923185e-01   82918.83  0.2117172  42323.12
##   1.131206e+00   87849.14  0.2377533  45205.93
##   1.242620e+00   88788.81  0.2398525  45959.53
##   1.286476e+00   89164.32  0.2406063  46260.43
##   2.060445e+00   95970.46  0.2494681  51791.76
##   4.534857e+00  114895.51  0.2580413  68306.89
##   6.072819e+00  123726.60  0.2595815  76153.48
##   6.309782e+00  124921.04  0.2597368  77198.88
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.2431698.

Modelo Lasso:

modeloLasso
## The lasso 
## 
## 4152 samples
##   23 predictor
## 
## Pre-processing: scaled (34), centered (34), remove (192) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 324, 323, 324, 323, 322 
## Resampling results across tuning parameters:
## 
##   fraction     RMSE      Rsquared   MAE     
##   0.005943615  79331.04  0.1009459  47030.81
##   0.013699861  82333.42  0.1252992  45633.77
##   0.024702298  83729.40  0.1461212  45265.44
##   0.050887137  87063.53  0.1477040  47185.16
##   0.057572983  88391.60  0.1447525  47628.89
##   0.067466039  90498.46  0.1406493  48452.42
##   0.079759124  91459.51  0.1398032  48988.47
##   0.080964732  91513.01  0.1397579  49005.14
##   0.088989410  91946.82  0.1381290  49150.54
##   0.142677798  93652.80  0.1305405  49899.56
##   0.151292309  93730.59  0.1295569  49952.04
##   0.163578472  93852.95  0.1280519  50035.54
##   0.229814705  94748.74  0.1185728  50624.35
##   0.253764323  95080.57  0.1158121  50837.50
##   0.276223923  95389.12  0.1135843  51048.95
##   0.302669838  95756.73  0.1111571  51302.03
##   0.323647498  96066.54  0.1093128  51510.11
##   0.326940490  96116.56  0.1090308  51544.47
##   0.344671358  96336.83  0.1077355  51695.82
##   0.367835980  96414.17  0.1072273  51757.26
##   0.377762078  96419.72  0.1071773  51766.21
##   0.390114318  96426.42  0.1071227  51777.24
##   0.411020396  96436.48  0.1070432  51795.13
##   0.445884952  96453.47  0.1069152  51824.67
##   0.464355299  96463.50  0.1068424  51840.57
##   0.477400043  96470.78  0.1067908  51851.79
##   0.486469708  96475.92  0.1067549  51859.60
##   0.494293786  96480.42  0.1067239  51866.33
##   0.513477667  96491.70  0.1066476  51883.38
##   0.519650466  96495.40  0.1066231  51888.94
##   0.566353685  96524.53  0.1064354  51931.31
##   0.604193960  96549.55  0.1062834  51965.37
##   0.614928176  96556.88  0.1062401  51975.03
##   0.672109262  96597.67  0.1060088  52026.50
##   0.682619243  96605.49  0.1059661  52035.96
##   0.695096032  96614.89  0.1059154  52047.20
##   0.698460856  96617.45  0.1059017  52050.26
##   0.707444218  96624.33  0.1058651  52058.44
##   0.723369671  96636.70  0.1058002  52072.95
##   0.744971777  96653.85  0.1057120  52093.26
##   0.754569635  96661.60  0.1056727  52102.39
##   0.767249841  96671.96  0.1056208  52114.45
##   0.796108794  96696.06  0.1055025  52141.90
##   0.798804601  96698.35  0.1054914  52144.46
##   0.807645044  96705.89  0.1054551  52152.87
##   0.840740281  96734.76  0.1053190  52184.35
##   0.850337762  96743.30  0.1052794  52193.48
##   0.927205836  96814.61  0.1049620  52266.60
##   0.947604574  96834.38  0.1048776  52286.00
##   0.951166808  96837.87  0.1048628  52289.39
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.005943615.

Modelo KNN:

modeloKnn
## k-Nearest Neighbors 
## 
## 4152 samples
##   23 predictor
## 
## Pre-processing: scaled (34), centered (34), remove (192) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 323, 324, 324, 323, 322 
## Resampling results across tuning parameters:
## 
##   k    RMSE      Rsquared   MAE     
##     2  95982.45  0.1282634  46202.91
##     3  83967.54  0.1786967  42435.06
##    11  74398.27  0.2121227  39929.43
##    16  73197.71  0.2327735  39015.45
##    21  72779.20  0.2451751  38746.95
##    24  72831.85  0.2465568  38738.78
##    25  72870.78  0.2466117  38851.32
##    28  72974.48  0.2473753  38715.29
##    29  72989.32  0.2479031  38731.71
##    33  73116.56  0.2482141  39080.50
##    36  73292.43  0.2450937  39253.20
##    42  73648.39  0.2396234  39655.74
##    43  73624.88  0.2422435  39670.31
##    48  73708.65  0.2459433  39694.96
##    50  73739.91  0.2484549  39756.60
##    52  73918.77  0.2429786  39873.28
##    53  73949.89  0.2439150  39863.45
##    56  74011.78  0.2462706  40022.53
##    61  74351.30  0.2378904  40264.89
##    63  74456.98  0.2369678  40258.10
##    66  74654.45  0.2321592  40434.99
##    68  74782.63  0.2300014  40532.93
##    77  74901.94  0.2346986  40670.60
##    83  74996.46  0.2360150  40778.30
##    84  75094.26  0.2323799  40858.80
##    90  75261.73  0.2345475  40936.50
##    91  75301.12  0.2332870  41031.70
##    93  75285.43  0.2356736  41066.28
##    96  75348.86  0.2324535  41192.42
##   112  75925.60  0.2196537  41715.20
##   114  75963.71  0.2207351  41784.09
##   115  75980.47  0.2219083  41807.31
##   117  76041.54  0.2214600  41847.31
##   120  76095.84  0.2228208  41947.45
##   125  76149.63  0.2246522  42054.77
##   126  76171.97  0.2241664  42072.09
##   127  76200.78  0.2239581  42099.18
##   128  76196.39  0.2244830  42105.15
##   133  76295.46  0.2245279  42285.61
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 21.

Uma forma de avaliarmos qual o melhor modelo gerado é avaliarmos o RMSE. O RMSE, Root Mean Square Error, é uma medida freqüentemente usada que calcula a diferença entre valores previstos por um modelo e os valores realmente observados da variável em estudo. Desta forma percebemos que o modelos com menor RMSE é o KNN com RSME de 71099.92 utilizando um k = 21.

Iremos agora comparar a importancias das variáveis para o modelo Ridge e Lasso:

Importância das variáveis para o modelo Lasso:

ggplot(varImp(modeloLasso))

Importância das variáveis para o modelo Ridge:

ggplot(varImp(modeloRidge))

Pelos gráficos acima percebemos que as variáveis possuem a mesma importância tanto para o modelo Lasso tanto para Ridge. As variáveis mais importantes segundo nossos modelos são: total_receita, recursos_de_partidos, recursos_de_pessoa_juridicas e total_depesa.

Analisando os valores dos coeficientes de cada varável para o modelo lasso:

predict.enet(modeloLasso$finalModel, type='coefficients', s=modeloLasso$bestTune$fraction, mode='fraction')
## $s
## [1] 0.005943615
## 
## $fraction
##           0 
## 0.005943615 
## 
## $mode
## [1] "fraction"
## 
## $coefficients
##                                        sequencial_candidato 
##                                                       0.000 
##                                             numero_cadidato 
##                                                       0.000 
##                                                        UFBA 
##                                                       0.000 
##                                                        UFMG 
##                                                       0.000 
##                                                        UFPR 
##                                                       0.000 
##                                                        UFRJ 
##                                                       0.000 
##                                                        UFRS 
##                                                       0.000 
##                                                        UFSP 
##                                                       0.000 
##                                                 partidoPMDB 
##                                                       0.000 
##                                                  partidoPSB 
##                                                       0.000 
##                                                  partidoPSD 
##                                                       0.000 
##                                                 partidoPSDB 
##                                                       0.000 
##                                                   partidoPT 
##                                                       0.000 
## setor_economico_receitaAtividades de organizações políticas 
##                                                       0.000 
##                                          quantidade_doacoes 
##                                                       0.000 
##                                         quantidade_doadores 
##                                                       0.000 
##                                               total_receita 
##                                                       0.000 
##                                               media_receita 
##                                                       0.000 
##                       recursos_de_outros_candidatos.comites 
##                                                   19256.837 
##                                        recursos_de_partidos 
##                                                       0.000 
##                                 recursos_de_pessoas_físicas 
##                                                       0.000 
##                               recursos_de_pessoas_juridicas 
##                                                       0.000 
##                                           recursos_proprios 
##                                                       0.000 
##                                         quantidade_despesas 
##                                                    9853.591 
##                                     quantidade_fornecedores 
##                                                       0.000 
##                                               total_despesa 
##                                                       0.000 
##                                               media_despesa 
##                                                       0.000 
##                                                       idade 
##                                                       0.000 
##                                               sexoMASCULINO 
##                                                       0.000 
##                                   grauENSINO MÉDIO COMPLETO 
##                                                       0.000 
##                                       grauSUPERIOR COMPLETO 
##                                                       0.000 
##                                     grauSUPERIOR INCOMPLETO 
##                                                       0.000 
##                                   estado_civilDIVORCIADO(A) 
##                                                       0.000 
##                                     estado_civilSOLTEIRO(A) 
##                                                       0.000

Pela tabela acima percebemos que algumas variáveis possuem valor zero. Estas variáveis com coeficiente zero são as variáveis que são descartadas pelo modelo Lasso, desta forma os valores estado_civil DIVORCIADO(A), grau ESINO MÉDIO COMPLETO, idade, total_depesa, quantidade_fornecedores, recursos_pessoas_juridicas, recursos_pessoas_fisicas,quantidade_doacoes, setor_economico_receita Atividade de organizações políticas, partido PSB, partido PT, UF RS, UF MG, numero_candidato, sequencial_candidato são irrelevantes para o modelo de predição.

Utilizaremos agora o melhor modelo de predição e iremos realizar o treino do mesmo utilizando todos os dados disponíveis.

modeloKnn <- train( votos ~ .,
                      data = dados,
                      method = 'knn',
                      na.action = na.omit,
                      preProcess = c('scale','center', 'nzv'),
                      tuneLength = 50
                    )

predictions <- predict(modeloKnn, dados)

predictions
##   [1] 140961.03  57056.08 104744.30  28522.70  44019.68  37461.32  61837.22
##   [8]  40785.92  63740.84  76100.70 122741.49  50706.30  56892.05  61012.32
##  [15]  49138.84  50188.46  33609.62  74153.62  68238.03  86967.78  80455.38
##  [22]  69398.03  54469.86  69233.97 118932.51  67972.62  33344.38 110752.81
##  [29]  45297.84  59863.35  48078.54 124010.24  40491.92  68488.63  61212.16
##  [36]  57359.46  43975.05  34275.49  77933.65  44629.78  42410.08  56830.11
##  [43]  44572.92  59824.27  41327.43  45042.76 130514.38  75693.78  69157.19
##  [50]  67287.76  57450.84  69224.24  48742.32  98217.22  37200.32  70785.59
##  [57]  56809.08  77738.49  42025.86  47003.27  68362.14  71530.68 123384.43
##  [64]  66125.32  63208.46  69745.41  38196.89  62324.81  44513.81 133246.24
##  [71]  78663.03  71550.68  56605.95  33341.22  59917.76  55510.73  87425.51
##  [78]  77194.97  87472.97  45142.76  58131.54  62217.30  84521.62  85967.65
##  [85]  32285.08  47305.08  59651.11  58870.16  65200.14  36703.92  80330.57
##  [92]  46946.78  51911.05  72703.08  81295.86  52367.11  45230.95  73845.62
##  [99]  89862.03  58319.73  57986.65 123542.14  47573.24 158522.49  74422.30
## [106]  40868.08  39039.49  32220.24  48746.22  42625.54  42991.43  43497.84
## [113]  47524.00  61218.03  62813.70  55832.00  51671.68  51701.05  37393.97
## [120]  40730.19  58465.19  54545.35 159081.95  29386.73  37220.03  68925.49
## [127] 100485.27  99581.14  35659.54  53764.38  44865.43  37859.08  55294.68
## [134] 129315.81  50360.27  39724.35  42187.32  56820.84  47863.08  62823.43
## [141]  53408.32 127580.14 108427.16  55449.59 102473.41  40393.57  88970.70
## [148]  83288.95  63840.16  46116.59  67804.08  37942.32  80175.92  97004.86
## [155]  96933.35 100926.30  62410.59  60618.38  65732.92  93801.11  63693.27
## [162]  43818.16  51251.38  44876.51  44920.38  43621.97  60286.35  36657.84
## [169]  28450.51  83895.16  70853.70  64659.19  39663.65  44142.76  58409.49
## [176]  54713.22  49107.57  37120.03  65979.86  40949.05  79605.51  47721.30
## [183]  60691.59  57688.08  75654.24  60791.89 103578.14  48431.43  56335.32
## [190]  34623.27  36811.38  37666.11  49301.51  53376.86  69483.81  87816.68
## [197]  47961.16  68703.16  47529.86 116427.59  52808.54  33386.57  58686.05
## [204]  33471.92  65628.16 135611.76  40187.22  40601.05  86792.27  36795.76
## [211] 128897.73  51939.89  72401.46  54637.54  42013.76  34783.35  56579.76
## [218] 110594.95  69059.57  96185.73  39305.16  50611.32  34984.73  98342.65
## [225] 108703.27  41436.68  56455.76  32391.00  79328.41  43007.73  66251.57
## [232]  41953.22  70475.05  98165.62  47610.14  50349.13  42802.73 118003.24
## [239]  45863.38  36669.00  65757.32  72064.59  51218.84  82231.65  77718.27
## [246]  61792.84  95067.73  86763.00  63316.16  90634.19  35648.97  44518.05
## [253]  50651.97  46937.46  49528.70  47099.95  34262.92  39025.57  65791.54
## [260]  49962.19  60334.89  45866.73  35531.27  72067.73  93549.43  56012.95
## [267] 108889.95 105885.46  49439.32  31662.51 135641.95 114391.97  74971.54
## [274]  71347.51  29735.76  56306.49  44555.00  40460.24  37779.92  62101.78
## [281]  54761.76  72949.35  87373.46  49442.78  49837.16 119391.00  55879.16
## [288]  54627.41 101814.11  37490.51  40631.86  42220.46  53845.43  47198.14
## [295] 128506.54  51184.76 102353.59  37152.27  62560.08  31369.62  39379.11
## [302]  43310.78  45284.86  56999.41  34090.41  38248.54  24647.62  40834.05
## [309]  32559.57  29351.46  44237.97  62519.38  70721.70  30236.97  57658.24
## [316]  37962.97  73830.38  95241.24  45600.05  55316.16  62647.46  62920.97
## [323]  48085.24  40734.30  70690.27  48870.97  65556.62  94089.05  83859.92
## [330]  35914.51  51184.30  68830.03  90738.59  40505.97  38900.92  93709.89
## [337]  61561.41  99325.35  94041.92  41872.19  60806.22  30091.03  31741.24
## [344]  61814.30  81321.41  51766.08 100657.73  34515.19  89081.73  57559.22
## [351]  67291.70  41347.73 123278.14  44446.08  46256.78  55823.11  94660.11
## [358]  42661.76  50960.81  42797.81  38965.97  47815.30  41208.81  62247.00
## [365]  39298.70  70246.05  32379.84  38735.27  44735.08  59350.16  76768.78
## [372]  52802.59  50536.43  69313.22  65786.97  47579.65  35850.73  51537.00
## [379]  64684.73  48947.27  53940.22  67402.51  73415.81  70957.92  84888.46
## [386] 111786.32  42491.49  36177.32  37783.92 107341.03  46184.62  37829.54
## [393]  91275.32  38469.86  40721.65  63105.46  95180.00  62001.73  88563.16
## [400]  62030.32  67817.78  37275.81  34584.51  86924.11

Agora iremos treinar nosso modelos com os dados de teste para a competição do Kaggle.

dadosKaggleTreino <- read.csv("train.csv")

dadosKaggleTreino <- dadosKaggleTreino %>% select(-cargo, -nome, -setor_economico_receita, -setor_economico_despesa)

dadosKaggleTeste <- read.csv("test.csv")

dadosKaggleTeste <- dadosKaggleTeste %>% select(-cargo, -nome, -setor_economico_receita, -setor_economico_despesa)

dadosKaggleTeste[is.na(dadosKaggleTeste)] <- 0


dadosKaggleTreino[is.na(dadosKaggleTreino)] <- 0

modeloKnnKaggle <- train( votos ~ .,
                      data = dadosKaggleTreino,
                      method = 'knn',
                      na.action = na.omit,
                      preProcess = c('scale','center', 'nzv'),
                      tuneLength = 1
                    )


predicoes <- predict(modeloKnnKaggle, dadosKaggleTeste)

dadosKaggleTeste$votos <- predicoes

dadosKaggleTeste <- dadosKaggleTeste %>% select(ID, votos)

write.csv(dadosKaggleTeste, file = "kaggle.csv", row.names = FALSE)