Avaliação 03

Packages

R

library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Carregando pacotes exigidos: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(DMwR)
## Carregando pacotes exigidos: lattice
## Carregando pacotes exigidos: grid
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
options(warn=-1)
library(MASS)
library(caret)
library(rpart)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
options(warn=0)

Funções extras

auc = function(fpr, tpr){
  a = fpr[1:(length(fpr)-1)]
  b = fpr[2:(length(fpr))]
  round(sum((a-b)*tpr),4)*100
}

Carregando os dados

DIR = 'data'; FILE = 'data.csv'
URL = file.path(DIR, FILE)
data = read.csv(URL, sep = ';')

Separando banco de variáveis continuas

continuos = c("Previous.qualification..grade.","Admission.grade",names(data)[22:33],
              "Unemployment.rate", "Inflation.rate", "GDP")
data_c = data[continuos]
data = data[,!(colnames(data) %in% continuos)]
dim(data_c)
## [1] 4424   17
colnames(data)
##  [1] "ï..Marital.status"           "Application.mode"           
##  [3] "Application.order"           "Course"                     
##  [5] "Daytime.evening.attendance." "Previous.qualification"     
##  [7] "Nacionality"                 "Mother.s.qualification"     
##  [9] "Father.s.qualification"      "Mother.s.occupation"        
## [11] "Father.s.occupation"         "Displaced"                  
## [13] "Educational.special.needs"   "Debtor"                     
## [15] "Tuition.fees.up.to.date"     "Gender"                     
## [17] "Scholarship.holder"          "Age.at.enrollment"          
## [19] "International"               "Target"
colnames(data_c) = c('PrevQualifiGrade', "AdmissionGrade","1stCredited",
                  "1stEnrolled","1stEvaluations",
                  "1stApproved","1stGrade","1stWithoutEva",
                  "2ndCredited",
                  "2ndEnrolled","2ndEvaluations",
                  "2ndApproved","2ndGrade","2ndWithoutEva",
                  colnames(data_c)[15:17])
attach(data)
attach(data_c)

Pre-Processing

Verificando existência de NaN’s

sum(is.na(data)) 
## [1] 0
sum(is.na(data_c)) 
## [1] 0
knitr::kable(summary(data))
ï..Marital.status Application.mode Application.order Course Daytime.evening.attendance. Previous.qualification Nacionality Mother.s.qualification Father.s.qualification Mother.s.occupation Father.s.occupation Displaced Educational.special.needs Debtor Tuition.fees.up.to.date Gender Scholarship.holder Age.at.enrollment International Target
Min. :1.000 Min. : 1.00 Min. :0.000 Min. : 33 Min. :0.0000 Min. : 1.000 Min. : 1.000 Min. : 1.00 Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :17.00 Min. :0.00000 Length:4424
1st Qu.:1.000 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:9085 1st Qu.:1.0000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 2.00 1st Qu.: 3.00 1st Qu.: 4.00 1st Qu.: 4.00 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:19.00 1st Qu.:0.00000 Class :character
Median :1.000 Median :17.00 Median :1.000 Median :9238 Median :1.0000 Median : 1.000 Median : 1.000 Median :19.00 Median :19.00 Median : 5.00 Median : 7.00 Median :1.0000 Median :0.00000 Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000 Median :20.00 Median :0.00000 Mode :character
Mean :1.179 Mean :18.67 Mean :1.728 Mean :8857 Mean :0.8908 Mean : 4.578 Mean : 1.873 Mean :19.56 Mean :22.28 Mean : 10.96 Mean : 11.03 Mean :0.5484 Mean :0.01153 Mean :0.1137 Mean :0.8807 Mean :0.3517 Mean :0.2484 Mean :23.27 Mean :0.02486 NA
3rd Qu.:1.000 3rd Qu.:39.00 3rd Qu.:2.000 3rd Qu.:9556 3rd Qu.:1.0000 3rd Qu.: 1.000 3rd Qu.: 1.000 3rd Qu.:37.00 3rd Qu.:37.00 3rd Qu.: 9.00 3rd Qu.: 9.00 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:25.00 3rd Qu.:0.00000 NA
Max. :6.000 Max. :57.00 Max. :9.000 Max. :9991 Max. :1.0000 Max. :43.000 Max. :109.000 Max. :44.00 Max. :44.00 Max. :194.00 Max. :195.00 Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :70.00 Max. :1.00000 NA
knitr::kable(summary(data_c))
PrevQualifiGrade AdmissionGrade 1stCredited 1stEnrolled 1stEvaluations 1stApproved 1stGrade 1stWithoutEva 2ndCredited 2ndEnrolled 2ndEvaluations 2ndApproved 2ndGrade 2ndWithoutEva Unemployment.rate Inflation.rate GDP
Min. : 95.0 Min. : 95.0 Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.0000 Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.0000 Min. : 7.60 Min. :-0.800 Min. :-4.060000
1st Qu.:125.0 1st Qu.:117.9 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.: 6.000 1st Qu.: 3.000 1st Qu.:11.00 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 5.000 1st Qu.: 6.000 1st Qu.: 2.000 1st Qu.:10.75 1st Qu.: 0.0000 1st Qu.: 9.40 1st Qu.: 0.300 1st Qu.:-1.700000
Median :133.1 Median :126.1 Median : 0.00 Median : 6.000 Median : 8.000 Median : 5.000 Median :12.29 Median : 0.0000 Median : 0.0000 Median : 6.000 Median : 8.000 Median : 5.000 Median :12.20 Median : 0.0000 Median :11.10 Median : 1.400 Median : 0.320000
Mean :132.6 Mean :127.0 Mean : 0.71 Mean : 6.271 Mean : 8.299 Mean : 4.707 Mean :10.64 Mean : 0.1377 Mean : 0.5418 Mean : 6.232 Mean : 8.063 Mean : 4.436 Mean :10.23 Mean : 0.1503 Mean :11.57 Mean : 1.228 Mean : 0.001969
3rd Qu.:140.0 3rd Qu.:134.8 3rd Qu.: 0.00 3rd Qu.: 7.000 3rd Qu.:10.000 3rd Qu.: 6.000 3rd Qu.:13.40 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 7.000 3rd Qu.:10.000 3rd Qu.: 6.000 3rd Qu.:13.33 3rd Qu.: 0.0000 3rd Qu.:13.90 3rd Qu.: 2.600 3rd Qu.: 1.790000
Max. :190.0 Max. :190.0 Max. :20.00 Max. :26.000 Max. :45.000 Max. :26.000 Max. :18.88 Max. :12.0000 Max. :19.0000 Max. :23.000 Max. :33.000 Max. :20.000 Max. :18.57 Max. :12.0000 Max. :16.20 Max. : 3.700 Max. : 3.510000

Podemos notar que a base não possui NaN’s. Além disso, apartir da ultima tabela gerada acima, podemos notar que não existe nenhum valor, em sua respectiva variavel, que aparenta estar fora do seu range de valores. Esse ultimo ponto de verificar se existe alguma variavel fora do range da sua natureza, foi utilizado com o intuito para verificar se existiam NaN’s tratados como 999 ou -1 por exemplo, como sabemos que muitas situações é comum tratarem observações com numeros bem discripantes da natureza da sua variavel para representar dados faltantes.

Definindo variável target

Vamos definir a nossa variável target para o nosso problema de classificação

target <- as.factor(data$Target)
levels(target)
## [1] "Dropout"  "Enrolled" "Graduate"

Vamos analisar a distribuição pelas classes da variável target, versus a variavel Unemployment.rate, Inflation.rate e Age.at.enrollment:

  • Unemployment.rate:
boxplot(Unemployment.rate ~ target)

Acima podemos notar os grupos Dropout e Graduate aparentam ter comportamento semelhante com respetio a variável Unemployment.rate, porém ambos tem comportamento distinto da classe Enrolled, sendo esta ultima apresentando um menor intervalo interquartilico, aparentando ter uma variação menor dentre os membros da sua classe, com respeito a taxa de desemprego. Vale ressaltar também que a mediana manteve-se igual para os 3 grupos.

boxplot(Inflation.rate ~ target)

Podemos notar que para o cenário acima, agora a classe Graduate apresenta uma mudança quando comparado as outras duas classes, evidenciando uma mediana inferior quando comparado aos outros grupos, com respeito a variável inflation.rate.

names(data)
##  [1] "ï..Marital.status"           "Application.mode"           
##  [3] "Application.order"           "Course"                     
##  [5] "Daytime.evening.attendance." "Previous.qualification"     
##  [7] "Nacionality"                 "Mother.s.qualification"     
##  [9] "Father.s.qualification"      "Mother.s.occupation"        
## [11] "Father.s.occupation"         "Displaced"                  
## [13] "Educational.special.needs"   "Debtor"                     
## [15] "Tuition.fees.up.to.date"     "Gender"                     
## [17] "Scholarship.holder"          "Age.at.enrollment"          
## [19] "International"               "Target"
boxplot(Age.at.enrollment ~ target)

Podemos notar que agora com respeito a variável Age.at.enrollment um comportamento diferente entre as 3 classes, notando que o menor valor encontra-se na classe Graduate, e também a presença maior de outliers, analogo podemos notar que a classe Dropout possui em sua predominância valores mais altos. Esta variável referencia a idade na hora de inscrição do curso, podemos notar que estudantes que se inscrevem com idades menores se enquadram na classe de Graduate, ou seja, se graduam de fato, porém jovens com idades maiores ou mais avançadas que se inscrevem no curso, tendem a realizar o Dropout do curso, ou seja, desistem ou abandonam o curso.

  • Trocando o nome das variáveis:

Vamos substituir os nomes das classes da variável target por valores numericos 0, 1 e 2, respectivamente

levels(target) <- c(0,1,2)
levels(target)
## [1] "0" "1" "2"

Proporção das classes da variável target

round((table(target)/length(target))*100,2)
## target
##     0     1     2 
## 32.12 17.95 49.93

Podemos notar que existe uma desproporção entre as classes, não estando balanceadas, possivelmente acarretando na possibilidade problemas para a tarefa de classificação a ser realizada. Isso será tratado mais na frente com métodos vistos. Podemos notar também que a classe majoritaria é a classe 2, 49.93% dos dados, Graduate, e a minoritária é a classe 1, Enrolled, com 17.95% dos dados.

Padronizando variaveis continuas

Agora vamos padronizar as variáveis continuas na base de dados, por apresentarem escalas distintas.

data_c_std = data.frame(scale(data_c))
knitr::kable(head(data_c_std))
PrevQualifiGrade AdmissionGrade X1stCredited X1stEnrolled X1stEvaluations X1stApproved X1stGrade X1stWithoutEva X2ndCredited X2ndEnrolled X2ndEvaluations X2ndApproved X2ndGrade X2ndWithoutEva Unemployment.rate Inflation.rate GDP
-0.8047503 0.0222263 -0.3007791 -2.5282738 -1.9858437 -1.5210854 -2.1968541 -0.1992505 -0.2824104 -2.8380158 -2.0423990 -1.4713606 -1.9632667 -0.1994184 -0.2876059 0.1243724 0.7656743
2.0765846 1.0718050 -0.3007791 -0.1090928 -0.5501298 0.4180026 0.6935202 -0.1992505 -0.2824104 -0.1057141 -0.5226233 0.5188450 0.6594872 -0.1994184 0.8761230 -1.1050966 0.3471602
-0.8047503 -0.1504018 -0.3007791 -0.1090928 -1.9858437 -1.5210854 -2.1968541 -0.1992505 -0.2824104 -0.1057141 -2.0423990 -1.4713606 -1.9632667 -0.1994184 -0.2876059 0.1243724 0.7656743
-0.8047503 -0.5094682 -0.3007791 -0.1090928 -0.0715585 0.4180026 0.5755457 -0.1992505 -0.2824104 -0.1057141 0.4905605 0.1871441 0.4164027 -0.1994184 -0.8131610 -1.4667052 -1.3753558
-2.4728915 1.0027538 -0.3007791 -0.1090928 0.1677271 0.0948213 0.3494280 -0.1992505 -0.2824104 -0.1057141 -0.5226233 0.5188450 0.5315479 -0.1994184 0.8761230 -1.1050966 0.3471602
0.0369028 -0.8409141 -0.3007791 -0.5122897 0.4070128 0.0948213 0.2511160 -0.1992505 -0.2824104 -0.5610977 2.2636322 0.1871441 0.2436847 6.4338689 1.7395349 -0.6711664 -0.4061652
round(colMeans(data_c_std),4)
##  PrevQualifiGrade    AdmissionGrade      X1stCredited      X1stEnrolled 
##                 0                 0                 0                 0 
##   X1stEvaluations      X1stApproved         X1stGrade    X1stWithoutEva 
##                 0                 0                 0                 0 
##      X2ndCredited      X2ndEnrolled   X2ndEvaluations      X2ndApproved 
##                 0                 0                 0                 0 
##         X2ndGrade    X2ndWithoutEva Unemployment.rate    Inflation.rate 
##                 0                 0                 0                 0 
##               GDP 
##                 0

Split dos dados

Merge dos dados continuos e não continuos

Vamos concatenar as informção não continuas e as continuas padronizadas.

data_final = data.frame(data, data_c_std)
data_final$Target <- as.factor(target)

Treino e Teste

Separaremos os dados em treino e teste, com proporção 80% e 20%, respectivamente.

set.seed(13)
in.trn <- createDataPartition(target, p = .80, list = FALSE)
trn <- data_final[in.trn,]
tst <- data_final[-in.trn,]

tst.features = subset(tst, select = -c(Target))
tst.target = subset(tst, select = Target)[,1]

trn.features = subset(trn, select = -c(Target))
trn.target = subset(trn, select = Target)[,1]

Corrigindo desbalanceamento dos dados de treino

  • Desbalanceado
props = table(trn.target)
(round((props/length(trn.target))*100,2))
## trn.target
##     0     1     2 
## 32.11 17.96 49.93
  • Balanceado
trn.balanced <- SMOTE(Target  ~ ., trn,
                 perc.over=max(props)-min(props))
round((table(trn.balanced$Target)/length(trn.balanced$Target))*100,2)
## 
##     0     1     2 
## 25.09 35.29 39.61
trn.features.balanced = subset(trn.balanced, select = -c(Target))
trn.target.balanced = subset(trn.balanced, select = Target)[,1]

Podemos notar agora que as proporções das classes estão balanceados.

Treinamento do modelo

Os modelos a seguir vão seguir a seguinte tarefa de aprendizagem supervisionada:

  • Tarefa: classificar se o estudante é da classe dropout (Aluno desistente), Enrolled (matriculado ainda no curso) e Graduate (Aluno graduado). Além disso, com a obtenção do melhor modelo, será retornado o valor da probabilidade do estudante desistir, ainda estar no curso ou de se formar.

Essa tarefa será interessante para inumeras funcionalidades, como a previsão de evasão por desistencia do curso, retornando a sua probabilidade, previsão da probabilidade de um estudante se formar; auxiliar também um departamento ou centro com a previsão geral de evasão dos seus alunos por departamento, com o intuito de diminui-la, podendo ate retirar o feature-importance do modelo para descobrir quais variáveis são mais impactantes para a desistência do estudante, afim de contornar essa situação, ou até mesmo ver qual variável é mais importante para a conclusão do curso do estudante, com o intuito de investir cada vez mais nela.

Decision tree

Cross-Validation

Será considerado uma validação cruzada com 10 folds

set.seed(13)
ctrl <- trainControl(
  method = "cv",
  number = 10,
)

Parameters Tunning

Vamos realizar um grid de parametros para verificar qual o valor de cp que maximize a acuracia obtida pelo modelo.

set.seed(13)
tuneGrid <- expand.grid(
  cp = seq(0, 1, by = .01)
)

Treinando o modelo

set.seed(13)
fit.dtc <- train(
  Target ~ .,
  data = trn.balanced,
  method = 'rpart',
  preProcess = c("scale"),
  trControl = ctrl,
  #summaryFunction=twoClassSummary,
  #classProbs=T,
  #savePredictions=T,
  tuneGrid = tuneGrid
)
fit.dtc
## CART 
## 
## 21624 samples
##    36 predictor
##     3 classes: '0', '1', '2' 
## 
## Pre-processing: scaled (36) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 19462, 19462, 19462, 19462, 19461, 19463, ... 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa    
##   0.00  0.9313256  0.8952968
##   0.01  0.7911574  0.6786442
##   0.02  0.7363573  0.5922283
##   0.03  0.7202179  0.5654364
##   0.04  0.7202179  0.5654364
##   0.05  0.7202179  0.5654364
##   0.06  0.7202179  0.5654364
##   0.07  0.7202179  0.5654364
##   0.08  0.7202179  0.5654364
##   0.09  0.7202179  0.5654364
##   0.10  0.7202179  0.5654364
##   0.11  0.7202179  0.5654364
##   0.12  0.7202179  0.5654364
##   0.13  0.7202179  0.5654364
##   0.14  0.7202179  0.5654364
##   0.15  0.7202179  0.5654364
##   0.16  0.7202179  0.5654364
##   0.17  0.7202179  0.5654364
##   0.18  0.7202179  0.5654364
##   0.19  0.7202179  0.5654364
##   0.20  0.6010444  0.3618105
##   0.21  0.6010444  0.3618105
##   0.22  0.6010444  0.3618105
##   0.23  0.6010444  0.3618105
##   0.24  0.6010444  0.3618105
##   0.25  0.6010444  0.3618105
##   0.26  0.6010444  0.3618105
##   0.27  0.6010444  0.3618105
##   0.28  0.6010444  0.3618105
##   0.29  0.6010444  0.3618105
##   0.30  0.6010444  0.3618105
##   0.31  0.6010444  0.3618105
##   0.32  0.6010444  0.3618105
##   0.33  0.6010444  0.3618105
##   0.34  0.4746165  0.1391573
##   0.35  0.3961339  0.0000000
##   0.36  0.3961339  0.0000000
##   0.37  0.3961339  0.0000000
##   0.38  0.3961339  0.0000000
##   0.39  0.3961339  0.0000000
##   0.40  0.3961339  0.0000000
##   0.41  0.3961339  0.0000000
##   0.42  0.3961339  0.0000000
##   0.43  0.3961339  0.0000000
##   0.44  0.3961339  0.0000000
##   0.45  0.3961339  0.0000000
##   0.46  0.3961339  0.0000000
##   0.47  0.3961339  0.0000000
##   0.48  0.3961339  0.0000000
##   0.49  0.3961339  0.0000000
##   0.50  0.3961339  0.0000000
##   0.51  0.3961339  0.0000000
##   0.52  0.3961339  0.0000000
##   0.53  0.3961339  0.0000000
##   0.54  0.3961339  0.0000000
##   0.55  0.3961339  0.0000000
##   0.56  0.3961339  0.0000000
##   0.57  0.3961339  0.0000000
##   0.58  0.3961339  0.0000000
##   0.59  0.3961339  0.0000000
##   0.60  0.3961339  0.0000000
##   0.61  0.3961339  0.0000000
##   0.62  0.3961339  0.0000000
##   0.63  0.3961339  0.0000000
##   0.64  0.3961339  0.0000000
##   0.65  0.3961339  0.0000000
##   0.66  0.3961339  0.0000000
##   0.67  0.3961339  0.0000000
##   0.68  0.3961339  0.0000000
##   0.69  0.3961339  0.0000000
##   0.70  0.3961339  0.0000000
##   0.71  0.3961339  0.0000000
##   0.72  0.3961339  0.0000000
##   0.73  0.3961339  0.0000000
##   0.74  0.3961339  0.0000000
##   0.75  0.3961339  0.0000000
##   0.76  0.3961339  0.0000000
##   0.77  0.3961339  0.0000000
##   0.78  0.3961339  0.0000000
##   0.79  0.3961339  0.0000000
##   0.80  0.3961339  0.0000000
##   0.81  0.3961339  0.0000000
##   0.82  0.3961339  0.0000000
##   0.83  0.3961339  0.0000000
##   0.84  0.3961339  0.0000000
##   0.85  0.3961339  0.0000000
##   0.86  0.3961339  0.0000000
##   0.87  0.3961339  0.0000000
##   0.88  0.3961339  0.0000000
##   0.89  0.3961339  0.0000000
##   0.90  0.3961339  0.0000000
##   0.91  0.3961339  0.0000000
##   0.92  0.3961339  0.0000000
##   0.93  0.3961339  0.0000000
##   0.94  0.3961339  0.0000000
##   0.95  0.3961339  0.0000000
##   0.96  0.3961339  0.0000000
##   0.97  0.3961339  0.0000000
##   0.98  0.3961339  0.0000000
##   0.99  0.3961339  0.0000000
##   1.00  0.3961339  0.0000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
plot(fit.dtc)

Vamos realizar as nossas predições para assim avaliar o modelo apos o tunning:

predictions.dtc = predict(fit.dtc, newdata = tst.features)
  • Matriz de confusão:
(cm.dtc = confusionMatrix(predictions.dtc, tst.target, mode='prec_recall'))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2
##          0 197  49  48
##          1  39  43  43
##          2  48  66 350
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6682         
##                  95% CI : (0.636, 0.6992)
##     No Information Rate : 0.4994         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.4517         
##                                          
##  Mcnemar's Test P-Value : 0.1121         
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Precision              0.6701   0.3440   0.7543
## Recall                 0.6937   0.2722   0.7937
## F1                     0.6817   0.3039   0.7735
## Prevalence             0.3216   0.1789   0.4994
## Detection Rate         0.2231   0.0487   0.3964
## Detection Prevalence   0.3330   0.1416   0.5255
## Balanced Accuracy      0.7659   0.5795   0.7679
a =  cm.dtc$table[1,1]+cm.dtc$table[2,2]+cm.dtc$table[1,1]
(a)/(a + cm.dtc$table[2,1]+cm.dtc$table[3,1]+cm.dtc$table[1,2]+cm.dtc$table[1,3])
## [1] 0.7037037

O modelo performou de forma consideravel, quando tratamos a precisão da classe 2. Porém quando analisamos a acuracia overall, o modelo baseado em arvore de decisão apresentou um desempenho melhor, de aproximadamente 66.82%.

Random Forest

Cross-Validation

Será considerado uma validação cruzada com 10 folds

set.seed(13)
ctrl <- trainControl(
  method = "cv",
  number = 10,
)

Parameters Tunning

Vamos realizar um grid de parametros para verificar qual o valor de mtry que maximize a acuracia obtida pelo modelo.

set.seed(13)
tuneGrid <- expand.grid(
  mtry = 1:4
)

Treinando o modelo

set.seed(13)
fit.rf <- train(
  Target ~ .,
  data = trn.balanced,
  method = 'rf',
  preProcess = c("scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)
fit.rf
## Random Forest 
## 
## 21624 samples
##    36 predictor
##     3 classes: '0', '1', '2' 
## 
## Pre-processing: scaled (36) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 19462, 19462, 19462, 19462, 19461, 19463, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   1     0.9171753  0.8727255
##   2     0.9850627  0.9771926
##   3     0.9910745  0.9863805
##   4     0.9912598  0.9866641
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
plot(fit.rf)

Vamos realizar as nossas predições para assim avaliar o modelo apos o tunning:

predictions.rf = predict(fit.rf, newdata = tst.features)
  • Matriz de confusão:
(cm.rf = confusionMatrix(predictions.rf, tst.target, mode='prec_recall'))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2
##          0 204  21  17
##          1  46  73  34
##          2  34  64 390
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7554          
##                  95% CI : (0.7256, 0.7834)
##     No Information Rate : 0.4994          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5956          
##                                           
##  Mcnemar's Test P-Value : 2.292e-05       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Precision              0.8430  0.47712   0.7992
## Recall                 0.7183  0.46203   0.8844
## F1                     0.7757  0.46945   0.8396
## Prevalence             0.3216  0.17894   0.4994
## Detection Rate         0.2310  0.08267   0.4417
## Detection Prevalence   0.2741  0.17327   0.5527
## Balanced Accuracy      0.8274  0.67584   0.8313
a =  cm.rf$table[1,1]+cm.rf$table[2,2]+cm.rf$table[1,1]
(a)/(a + cm.rf$table[2,1]+cm.rf$table[3,1]+cm.rf$table[1,2]+cm.rf$table[1,3])
## [1] 0.803005

Podemos notar que a floresta aleatoria performou bem melhor que o modelo baseado em Arvore de decisão, como era de se esperar, quando comparamos a precisão das classes. Além disso, quando comparamos a acuracia overall, temos um valor de aproximadamente 75.54%, bem superior que ambos outros modelos. Até então, temos que o modelo baseado na floresta aleatoria é oque performou melhor.

Multinom

Cross-Validation

Será considerado uma validação cruzada com 10 folds

set.seed(13)
ctrl <- trainControl(
  method = "cv",
  number = 10,
)

Parameters Tunning

Vamos realizar um grid de parametros para verificar qual o valor de decay que maximize a acuracia obtida pelo modelo.

set.seed(13)
tuneGrid <- expand.grid(
  decay = seq((10**(-3)), 1, length.out=10)
)

Treinando o modelo

set.seed(13)
fit.multinom <- train(
  Target ~ .,
  data = trn.balanced,
  method = 'multinom',
  preProcess = c("scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14716.676382
## iter  20 value 14516.680449
## iter  30 value 14225.777056
## iter  40 value 13385.968827
## iter  50 value 13051.744865
## iter  60 value 12595.397115
## iter  70 value 12363.592893
## iter  80 value 11739.749537
## iter  90 value 11526.661140
## iter  90 value 11526.661069
## iter  90 value 11526.661069
## final  value 11526.661069 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14717.903015
## iter  20 value 14518.402943
## iter  30 value 14226.592835
## iter  40 value 13386.737522
## iter  50 value 13056.576315
## iter  60 value 12598.816006
## iter  70 value 12364.629233
## iter  80 value 11749.215992
## iter  90 value 11531.355949
## iter  90 value 11531.355909
## iter  90 value 11531.355908
## final  value 11531.355908 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14719.129003
## iter  20 value 14520.122758
## iter  30 value 14227.384912
## iter  40 value 13387.597349
## iter  50 value 13061.317481
## iter  60 value 12602.104506
## iter  70 value 12365.596902
## iter  80 value 11747.970124
## iter  90 value 11536.011880
## iter  90 value 11536.011824
## iter  90 value 11536.011824
## final  value 11536.011824 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14720.354344
## iter  20 value 14521.839907
## iter  30 value 14228.153402
## iter  40 value 13388.549417
## iter  50 value 13065.974753
## iter  60 value 12605.169821
## iter  70 value 12366.340646
## iter  80 value 11751.283713
## iter  90 value 11540.629902
## iter  90 value 11540.629846
## iter  90 value 11540.629845
## final  value 11540.629845 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14721.579041
## iter  20 value 14523.554400
## iter  30 value 14228.898419
## iter  40 value 13389.594537
## iter  50 value 13070.555175
## iter  60 value 12607.954742
## iter  70 value 12366.700748
## iter  80 value 11757.288357
## iter  90 value 11545.210993
## iter  90 value 11545.210936
## iter  90 value 11545.210935
## final  value 11545.210935 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14722.803093
## iter  20 value 14525.266251
## iter  30 value 14229.620072
## iter  40 value 13390.733187
## iter  50 value 13075.067276
## iter  60 value 12610.445418
## iter  70 value 12371.866088
## iter  80 value 11766.294106
## iter  90 value 11549.756053
## iter  90 value 11549.756005
## iter  90 value 11549.756003
## final  value 11549.756003 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14724.026502
## iter  20 value 14526.975470
## iter  30 value 14230.318467
## iter  40 value 13391.965474
## iter  50 value 13079.522158
## iter  60 value 12612.677340
## iter  70 value 12370.335046
## iter  80 value 11772.392989
## iter  90 value 11554.265932
## iter  90 value 11554.265899
## iter  90 value 11554.265898
## final  value 11554.265898 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14725.249268
## iter  20 value 14528.682070
## iter  30 value 14230.993705
## iter  40 value 13393.291095
## iter  50 value 13083.934838
## iter  60 value 12614.739521
## iter  70 value 12368.566383
## iter  80 value 11752.356373
## iter  90 value 11558.741515
## iter  90 value 11558.741447
## iter  90 value 11558.741446
## final  value 11558.741446 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14726.471392
## iter  20 value 14530.386063
## iter  30 value 14231.645882
## iter  40 value 13394.709285
## iter  50 value 13088.325788
## iter  60 value 12616.775211
## iter  70 value 12367.232941
## iter  80 value 11757.877933
## iter  90 value 11563.183396
## iter  90 value 11563.183380
## iter  90 value 11563.183380
## final  value 11563.183380 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14727.692875
## iter  20 value 14532.087459
## iter  30 value 14232.275095
## iter  40 value 13396.218777
## iter  50 value 13092.722516
## iter  60 value 12618.976575
## iter  70 value 12367.231665
## iter  80 value 11771.485033
## iter  90 value 11567.592486
## iter  90 value 11567.592462
## iter  90 value 11567.592462
## final  value 11567.592462 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14763.658876
## iter  20 value 14565.372184
## iter  30 value 14289.066667
## iter  40 value 13460.445914
## iter  50 value 13056.615258
## iter  60 value 12664.272957
## iter  70 value 12364.250221
## iter  80 value 11806.473425
## iter  90 value 11600.899111
## final  value 11600.898971 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14764.874361
## iter  20 value 14567.081782
## iter  30 value 14290.303828
## iter  40 value 13461.881156
## iter  50 value 13064.506114
## iter  60 value 12696.992052
## iter  70 value 12431.386192
## iter  80 value 11803.790845
## iter  90 value 11605.631138
## iter  90 value 11605.631034
## iter  90 value 11605.631033
## final  value 11605.631033 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14766.089203
## iter  20 value 14568.788652
## iter  30 value 14291.515237
## iter  40 value 13463.835747
## iter  50 value 13072.846910
## iter  60 value 12694.486093
## iter  70 value 12422.162518
## iter  80 value 11825.451467
## iter  90 value 11610.324927
## final  value 11610.324770 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14767.303405
## iter  20 value 14570.492806
## iter  30 value 14292.701001
## iter  40 value 13466.355258
## iter  50 value 13081.670345
## iter  60 value 12693.674793
## iter  70 value 12405.875234
## iter  80 value 11827.311698
## iter  90 value 11614.981303
## iter  90 value 11614.981189
## iter  90 value 11614.981188
## final  value 11614.981188 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14768.516966
## iter  20 value 14572.194258
## iter  30 value 14293.861221
## iter  40 value 13507.438774
## iter  50 value 13141.994948
## iter  60 value 12722.155869
## iter  70 value 12441.746553
## iter  80 value 11831.297871
## iter  90 value 11619.601242
## final  value 11619.601104 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14769.729888
## iter  20 value 14573.893019
## iter  30 value 14294.995996
## iter  40 value 13505.034008
## iter  50 value 13145.096609
## iter  60 value 12684.084352
## iter  70 value 12379.597506
## iter  80 value 11820.823357
## iter  90 value 11624.185516
## iter  90 value 11624.185419
## iter  90 value 11624.185419
## final  value 11624.185419 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14770.942171
## iter  20 value 14575.589103
## iter  30 value 14296.105420
## iter  40 value 13503.273852
## iter  50 value 13148.759760
## iter  60 value 12687.791889
## iter  70 value 12382.514414
## iter  80 value 11822.995961
## iter  90 value 11628.734969
## iter  90 value 11628.734872
## iter  90 value 11628.734871
## final  value 11628.734871 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14772.153816
## iter  20 value 14577.282522
## iter  30 value 14297.189586
## iter  40 value 13502.198287
## iter  50 value 13153.056674
## iter  60 value 12691.990963
## iter  70 value 12386.359927
## iter  80 value 11823.908463
## iter  90 value 11633.250333
## iter  90 value 11633.250237
## iter  90 value 11633.250237
## final  value 11633.250237 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14773.364824
## iter  20 value 14578.973287
## iter  30 value 14298.248580
## iter  40 value 13501.836920
## iter  50 value 13158.054364
## iter  60 value 12698.340179
## iter  70 value 12392.340784
## iter  80 value 11823.465157
## iter  90 value 11637.732320
## iter  90 value 11637.732235
## iter  90 value 11637.732235
## final  value 11637.732235 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14774.575195
## iter  20 value 14580.661412
## iter  30 value 14299.282490
## iter  40 value 13502.226270
## iter  50 value 13163.826857
## iter  60 value 12717.652882
## iter  70 value 12407.586652
## iter  80 value 11824.454045
## iter  90 value 11642.181643
## iter  90 value 11642.181557
## iter  90 value 11642.181557
## final  value 11642.181557 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14416.362275
## iter  20 value 14212.326748
## iter  30 value 14009.805675
## iter  40 value 13232.709742
## iter  50 value 12907.048690
## iter  60 value 12617.080036
## iter  70 value 12389.690760
## iter  80 value 11767.066683
## iter  90 value 11566.851404
## iter  90 value 11566.851357
## iter  90 value 11566.851357
## final  value 11566.851357 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14417.483109
## iter  20 value 14213.928897
## iter  30 value 14010.937939
## iter  40 value 13233.519342
## iter  50 value 12912.270926
## iter  60 value 12621.312844
## iter  70 value 12366.709577
## iter  80 value 11748.002101
## final  value 11571.568622 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14418.603467
## iter  20 value 14215.528799
## iter  30 value 14012.064445
## iter  40 value 13234.531066
## iter  50 value 12917.572348
## iter  60 value 12561.907567
## iter  70 value 12274.973856
## iter  80 value 11763.287817
## iter  90 value 11576.246572
## iter  90 value 11576.246538
## iter  90 value 11576.246538
## final  value 11576.246538 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14419.723348
## iter  20 value 14217.126461
## iter  30 value 14013.185613
## iter  40 value 13235.742685
## iter  50 value 12923.087679
## iter  60 value 12554.984845
## iter  70 value 12265.558166
## iter  80 value 11772.538311
## final  value 11580.886292 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14420.842753
## iter  20 value 14218.721894
## iter  30 value 14014.301875
## iter  40 value 13237.150711
## iter  50 value 12928.998870
## iter  60 value 12559.450351
## iter  70 value 12283.999393
## iter  80 value 11754.980337
## final  value 11585.488841 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14421.961683
## iter  20 value 14220.315105
## iter  30 value 14015.413669
## iter  40 value 13238.750368
## iter  50 value 12935.395317
## iter  60 value 12560.863003
## iter  70 value 12285.871128
## iter  80 value 11775.326501
## final  value 11590.055185 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14423.080138
## iter  20 value 14221.906104
## iter  30 value 14016.521448
## iter  40 value 13240.535574
## iter  50 value 12942.177030
## iter  60 value 12561.336109
## iter  70 value 12301.537706
## iter  80 value 11784.726708
## final  value 11594.586203 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14424.198118
## iter  20 value 14223.494898
## iter  30 value 14017.625672
## iter  40 value 13242.498968
## iter  50 value 12949.205878
## iter  60 value 12561.741025
## iter  70 value 12305.856867
## iter  80 value 11791.085042
## final  value 11599.082734 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14425.315625
## iter  20 value 14225.081498
## iter  30 value 14018.726814
## iter  40 value 13244.631965
## iter  50 value 12956.381055
## iter  60 value 12562.248684
## iter  70 value 12320.086582
## iter  80 value 11796.826233
## iter  90 value 11603.545586
## iter  90 value 11603.545556
## iter  90 value 11603.545556
## final  value 11603.545556 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14426.432658
## iter  20 value 14226.665910
## iter  30 value 13998.595245
## iter  40 value 13157.002043
## iter  50 value 12865.324421
## iter  60 value 12547.057864
## iter  70 value 12278.234999
## iter  80 value 11768.824211
## final  value 11607.975486 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14711.974904
## iter  20 value 14518.000609
## iter  30 value 14246.556324
## iter  40 value 13309.124366
## iter  50 value 13009.242573
## iter  60 value 12579.531955
## iter  70 value 12351.956808
## iter  80 value 11774.570071
## iter  90 value 11581.222931
## iter  90 value 11581.222825
## iter  90 value 11581.222824
## final  value 11581.222824 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14713.173613
## iter  20 value 14519.686929
## iter  30 value 14247.121522
## iter  40 value 13309.687528
## iter  50 value 13014.135682
## iter  60 value 12579.894059
## iter  70 value 12353.021501
## iter  80 value 11777.997341
## iter  90 value 11585.776418
## iter  90 value 11585.776310
## iter  90 value 11585.776309
## final  value 11585.776309 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14714.371691
## iter  20 value 14521.370620
## iter  30 value 14247.665718
## iter  40 value 13310.382714
## iter  50 value 13019.037650
## iter  60 value 12599.800789
## iter  70 value 12376.803867
## iter  80 value 11782.479613
## iter  90 value 11590.295387
## iter  90 value 11590.295297
## iter  90 value 11590.295296
## final  value 11590.295296 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14715.569140
## iter  20 value 14523.051692
## iter  30 value 14248.188996
## iter  40 value 13311.209696
## iter  50 value 13023.943621
## iter  60 value 12598.620694
## iter  70 value 12377.871092
## iter  80 value 11793.048488
## iter  90 value 11594.780666
## iter  90 value 11594.780569
## iter  90 value 11594.780568
## final  value 11594.780568 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14716.765961
## iter  20 value 14524.730158
## iter  30 value 14248.691436
## iter  40 value 13312.168164
## iter  50 value 13028.849662
## iter  60 value 12597.571135
## iter  70 value 12379.229818
## iter  80 value 11804.411186
## iter  90 value 11599.232961
## final  value 11599.232827 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14717.962153
## iter  20 value 14526.406029
## iter  30 value 14249.173118
## iter  40 value 13313.257702
## iter  50 value 13033.753201
## iter  60 value 12596.819795
## iter  70 value 12381.182748
## iter  80 value 11806.386178
## iter  90 value 11603.652931
## final  value 11603.652793 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14719.157717
## iter  20 value 14528.079317
## iter  30 value 14249.634122
## iter  40 value 13314.477767
## iter  50 value 13038.653571
## iter  60 value 12596.632836
## iter  70 value 12384.081234
## iter  80 value 11810.477802
## iter  90 value 11608.041237
## iter  90 value 11608.041126
## iter  90 value 11608.041124
## final  value 11608.041124 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14720.352655
## iter  20 value 14529.750033
## iter  30 value 14250.074524
## iter  40 value 13315.827670
## iter  50 value 13043.552670
## iter  60 value 12597.403168
## iter  70 value 12388.300827
## iter  80 value 11816.254167
## iter  90 value 11612.398532
## iter  90 value 11612.398429
## iter  90 value 11612.398426
## final  value 11612.398426 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14721.546968
## iter  20 value 14531.418189
## iter  30 value 14250.494401
## iter  40 value 13317.306554
## iter  50 value 13048.455739
## iter  60 value 12599.713592
## iter  70 value 12394.179597
## iter  80 value 11814.010752
## iter  90 value 11616.725371
## iter  90 value 11616.725300
## iter  90 value 11616.725299
## final  value 11616.725299 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14722.740655
## iter  20 value 14533.083796
## iter  30 value 14250.893829
## iter  40 value 13318.913372
## iter  50 value 13053.372176
## iter  60 value 12604.420839
## iter  70 value 12389.111306
## iter  80 value 11817.226045
## iter  90 value 11621.022490
## final  value 11621.022327 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14755.663761
## iter  20 value 14558.685712
## iter  30 value 14291.557113
## iter  40 value 13429.276005
## iter  50 value 13077.555515
## iter  60 value 12671.865765
## iter  70 value 12411.079916
## iter  80 value 11773.080564
## iter  90 value 11571.486590
## iter  90 value 11571.486484
## iter  90 value 11571.486483
## final  value 11571.486483 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14756.885621
## iter  20 value 14560.413438
## iter  30 value 14292.342895
## iter  40 value 13429.174677
## iter  50 value 13082.559102
## iter  60 value 12683.078776
## iter  70 value 12400.073859
## iter  80 value 11780.041457
## iter  90 value 11576.049847
## final  value 11576.049715 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14758.106840
## iter  20 value 14562.138373
## iter  30 value 14293.106200
## iter  40 value 13429.242975
## iter  50 value 13087.461306
## iter  60 value 12627.242846
## iter  70 value 12399.618700
## iter  80 value 11784.204027
## iter  90 value 11580.578481
## iter  90 value 11580.578366
## iter  90 value 11580.578365
## final  value 11580.578365 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14759.327420
## iter  20 value 14563.860529
## iter  30 value 14293.847100
## iter  40 value 13429.473075
## iter  50 value 13092.277116
## iter  60 value 12628.112548
## iter  70 value 12430.661983
## iter  80 value 11789.811208
## iter  90 value 11585.073185
## iter  90 value 11585.073100
## iter  90 value 11585.073099
## final  value 11585.073099 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14760.547361
## iter  20 value 14565.579920
## iter  30 value 14294.565663
## iter  40 value 13429.856118
## iter  50 value 13097.025449
## iter  60 value 12628.735723
## iter  70 value 12408.674482
## iter  80 value 11791.323277
## iter  90 value 11589.534790
## iter  90 value 11589.534679
## iter  90 value 11589.534678
## final  value 11589.534678 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14761.766665
## iter  20 value 14567.296558
## iter  30 value 14295.261955
## iter  40 value 13430.382694
## iter  50 value 13101.722483
## iter  60 value 12629.092158
## iter  70 value 12384.042265
## iter  80 value 11790.494684
## iter  90 value 11593.963908
## final  value 11593.963733 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14762.985330
## iter  20 value 14569.010456
## iter  30 value 14295.936037
## iter  40 value 13431.043141
## iter  50 value 13106.380213
## iter  60 value 12629.257937
## iter  70 value 12372.931273
## iter  80 value 11796.367217
## iter  90 value 11598.361057
## iter  90 value 11598.360967
## iter  90 value 11598.360966
## final  value 11598.360966 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14764.203359
## iter  20 value 14570.721627
## iter  30 value 14296.587968
## iter  40 value 13431.827720
## iter  50 value 13111.006935
## iter  60 value 12629.433559
## iter  70 value 12395.292041
## iter  80 value 11807.328505
## iter  90 value 11602.727064
## iter  90 value 11602.726964
## iter  90 value 11602.726963
## final  value 11602.726963 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14765.420752
## iter  20 value 14572.430084
## iter  30 value 14297.217805
## iter  40 value 13432.726712
## iter  50 value 13115.608209
## iter  60 value 12630.006173
## iter  70 value 12401.459004
## iter  80 value 11813.155629
## iter  90 value 11607.062422
## iter  90 value 11607.062317
## iter  90 value 11607.062315
## final  value 11607.062315 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14472.557863
## iter  20 value 14258.795785
## iter  30 value 14061.492557
## iter  40 value 13251.383762
## iter  50 value 12906.183045
## iter  60 value 12511.624650
## iter  70 value 12343.367029
## iter  80 value 11785.665056
## final  value 11611.367589 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14667.977074
## iter  20 value 14479.131695
## iter  30 value 14261.954692
## iter  40 value 13373.171576
## iter  50 value 13031.437286
## iter  60 value 12683.674922
## iter  70 value 12378.730918
## iter  80 value 11774.005383
## iter  90 value 11561.522287
## final  value 11561.522111 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14669.169798
## iter  20 value 14480.772627
## iter  30 value 14262.746451
## iter  40 value 13370.905717
## iter  50 value 13033.920326
## iter  60 value 12678.574104
## iter  70 value 12394.355507
## iter  80 value 11765.842725
## iter  90 value 11566.131476
## final  value 11566.131290 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14670.361898
## iter  20 value 14482.411101
## iter  30 value 14263.518292
## iter  40 value 13369.118196
## iter  50 value 13036.638966
## iter  60 value 12672.031468
## iter  70 value 12395.143472
## iter  80 value 11783.715670
## iter  90 value 11570.704328
## final  value 11570.704170 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14671.553374
## iter  20 value 14484.047126
## iter  30 value 14264.270338
## iter  40 value 13367.792635
## iter  50 value 13039.687073
## iter  60 value 12664.718296
## iter  70 value 12391.544313
## iter  80 value 11781.463409
## iter  90 value 11575.241754
## final  value 11575.241599 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14672.744228
## iter  20 value 14485.680714
## iter  30 value 14265.002713
## iter  40 value 13366.869879
## iter  50 value 13043.064157
## iter  60 value 12657.398424
## iter  70 value 12394.685200
## iter  80 value 11789.082390
## iter  90 value 11579.744587
## final  value 11579.744375 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14673.934459
## iter  20 value 14487.311876
## iter  30 value 14265.715536
## iter  40 value 13366.231201
## iter  50 value 13046.669982
## iter  60 value 12650.957149
## iter  70 value 12389.197923
## iter  80 value 11792.136491
## iter  90 value 11584.213490
## final  value 11584.213258 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14675.124069
## iter  20 value 14488.940622
## iter  30 value 14266.408924
## iter  40 value 13365.680224
## iter  50 value 13050.334985
## iter  60 value 12646.288059
## iter  70 value 12385.553665
## iter  80 value 11802.211053
## iter  90 value 11588.649232
## final  value 11588.648971 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14676.313059
## iter  20 value 14490.566964
## iter  30 value 14267.082990
## iter  40 value 13313.402078
## iter  50 value 13005.466423
## iter  60 value 12659.116027
## iter  70 value 12396.489266
## iter  80 value 11798.937466
## iter  90 value 11593.052309
## iter  90 value 11593.052218
## iter  90 value 11593.052216
## final  value 11593.052216 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14677.501429
## iter  20 value 14492.190911
## iter  30 value 14267.737846
## iter  40 value 13309.004877
## iter  50 value 13006.080242
## iter  60 value 12664.280294
## iter  70 value 12398.902318
## iter  80 value 11808.081975
## iter  90 value 11597.423744
## iter  90 value 11597.423634
## iter  90 value 11597.423632
## final  value 11597.423632 
## converged
## # weights:  114 (74 variable)
## initial  value 21382.290974 
## iter  10 value 14678.689179
## iter  20 value 14493.812475
## iter  30 value 14268.373601
## iter  40 value 13305.640289
## iter  50 value 13007.611691
## iter  60 value 12616.520822
## iter  70 value 12394.731082
## iter  80 value 11788.492131
## iter  90 value 11601.763916
## iter  90 value 11601.763853
## iter  90 value 11601.763852
## final  value 11601.763852 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14432.517714
## iter  20 value 14211.206181
## iter  30 value 14011.526707
## iter  40 value 13095.873524
## iter  50 value 12796.623784
## iter  60 value 12539.329754
## iter  70 value 12285.816788
## iter  80 value 11755.918761
## final  value 11573.563296 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14433.643675
## iter  20 value 14212.881189
## iter  30 value 14013.186226
## iter  40 value 13093.155682
## iter  50 value 12814.705302
## iter  60 value 12543.199012
## iter  70 value 12313.917942
## iter  80 value 11777.641817
## iter  90 value 11578.211480
## iter  90 value 11578.211407
## iter  90 value 11578.211407
## final  value 11578.211407 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14434.769154
## iter  20 value 14214.553607
## iter  30 value 14014.915585
## iter  40 value 13091.167733
## iter  50 value 12820.224322
## iter  60 value 12542.639261
## iter  70 value 12345.476532
## iter  80 value 11771.663821
## final  value 11582.822562 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14435.894152
## iter  20 value 14216.223448
## iter  30 value 14016.702028
## iter  40 value 13090.756911
## iter  50 value 12827.612020
## iter  60 value 12538.601210
## iter  70 value 12297.432493
## iter  80 value 11767.878862
## final  value 11587.397655 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14437.018670
## iter  20 value 14217.890723
## iter  30 value 14018.534651
## iter  40 value 13093.023463
## iter  50 value 12831.948672
## iter  60 value 12545.649856
## iter  70 value 12317.143320
## iter  80 value 11768.332601
## final  value 11591.937530 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14438.142707
## iter  20 value 14219.555443
## iter  30 value 14020.404028
## iter  40 value 13099.150607
## iter  50 value 12813.712842
## iter  60 value 12531.721432
## iter  70 value 12319.388426
## iter  80 value 11772.291216
## final  value 11596.442993 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14439.266265
## iter  20 value 14221.217620
## iter  30 value 14022.301931
## iter  40 value 13110.048372
## iter  50 value 12815.234128
## iter  60 value 12525.004598
## iter  70 value 12326.794218
## iter  80 value 11780.507869
## final  value 11600.914803 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14440.389344
## iter  20 value 14222.877265
## iter  30 value 14004.585003
## iter  40 value 13181.326981
## iter  50 value 12854.285012
## iter  60 value 12516.230565
## iter  70 value 12306.191111
## iter  80 value 11791.093163
## iter  90 value 11605.353715
## iter  90 value 11605.353679
## iter  90 value 11605.353678
## final  value 11605.353678 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14441.511944
## iter  20 value 14224.534389
## iter  30 value 14005.269039
## iter  40 value 13178.887522
## iter  50 value 12857.511184
## iter  60 value 12515.005096
## iter  70 value 12308.193770
## iter  80 value 11792.779203
## iter  90 value 11609.760330
## iter  90 value 11609.760309
## iter  90 value 11609.760309
## final  value 11609.760309 
## converged
## # weights:  114 (74 variable)
## initial  value 21381.192362 
## iter  10 value 14442.634066
## iter  20 value 14226.189005
## iter  30 value 14005.959429
## iter  40 value 13177.212316
## iter  50 value 12861.382798
## iter  60 value 12509.013194
## iter  70 value 12284.545887
## iter  80 value 11793.426093
## final  value 11614.135373 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14698.642555
## iter  20 value 14501.565374
## iter  30 value 14248.159867
## iter  40 value 13287.519926
## iter  50 value 12968.164720
## iter  60 value 12658.564686
## iter  70 value 12390.949043
## iter  80 value 11748.163526
## iter  90 value 11554.576402
## iter  90 value 11554.576344
## iter  90 value 11554.576343
## final  value 11554.576343 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14699.865922
## iter  20 value 14503.281635
## iter  30 value 14248.834863
## iter  40 value 13287.913115
## iter  50 value 12972.784623
## iter  60 value 12667.605165
## iter  70 value 12397.909140
## iter  80 value 11751.106633
## iter  90 value 11559.168689
## iter  90 value 11559.168633
## iter  90 value 11559.168632
## final  value 11559.168632 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14701.088632
## iter  20 value 14504.995179
## iter  30 value 14249.488821
## iter  40 value 13288.458075
## iter  50 value 12977.486213
## iter  60 value 12677.217040
## iter  70 value 12406.554113
## iter  80 value 11760.311271
## iter  90 value 11563.725308
## iter  90 value 11563.725237
## iter  90 value 11563.725235
## final  value 11563.725235 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14702.310686
## iter  20 value 14506.706017
## iter  30 value 14250.121808
## iter  40 value 13289.156825
## iter  50 value 12982.265675
## iter  60 value 12687.428281
## iter  70 value 12416.889423
## iter  80 value 11761.806480
## iter  90 value 11568.247113
## iter  90 value 11568.247043
## iter  90 value 11568.247041
## final  value 11568.247041 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14703.532086
## iter  20 value 14508.414163
## iter  30 value 14250.733891
## iter  40 value 13290.011318
## iter  50 value 12987.120843
## iter  60 value 12698.283309
## iter  70 value 12428.384582
## iter  80 value 11766.898776
## iter  90 value 11572.734942
## iter  90 value 11572.734874
## iter  90 value 11572.734872
## final  value 11572.734872 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14704.752831
## iter  20 value 14510.119628
## iter  30 value 14251.325136
## iter  40 value 13291.023422
## iter  50 value 12992.050480
## iter  60 value 12709.863765
## iter  70 value 12440.321019
## iter  80 value 11774.336622
## iter  90 value 11577.189570
## iter  90 value 11577.189510
## iter  90 value 11577.189509
## final  value 11577.189509 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14705.972923
## iter  20 value 14511.822424
## iter  30 value 14251.895604
## iter  40 value 13292.194896
## iter  50 value 12997.053792
## iter  60 value 12722.340274
## iter  70 value 12452.248012
## iter  80 value 11781.794281
## iter  90 value 11581.611751
## iter  90 value 11581.611690
## iter  90 value 11581.611688
## final  value 11581.611688 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14707.192363
## iter  20 value 14513.522564
## iter  30 value 14252.445360
## iter  40 value 13293.527379
## iter  50 value 13002.130115
## iter  60 value 12736.023882
## iter  70 value 12464.287517
## iter  80 value 11787.971439
## iter  90 value 11586.002171
## iter  90 value 11586.002111
## iter  90 value 11586.002110
## final  value 11586.002110 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14708.411150
## iter  20 value 14515.220059
## iter  30 value 14252.974464
## iter  40 value 13295.022374
## iter  50 value 13007.278725
## iter  60 value 12751.337314
## iter  70 value 12477.006485
## iter  80 value 11785.175511
## iter  90 value 11590.361507
## iter  90 value 11590.361442
## iter  90 value 11590.361441
## final  value 11590.361441 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14709.629287
## iter  20 value 14516.914921
## iter  30 value 14253.482976
## iter  40 value 13296.681232
## iter  50 value 13012.498754
## iter  60 value 12679.139651
## iter  70 value 12405.200426
## iter  80 value 11793.597380
## iter  90 value 11594.690356
## iter  90 value 11594.690305
## iter  90 value 11594.690304
## final  value 11594.690304 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14660.631734
## iter  20 value 14465.043660
## iter  30 value 14219.315103
## iter  40 value 13318.680315
## iter  50 value 13014.106715
## iter  60 value 12614.341129
## iter  70 value 12366.522331
## iter  80 value 11758.921503
## iter  90 value 11561.356420
## iter  90 value 11561.356345
## iter  90 value 11561.356344
## final  value 11561.356344 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14661.831400
## iter  20 value 14466.720081
## iter  30 value 14220.189479
## iter  40 value 13319.763357
## iter  50 value 13019.607349
## iter  60 value 12617.481503
## iter  70 value 12370.919944
## iter  80 value 11762.284555
## iter  90 value 11565.914928
## iter  90 value 11565.914851
## iter  90 value 11565.914850
## final  value 11565.914850 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14663.030446
## iter  20 value 14468.393950
## iter  30 value 14221.046739
## iter  40 value 13320.985167
## iter  50 value 13025.114691
## iter  60 value 12621.076397
## iter  70 value 12374.942530
## iter  80 value 11765.522566
## iter  90 value 11570.438810
## iter  90 value 11570.438754
## iter  90 value 11570.438753
## final  value 11570.438753 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14664.228874
## iter  20 value 14470.065276
## iter  30 value 14221.887008
## iter  40 value 13322.340480
## iter  50 value 13030.622997
## iter  60 value 12625.157751
## iter  70 value 12378.436682
## iter  80 value 11771.106222
## iter  90 value 11574.928879
## iter  90 value 11574.928807
## iter  90 value 11574.928807
## final  value 11574.928807 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14665.426685
## iter  20 value 14471.734072
## iter  30 value 14222.710413
## iter  40 value 13323.824276
## iter  50 value 13036.126801
## iter  60 value 12629.760245
## iter  70 value 12381.492446
## iter  80 value 11769.778941
## iter  90 value 11579.385805
## iter  90 value 11579.385732
## iter  90 value 11579.385732
## final  value 11579.385732 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14666.623878
## iter  20 value 14473.400349
## iter  30 value 14223.517076
## iter  40 value 13325.431793
## iter  50 value 13041.620870
## iter  60 value 12634.918539
## iter  70 value 12384.362713
## iter  80 value 11782.612471
## iter  90 value 11583.810272
## iter  90 value 11583.810208
## iter  90 value 11583.810208
## final  value 11583.810208 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14667.820455
## iter  20 value 14475.064118
## iter  30 value 14224.307118
## iter  40 value 13327.158540
## iter  50 value 13047.100143
## iter  60 value 12640.662712
## iter  70 value 12387.476564
## iter  80 value 11783.431173
## iter  90 value 11588.202942
## iter  90 value 11588.202891
## iter  90 value 11588.202890
## final  value 11588.202890 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14669.016417
## iter  20 value 14476.725390
## iter  30 value 14225.080657
## iter  40 value 13329.000302
## iter  50 value 13052.559635
## iter  60 value 12647.012524
## iter  70 value 12391.605828
## iter  80 value 11782.019018
## iter  90 value 11592.564449
## iter  90 value 11592.564399
## iter  90 value 11592.564399
## final  value 11592.564399 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14670.211763
## iter  20 value 14478.384176
## iter  30 value 14225.837811
## iter  40 value 13330.953133
## iter  50 value 13057.994315
## iter  60 value 12653.972149
## iter  70 value 12398.801485
## iter  80 value 11787.693054
## iter  90 value 11596.895393
## iter  90 value 11596.895336
## iter  90 value 11596.895335
## final  value 11596.895335 
## converged
## # weights:  114 (74 variable)
## initial  value 21378.995137 
## iter  10 value 14671.406496
## iter  20 value 14480.040487
## iter  30 value 14226.578695
## iter  40 value 13333.013335
## iter  50 value 13063.398922
## iter  60 value 12661.527870
## iter  70 value 12419.750691
## iter  80 value 11798.628201
## iter  90 value 11601.196342
## iter  90 value 11601.196279
## iter  90 value 11601.196278
## final  value 11601.196278 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14675.138442
## iter  20 value 14480.583687
## iter  30 value 14190.741693
## iter  40 value 13338.303193
## iter  50 value 12991.464877
## iter  60 value 12621.114193
## iter  70 value 12389.295278
## iter  80 value 11739.245369
## iter  90 value 11531.340165
## iter  90 value 11531.340068
## iter  90 value 11531.340067
## final  value 11531.340067 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14676.374962
## iter  20 value 14482.314385
## iter  30 value 14191.394986
## iter  40 value 13337.087725
## iter  50 value 12996.551212
## iter  60 value 12630.412730
## iter  70 value 12381.099114
## iter  80 value 11747.427471
## iter  90 value 11536.068638
## final  value 11536.068469 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14677.610826
## iter  20 value 14484.042382
## iter  30 value 14192.023548
## iter  40 value 13335.956821
## iter  50 value 13001.617307
## iter  60 value 12657.461537
## iter  70 value 12374.397632
## iter  80 value 11775.268898
## iter  90 value 11540.760392
## final  value 11540.760135 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14678.846035
## iter  20 value 14485.767690
## iter  30 value 14192.627454
## iter  40 value 13382.848346
## iter  50 value 13055.945431
## iter  60 value 12680.251182
## iter  70 value 12399.909099
## iter  80 value 11754.490234
## iter  90 value 11545.415946
## iter  90 value 11545.415866
## iter  90 value 11545.415866
## final  value 11545.415866 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14680.080590
## iter  20 value 14487.490321
## iter  30 value 14193.206775
## iter  40 value 13381.428542
## iter  50 value 13060.064646
## iter  60 value 12684.679285
## iter  70 value 12396.118432
## iter  80 value 11759.424331
## iter  90 value 11550.036506
## iter  90 value 11550.036432
## iter  90 value 11550.036432
## final  value 11550.036432 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14681.314492
## iter  20 value 14489.210287
## iter  30 value 14193.761580
## iter  40 value 13380.109028
## iter  50 value 13064.152397
## iter  60 value 12689.515629
## iter  70 value 12410.603586
## iter  80 value 11752.036323
## iter  90 value 11554.622651
## iter  90 value 11554.622579
## iter  90 value 11554.622579
## final  value 11554.622579 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14682.547742
## iter  20 value 14490.927600
## iter  30 value 14194.291932
## iter  40 value 13378.892514
## iter  50 value 13068.206802
## iter  60 value 12694.978441
## iter  70 value 12405.392305
## iter  80 value 11765.219619
## iter  90 value 11559.175094
## iter  90 value 11559.175010
## iter  90 value 11559.175010
## final  value 11559.175010 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14683.780340
## iter  20 value 14492.642271
## iter  30 value 14194.797896
## iter  40 value 13377.781668
## iter  50 value 13072.225986
## iter  60 value 12625.922053
## iter  70 value 12377.674760
## iter  80 value 11774.814969
## iter  90 value 11563.694497
## iter  90 value 11563.694406
## iter  90 value 11563.694405
## final  value 11563.694405 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14685.012287
## iter  20 value 14494.354313
## iter  30 value 14195.279529
## iter  40 value 13376.779109
## iter  50 value 13076.208051
## iter  60 value 12629.301459
## iter  70 value 12380.513065
## iter  80 value 11778.474534
## iter  90 value 11568.181468
## iter  90 value 11568.181393
## iter  90 value 11568.181392
## final  value 11568.181392 
## converged
## # weights:  114 (74 variable)
## initial  value 21380.093750 
## iter  10 value 14686.243583
## iter  20 value 14496.063736
## iter  30 value 14195.736890
## iter  40 value 13375.887375
## iter  50 value 13080.151012
## iter  60 value 12633.108251
## iter  70 value 12384.747044
## iter  80 value 11782.697420
## iter  90 value 11572.636690
## iter  90 value 11572.636606
## iter  90 value 11572.636605
## final  value 11572.636605 
## converged
## # weights:  114 (74 variable)
## initial  value 23756.392130 
## iter  10 value 16170.664970
## iter  20 value 15907.576583
## iter  30 value 15714.844017
## iter  40 value 14990.471317
## iter  50 value 14555.964670
## iter  60 value 14114.451142
## iter  70 value 13783.215874
## iter  80 value 13104.469011
## iter  90 value 12874.863196
## iter  90 value 12874.863144
## iter  90 value 12874.863143
## final  value 12874.863143 
## converged
fit.multinom
## Penalized Multinomial Regression 
## 
## 21624 samples
##    36 predictor
##     3 classes: '0', '1', '2' 
## 
## Pre-processing: scaled (36) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 19462, 19462, 19462, 19462, 19461, 19463, ... 
## Resampling results across tuning parameters:
## 
##   decay  Accuracy   Kappa    
##   0.001  0.7609608  0.6319636
##   0.112  0.7607759  0.6316827
##   0.223  0.7607759  0.6316809
##   0.334  0.7610996  0.6321867
##   0.445  0.7611921  0.6323248
##   0.556  0.7614232  0.6326802
##   0.667  0.7613306  0.6325346
##   0.778  0.7613307  0.6325250
##   0.889  0.7614231  0.6326621
##   1.000  0.7612844  0.6324512
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was decay = 0.556.
plot(fit.multinom)

Vamos realizar as nossas predições para assim avaliar o modelo apos o tunning:

predictions.multinom = predict(fit.multinom, newdata = tst.features)
  • Matriz de confusão:
(cm.multinom = confusionMatrix(predictions.multinom, tst.target, mode='prec_recall'))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2
##          0 193  16  11
##          1  65  99  69
##          2  26  43 361
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7395          
##                  95% CI : (0.7092, 0.7682)
##     No Information Rate : 0.4994          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5862          
##                                           
##  Mcnemar's Test P-Value : 4.514e-09       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Precision              0.8773   0.4249   0.8395
## Recall                 0.6796   0.6266   0.8186
## F1                     0.7659   0.5064   0.8289
## Prevalence             0.3216   0.1789   0.4994
## Detection Rate         0.2186   0.1121   0.4088
## Detection Prevalence   0.2492   0.2639   0.4870
## Balanced Accuracy      0.8173   0.7209   0.8312
a =  cm.multinom$table[1,1]+cm.multinom$table[2,2]+cm.multinom$table[1,1]
(a)/(a + cm.multinom$table[2,1]+cm.multinom$table[3,1]+cm.multinom$table[1,2]+cm.multinom$table[1,3])
## [1] 0.8043118

Podemos notar que o modelo multinomial performou bem melhor que os modelos anteriores, com respeito a precisão para todas as 3 classes. Além disso, quando comparamos a acuracia overall, temos um valor de aproximadamente 74.86%, um pouco inferior que o modelo de floresta aleatoria. Caso o intuito do modelo seja em reter a probabilidade ou a classificar o individuo das classes Dropout e Graduate, de forma única, esse modelo seria o recomendavel, por apresentar as maiores precisões referentes a ambas as classes.

LDA

Cross-Validation

Será considerado uma validação cruzada com 10 folds

set.seed(13)
ctrl <- trainControl(
  method = "cv",
  number = 10,
)

Parameters Tunning

Será considerando um grid de parametros, aonde temos dimen variando de 0 a 5.

set.seed(13)
tuneGrid <- expand.grid(
  dimen = 0:5
)

Treinando o modelo

set.seed(13)
fit.lda <- train(
  Target ~ .,
  data = trn.balanced,
  method = 'lda2',
  preProcess = c("scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)
fit.lda
## Linear Discriminant Analysis 
## 
## 21624 samples
##    36 predictor
##     3 classes: '0', '1', '2' 
## 
## Pre-processing: scaled (36) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 19462, 19462, 19462, 19462, 19461, 19463, ... 
## Resampling results across tuning parameters:
## 
##   dimen  Accuracy   Kappa    
##   0      0.7211428  0.5697152
##   1      0.7211428  0.5697152
##   2      0.7325196  0.5862354
##   3      0.7325196  0.5862354
##   4      0.7325196  0.5862354
##   5      0.7325196  0.5862354
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was dimen = 2.
plot(fit.lda)

Vamos realizar as nossas predições para assim avaliar o modelo apos o tunning:

predictions.lda = predict(fit.lda, newdata = tst.features)
  • Matriz de confusão:
(cm.lda = confusionMatrix(predictions.lda, tst.target, mode='prec_recall'))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2
##          0 184  12   4
##          1  72  95  92
##          2  28  51 345
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7067          
##                  95% CI : (0.6754, 0.7365)
##     No Information Rate : 0.4994          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.538           
##                                           
##  Mcnemar's Test P-Value : 1.177e-15       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Precision              0.9200   0.3668   0.8137
## Recall                 0.6479   0.6013   0.7823
## F1                     0.7603   0.4556   0.7977
## Prevalence             0.3216   0.1789   0.4994
## Detection Rate         0.2084   0.1076   0.3907
## Detection Prevalence   0.2265   0.2933   0.4802
## Balanced Accuracy      0.8106   0.6875   0.8018
a =  cm.lda$table[1,1]+cm.lda$table[2,2]+cm.lda$table[1,1]
(a)/(a + cm.lda$table[2,1]+cm.lda$table[3,1]+cm.lda$table[1,2]+cm.lda$table[1,3])
## [1] 0.7996546

Podemos notar que o modelo baseado em LDA performou bem melhor que os modelos anteriores, com respeito a precisão para todas as classes 0 e 2, porém pior para a classe 1. Além disso, quando comparamos a acuracia overall, temos um valor de aproximadamente 71.35%, sendo um valor inferior ao modelo multinomial. Vale ressaltar os altos valores para a precisão para as classes 0 e 2, dropout e graduate, mostrando que o modelo possui uma alta precisão para as classes 0 e 2.

Resultados

predictions.dtc.prob = predict(fit.dtc, newdata = tst.features,type='prob')
predictions.rf.prob = predict(fit.rf, newdata = tst.features,type='prob')
predictions.multinom.prob = predict(fit.multinom, newdata = tst.features,type='prob')
predictions.lda.prob = predict(fit.lda, newdata = tst.features,type='prob')

n = 1000
cutoffs = seq(0,1, length.out=n)
c(min(cutoffs),max(cutoffs))
## [1] 0 1

ROC curve por classe

Vamos analisar a curva roc para as classes 0, 1 e 2.

cols = c('red','blue','purple','brown')
models = c('DecisionTree','RandomForest','Multinom','LDA')
types = 1:length(models)

Classe Dropout

n_class = 1
tpr.dtc = numeric(n);fpr.dtc = numeric(n)
tpr.rf = numeric(n);fpr.rf = numeric(n)
tpr.multinom = numeric(n);fpr.multinom = numeric(n)
tpr.lda = numeric(n);fpr.lda = numeric(n)

tprs = data.frame(tpr.dtc, fpr.dtc,
                  tpr.rf, fpr.rf,
                  tpr.multinom,fpr.multinom,
                  tpr.lda, fpr.lda)

tst.target.roc = ifelse(tst.target == n_class-1, 1, 0)

names = c('dtc','rf','multinom','lda')
for (j in 1:length(names)){
  for (i in 1:length(cutoffs)){
    if (names[j] == 'dtc'){
      temp = ifelse(predictions.dtc.prob[,n_class] >= cutoffs[i], 1, 0)
    }else{
      if(names[j] == 'rf'){
        temp = ifelse(predictions.rf.prob[,n_class] >= cutoffs[i], 1, 0)
      }else{
        if(names[j] == 'multinom'){
          temp = ifelse(predictions.multinom.prob[,n_class] >= cutoffs[i], 1, 0)
        }else{
          if(names[j] == 'lda'){
            temp = ifelse(predictions.lda.prob[,n_class] >= cutoffs[i], 1, 0)
          }else{
            break
          }
        }
      }
    }
    
    m = matrix(c(0,0,0,0),2,2)
    #print(cutoffs[i])
    m[1,1] = sum((temp == tst.target.roc) & (tst.target.roc == 0))
    m[2,2] = sum((temp == tst.target.roc) & (tst.target.roc == 1))
    
    m[1,2] = sum((temp != tst.target.roc) & (tst.target.roc == 1))
    m[2,1] = sum((temp != tst.target.roc) & (tst.target.roc == 0))
    
    tp = m[2,2]
    fn = m[1,2]
    tn = m[1,1]
    fp = m[2,1]
    
    if (names[j] == 'dtc'){
      tpr.dtc[i] = tp/(tp+fn)
      fpr.dtc[i] = fp/(fp+tn)
    }else{
      if(names[j] == 'rf'){
        tpr.rf[i] = tp/(tp+fn)
        fpr.rf[i] = fp/(fp+tn)
      }else{
        if(names[j] == 'multinom'){
          tpr.multinom[i] = tp/(tp+fn)
          fpr.multinom[i] = fp/(fp+tn)
        }else{
          if(names[j] == 'lda'){
            tpr.lda[i] = tp/(tp+fn)
            fpr.lda[i] = fp/(fp+tn)
          }else{
            break
          }
        }
      }
    }
  }
}
(auc.dtc = auc(fpr.dtc, tpr.dtc))
## [1] 89.81
(auc.rf = auc(fpr.rf, tpr.rf))
## [1] 92.01
(auc.multinom = auc(fpr.multinom, tpr.multinom))
## [1] 90.87
(auc.lda = auc(fpr.lda, tpr.lda))
## [1] 91.15
tpr.dtc = c(tpr.dtc, 0)
fpr.dtc = c(fpr.dtc, 0)
length(fpr.dtc); length(tpr.dtc)
## [1] 1001
## [1] 1001
plot(y=tpr.dtc,x=fpr.dtc,
     type='l',col='red',
     xlim=c(0,1),ylim=c(0,1),
     xlab='fpr',ylab='tpr',
     lty=types[1])
lines(fpr.rf,tpr.rf,type='l',col='blue',lty=types[2])
lines(fpr.multinom,tpr.multinom,type='l',col='purple',lty=types[3])
lines(fpr.lda,tpr.lda,type='l',col='brown',lty=types[4])
abline(0,1)
grid()
title('Classe Dropout')
legend('bottomright',
       legend=paste(c(models,'Random Classifier'), '(AUC =',paste(c(auc.dtc,auc.rf,auc.multinom,auc.lda, '50.00'),'%',sep=''),')'),
       col=c(cols,'black'), lty=c(types,1), cex=0.8)

Classe Enrolled

n_class = 2
tpr.dtc = numeric(n);fpr.dtc = numeric(n)
tpr.rf = numeric(n);fpr.rf = numeric(n)
tpr.multinom = numeric(n);fpr.multinom = numeric(n)
tpr.lda = numeric(n);fpr.lda = numeric(n)

tprs = data.frame(tpr.dtc, fpr.dtc,tpr.rf, fpr.rf,tpr.multinom, fpr.multinom,tpr.lda, fpr.lda)
tst.target.roc = ifelse(tst.target == n_class-1, 1, 0)

names = c('dtc','rf','multinom','lda')
for (j in 1:length(names)){
  for (i in 1:length(cutoffs)){
    if (names[j] == 'dtc'){
      temp = ifelse(predictions.dtc.prob[,n_class] >= cutoffs[i], 1, 0)
    }else{
      if(names[j] == 'rf'){
        temp = ifelse(predictions.rf.prob[,n_class] >= cutoffs[i], 1, 0)
      }else{
        if(names[j] == 'multinom'){
          temp = ifelse(predictions.multinom.prob[,n_class] >= cutoffs[i], 1, 0)
        }else{
          if(names[j] == 'lda'){
            temp = ifelse(predictions.lda.prob[,n_class] >= cutoffs[i], 1, 0)
          }else{
            break
          }
        }
      }
    }
    
    m = matrix(c(0,0,0,0),2,2)
    #print(cutoffs[i])
    m[1,1] = sum((temp == tst.target.roc) & (tst.target.roc == 0))
    m[2,2] = sum((temp == tst.target.roc) & (tst.target.roc == 1))
    
    m[1,2] = sum((temp != tst.target.roc) & (tst.target.roc == 1))
    m[2,1] = sum((temp != tst.target.roc) & (tst.target.roc == 0))
    
    tp = m[2,2]
    fn = m[1,2]
    tn = m[1,1]
    fp = m[2,1]
    
    if (names[j] == 'dtc'){
      tpr.dtc[i] = tp/(tp+fn)
      fpr.dtc[i] = fp/(fp+tn)
    }else{
      if(names[j] == 'rf'){
        tpr.rf[i] = tp/(tp+fn)
        fpr.rf[i] = fp/(fp+tn)
      }else{
        if(names[j] == 'multinom'){
          tpr.multinom[i] = tp/(tp+fn)
          fpr.multinom[i] = fp/(fp+tn)
        }else{
          if(names[j] == 'lda'){
            tpr.lda[i] = tp/(tp+fn)
            fpr.lda[i] = fp/(fp+tn)
          }else{
            break
          }
        }
      }
    }
    #print(c(tpr,fpr))
    #break
  }
}
(auc.dtc = auc(fpr.dtc, tpr.dtc))
## [1] 68.64
(auc.rf = auc(fpr.rf, tpr.rf))
## [1] 80.83
(auc.multinom = auc(fpr.multinom, tpr.multinom))
## [1] 78.17
(auc.lda = auc(fpr.lda, tpr.lda))
## [1] 77.22
tpr.dtc = c(tpr.dtc, 0)
fpr.dtc = c(fpr.dtc, 0)
plot(y=tpr.dtc,x=fpr.dtc,
     type='l',col='red',
     xlim=c(0,1),ylim=c(0,1),
     xlab='fpr',ylab='tpr',
     lty=types[1])
lines(fpr.rf,tpr.rf,type='l',col='blue',lty=types[2])
lines(fpr.multinom,tpr.multinom,type='l',col='purple',lty=types[3])
lines(fpr.lda,tpr.lda,type='l',col='brown',lty=types[4])
abline(0,1)
grid()
title('Classe Enrolled')
legend('bottomright',
       legend=paste(c(models,'Random Classifier'), '(AUC =',paste(c(auc.dtc,auc.rf,auc.multinom,auc.lda, '50.00'),'%',sep=''),')'),
       col=c(cols,'black'), lty=c(types,1), cex=0.8)

Classe Graduate

n_class = 3
tpr.dtc = numeric(n);fpr.dtc = numeric(n)
tpr.rf = numeric(n);fpr.rf = numeric(n)
tpr.multinom = numeric(n);fpr.multinom = numeric(n)
tpr.lda = numeric(n);fpr.lda = numeric(n)

tprs = data.frame(tpr.dtc, fpr.dtc,tpr.rf, fpr.rf,tpr.multinom, fpr.multinom,tpr.lda, fpr.lda)
tst.target.roc = ifelse(tst.target == n_class-1, 1, 0)

names = c('dtc','rf','multinom','lda')
for (j in 1:length(names)){
  for (i in 1:length(cutoffs)){
    if (names[j] == 'dtc'){
      temp = ifelse(predictions.dtc.prob[,n_class] >= cutoffs[i], 1, 0)
    }else{
      if(names[j] == 'rf'){
        temp = ifelse(predictions.rf.prob[,n_class] >= cutoffs[i], 1, 0)
      }else{
        if(names[j] == 'multinom'){
          temp = ifelse(predictions.multinom.prob[,n_class] >= cutoffs[i], 1, 0)
        }else{
          if(names[j] == 'lda'){
            temp = ifelse(predictions.lda.prob[,n_class] >= cutoffs[i], 1, 0)
          }else{
            break
          }
        }
      }
    }
    
    m = matrix(c(0,0,0,0),2,2)
    #print(cutoffs[i])
    m[1,1] = sum((temp == tst.target.roc) & (tst.target.roc == 0))
    m[2,2] = sum((temp == tst.target.roc) & (tst.target.roc == 1))
    
    m[1,2] = sum((temp != tst.target.roc) & (tst.target.roc == 1))
    m[2,1] = sum((temp != tst.target.roc) & (tst.target.roc == 0))
    
    tp = m[2,2]
    fn = m[1,2]
    tn = m[1,1]
    fp = m[2,1]
    
    if (names[j] == 'dtc'){
      tpr.dtc[i] = tp/(tp+fn)
      fpr.dtc[i] = fp/(fp+tn)
    }else{
      if(names[j] == 'rf'){
        tpr.rf[i] = tp/(tp+fn)
        fpr.rf[i] = fp/(fp+tn)
      }else{
        if(names[j] == 'multinom'){
          tpr.multinom[i] = tp/(tp+fn)
          fpr.multinom[i] = fp/(fp+tn)
        }else{
          if(names[j] == 'lda'){
            tpr.lda[i] = tp/(tp+fn)
            fpr.lda[i] = fp/(fp+tn)
          }else{
            break
          }
        }
      }
    }
    #print(c(tpr,fpr))
    #break
  }
}
(auc.dtc = auc(fpr.dtc, tpr.dtc))
## [1] 88.29
(auc.rf = auc(fpr.rf, tpr.rf))
## [1] 92.15
(auc.multinom = auc(fpr.multinom, tpr.multinom))
## [1] 90.83
(auc.lda = auc(fpr.lda, tpr.lda))
## [1] 89.74
tpr.dtc = c(tpr.dtc, 0)
fpr.dtc = c(fpr.dtc, 0)
plot(y=tpr.dtc,x=fpr.dtc,
     type='l',col='red',
     xlim=c(0,1),ylim=c(0,1),
     xlab='fpr',ylab='tpr',
     lty=types[1])
lines(fpr.rf,tpr.rf,type='l',col='blue',lty=types[2])
lines(fpr.multinom,tpr.multinom,type='l',col='purple',lty=types[3])
lines(fpr.lda,tpr.lda,type='l',col='brown',lty=types[4])
abline(0,1)
grid()
title('Classe Graduate')
legend('bottomright',
       legend=paste(c(models,'Random Classifier'), '(AUC =',paste(c(auc.dtc,auc.rf,auc.multinom,auc.lda, '50.00'),'%',sep=''),')'),
       col=c(cols,'black'), lty=c(types,1), cex=0.8)

Podemos notar que o modelo RandomForest apresentou a maior area abaixo da curva ROC para a classe Dropout, em concorrencia temos os modelos Multinom e LDA, performando de formas semelhantes para esta classe. Além disso, o modelo DecisionTree apresentou a pior performace.

Analogamente, temos que os modelos RandomForest e Multinom performaram semelhante para a classe Graduate, com respeito a area abaixo da curva ROC, em sequencia temos o modelo LDA e por ultimo o modelo DecisionTree apresentou a pior performace.

Será utilizado o modelo RandomForest, uma vez visto que o modelo performou melhor sobre o conjunto de teste, para todas as classes, e apresentando a maior area abaixo da curva ROC.

Dessa forma, como foi dito no inicio da sessão sobre treinamento dos modelos, a nossa tarefa será “classificar se o estudante é da classe dropout (Aluno desistente), Enrolled (matriculado ainda no curso) e Graduate (Aluno graduado). Além disso, com a obtenção do melhor modelo, será retornado o valor da probabilidade do estudante desistir, ainda estar no curso ou de se formar.”, então será selecionado o modelo baseado em RandomForest. Além disso, vamos analisar também utilizando o modelo LDA, por fins de analise com respeito a classe Dropout

Feature Importance

LDA

predictions.probs = predict(fit.lda, newdata = tst.features, type='prob')
predictions.raw = predict(fit.lda, newdata = tst.features, type='raw')
lda.probs.raw = data.frame(predictions.probs, 'class' = predictions.raw)
knitr::kable(head(lda.probs.raw))
X0 X1 X2 class
6 0.0302128 0.4988740 0.4709132 1
11 0.0143699 0.3188791 0.6667509 2
14 0.0022112 0.1450597 0.8527291 2
21 0.0968446 0.5205162 0.3826392 1
24 0.0018025 0.0891333 0.9090642 2
27 0.0052371 0.3641687 0.6305942 2
lda.probs = NULL
for (i in 1:(dim(lda.probs.raw)[1])){
  lda.probs[i] = predictions.probs[i,as.integer(lda.probs.raw$class[i])]
}
probs.class = data.frame('probs'=lda.probs,'class'=lda.probs.raw$class)
knitr::kable(head(probs.class))
probs class
0.4988740 1
0.6667509 2
0.8527291 2
0.5205162 1
0.9090642 2
0.6305942 2
attach(probs.class)
plot(probs ~ class)

plot(probs, col=c('blue','black','red')[as.integer(class)])
legend('bottomright', legend=c("Dropout", "Enrolled","Graduate"),
       col=c("blue", "black","red"), lty=1, cex=0.8)

Podemos notar que as previsões para a classe Dropout apresentaram suas maiores probabilidades, evidenciando que o modelo consegue classificar bem a classe dropout. Analogo, mas não com probabilidades tão proximas de 1, temos a probabilidade para a classe Graduate, também em sua maioria com valores altos. Porém, para o cenário da classe Enrolled, temos uma incerteza maior para classifica-la, oscilando em torno de 0.5 a 0.9, range consideravel para a probabilidade. Por fim, o modelo esta prevendo bem a probabilidade do individuo realizar o Dropout do curso, ou seja, desistir, sendo essa informação muito importante para os centros e departamentos dos cursos que o individuo esta relacionado. Além disso, pode ser interessante uma analise interna nas possiveis variaveis mais impactantes para aquela classificação.

  • Feature importance
lda.Imp <- varImp(fit.lda, scale = FALSE)
lda.Imp$importance[order(-lda.Imp$importance$X0),]
##                                    X0        X1        X2
## X2ndApproved                0.8986842 0.8217307 0.8986842
## X1stApproved                0.8635956 0.7919111 0.8635956
## X2ndGrade                   0.8492189 0.7401704 0.8492189
## X1stGrade                   0.8120677 0.7214242 0.8120677
## Age.at.enrollment           0.6907442 0.6322727 0.6907442
## X2ndEvaluations             0.6497901 0.6702177 0.6702177
## Tuition.fees.up.to.date     0.6474801 0.6339366 0.6474801
## X2ndEnrolled                0.6450975 0.6426893 0.6450975
## Scholarship.holder          0.6443052 0.6133253 0.6443052
## X1stEnrolled                0.6385021 0.6372147 0.6385021
## Gender                      0.6333169 0.5686504 0.6333169
## Application.mode            0.6255652 0.5707959 0.6255652
## X1stEvaluations             0.6045037 0.6508959 0.6508959
## Debtor                      0.5839102 0.5565720 0.5839102
## AdmissionGrade              0.5706766 0.5665177 0.5706766
## Application.order           0.5696637 0.5696637 0.5529060
## PrevQualifiGrade            0.5654050 0.5902006 0.5902006
## Displaced                   0.5635014 0.5374181 0.5635014
## Previous.qualification      0.5617439 0.5538519 0.5617439
## Mother.s.qualification      0.5444876 0.5444876 0.5194112
## ï..Marital.status           0.5429879 0.5253515 0.5429879
## GDP                         0.5386890 0.5336235 0.5386890
## Unemployment.rate           0.5287641 0.5371242 0.5371242
## Daytime.evening.attendance. 0.5267579 0.5267579 0.5236288
## X2ndWithoutEva              0.5254434 0.5499886 0.5499886
## Mother.s.occupation         0.5218603 0.5188915 0.5218603
## X1stWithoutEva              0.5216097 0.5433383 0.5433383
## Father.s.qualification      0.5210370 0.5324671 0.5324671
## Inflation.rate              0.5207763 0.5146957 0.5207763
## Father.s.occupation         0.5190514 0.5119659 0.5190514
## Course                      0.5175558 0.5175558 0.5107242
## X1stCredited                0.5147028 0.5147028 0.5079589
## X2ndCredited                0.5140765 0.5114905 0.5140765
## Educational.special.needs   0.5024780 0.5013524 0.5024780
## Nacionality                 0.5017635 0.5017635 0.5011255
## International               0.5017442 0.5017442 0.5010535
plot(lda.Imp)

Podemos notar que as variáveis X2ndApproved, X1stApproved, X2ndGrade e X1stGrade são variáveis muit impactantes para determinar a classe a qual o individuo pertence, de fato, uma vez que sabemos que as variáveis e X2ndApproved e X1stApproved são referentes ao numero de disciplinas que o inviduo foi aprovado no primeiro e segundo semestre, influencia bastante na decisão de um estudante de permanecer no curso ou não, se decide realizar o dropout ou não; analogamente temos para as variaveis X2ndGrade e X1stGrade, referenciando ao total de disciplinas pagas no primeiro e segundo semestre, podendo interferir bastante na decisão do estudante de desistir ou não do curso, seja por pagar muitas disciplinas, ou por pagar poucas porque não consegue andar no curso por ter reprovado outras disciplinas que são dependencias para outras mais avançadas.

RandomForest

predictions.probs = predict(fit.rf, newdata = tst.features, type='prob')
predictions.raw = predict(fit.rf, newdata = tst.features, type='raw')
rf.probs.raw = data.frame(predictions.probs, 'class' = predictions.raw)
knitr::kable(head(rf.probs.raw))
X0 X1 X2 class
6 0.352 0.250 0.398 2
11 0.116 0.242 0.642 2
14 0.146 0.206 0.648 2
21 0.510 0.172 0.318 0
24 0.054 0.074 0.872 2
27 0.078 0.216 0.706 2
rf.probs = NULL
for (i in 1:(dim(rf.probs.raw)[1])){
  rf.probs[i] = predictions.probs[i,as.integer(rf.probs.raw$class[i])]
}
probs.class = data.frame('probs'=rf.probs,'class'=rf.probs.raw$class)
knitr::kable(head(probs.class))
probs class
0.398 2
0.642 2
0.648 2
0.510 0
0.872 2
0.706 2
attach(probs.class)
## The following objects are masked from probs.class (pos = 3):
## 
##     class, probs
plot(probs ~ class)

plot(probs, col=c('blue','black','red')[as.integer(class)])
legend('bottomright', legend=c("Dropout", "Enrolled","Graduate"),
       col=c("blue", "black","red"), lty=1, cex=0.8)

Acima podemos notar que, diferente do modelo baseado em LDA, os valores das probabilidades aparentaram estar mais dispersos, aparentando o modelo estar classificando com um grau de incerteza maior sobre aquela classe.

  • Feature importance
rf.Imp <- varImp(fit.rf, scale = FALSE)
data_imp <- rf.Imp$importance #unique
data_imp['var'] <- rownames(data_imp)
row.names(data_imp) <- seq(1, dim(data_imp)[1])
p <- ggplot(data_imp, aes(x = reorder(var, Overall), y = Overall))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=40, hjust=1))
p <- p + ggtitle("Feature Importance para o Random Forest")
p <- p + labs(y = "Feature Importance", x = "Variável")
p.final <- p + coord_flip()
p.final

Podemos notar que as variáveis X2ndApproved, X1stApproved, X2ndGrade, X2ndEvaluations e X1stGrade são variáveis muito impactantes para determinar a classe a qual o individuo pertence, cenário este sendo semelhante aos resultados obtidos pelo modelo LDA.