Avaliação 03
Packages
R
library(GGally)## Warning: package 'GGally' was built under R version 4.1.3
## Carregando pacotes exigidos: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(DMwR)## Carregando pacotes exigidos: lattice
## Carregando pacotes exigidos: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
options(warn=-1)
library(MASS)
library(caret)
library(rpart)
library(randomForest)## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
options(warn=0)Funções extras
auc = function(fpr, tpr){
a = fpr[1:(length(fpr)-1)]
b = fpr[2:(length(fpr))]
round(sum((a-b)*tpr),4)*100
}Carregando os dados
DIR = 'data'; FILE = 'data.csv'
URL = file.path(DIR, FILE)
data = read.csv(URL, sep = ';')Separando banco de variáveis continuas
continuos = c("Previous.qualification..grade.","Admission.grade",names(data)[22:33],
"Unemployment.rate", "Inflation.rate", "GDP")
data_c = data[continuos]
data = data[,!(colnames(data) %in% continuos)]
dim(data_c)## [1] 4424 17
colnames(data)## [1] "ï..Marital.status" "Application.mode"
## [3] "Application.order" "Course"
## [5] "Daytime.evening.attendance." "Previous.qualification"
## [7] "Nacionality" "Mother.s.qualification"
## [9] "Father.s.qualification" "Mother.s.occupation"
## [11] "Father.s.occupation" "Displaced"
## [13] "Educational.special.needs" "Debtor"
## [15] "Tuition.fees.up.to.date" "Gender"
## [17] "Scholarship.holder" "Age.at.enrollment"
## [19] "International" "Target"
colnames(data_c) = c('PrevQualifiGrade', "AdmissionGrade","1stCredited",
"1stEnrolled","1stEvaluations",
"1stApproved","1stGrade","1stWithoutEva",
"2ndCredited",
"2ndEnrolled","2ndEvaluations",
"2ndApproved","2ndGrade","2ndWithoutEva",
colnames(data_c)[15:17])attach(data)
attach(data_c)Pre-Processing
Verificando existência de NaN’s
sum(is.na(data)) ## [1] 0
sum(is.na(data_c)) ## [1] 0
knitr::kable(summary(data))| ï..Marital.status | Application.mode | Application.order | Course | Daytime.evening.attendance. | Previous.qualification | Nacionality | Mother.s.qualification | Father.s.qualification | Mother.s.occupation | Father.s.occupation | Displaced | Educational.special.needs | Debtor | Tuition.fees.up.to.date | Gender | Scholarship.holder | Age.at.enrollment | International | Target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :1.000 | Min. : 1.00 | Min. :0.000 | Min. : 33 | Min. :0.0000 | Min. : 1.000 | Min. : 1.000 | Min. : 1.00 | Min. : 1.00 | Min. : 0.00 | Min. : 0.00 | Min. :0.0000 | Min. :0.00000 | Min. :0.0000 | Min. :0.0000 | Min. :0.0000 | Min. :0.0000 | Min. :17.00 | Min. :0.00000 | Length:4424 | |
| 1st Qu.:1.000 | 1st Qu.: 1.00 | 1st Qu.:1.000 | 1st Qu.:9085 | 1st Qu.:1.0000 | 1st Qu.: 1.000 | 1st Qu.: 1.000 | 1st Qu.: 2.00 | 1st Qu.: 3.00 | 1st Qu.: 4.00 | 1st Qu.: 4.00 | 1st Qu.:0.0000 | 1st Qu.:0.00000 | 1st Qu.:0.0000 | 1st Qu.:1.0000 | 1st Qu.:0.0000 | 1st Qu.:0.0000 | 1st Qu.:19.00 | 1st Qu.:0.00000 | Class :character | |
| Median :1.000 | Median :17.00 | Median :1.000 | Median :9238 | Median :1.0000 | Median : 1.000 | Median : 1.000 | Median :19.00 | Median :19.00 | Median : 5.00 | Median : 7.00 | Median :1.0000 | Median :0.00000 | Median :0.0000 | Median :1.0000 | Median :0.0000 | Median :0.0000 | Median :20.00 | Median :0.00000 | Mode :character | |
| Mean :1.179 | Mean :18.67 | Mean :1.728 | Mean :8857 | Mean :0.8908 | Mean : 4.578 | Mean : 1.873 | Mean :19.56 | Mean :22.28 | Mean : 10.96 | Mean : 11.03 | Mean :0.5484 | Mean :0.01153 | Mean :0.1137 | Mean :0.8807 | Mean :0.3517 | Mean :0.2484 | Mean :23.27 | Mean :0.02486 | NA | |
| 3rd Qu.:1.000 | 3rd Qu.:39.00 | 3rd Qu.:2.000 | 3rd Qu.:9556 | 3rd Qu.:1.0000 | 3rd Qu.: 1.000 | 3rd Qu.: 1.000 | 3rd Qu.:37.00 | 3rd Qu.:37.00 | 3rd Qu.: 9.00 | 3rd Qu.: 9.00 | 3rd Qu.:1.0000 | 3rd Qu.:0.00000 | 3rd Qu.:0.0000 | 3rd Qu.:1.0000 | 3rd Qu.:1.0000 | 3rd Qu.:0.0000 | 3rd Qu.:25.00 | 3rd Qu.:0.00000 | NA | |
| Max. :6.000 | Max. :57.00 | Max. :9.000 | Max. :9991 | Max. :1.0000 | Max. :43.000 | Max. :109.000 | Max. :44.00 | Max. :44.00 | Max. :194.00 | Max. :195.00 | Max. :1.0000 | Max. :1.00000 | Max. :1.0000 | Max. :1.0000 | Max. :1.0000 | Max. :1.0000 | Max. :70.00 | Max. :1.00000 | NA |
knitr::kable(summary(data_c))| PrevQualifiGrade | AdmissionGrade | 1stCredited | 1stEnrolled | 1stEvaluations | 1stApproved | 1stGrade | 1stWithoutEva | 2ndCredited | 2ndEnrolled | 2ndEvaluations | 2ndApproved | 2ndGrade | 2ndWithoutEva | Unemployment.rate | Inflation.rate | GDP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 95.0 | Min. : 95.0 | Min. : 0.00 | Min. : 0.000 | Min. : 0.000 | Min. : 0.000 | Min. : 0.00 | Min. : 0.0000 | Min. : 0.0000 | Min. : 0.000 | Min. : 0.000 | Min. : 0.000 | Min. : 0.00 | Min. : 0.0000 | Min. : 7.60 | Min. :-0.800 | Min. :-4.060000 | |
| 1st Qu.:125.0 | 1st Qu.:117.9 | 1st Qu.: 0.00 | 1st Qu.: 5.000 | 1st Qu.: 6.000 | 1st Qu.: 3.000 | 1st Qu.:11.00 | 1st Qu.: 0.0000 | 1st Qu.: 0.0000 | 1st Qu.: 5.000 | 1st Qu.: 6.000 | 1st Qu.: 2.000 | 1st Qu.:10.75 | 1st Qu.: 0.0000 | 1st Qu.: 9.40 | 1st Qu.: 0.300 | 1st Qu.:-1.700000 | |
| Median :133.1 | Median :126.1 | Median : 0.00 | Median : 6.000 | Median : 8.000 | Median : 5.000 | Median :12.29 | Median : 0.0000 | Median : 0.0000 | Median : 6.000 | Median : 8.000 | Median : 5.000 | Median :12.20 | Median : 0.0000 | Median :11.10 | Median : 1.400 | Median : 0.320000 | |
| Mean :132.6 | Mean :127.0 | Mean : 0.71 | Mean : 6.271 | Mean : 8.299 | Mean : 4.707 | Mean :10.64 | Mean : 0.1377 | Mean : 0.5418 | Mean : 6.232 | Mean : 8.063 | Mean : 4.436 | Mean :10.23 | Mean : 0.1503 | Mean :11.57 | Mean : 1.228 | Mean : 0.001969 | |
| 3rd Qu.:140.0 | 3rd Qu.:134.8 | 3rd Qu.: 0.00 | 3rd Qu.: 7.000 | 3rd Qu.:10.000 | 3rd Qu.: 6.000 | 3rd Qu.:13.40 | 3rd Qu.: 0.0000 | 3rd Qu.: 0.0000 | 3rd Qu.: 7.000 | 3rd Qu.:10.000 | 3rd Qu.: 6.000 | 3rd Qu.:13.33 | 3rd Qu.: 0.0000 | 3rd Qu.:13.90 | 3rd Qu.: 2.600 | 3rd Qu.: 1.790000 | |
| Max. :190.0 | Max. :190.0 | Max. :20.00 | Max. :26.000 | Max. :45.000 | Max. :26.000 | Max. :18.88 | Max. :12.0000 | Max. :19.0000 | Max. :23.000 | Max. :33.000 | Max. :20.000 | Max. :18.57 | Max. :12.0000 | Max. :16.20 | Max. : 3.700 | Max. : 3.510000 |
Podemos notar que a base não possui NaN’s. Além disso, apartir da ultima tabela gerada acima, podemos notar que não existe nenhum valor, em sua respectiva variavel, que aparenta estar fora do seu range de valores. Esse ultimo ponto de verificar se existe alguma variavel fora do range da sua natureza, foi utilizado com o intuito para verificar se existiam NaN’s tratados como 999 ou -1 por exemplo, como sabemos que muitas situações é comum tratarem observações com numeros bem discripantes da natureza da sua variavel para representar dados faltantes.
Definindo variável target
Vamos definir a nossa variável target para o nosso problema de classificação
target <- as.factor(data$Target)
levels(target)## [1] "Dropout" "Enrolled" "Graduate"
Vamos analisar a distribuição pelas classes da variável target, versus a variavel Unemployment.rate, Inflation.rate e Age.at.enrollment:
- Unemployment.rate:
boxplot(Unemployment.rate ~ target)Acima podemos notar os grupos Dropout e Graduate aparentam ter comportamento semelhante com respetio a variável Unemployment.rate, porém ambos tem comportamento distinto da classe Enrolled, sendo esta ultima apresentando um menor intervalo interquartilico, aparentando ter uma variação menor dentre os membros da sua classe, com respeito a taxa de desemprego. Vale ressaltar também que a mediana manteve-se igual para os 3 grupos.
boxplot(Inflation.rate ~ target)Podemos notar que para o cenário acima, agora a classe Graduate apresenta uma mudança quando comparado as outras duas classes, evidenciando uma mediana inferior quando comparado aos outros grupos, com respeito a variável inflation.rate.
names(data)## [1] "ï..Marital.status" "Application.mode"
## [3] "Application.order" "Course"
## [5] "Daytime.evening.attendance." "Previous.qualification"
## [7] "Nacionality" "Mother.s.qualification"
## [9] "Father.s.qualification" "Mother.s.occupation"
## [11] "Father.s.occupation" "Displaced"
## [13] "Educational.special.needs" "Debtor"
## [15] "Tuition.fees.up.to.date" "Gender"
## [17] "Scholarship.holder" "Age.at.enrollment"
## [19] "International" "Target"
boxplot(Age.at.enrollment ~ target)Podemos notar que agora com respeito a variável Age.at.enrollment um comportamento diferente entre as 3 classes, notando que o menor valor encontra-se na classe Graduate, e também a presença maior de outliers, analogo podemos notar que a classe Dropout possui em sua predominância valores mais altos. Esta variável referencia a idade na hora de inscrição do curso, podemos notar que estudantes que se inscrevem com idades menores se enquadram na classe de Graduate, ou seja, se graduam de fato, porém jovens com idades maiores ou mais avançadas que se inscrevem no curso, tendem a realizar o Dropout do curso, ou seja, desistem ou abandonam o curso.
- Trocando o nome das variáveis:
Vamos substituir os nomes das classes da variável target por valores numericos 0, 1 e 2, respectivamente
levels(target) <- c(0,1,2)
levels(target)## [1] "0" "1" "2"
Proporção das classes da variável target
round((table(target)/length(target))*100,2)## target
## 0 1 2
## 32.12 17.95 49.93
Podemos notar que existe uma desproporção entre as classes, não estando balanceadas, possivelmente acarretando na possibilidade problemas para a tarefa de classificação a ser realizada. Isso será tratado mais na frente com métodos vistos. Podemos notar também que a classe majoritaria é a classe 2, 49.93% dos dados, Graduate, e a minoritária é a classe 1, Enrolled, com 17.95% dos dados.
Padronizando variaveis continuas
Agora vamos padronizar as variáveis continuas na base de dados, por apresentarem escalas distintas.
data_c_std = data.frame(scale(data_c))
knitr::kable(head(data_c_std))| PrevQualifiGrade | AdmissionGrade | X1stCredited | X1stEnrolled | X1stEvaluations | X1stApproved | X1stGrade | X1stWithoutEva | X2ndCredited | X2ndEnrolled | X2ndEvaluations | X2ndApproved | X2ndGrade | X2ndWithoutEva | Unemployment.rate | Inflation.rate | GDP |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| -0.8047503 | 0.0222263 | -0.3007791 | -2.5282738 | -1.9858437 | -1.5210854 | -2.1968541 | -0.1992505 | -0.2824104 | -2.8380158 | -2.0423990 | -1.4713606 | -1.9632667 | -0.1994184 | -0.2876059 | 0.1243724 | 0.7656743 |
| 2.0765846 | 1.0718050 | -0.3007791 | -0.1090928 | -0.5501298 | 0.4180026 | 0.6935202 | -0.1992505 | -0.2824104 | -0.1057141 | -0.5226233 | 0.5188450 | 0.6594872 | -0.1994184 | 0.8761230 | -1.1050966 | 0.3471602 |
| -0.8047503 | -0.1504018 | -0.3007791 | -0.1090928 | -1.9858437 | -1.5210854 | -2.1968541 | -0.1992505 | -0.2824104 | -0.1057141 | -2.0423990 | -1.4713606 | -1.9632667 | -0.1994184 | -0.2876059 | 0.1243724 | 0.7656743 |
| -0.8047503 | -0.5094682 | -0.3007791 | -0.1090928 | -0.0715585 | 0.4180026 | 0.5755457 | -0.1992505 | -0.2824104 | -0.1057141 | 0.4905605 | 0.1871441 | 0.4164027 | -0.1994184 | -0.8131610 | -1.4667052 | -1.3753558 |
| -2.4728915 | 1.0027538 | -0.3007791 | -0.1090928 | 0.1677271 | 0.0948213 | 0.3494280 | -0.1992505 | -0.2824104 | -0.1057141 | -0.5226233 | 0.5188450 | 0.5315479 | -0.1994184 | 0.8761230 | -1.1050966 | 0.3471602 |
| 0.0369028 | -0.8409141 | -0.3007791 | -0.5122897 | 0.4070128 | 0.0948213 | 0.2511160 | -0.1992505 | -0.2824104 | -0.5610977 | 2.2636322 | 0.1871441 | 0.2436847 | 6.4338689 | 1.7395349 | -0.6711664 | -0.4061652 |
round(colMeans(data_c_std),4)## PrevQualifiGrade AdmissionGrade X1stCredited X1stEnrolled
## 0 0 0 0
## X1stEvaluations X1stApproved X1stGrade X1stWithoutEva
## 0 0 0 0
## X2ndCredited X2ndEnrolled X2ndEvaluations X2ndApproved
## 0 0 0 0
## X2ndGrade X2ndWithoutEva Unemployment.rate Inflation.rate
## 0 0 0 0
## GDP
## 0
Split dos dados
Merge dos dados continuos e não continuos
Vamos concatenar as informção não continuas e as continuas padronizadas.
data_final = data.frame(data, data_c_std)
data_final$Target <- as.factor(target)Treino e Teste
Separaremos os dados em treino e teste, com proporção 80% e 20%, respectivamente.
set.seed(13)
in.trn <- createDataPartition(target, p = .80, list = FALSE)
trn <- data_final[in.trn,]
tst <- data_final[-in.trn,]
tst.features = subset(tst, select = -c(Target))
tst.target = subset(tst, select = Target)[,1]
trn.features = subset(trn, select = -c(Target))
trn.target = subset(trn, select = Target)[,1]Corrigindo desbalanceamento dos dados de treino
- Desbalanceado
props = table(trn.target)
(round((props/length(trn.target))*100,2))## trn.target
## 0 1 2
## 32.11 17.96 49.93
- Balanceado
trn.balanced <- SMOTE(Target ~ ., trn,
perc.over=max(props)-min(props))
round((table(trn.balanced$Target)/length(trn.balanced$Target))*100,2)##
## 0 1 2
## 25.09 35.29 39.61
trn.features.balanced = subset(trn.balanced, select = -c(Target))
trn.target.balanced = subset(trn.balanced, select = Target)[,1]Podemos notar agora que as proporções das classes estão balanceados.
Treinamento do modelo
Os modelos a seguir vão seguir a seguinte tarefa de aprendizagem supervisionada:
- Tarefa: classificar se o estudante é da classe dropout (Aluno desistente), Enrolled (matriculado ainda no curso) e Graduate (Aluno graduado). Além disso, com a obtenção do melhor modelo, será retornado o valor da probabilidade do estudante desistir, ainda estar no curso ou de se formar.
Essa tarefa será interessante para inumeras funcionalidades, como a previsão de evasão por desistencia do curso, retornando a sua probabilidade, previsão da probabilidade de um estudante se formar; auxiliar também um departamento ou centro com a previsão geral de evasão dos seus alunos por departamento, com o intuito de diminui-la, podendo ate retirar o feature-importance do modelo para descobrir quais variáveis são mais impactantes para a desistência do estudante, afim de contornar essa situação, ou até mesmo ver qual variável é mais importante para a conclusão do curso do estudante, com o intuito de investir cada vez mais nela.
Decision tree
Cross-Validation
Será considerado uma validação cruzada com 10 folds
set.seed(13)
ctrl <- trainControl(
method = "cv",
number = 10,
)Parameters Tunning
Vamos realizar um grid de parametros para verificar qual o valor de cp que maximize a acuracia obtida pelo modelo.
set.seed(13)
tuneGrid <- expand.grid(
cp = seq(0, 1, by = .01)
)Treinando o modelo
set.seed(13)
fit.dtc <- train(
Target ~ .,
data = trn.balanced,
method = 'rpart',
preProcess = c("scale"),
trControl = ctrl,
#summaryFunction=twoClassSummary,
#classProbs=T,
#savePredictions=T,
tuneGrid = tuneGrid
)
fit.dtc## CART
##
## 21624 samples
## 36 predictor
## 3 classes: '0', '1', '2'
##
## Pre-processing: scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 19462, 19462, 19462, 19462, 19461, 19463, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.00 0.9313256 0.8952968
## 0.01 0.7911574 0.6786442
## 0.02 0.7363573 0.5922283
## 0.03 0.7202179 0.5654364
## 0.04 0.7202179 0.5654364
## 0.05 0.7202179 0.5654364
## 0.06 0.7202179 0.5654364
## 0.07 0.7202179 0.5654364
## 0.08 0.7202179 0.5654364
## 0.09 0.7202179 0.5654364
## 0.10 0.7202179 0.5654364
## 0.11 0.7202179 0.5654364
## 0.12 0.7202179 0.5654364
## 0.13 0.7202179 0.5654364
## 0.14 0.7202179 0.5654364
## 0.15 0.7202179 0.5654364
## 0.16 0.7202179 0.5654364
## 0.17 0.7202179 0.5654364
## 0.18 0.7202179 0.5654364
## 0.19 0.7202179 0.5654364
## 0.20 0.6010444 0.3618105
## 0.21 0.6010444 0.3618105
## 0.22 0.6010444 0.3618105
## 0.23 0.6010444 0.3618105
## 0.24 0.6010444 0.3618105
## 0.25 0.6010444 0.3618105
## 0.26 0.6010444 0.3618105
## 0.27 0.6010444 0.3618105
## 0.28 0.6010444 0.3618105
## 0.29 0.6010444 0.3618105
## 0.30 0.6010444 0.3618105
## 0.31 0.6010444 0.3618105
## 0.32 0.6010444 0.3618105
## 0.33 0.6010444 0.3618105
## 0.34 0.4746165 0.1391573
## 0.35 0.3961339 0.0000000
## 0.36 0.3961339 0.0000000
## 0.37 0.3961339 0.0000000
## 0.38 0.3961339 0.0000000
## 0.39 0.3961339 0.0000000
## 0.40 0.3961339 0.0000000
## 0.41 0.3961339 0.0000000
## 0.42 0.3961339 0.0000000
## 0.43 0.3961339 0.0000000
## 0.44 0.3961339 0.0000000
## 0.45 0.3961339 0.0000000
## 0.46 0.3961339 0.0000000
## 0.47 0.3961339 0.0000000
## 0.48 0.3961339 0.0000000
## 0.49 0.3961339 0.0000000
## 0.50 0.3961339 0.0000000
## 0.51 0.3961339 0.0000000
## 0.52 0.3961339 0.0000000
## 0.53 0.3961339 0.0000000
## 0.54 0.3961339 0.0000000
## 0.55 0.3961339 0.0000000
## 0.56 0.3961339 0.0000000
## 0.57 0.3961339 0.0000000
## 0.58 0.3961339 0.0000000
## 0.59 0.3961339 0.0000000
## 0.60 0.3961339 0.0000000
## 0.61 0.3961339 0.0000000
## 0.62 0.3961339 0.0000000
## 0.63 0.3961339 0.0000000
## 0.64 0.3961339 0.0000000
## 0.65 0.3961339 0.0000000
## 0.66 0.3961339 0.0000000
## 0.67 0.3961339 0.0000000
## 0.68 0.3961339 0.0000000
## 0.69 0.3961339 0.0000000
## 0.70 0.3961339 0.0000000
## 0.71 0.3961339 0.0000000
## 0.72 0.3961339 0.0000000
## 0.73 0.3961339 0.0000000
## 0.74 0.3961339 0.0000000
## 0.75 0.3961339 0.0000000
## 0.76 0.3961339 0.0000000
## 0.77 0.3961339 0.0000000
## 0.78 0.3961339 0.0000000
## 0.79 0.3961339 0.0000000
## 0.80 0.3961339 0.0000000
## 0.81 0.3961339 0.0000000
## 0.82 0.3961339 0.0000000
## 0.83 0.3961339 0.0000000
## 0.84 0.3961339 0.0000000
## 0.85 0.3961339 0.0000000
## 0.86 0.3961339 0.0000000
## 0.87 0.3961339 0.0000000
## 0.88 0.3961339 0.0000000
## 0.89 0.3961339 0.0000000
## 0.90 0.3961339 0.0000000
## 0.91 0.3961339 0.0000000
## 0.92 0.3961339 0.0000000
## 0.93 0.3961339 0.0000000
## 0.94 0.3961339 0.0000000
## 0.95 0.3961339 0.0000000
## 0.96 0.3961339 0.0000000
## 0.97 0.3961339 0.0000000
## 0.98 0.3961339 0.0000000
## 0.99 0.3961339 0.0000000
## 1.00 0.3961339 0.0000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
plot(fit.dtc)Vamos realizar as nossas predições para assim avaliar o modelo apos o tunning:
predictions.dtc = predict(fit.dtc, newdata = tst.features)- Matriz de confusão:
(cm.dtc = confusionMatrix(predictions.dtc, tst.target, mode='prec_recall'))## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 197 49 48
## 1 39 43 43
## 2 48 66 350
##
## Overall Statistics
##
## Accuracy : 0.6682
## 95% CI : (0.636, 0.6992)
## No Information Rate : 0.4994
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4517
##
## Mcnemar's Test P-Value : 0.1121
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Precision 0.6701 0.3440 0.7543
## Recall 0.6937 0.2722 0.7937
## F1 0.6817 0.3039 0.7735
## Prevalence 0.3216 0.1789 0.4994
## Detection Rate 0.2231 0.0487 0.3964
## Detection Prevalence 0.3330 0.1416 0.5255
## Balanced Accuracy 0.7659 0.5795 0.7679
a = cm.dtc$table[1,1]+cm.dtc$table[2,2]+cm.dtc$table[1,1]
(a)/(a + cm.dtc$table[2,1]+cm.dtc$table[3,1]+cm.dtc$table[1,2]+cm.dtc$table[1,3])## [1] 0.7037037
O modelo performou de forma consideravel, quando tratamos a precisão da classe 2. Porém quando analisamos a acuracia overall, o modelo baseado em arvore de decisão apresentou um desempenho melhor, de aproximadamente 66.82%.
Random Forest
Cross-Validation
Será considerado uma validação cruzada com 10 folds
set.seed(13)
ctrl <- trainControl(
method = "cv",
number = 10,
)Parameters Tunning
Vamos realizar um grid de parametros para verificar qual o valor de mtry que maximize a acuracia obtida pelo modelo.
set.seed(13)
tuneGrid <- expand.grid(
mtry = 1:4
)Treinando o modelo
set.seed(13)
fit.rf <- train(
Target ~ .,
data = trn.balanced,
method = 'rf',
preProcess = c("scale"),
trControl = ctrl,
tuneGrid = tuneGrid
)
fit.rf## Random Forest
##
## 21624 samples
## 36 predictor
## 3 classes: '0', '1', '2'
##
## Pre-processing: scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 19462, 19462, 19462, 19462, 19461, 19463, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.9171753 0.8727255
## 2 0.9850627 0.9771926
## 3 0.9910745 0.9863805
## 4 0.9912598 0.9866641
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
plot(fit.rf)Vamos realizar as nossas predições para assim avaliar o modelo apos o tunning:
predictions.rf = predict(fit.rf, newdata = tst.features)- Matriz de confusão:
(cm.rf = confusionMatrix(predictions.rf, tst.target, mode='prec_recall'))## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 204 21 17
## 1 46 73 34
## 2 34 64 390
##
## Overall Statistics
##
## Accuracy : 0.7554
## 95% CI : (0.7256, 0.7834)
## No Information Rate : 0.4994
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5956
##
## Mcnemar's Test P-Value : 2.292e-05
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Precision 0.8430 0.47712 0.7992
## Recall 0.7183 0.46203 0.8844
## F1 0.7757 0.46945 0.8396
## Prevalence 0.3216 0.17894 0.4994
## Detection Rate 0.2310 0.08267 0.4417
## Detection Prevalence 0.2741 0.17327 0.5527
## Balanced Accuracy 0.8274 0.67584 0.8313
a = cm.rf$table[1,1]+cm.rf$table[2,2]+cm.rf$table[1,1]
(a)/(a + cm.rf$table[2,1]+cm.rf$table[3,1]+cm.rf$table[1,2]+cm.rf$table[1,3])## [1] 0.803005
Podemos notar que a floresta aleatoria performou bem melhor que o modelo baseado em Arvore de decisão, como era de se esperar, quando comparamos a precisão das classes. Além disso, quando comparamos a acuracia overall, temos um valor de aproximadamente 75.54%, bem superior que ambos outros modelos. Até então, temos que o modelo baseado na floresta aleatoria é oque performou melhor.
Multinom
Cross-Validation
Será considerado uma validação cruzada com 10 folds
set.seed(13)
ctrl <- trainControl(
method = "cv",
number = 10,
)Parameters Tunning
Vamos realizar um grid de parametros para verificar qual o valor de decay que maximize a acuracia obtida pelo modelo.
set.seed(13)
tuneGrid <- expand.grid(
decay = seq((10**(-3)), 1, length.out=10)
)Treinando o modelo
set.seed(13)
fit.multinom <- train(
Target ~ .,
data = trn.balanced,
method = 'multinom',
preProcess = c("scale"),
trControl = ctrl,
tuneGrid = tuneGrid
)## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14716.676382
## iter 20 value 14516.680449
## iter 30 value 14225.777056
## iter 40 value 13385.968827
## iter 50 value 13051.744865
## iter 60 value 12595.397115
## iter 70 value 12363.592893
## iter 80 value 11739.749537
## iter 90 value 11526.661140
## iter 90 value 11526.661069
## iter 90 value 11526.661069
## final value 11526.661069
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14717.903015
## iter 20 value 14518.402943
## iter 30 value 14226.592835
## iter 40 value 13386.737522
## iter 50 value 13056.576315
## iter 60 value 12598.816006
## iter 70 value 12364.629233
## iter 80 value 11749.215992
## iter 90 value 11531.355949
## iter 90 value 11531.355909
## iter 90 value 11531.355908
## final value 11531.355908
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14719.129003
## iter 20 value 14520.122758
## iter 30 value 14227.384912
## iter 40 value 13387.597349
## iter 50 value 13061.317481
## iter 60 value 12602.104506
## iter 70 value 12365.596902
## iter 80 value 11747.970124
## iter 90 value 11536.011880
## iter 90 value 11536.011824
## iter 90 value 11536.011824
## final value 11536.011824
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14720.354344
## iter 20 value 14521.839907
## iter 30 value 14228.153402
## iter 40 value 13388.549417
## iter 50 value 13065.974753
## iter 60 value 12605.169821
## iter 70 value 12366.340646
## iter 80 value 11751.283713
## iter 90 value 11540.629902
## iter 90 value 11540.629846
## iter 90 value 11540.629845
## final value 11540.629845
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14721.579041
## iter 20 value 14523.554400
## iter 30 value 14228.898419
## iter 40 value 13389.594537
## iter 50 value 13070.555175
## iter 60 value 12607.954742
## iter 70 value 12366.700748
## iter 80 value 11757.288357
## iter 90 value 11545.210993
## iter 90 value 11545.210936
## iter 90 value 11545.210935
## final value 11545.210935
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14722.803093
## iter 20 value 14525.266251
## iter 30 value 14229.620072
## iter 40 value 13390.733187
## iter 50 value 13075.067276
## iter 60 value 12610.445418
## iter 70 value 12371.866088
## iter 80 value 11766.294106
## iter 90 value 11549.756053
## iter 90 value 11549.756005
## iter 90 value 11549.756003
## final value 11549.756003
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14724.026502
## iter 20 value 14526.975470
## iter 30 value 14230.318467
## iter 40 value 13391.965474
## iter 50 value 13079.522158
## iter 60 value 12612.677340
## iter 70 value 12370.335046
## iter 80 value 11772.392989
## iter 90 value 11554.265932
## iter 90 value 11554.265899
## iter 90 value 11554.265898
## final value 11554.265898
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14725.249268
## iter 20 value 14528.682070
## iter 30 value 14230.993705
## iter 40 value 13393.291095
## iter 50 value 13083.934838
## iter 60 value 12614.739521
## iter 70 value 12368.566383
## iter 80 value 11752.356373
## iter 90 value 11558.741515
## iter 90 value 11558.741447
## iter 90 value 11558.741446
## final value 11558.741446
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14726.471392
## iter 20 value 14530.386063
## iter 30 value 14231.645882
## iter 40 value 13394.709285
## iter 50 value 13088.325788
## iter 60 value 12616.775211
## iter 70 value 12367.232941
## iter 80 value 11757.877933
## iter 90 value 11563.183396
## iter 90 value 11563.183380
## iter 90 value 11563.183380
## final value 11563.183380
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14727.692875
## iter 20 value 14532.087459
## iter 30 value 14232.275095
## iter 40 value 13396.218777
## iter 50 value 13092.722516
## iter 60 value 12618.976575
## iter 70 value 12367.231665
## iter 80 value 11771.485033
## iter 90 value 11567.592486
## iter 90 value 11567.592462
## iter 90 value 11567.592462
## final value 11567.592462
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14763.658876
## iter 20 value 14565.372184
## iter 30 value 14289.066667
## iter 40 value 13460.445914
## iter 50 value 13056.615258
## iter 60 value 12664.272957
## iter 70 value 12364.250221
## iter 80 value 11806.473425
## iter 90 value 11600.899111
## final value 11600.898971
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14764.874361
## iter 20 value 14567.081782
## iter 30 value 14290.303828
## iter 40 value 13461.881156
## iter 50 value 13064.506114
## iter 60 value 12696.992052
## iter 70 value 12431.386192
## iter 80 value 11803.790845
## iter 90 value 11605.631138
## iter 90 value 11605.631034
## iter 90 value 11605.631033
## final value 11605.631033
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14766.089203
## iter 20 value 14568.788652
## iter 30 value 14291.515237
## iter 40 value 13463.835747
## iter 50 value 13072.846910
## iter 60 value 12694.486093
## iter 70 value 12422.162518
## iter 80 value 11825.451467
## iter 90 value 11610.324927
## final value 11610.324770
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14767.303405
## iter 20 value 14570.492806
## iter 30 value 14292.701001
## iter 40 value 13466.355258
## iter 50 value 13081.670345
## iter 60 value 12693.674793
## iter 70 value 12405.875234
## iter 80 value 11827.311698
## iter 90 value 11614.981303
## iter 90 value 11614.981189
## iter 90 value 11614.981188
## final value 11614.981188
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14768.516966
## iter 20 value 14572.194258
## iter 30 value 14293.861221
## iter 40 value 13507.438774
## iter 50 value 13141.994948
## iter 60 value 12722.155869
## iter 70 value 12441.746553
## iter 80 value 11831.297871
## iter 90 value 11619.601242
## final value 11619.601104
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14769.729888
## iter 20 value 14573.893019
## iter 30 value 14294.995996
## iter 40 value 13505.034008
## iter 50 value 13145.096609
## iter 60 value 12684.084352
## iter 70 value 12379.597506
## iter 80 value 11820.823357
## iter 90 value 11624.185516
## iter 90 value 11624.185419
## iter 90 value 11624.185419
## final value 11624.185419
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14770.942171
## iter 20 value 14575.589103
## iter 30 value 14296.105420
## iter 40 value 13503.273852
## iter 50 value 13148.759760
## iter 60 value 12687.791889
## iter 70 value 12382.514414
## iter 80 value 11822.995961
## iter 90 value 11628.734969
## iter 90 value 11628.734872
## iter 90 value 11628.734871
## final value 11628.734871
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14772.153816
## iter 20 value 14577.282522
## iter 30 value 14297.189586
## iter 40 value 13502.198287
## iter 50 value 13153.056674
## iter 60 value 12691.990963
## iter 70 value 12386.359927
## iter 80 value 11823.908463
## iter 90 value 11633.250333
## iter 90 value 11633.250237
## iter 90 value 11633.250237
## final value 11633.250237
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14773.364824
## iter 20 value 14578.973287
## iter 30 value 14298.248580
## iter 40 value 13501.836920
## iter 50 value 13158.054364
## iter 60 value 12698.340179
## iter 70 value 12392.340784
## iter 80 value 11823.465157
## iter 90 value 11637.732320
## iter 90 value 11637.732235
## iter 90 value 11637.732235
## final value 11637.732235
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14774.575195
## iter 20 value 14580.661412
## iter 30 value 14299.282490
## iter 40 value 13502.226270
## iter 50 value 13163.826857
## iter 60 value 12717.652882
## iter 70 value 12407.586652
## iter 80 value 11824.454045
## iter 90 value 11642.181643
## iter 90 value 11642.181557
## iter 90 value 11642.181557
## final value 11642.181557
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14416.362275
## iter 20 value 14212.326748
## iter 30 value 14009.805675
## iter 40 value 13232.709742
## iter 50 value 12907.048690
## iter 60 value 12617.080036
## iter 70 value 12389.690760
## iter 80 value 11767.066683
## iter 90 value 11566.851404
## iter 90 value 11566.851357
## iter 90 value 11566.851357
## final value 11566.851357
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14417.483109
## iter 20 value 14213.928897
## iter 30 value 14010.937939
## iter 40 value 13233.519342
## iter 50 value 12912.270926
## iter 60 value 12621.312844
## iter 70 value 12366.709577
## iter 80 value 11748.002101
## final value 11571.568622
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14418.603467
## iter 20 value 14215.528799
## iter 30 value 14012.064445
## iter 40 value 13234.531066
## iter 50 value 12917.572348
## iter 60 value 12561.907567
## iter 70 value 12274.973856
## iter 80 value 11763.287817
## iter 90 value 11576.246572
## iter 90 value 11576.246538
## iter 90 value 11576.246538
## final value 11576.246538
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14419.723348
## iter 20 value 14217.126461
## iter 30 value 14013.185613
## iter 40 value 13235.742685
## iter 50 value 12923.087679
## iter 60 value 12554.984845
## iter 70 value 12265.558166
## iter 80 value 11772.538311
## final value 11580.886292
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14420.842753
## iter 20 value 14218.721894
## iter 30 value 14014.301875
## iter 40 value 13237.150711
## iter 50 value 12928.998870
## iter 60 value 12559.450351
## iter 70 value 12283.999393
## iter 80 value 11754.980337
## final value 11585.488841
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14421.961683
## iter 20 value 14220.315105
## iter 30 value 14015.413669
## iter 40 value 13238.750368
## iter 50 value 12935.395317
## iter 60 value 12560.863003
## iter 70 value 12285.871128
## iter 80 value 11775.326501
## final value 11590.055185
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14423.080138
## iter 20 value 14221.906104
## iter 30 value 14016.521448
## iter 40 value 13240.535574
## iter 50 value 12942.177030
## iter 60 value 12561.336109
## iter 70 value 12301.537706
## iter 80 value 11784.726708
## final value 11594.586203
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14424.198118
## iter 20 value 14223.494898
## iter 30 value 14017.625672
## iter 40 value 13242.498968
## iter 50 value 12949.205878
## iter 60 value 12561.741025
## iter 70 value 12305.856867
## iter 80 value 11791.085042
## final value 11599.082734
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14425.315625
## iter 20 value 14225.081498
## iter 30 value 14018.726814
## iter 40 value 13244.631965
## iter 50 value 12956.381055
## iter 60 value 12562.248684
## iter 70 value 12320.086582
## iter 80 value 11796.826233
## iter 90 value 11603.545586
## iter 90 value 11603.545556
## iter 90 value 11603.545556
## final value 11603.545556
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14426.432658
## iter 20 value 14226.665910
## iter 30 value 13998.595245
## iter 40 value 13157.002043
## iter 50 value 12865.324421
## iter 60 value 12547.057864
## iter 70 value 12278.234999
## iter 80 value 11768.824211
## final value 11607.975486
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14711.974904
## iter 20 value 14518.000609
## iter 30 value 14246.556324
## iter 40 value 13309.124366
## iter 50 value 13009.242573
## iter 60 value 12579.531955
## iter 70 value 12351.956808
## iter 80 value 11774.570071
## iter 90 value 11581.222931
## iter 90 value 11581.222825
## iter 90 value 11581.222824
## final value 11581.222824
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14713.173613
## iter 20 value 14519.686929
## iter 30 value 14247.121522
## iter 40 value 13309.687528
## iter 50 value 13014.135682
## iter 60 value 12579.894059
## iter 70 value 12353.021501
## iter 80 value 11777.997341
## iter 90 value 11585.776418
## iter 90 value 11585.776310
## iter 90 value 11585.776309
## final value 11585.776309
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14714.371691
## iter 20 value 14521.370620
## iter 30 value 14247.665718
## iter 40 value 13310.382714
## iter 50 value 13019.037650
## iter 60 value 12599.800789
## iter 70 value 12376.803867
## iter 80 value 11782.479613
## iter 90 value 11590.295387
## iter 90 value 11590.295297
## iter 90 value 11590.295296
## final value 11590.295296
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14715.569140
## iter 20 value 14523.051692
## iter 30 value 14248.188996
## iter 40 value 13311.209696
## iter 50 value 13023.943621
## iter 60 value 12598.620694
## iter 70 value 12377.871092
## iter 80 value 11793.048488
## iter 90 value 11594.780666
## iter 90 value 11594.780569
## iter 90 value 11594.780568
## final value 11594.780568
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14716.765961
## iter 20 value 14524.730158
## iter 30 value 14248.691436
## iter 40 value 13312.168164
## iter 50 value 13028.849662
## iter 60 value 12597.571135
## iter 70 value 12379.229818
## iter 80 value 11804.411186
## iter 90 value 11599.232961
## final value 11599.232827
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14717.962153
## iter 20 value 14526.406029
## iter 30 value 14249.173118
## iter 40 value 13313.257702
## iter 50 value 13033.753201
## iter 60 value 12596.819795
## iter 70 value 12381.182748
## iter 80 value 11806.386178
## iter 90 value 11603.652931
## final value 11603.652793
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14719.157717
## iter 20 value 14528.079317
## iter 30 value 14249.634122
## iter 40 value 13314.477767
## iter 50 value 13038.653571
## iter 60 value 12596.632836
## iter 70 value 12384.081234
## iter 80 value 11810.477802
## iter 90 value 11608.041237
## iter 90 value 11608.041126
## iter 90 value 11608.041124
## final value 11608.041124
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14720.352655
## iter 20 value 14529.750033
## iter 30 value 14250.074524
## iter 40 value 13315.827670
## iter 50 value 13043.552670
## iter 60 value 12597.403168
## iter 70 value 12388.300827
## iter 80 value 11816.254167
## iter 90 value 11612.398532
## iter 90 value 11612.398429
## iter 90 value 11612.398426
## final value 11612.398426
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14721.546968
## iter 20 value 14531.418189
## iter 30 value 14250.494401
## iter 40 value 13317.306554
## iter 50 value 13048.455739
## iter 60 value 12599.713592
## iter 70 value 12394.179597
## iter 80 value 11814.010752
## iter 90 value 11616.725371
## iter 90 value 11616.725300
## iter 90 value 11616.725299
## final value 11616.725299
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14722.740655
## iter 20 value 14533.083796
## iter 30 value 14250.893829
## iter 40 value 13318.913372
## iter 50 value 13053.372176
## iter 60 value 12604.420839
## iter 70 value 12389.111306
## iter 80 value 11817.226045
## iter 90 value 11621.022490
## final value 11621.022327
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14755.663761
## iter 20 value 14558.685712
## iter 30 value 14291.557113
## iter 40 value 13429.276005
## iter 50 value 13077.555515
## iter 60 value 12671.865765
## iter 70 value 12411.079916
## iter 80 value 11773.080564
## iter 90 value 11571.486590
## iter 90 value 11571.486484
## iter 90 value 11571.486483
## final value 11571.486483
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14756.885621
## iter 20 value 14560.413438
## iter 30 value 14292.342895
## iter 40 value 13429.174677
## iter 50 value 13082.559102
## iter 60 value 12683.078776
## iter 70 value 12400.073859
## iter 80 value 11780.041457
## iter 90 value 11576.049847
## final value 11576.049715
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14758.106840
## iter 20 value 14562.138373
## iter 30 value 14293.106200
## iter 40 value 13429.242975
## iter 50 value 13087.461306
## iter 60 value 12627.242846
## iter 70 value 12399.618700
## iter 80 value 11784.204027
## iter 90 value 11580.578481
## iter 90 value 11580.578366
## iter 90 value 11580.578365
## final value 11580.578365
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14759.327420
## iter 20 value 14563.860529
## iter 30 value 14293.847100
## iter 40 value 13429.473075
## iter 50 value 13092.277116
## iter 60 value 12628.112548
## iter 70 value 12430.661983
## iter 80 value 11789.811208
## iter 90 value 11585.073185
## iter 90 value 11585.073100
## iter 90 value 11585.073099
## final value 11585.073099
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14760.547361
## iter 20 value 14565.579920
## iter 30 value 14294.565663
## iter 40 value 13429.856118
## iter 50 value 13097.025449
## iter 60 value 12628.735723
## iter 70 value 12408.674482
## iter 80 value 11791.323277
## iter 90 value 11589.534790
## iter 90 value 11589.534679
## iter 90 value 11589.534678
## final value 11589.534678
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14761.766665
## iter 20 value 14567.296558
## iter 30 value 14295.261955
## iter 40 value 13430.382694
## iter 50 value 13101.722483
## iter 60 value 12629.092158
## iter 70 value 12384.042265
## iter 80 value 11790.494684
## iter 90 value 11593.963908
## final value 11593.963733
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14762.985330
## iter 20 value 14569.010456
## iter 30 value 14295.936037
## iter 40 value 13431.043141
## iter 50 value 13106.380213
## iter 60 value 12629.257937
## iter 70 value 12372.931273
## iter 80 value 11796.367217
## iter 90 value 11598.361057
## iter 90 value 11598.360967
## iter 90 value 11598.360966
## final value 11598.360966
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14764.203359
## iter 20 value 14570.721627
## iter 30 value 14296.587968
## iter 40 value 13431.827720
## iter 50 value 13111.006935
## iter 60 value 12629.433559
## iter 70 value 12395.292041
## iter 80 value 11807.328505
## iter 90 value 11602.727064
## iter 90 value 11602.726964
## iter 90 value 11602.726963
## final value 11602.726963
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14765.420752
## iter 20 value 14572.430084
## iter 30 value 14297.217805
## iter 40 value 13432.726712
## iter 50 value 13115.608209
## iter 60 value 12630.006173
## iter 70 value 12401.459004
## iter 80 value 11813.155629
## iter 90 value 11607.062422
## iter 90 value 11607.062317
## iter 90 value 11607.062315
## final value 11607.062315
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14472.557863
## iter 20 value 14258.795785
## iter 30 value 14061.492557
## iter 40 value 13251.383762
## iter 50 value 12906.183045
## iter 60 value 12511.624650
## iter 70 value 12343.367029
## iter 80 value 11785.665056
## final value 11611.367589
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14667.977074
## iter 20 value 14479.131695
## iter 30 value 14261.954692
## iter 40 value 13373.171576
## iter 50 value 13031.437286
## iter 60 value 12683.674922
## iter 70 value 12378.730918
## iter 80 value 11774.005383
## iter 90 value 11561.522287
## final value 11561.522111
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14669.169798
## iter 20 value 14480.772627
## iter 30 value 14262.746451
## iter 40 value 13370.905717
## iter 50 value 13033.920326
## iter 60 value 12678.574104
## iter 70 value 12394.355507
## iter 80 value 11765.842725
## iter 90 value 11566.131476
## final value 11566.131290
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14670.361898
## iter 20 value 14482.411101
## iter 30 value 14263.518292
## iter 40 value 13369.118196
## iter 50 value 13036.638966
## iter 60 value 12672.031468
## iter 70 value 12395.143472
## iter 80 value 11783.715670
## iter 90 value 11570.704328
## final value 11570.704170
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14671.553374
## iter 20 value 14484.047126
## iter 30 value 14264.270338
## iter 40 value 13367.792635
## iter 50 value 13039.687073
## iter 60 value 12664.718296
## iter 70 value 12391.544313
## iter 80 value 11781.463409
## iter 90 value 11575.241754
## final value 11575.241599
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14672.744228
## iter 20 value 14485.680714
## iter 30 value 14265.002713
## iter 40 value 13366.869879
## iter 50 value 13043.064157
## iter 60 value 12657.398424
## iter 70 value 12394.685200
## iter 80 value 11789.082390
## iter 90 value 11579.744587
## final value 11579.744375
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14673.934459
## iter 20 value 14487.311876
## iter 30 value 14265.715536
## iter 40 value 13366.231201
## iter 50 value 13046.669982
## iter 60 value 12650.957149
## iter 70 value 12389.197923
## iter 80 value 11792.136491
## iter 90 value 11584.213490
## final value 11584.213258
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14675.124069
## iter 20 value 14488.940622
## iter 30 value 14266.408924
## iter 40 value 13365.680224
## iter 50 value 13050.334985
## iter 60 value 12646.288059
## iter 70 value 12385.553665
## iter 80 value 11802.211053
## iter 90 value 11588.649232
## final value 11588.648971
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14676.313059
## iter 20 value 14490.566964
## iter 30 value 14267.082990
## iter 40 value 13313.402078
## iter 50 value 13005.466423
## iter 60 value 12659.116027
## iter 70 value 12396.489266
## iter 80 value 11798.937466
## iter 90 value 11593.052309
## iter 90 value 11593.052218
## iter 90 value 11593.052216
## final value 11593.052216
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14677.501429
## iter 20 value 14492.190911
## iter 30 value 14267.737846
## iter 40 value 13309.004877
## iter 50 value 13006.080242
## iter 60 value 12664.280294
## iter 70 value 12398.902318
## iter 80 value 11808.081975
## iter 90 value 11597.423744
## iter 90 value 11597.423634
## iter 90 value 11597.423632
## final value 11597.423632
## converged
## # weights: 114 (74 variable)
## initial value 21382.290974
## iter 10 value 14678.689179
## iter 20 value 14493.812475
## iter 30 value 14268.373601
## iter 40 value 13305.640289
## iter 50 value 13007.611691
## iter 60 value 12616.520822
## iter 70 value 12394.731082
## iter 80 value 11788.492131
## iter 90 value 11601.763916
## iter 90 value 11601.763853
## iter 90 value 11601.763852
## final value 11601.763852
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14432.517714
## iter 20 value 14211.206181
## iter 30 value 14011.526707
## iter 40 value 13095.873524
## iter 50 value 12796.623784
## iter 60 value 12539.329754
## iter 70 value 12285.816788
## iter 80 value 11755.918761
## final value 11573.563296
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14433.643675
## iter 20 value 14212.881189
## iter 30 value 14013.186226
## iter 40 value 13093.155682
## iter 50 value 12814.705302
## iter 60 value 12543.199012
## iter 70 value 12313.917942
## iter 80 value 11777.641817
## iter 90 value 11578.211480
## iter 90 value 11578.211407
## iter 90 value 11578.211407
## final value 11578.211407
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14434.769154
## iter 20 value 14214.553607
## iter 30 value 14014.915585
## iter 40 value 13091.167733
## iter 50 value 12820.224322
## iter 60 value 12542.639261
## iter 70 value 12345.476532
## iter 80 value 11771.663821
## final value 11582.822562
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14435.894152
## iter 20 value 14216.223448
## iter 30 value 14016.702028
## iter 40 value 13090.756911
## iter 50 value 12827.612020
## iter 60 value 12538.601210
## iter 70 value 12297.432493
## iter 80 value 11767.878862
## final value 11587.397655
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14437.018670
## iter 20 value 14217.890723
## iter 30 value 14018.534651
## iter 40 value 13093.023463
## iter 50 value 12831.948672
## iter 60 value 12545.649856
## iter 70 value 12317.143320
## iter 80 value 11768.332601
## final value 11591.937530
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14438.142707
## iter 20 value 14219.555443
## iter 30 value 14020.404028
## iter 40 value 13099.150607
## iter 50 value 12813.712842
## iter 60 value 12531.721432
## iter 70 value 12319.388426
## iter 80 value 11772.291216
## final value 11596.442993
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14439.266265
## iter 20 value 14221.217620
## iter 30 value 14022.301931
## iter 40 value 13110.048372
## iter 50 value 12815.234128
## iter 60 value 12525.004598
## iter 70 value 12326.794218
## iter 80 value 11780.507869
## final value 11600.914803
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14440.389344
## iter 20 value 14222.877265
## iter 30 value 14004.585003
## iter 40 value 13181.326981
## iter 50 value 12854.285012
## iter 60 value 12516.230565
## iter 70 value 12306.191111
## iter 80 value 11791.093163
## iter 90 value 11605.353715
## iter 90 value 11605.353679
## iter 90 value 11605.353678
## final value 11605.353678
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14441.511944
## iter 20 value 14224.534389
## iter 30 value 14005.269039
## iter 40 value 13178.887522
## iter 50 value 12857.511184
## iter 60 value 12515.005096
## iter 70 value 12308.193770
## iter 80 value 11792.779203
## iter 90 value 11609.760330
## iter 90 value 11609.760309
## iter 90 value 11609.760309
## final value 11609.760309
## converged
## # weights: 114 (74 variable)
## initial value 21381.192362
## iter 10 value 14442.634066
## iter 20 value 14226.189005
## iter 30 value 14005.959429
## iter 40 value 13177.212316
## iter 50 value 12861.382798
## iter 60 value 12509.013194
## iter 70 value 12284.545887
## iter 80 value 11793.426093
## final value 11614.135373
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14698.642555
## iter 20 value 14501.565374
## iter 30 value 14248.159867
## iter 40 value 13287.519926
## iter 50 value 12968.164720
## iter 60 value 12658.564686
## iter 70 value 12390.949043
## iter 80 value 11748.163526
## iter 90 value 11554.576402
## iter 90 value 11554.576344
## iter 90 value 11554.576343
## final value 11554.576343
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14699.865922
## iter 20 value 14503.281635
## iter 30 value 14248.834863
## iter 40 value 13287.913115
## iter 50 value 12972.784623
## iter 60 value 12667.605165
## iter 70 value 12397.909140
## iter 80 value 11751.106633
## iter 90 value 11559.168689
## iter 90 value 11559.168633
## iter 90 value 11559.168632
## final value 11559.168632
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14701.088632
## iter 20 value 14504.995179
## iter 30 value 14249.488821
## iter 40 value 13288.458075
## iter 50 value 12977.486213
## iter 60 value 12677.217040
## iter 70 value 12406.554113
## iter 80 value 11760.311271
## iter 90 value 11563.725308
## iter 90 value 11563.725237
## iter 90 value 11563.725235
## final value 11563.725235
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14702.310686
## iter 20 value 14506.706017
## iter 30 value 14250.121808
## iter 40 value 13289.156825
## iter 50 value 12982.265675
## iter 60 value 12687.428281
## iter 70 value 12416.889423
## iter 80 value 11761.806480
## iter 90 value 11568.247113
## iter 90 value 11568.247043
## iter 90 value 11568.247041
## final value 11568.247041
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14703.532086
## iter 20 value 14508.414163
## iter 30 value 14250.733891
## iter 40 value 13290.011318
## iter 50 value 12987.120843
## iter 60 value 12698.283309
## iter 70 value 12428.384582
## iter 80 value 11766.898776
## iter 90 value 11572.734942
## iter 90 value 11572.734874
## iter 90 value 11572.734872
## final value 11572.734872
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14704.752831
## iter 20 value 14510.119628
## iter 30 value 14251.325136
## iter 40 value 13291.023422
## iter 50 value 12992.050480
## iter 60 value 12709.863765
## iter 70 value 12440.321019
## iter 80 value 11774.336622
## iter 90 value 11577.189570
## iter 90 value 11577.189510
## iter 90 value 11577.189509
## final value 11577.189509
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14705.972923
## iter 20 value 14511.822424
## iter 30 value 14251.895604
## iter 40 value 13292.194896
## iter 50 value 12997.053792
## iter 60 value 12722.340274
## iter 70 value 12452.248012
## iter 80 value 11781.794281
## iter 90 value 11581.611751
## iter 90 value 11581.611690
## iter 90 value 11581.611688
## final value 11581.611688
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14707.192363
## iter 20 value 14513.522564
## iter 30 value 14252.445360
## iter 40 value 13293.527379
## iter 50 value 13002.130115
## iter 60 value 12736.023882
## iter 70 value 12464.287517
## iter 80 value 11787.971439
## iter 90 value 11586.002171
## iter 90 value 11586.002111
## iter 90 value 11586.002110
## final value 11586.002110
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14708.411150
## iter 20 value 14515.220059
## iter 30 value 14252.974464
## iter 40 value 13295.022374
## iter 50 value 13007.278725
## iter 60 value 12751.337314
## iter 70 value 12477.006485
## iter 80 value 11785.175511
## iter 90 value 11590.361507
## iter 90 value 11590.361442
## iter 90 value 11590.361441
## final value 11590.361441
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14709.629287
## iter 20 value 14516.914921
## iter 30 value 14253.482976
## iter 40 value 13296.681232
## iter 50 value 13012.498754
## iter 60 value 12679.139651
## iter 70 value 12405.200426
## iter 80 value 11793.597380
## iter 90 value 11594.690356
## iter 90 value 11594.690305
## iter 90 value 11594.690304
## final value 11594.690304
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14660.631734
## iter 20 value 14465.043660
## iter 30 value 14219.315103
## iter 40 value 13318.680315
## iter 50 value 13014.106715
## iter 60 value 12614.341129
## iter 70 value 12366.522331
## iter 80 value 11758.921503
## iter 90 value 11561.356420
## iter 90 value 11561.356345
## iter 90 value 11561.356344
## final value 11561.356344
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14661.831400
## iter 20 value 14466.720081
## iter 30 value 14220.189479
## iter 40 value 13319.763357
## iter 50 value 13019.607349
## iter 60 value 12617.481503
## iter 70 value 12370.919944
## iter 80 value 11762.284555
## iter 90 value 11565.914928
## iter 90 value 11565.914851
## iter 90 value 11565.914850
## final value 11565.914850
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14663.030446
## iter 20 value 14468.393950
## iter 30 value 14221.046739
## iter 40 value 13320.985167
## iter 50 value 13025.114691
## iter 60 value 12621.076397
## iter 70 value 12374.942530
## iter 80 value 11765.522566
## iter 90 value 11570.438810
## iter 90 value 11570.438754
## iter 90 value 11570.438753
## final value 11570.438753
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14664.228874
## iter 20 value 14470.065276
## iter 30 value 14221.887008
## iter 40 value 13322.340480
## iter 50 value 13030.622997
## iter 60 value 12625.157751
## iter 70 value 12378.436682
## iter 80 value 11771.106222
## iter 90 value 11574.928879
## iter 90 value 11574.928807
## iter 90 value 11574.928807
## final value 11574.928807
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14665.426685
## iter 20 value 14471.734072
## iter 30 value 14222.710413
## iter 40 value 13323.824276
## iter 50 value 13036.126801
## iter 60 value 12629.760245
## iter 70 value 12381.492446
## iter 80 value 11769.778941
## iter 90 value 11579.385805
## iter 90 value 11579.385732
## iter 90 value 11579.385732
## final value 11579.385732
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14666.623878
## iter 20 value 14473.400349
## iter 30 value 14223.517076
## iter 40 value 13325.431793
## iter 50 value 13041.620870
## iter 60 value 12634.918539
## iter 70 value 12384.362713
## iter 80 value 11782.612471
## iter 90 value 11583.810272
## iter 90 value 11583.810208
## iter 90 value 11583.810208
## final value 11583.810208
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14667.820455
## iter 20 value 14475.064118
## iter 30 value 14224.307118
## iter 40 value 13327.158540
## iter 50 value 13047.100143
## iter 60 value 12640.662712
## iter 70 value 12387.476564
## iter 80 value 11783.431173
## iter 90 value 11588.202942
## iter 90 value 11588.202891
## iter 90 value 11588.202890
## final value 11588.202890
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14669.016417
## iter 20 value 14476.725390
## iter 30 value 14225.080657
## iter 40 value 13329.000302
## iter 50 value 13052.559635
## iter 60 value 12647.012524
## iter 70 value 12391.605828
## iter 80 value 11782.019018
## iter 90 value 11592.564449
## iter 90 value 11592.564399
## iter 90 value 11592.564399
## final value 11592.564399
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14670.211763
## iter 20 value 14478.384176
## iter 30 value 14225.837811
## iter 40 value 13330.953133
## iter 50 value 13057.994315
## iter 60 value 12653.972149
## iter 70 value 12398.801485
## iter 80 value 11787.693054
## iter 90 value 11596.895393
## iter 90 value 11596.895336
## iter 90 value 11596.895335
## final value 11596.895335
## converged
## # weights: 114 (74 variable)
## initial value 21378.995137
## iter 10 value 14671.406496
## iter 20 value 14480.040487
## iter 30 value 14226.578695
## iter 40 value 13333.013335
## iter 50 value 13063.398922
## iter 60 value 12661.527870
## iter 70 value 12419.750691
## iter 80 value 11798.628201
## iter 90 value 11601.196342
## iter 90 value 11601.196279
## iter 90 value 11601.196278
## final value 11601.196278
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14675.138442
## iter 20 value 14480.583687
## iter 30 value 14190.741693
## iter 40 value 13338.303193
## iter 50 value 12991.464877
## iter 60 value 12621.114193
## iter 70 value 12389.295278
## iter 80 value 11739.245369
## iter 90 value 11531.340165
## iter 90 value 11531.340068
## iter 90 value 11531.340067
## final value 11531.340067
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14676.374962
## iter 20 value 14482.314385
## iter 30 value 14191.394986
## iter 40 value 13337.087725
## iter 50 value 12996.551212
## iter 60 value 12630.412730
## iter 70 value 12381.099114
## iter 80 value 11747.427471
## iter 90 value 11536.068638
## final value 11536.068469
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14677.610826
## iter 20 value 14484.042382
## iter 30 value 14192.023548
## iter 40 value 13335.956821
## iter 50 value 13001.617307
## iter 60 value 12657.461537
## iter 70 value 12374.397632
## iter 80 value 11775.268898
## iter 90 value 11540.760392
## final value 11540.760135
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14678.846035
## iter 20 value 14485.767690
## iter 30 value 14192.627454
## iter 40 value 13382.848346
## iter 50 value 13055.945431
## iter 60 value 12680.251182
## iter 70 value 12399.909099
## iter 80 value 11754.490234
## iter 90 value 11545.415946
## iter 90 value 11545.415866
## iter 90 value 11545.415866
## final value 11545.415866
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14680.080590
## iter 20 value 14487.490321
## iter 30 value 14193.206775
## iter 40 value 13381.428542
## iter 50 value 13060.064646
## iter 60 value 12684.679285
## iter 70 value 12396.118432
## iter 80 value 11759.424331
## iter 90 value 11550.036506
## iter 90 value 11550.036432
## iter 90 value 11550.036432
## final value 11550.036432
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14681.314492
## iter 20 value 14489.210287
## iter 30 value 14193.761580
## iter 40 value 13380.109028
## iter 50 value 13064.152397
## iter 60 value 12689.515629
## iter 70 value 12410.603586
## iter 80 value 11752.036323
## iter 90 value 11554.622651
## iter 90 value 11554.622579
## iter 90 value 11554.622579
## final value 11554.622579
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14682.547742
## iter 20 value 14490.927600
## iter 30 value 14194.291932
## iter 40 value 13378.892514
## iter 50 value 13068.206802
## iter 60 value 12694.978441
## iter 70 value 12405.392305
## iter 80 value 11765.219619
## iter 90 value 11559.175094
## iter 90 value 11559.175010
## iter 90 value 11559.175010
## final value 11559.175010
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14683.780340
## iter 20 value 14492.642271
## iter 30 value 14194.797896
## iter 40 value 13377.781668
## iter 50 value 13072.225986
## iter 60 value 12625.922053
## iter 70 value 12377.674760
## iter 80 value 11774.814969
## iter 90 value 11563.694497
## iter 90 value 11563.694406
## iter 90 value 11563.694405
## final value 11563.694405
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14685.012287
## iter 20 value 14494.354313
## iter 30 value 14195.279529
## iter 40 value 13376.779109
## iter 50 value 13076.208051
## iter 60 value 12629.301459
## iter 70 value 12380.513065
## iter 80 value 11778.474534
## iter 90 value 11568.181468
## iter 90 value 11568.181393
## iter 90 value 11568.181392
## final value 11568.181392
## converged
## # weights: 114 (74 variable)
## initial value 21380.093750
## iter 10 value 14686.243583
## iter 20 value 14496.063736
## iter 30 value 14195.736890
## iter 40 value 13375.887375
## iter 50 value 13080.151012
## iter 60 value 12633.108251
## iter 70 value 12384.747044
## iter 80 value 11782.697420
## iter 90 value 11572.636690
## iter 90 value 11572.636606
## iter 90 value 11572.636605
## final value 11572.636605
## converged
## # weights: 114 (74 variable)
## initial value 23756.392130
## iter 10 value 16170.664970
## iter 20 value 15907.576583
## iter 30 value 15714.844017
## iter 40 value 14990.471317
## iter 50 value 14555.964670
## iter 60 value 14114.451142
## iter 70 value 13783.215874
## iter 80 value 13104.469011
## iter 90 value 12874.863196
## iter 90 value 12874.863144
## iter 90 value 12874.863143
## final value 12874.863143
## converged
fit.multinom## Penalized Multinomial Regression
##
## 21624 samples
## 36 predictor
## 3 classes: '0', '1', '2'
##
## Pre-processing: scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 19462, 19462, 19462, 19462, 19461, 19463, ...
## Resampling results across tuning parameters:
##
## decay Accuracy Kappa
## 0.001 0.7609608 0.6319636
## 0.112 0.7607759 0.6316827
## 0.223 0.7607759 0.6316809
## 0.334 0.7610996 0.6321867
## 0.445 0.7611921 0.6323248
## 0.556 0.7614232 0.6326802
## 0.667 0.7613306 0.6325346
## 0.778 0.7613307 0.6325250
## 0.889 0.7614231 0.6326621
## 1.000 0.7612844 0.6324512
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was decay = 0.556.
plot(fit.multinom)Vamos realizar as nossas predições para assim avaliar o modelo apos o tunning:
predictions.multinom = predict(fit.multinom, newdata = tst.features)- Matriz de confusão:
(cm.multinom = confusionMatrix(predictions.multinom, tst.target, mode='prec_recall'))## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 193 16 11
## 1 65 99 69
## 2 26 43 361
##
## Overall Statistics
##
## Accuracy : 0.7395
## 95% CI : (0.7092, 0.7682)
## No Information Rate : 0.4994
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5862
##
## Mcnemar's Test P-Value : 4.514e-09
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Precision 0.8773 0.4249 0.8395
## Recall 0.6796 0.6266 0.8186
## F1 0.7659 0.5064 0.8289
## Prevalence 0.3216 0.1789 0.4994
## Detection Rate 0.2186 0.1121 0.4088
## Detection Prevalence 0.2492 0.2639 0.4870
## Balanced Accuracy 0.8173 0.7209 0.8312
a = cm.multinom$table[1,1]+cm.multinom$table[2,2]+cm.multinom$table[1,1]
(a)/(a + cm.multinom$table[2,1]+cm.multinom$table[3,1]+cm.multinom$table[1,2]+cm.multinom$table[1,3])## [1] 0.8043118
Podemos notar que o modelo multinomial performou bem melhor que os modelos anteriores, com respeito a precisão para todas as 3 classes. Além disso, quando comparamos a acuracia overall, temos um valor de aproximadamente 74.86%, um pouco inferior que o modelo de floresta aleatoria. Caso o intuito do modelo seja em reter a probabilidade ou a classificar o individuo das classes Dropout e Graduate, de forma única, esse modelo seria o recomendavel, por apresentar as maiores precisões referentes a ambas as classes.
LDA
Cross-Validation
Será considerado uma validação cruzada com 10 folds
set.seed(13)
ctrl <- trainControl(
method = "cv",
number = 10,
)Parameters Tunning
Será considerando um grid de parametros, aonde temos dimen variando de 0 a 5.
set.seed(13)
tuneGrid <- expand.grid(
dimen = 0:5
)Treinando o modelo
set.seed(13)
fit.lda <- train(
Target ~ .,
data = trn.balanced,
method = 'lda2',
preProcess = c("scale"),
trControl = ctrl,
tuneGrid = tuneGrid
)
fit.lda## Linear Discriminant Analysis
##
## 21624 samples
## 36 predictor
## 3 classes: '0', '1', '2'
##
## Pre-processing: scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 19462, 19462, 19462, 19462, 19461, 19463, ...
## Resampling results across tuning parameters:
##
## dimen Accuracy Kappa
## 0 0.7211428 0.5697152
## 1 0.7211428 0.5697152
## 2 0.7325196 0.5862354
## 3 0.7325196 0.5862354
## 4 0.7325196 0.5862354
## 5 0.7325196 0.5862354
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was dimen = 2.
plot(fit.lda)Vamos realizar as nossas predições para assim avaliar o modelo apos o tunning:
predictions.lda = predict(fit.lda, newdata = tst.features)- Matriz de confusão:
(cm.lda = confusionMatrix(predictions.lda, tst.target, mode='prec_recall'))## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 184 12 4
## 1 72 95 92
## 2 28 51 345
##
## Overall Statistics
##
## Accuracy : 0.7067
## 95% CI : (0.6754, 0.7365)
## No Information Rate : 0.4994
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.538
##
## Mcnemar's Test P-Value : 1.177e-15
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Precision 0.9200 0.3668 0.8137
## Recall 0.6479 0.6013 0.7823
## F1 0.7603 0.4556 0.7977
## Prevalence 0.3216 0.1789 0.4994
## Detection Rate 0.2084 0.1076 0.3907
## Detection Prevalence 0.2265 0.2933 0.4802
## Balanced Accuracy 0.8106 0.6875 0.8018
a = cm.lda$table[1,1]+cm.lda$table[2,2]+cm.lda$table[1,1]
(a)/(a + cm.lda$table[2,1]+cm.lda$table[3,1]+cm.lda$table[1,2]+cm.lda$table[1,3])## [1] 0.7996546
Podemos notar que o modelo baseado em LDA performou bem melhor que os modelos anteriores, com respeito a precisão para todas as classes 0 e 2, porém pior para a classe 1. Além disso, quando comparamos a acuracia overall, temos um valor de aproximadamente 71.35%, sendo um valor inferior ao modelo multinomial. Vale ressaltar os altos valores para a precisão para as classes 0 e 2, dropout e graduate, mostrando que o modelo possui uma alta precisão para as classes 0 e 2.
Resultados
predictions.dtc.prob = predict(fit.dtc, newdata = tst.features,type='prob')
predictions.rf.prob = predict(fit.rf, newdata = tst.features,type='prob')
predictions.multinom.prob = predict(fit.multinom, newdata = tst.features,type='prob')
predictions.lda.prob = predict(fit.lda, newdata = tst.features,type='prob')
n = 1000
cutoffs = seq(0,1, length.out=n)
c(min(cutoffs),max(cutoffs))## [1] 0 1
ROC curve por classe
Vamos analisar a curva roc para as classes 0, 1 e 2.
cols = c('red','blue','purple','brown')
models = c('DecisionTree','RandomForest','Multinom','LDA')
types = 1:length(models)Classe Dropout
n_class = 1tpr.dtc = numeric(n);fpr.dtc = numeric(n)
tpr.rf = numeric(n);fpr.rf = numeric(n)
tpr.multinom = numeric(n);fpr.multinom = numeric(n)
tpr.lda = numeric(n);fpr.lda = numeric(n)
tprs = data.frame(tpr.dtc, fpr.dtc,
tpr.rf, fpr.rf,
tpr.multinom,fpr.multinom,
tpr.lda, fpr.lda)
tst.target.roc = ifelse(tst.target == n_class-1, 1, 0)
names = c('dtc','rf','multinom','lda')
for (j in 1:length(names)){
for (i in 1:length(cutoffs)){
if (names[j] == 'dtc'){
temp = ifelse(predictions.dtc.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'rf'){
temp = ifelse(predictions.rf.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'multinom'){
temp = ifelse(predictions.multinom.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'lda'){
temp = ifelse(predictions.lda.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
break
}
}
}
}
m = matrix(c(0,0,0,0),2,2)
#print(cutoffs[i])
m[1,1] = sum((temp == tst.target.roc) & (tst.target.roc == 0))
m[2,2] = sum((temp == tst.target.roc) & (tst.target.roc == 1))
m[1,2] = sum((temp != tst.target.roc) & (tst.target.roc == 1))
m[2,1] = sum((temp != tst.target.roc) & (tst.target.roc == 0))
tp = m[2,2]
fn = m[1,2]
tn = m[1,1]
fp = m[2,1]
if (names[j] == 'dtc'){
tpr.dtc[i] = tp/(tp+fn)
fpr.dtc[i] = fp/(fp+tn)
}else{
if(names[j] == 'rf'){
tpr.rf[i] = tp/(tp+fn)
fpr.rf[i] = fp/(fp+tn)
}else{
if(names[j] == 'multinom'){
tpr.multinom[i] = tp/(tp+fn)
fpr.multinom[i] = fp/(fp+tn)
}else{
if(names[j] == 'lda'){
tpr.lda[i] = tp/(tp+fn)
fpr.lda[i] = fp/(fp+tn)
}else{
break
}
}
}
}
}
}(auc.dtc = auc(fpr.dtc, tpr.dtc))## [1] 89.81
(auc.rf = auc(fpr.rf, tpr.rf))## [1] 92.01
(auc.multinom = auc(fpr.multinom, tpr.multinom))## [1] 90.87
(auc.lda = auc(fpr.lda, tpr.lda))## [1] 91.15
tpr.dtc = c(tpr.dtc, 0)
fpr.dtc = c(fpr.dtc, 0)length(fpr.dtc); length(tpr.dtc)## [1] 1001
## [1] 1001
plot(y=tpr.dtc,x=fpr.dtc,
type='l',col='red',
xlim=c(0,1),ylim=c(0,1),
xlab='fpr',ylab='tpr',
lty=types[1])
lines(fpr.rf,tpr.rf,type='l',col='blue',lty=types[2])
lines(fpr.multinom,tpr.multinom,type='l',col='purple',lty=types[3])
lines(fpr.lda,tpr.lda,type='l',col='brown',lty=types[4])
abline(0,1)
grid()
title('Classe Dropout')
legend('bottomright',
legend=paste(c(models,'Random Classifier'), '(AUC =',paste(c(auc.dtc,auc.rf,auc.multinom,auc.lda, '50.00'),'%',sep=''),')'),
col=c(cols,'black'), lty=c(types,1), cex=0.8)Classe Enrolled
n_class = 2tpr.dtc = numeric(n);fpr.dtc = numeric(n)
tpr.rf = numeric(n);fpr.rf = numeric(n)
tpr.multinom = numeric(n);fpr.multinom = numeric(n)
tpr.lda = numeric(n);fpr.lda = numeric(n)
tprs = data.frame(tpr.dtc, fpr.dtc,tpr.rf, fpr.rf,tpr.multinom, fpr.multinom,tpr.lda, fpr.lda)
tst.target.roc = ifelse(tst.target == n_class-1, 1, 0)
names = c('dtc','rf','multinom','lda')
for (j in 1:length(names)){
for (i in 1:length(cutoffs)){
if (names[j] == 'dtc'){
temp = ifelse(predictions.dtc.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'rf'){
temp = ifelse(predictions.rf.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'multinom'){
temp = ifelse(predictions.multinom.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'lda'){
temp = ifelse(predictions.lda.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
break
}
}
}
}
m = matrix(c(0,0,0,0),2,2)
#print(cutoffs[i])
m[1,1] = sum((temp == tst.target.roc) & (tst.target.roc == 0))
m[2,2] = sum((temp == tst.target.roc) & (tst.target.roc == 1))
m[1,2] = sum((temp != tst.target.roc) & (tst.target.roc == 1))
m[2,1] = sum((temp != tst.target.roc) & (tst.target.roc == 0))
tp = m[2,2]
fn = m[1,2]
tn = m[1,1]
fp = m[2,1]
if (names[j] == 'dtc'){
tpr.dtc[i] = tp/(tp+fn)
fpr.dtc[i] = fp/(fp+tn)
}else{
if(names[j] == 'rf'){
tpr.rf[i] = tp/(tp+fn)
fpr.rf[i] = fp/(fp+tn)
}else{
if(names[j] == 'multinom'){
tpr.multinom[i] = tp/(tp+fn)
fpr.multinom[i] = fp/(fp+tn)
}else{
if(names[j] == 'lda'){
tpr.lda[i] = tp/(tp+fn)
fpr.lda[i] = fp/(fp+tn)
}else{
break
}
}
}
}
#print(c(tpr,fpr))
#break
}
}(auc.dtc = auc(fpr.dtc, tpr.dtc))## [1] 68.64
(auc.rf = auc(fpr.rf, tpr.rf))## [1] 80.83
(auc.multinom = auc(fpr.multinom, tpr.multinom))## [1] 78.17
(auc.lda = auc(fpr.lda, tpr.lda))## [1] 77.22
tpr.dtc = c(tpr.dtc, 0)
fpr.dtc = c(fpr.dtc, 0)plot(y=tpr.dtc,x=fpr.dtc,
type='l',col='red',
xlim=c(0,1),ylim=c(0,1),
xlab='fpr',ylab='tpr',
lty=types[1])
lines(fpr.rf,tpr.rf,type='l',col='blue',lty=types[2])
lines(fpr.multinom,tpr.multinom,type='l',col='purple',lty=types[3])
lines(fpr.lda,tpr.lda,type='l',col='brown',lty=types[4])
abline(0,1)
grid()
title('Classe Enrolled')
legend('bottomright',
legend=paste(c(models,'Random Classifier'), '(AUC =',paste(c(auc.dtc,auc.rf,auc.multinom,auc.lda, '50.00'),'%',sep=''),')'),
col=c(cols,'black'), lty=c(types,1), cex=0.8)Classe Graduate
n_class = 3tpr.dtc = numeric(n);fpr.dtc = numeric(n)
tpr.rf = numeric(n);fpr.rf = numeric(n)
tpr.multinom = numeric(n);fpr.multinom = numeric(n)
tpr.lda = numeric(n);fpr.lda = numeric(n)
tprs = data.frame(tpr.dtc, fpr.dtc,tpr.rf, fpr.rf,tpr.multinom, fpr.multinom,tpr.lda, fpr.lda)
tst.target.roc = ifelse(tst.target == n_class-1, 1, 0)
names = c('dtc','rf','multinom','lda')
for (j in 1:length(names)){
for (i in 1:length(cutoffs)){
if (names[j] == 'dtc'){
temp = ifelse(predictions.dtc.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'rf'){
temp = ifelse(predictions.rf.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'multinom'){
temp = ifelse(predictions.multinom.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
if(names[j] == 'lda'){
temp = ifelse(predictions.lda.prob[,n_class] >= cutoffs[i], 1, 0)
}else{
break
}
}
}
}
m = matrix(c(0,0,0,0),2,2)
#print(cutoffs[i])
m[1,1] = sum((temp == tst.target.roc) & (tst.target.roc == 0))
m[2,2] = sum((temp == tst.target.roc) & (tst.target.roc == 1))
m[1,2] = sum((temp != tst.target.roc) & (tst.target.roc == 1))
m[2,1] = sum((temp != tst.target.roc) & (tst.target.roc == 0))
tp = m[2,2]
fn = m[1,2]
tn = m[1,1]
fp = m[2,1]
if (names[j] == 'dtc'){
tpr.dtc[i] = tp/(tp+fn)
fpr.dtc[i] = fp/(fp+tn)
}else{
if(names[j] == 'rf'){
tpr.rf[i] = tp/(tp+fn)
fpr.rf[i] = fp/(fp+tn)
}else{
if(names[j] == 'multinom'){
tpr.multinom[i] = tp/(tp+fn)
fpr.multinom[i] = fp/(fp+tn)
}else{
if(names[j] == 'lda'){
tpr.lda[i] = tp/(tp+fn)
fpr.lda[i] = fp/(fp+tn)
}else{
break
}
}
}
}
#print(c(tpr,fpr))
#break
}
}(auc.dtc = auc(fpr.dtc, tpr.dtc))## [1] 88.29
(auc.rf = auc(fpr.rf, tpr.rf))## [1] 92.15
(auc.multinom = auc(fpr.multinom, tpr.multinom))## [1] 90.83
(auc.lda = auc(fpr.lda, tpr.lda))## [1] 89.74
tpr.dtc = c(tpr.dtc, 0)
fpr.dtc = c(fpr.dtc, 0)plot(y=tpr.dtc,x=fpr.dtc,
type='l',col='red',
xlim=c(0,1),ylim=c(0,1),
xlab='fpr',ylab='tpr',
lty=types[1])
lines(fpr.rf,tpr.rf,type='l',col='blue',lty=types[2])
lines(fpr.multinom,tpr.multinom,type='l',col='purple',lty=types[3])
lines(fpr.lda,tpr.lda,type='l',col='brown',lty=types[4])
abline(0,1)
grid()
title('Classe Graduate')
legend('bottomright',
legend=paste(c(models,'Random Classifier'), '(AUC =',paste(c(auc.dtc,auc.rf,auc.multinom,auc.lda, '50.00'),'%',sep=''),')'),
col=c(cols,'black'), lty=c(types,1), cex=0.8)Podemos notar que o modelo RandomForest apresentou a maior area abaixo da curva ROC para a classe Dropout, em concorrencia temos os modelos Multinom e LDA, performando de formas semelhantes para esta classe. Além disso, o modelo DecisionTree apresentou a pior performace.
Analogamente, temos que os modelos RandomForest e Multinom performaram semelhante para a classe Graduate, com respeito a area abaixo da curva ROC, em sequencia temos o modelo LDA e por ultimo o modelo DecisionTree apresentou a pior performace.
Será utilizado o modelo RandomForest, uma vez visto que o modelo performou melhor sobre o conjunto de teste, para todas as classes, e apresentando a maior area abaixo da curva ROC.
Dessa forma, como foi dito no inicio da sessão sobre treinamento dos modelos, a nossa tarefa será “classificar se o estudante é da classe dropout (Aluno desistente), Enrolled (matriculado ainda no curso) e Graduate (Aluno graduado). Além disso, com a obtenção do melhor modelo, será retornado o valor da probabilidade do estudante desistir, ainda estar no curso ou de se formar.”, então será selecionado o modelo baseado em RandomForest. Além disso, vamos analisar também utilizando o modelo LDA, por fins de analise com respeito a classe Dropout
Feature Importance
LDA
predictions.probs = predict(fit.lda, newdata = tst.features, type='prob')
predictions.raw = predict(fit.lda, newdata = tst.features, type='raw')
lda.probs.raw = data.frame(predictions.probs, 'class' = predictions.raw)
knitr::kable(head(lda.probs.raw))| X0 | X1 | X2 | class | |
|---|---|---|---|---|
| 6 | 0.0302128 | 0.4988740 | 0.4709132 | 1 |
| 11 | 0.0143699 | 0.3188791 | 0.6667509 | 2 |
| 14 | 0.0022112 | 0.1450597 | 0.8527291 | 2 |
| 21 | 0.0968446 | 0.5205162 | 0.3826392 | 1 |
| 24 | 0.0018025 | 0.0891333 | 0.9090642 | 2 |
| 27 | 0.0052371 | 0.3641687 | 0.6305942 | 2 |
lda.probs = NULL
for (i in 1:(dim(lda.probs.raw)[1])){
lda.probs[i] = predictions.probs[i,as.integer(lda.probs.raw$class[i])]
}
probs.class = data.frame('probs'=lda.probs,'class'=lda.probs.raw$class)
knitr::kable(head(probs.class))| probs | class |
|---|---|
| 0.4988740 | 1 |
| 0.6667509 | 2 |
| 0.8527291 | 2 |
| 0.5205162 | 1 |
| 0.9090642 | 2 |
| 0.6305942 | 2 |
attach(probs.class)plot(probs ~ class)plot(probs, col=c('blue','black','red')[as.integer(class)])
legend('bottomright', legend=c("Dropout", "Enrolled","Graduate"),
col=c("blue", "black","red"), lty=1, cex=0.8)Podemos notar que as previsões para a classe Dropout apresentaram suas maiores probabilidades, evidenciando que o modelo consegue classificar bem a classe dropout. Analogo, mas não com probabilidades tão proximas de 1, temos a probabilidade para a classe Graduate, também em sua maioria com valores altos. Porém, para o cenário da classe Enrolled, temos uma incerteza maior para classifica-la, oscilando em torno de 0.5 a 0.9, range consideravel para a probabilidade. Por fim, o modelo esta prevendo bem a probabilidade do individuo realizar o Dropout do curso, ou seja, desistir, sendo essa informação muito importante para os centros e departamentos dos cursos que o individuo esta relacionado. Além disso, pode ser interessante uma analise interna nas possiveis variaveis mais impactantes para aquela classificação.
- Feature importance
lda.Imp <- varImp(fit.lda, scale = FALSE)
lda.Imp$importance[order(-lda.Imp$importance$X0),]## X0 X1 X2
## X2ndApproved 0.8986842 0.8217307 0.8986842
## X1stApproved 0.8635956 0.7919111 0.8635956
## X2ndGrade 0.8492189 0.7401704 0.8492189
## X1stGrade 0.8120677 0.7214242 0.8120677
## Age.at.enrollment 0.6907442 0.6322727 0.6907442
## X2ndEvaluations 0.6497901 0.6702177 0.6702177
## Tuition.fees.up.to.date 0.6474801 0.6339366 0.6474801
## X2ndEnrolled 0.6450975 0.6426893 0.6450975
## Scholarship.holder 0.6443052 0.6133253 0.6443052
## X1stEnrolled 0.6385021 0.6372147 0.6385021
## Gender 0.6333169 0.5686504 0.6333169
## Application.mode 0.6255652 0.5707959 0.6255652
## X1stEvaluations 0.6045037 0.6508959 0.6508959
## Debtor 0.5839102 0.5565720 0.5839102
## AdmissionGrade 0.5706766 0.5665177 0.5706766
## Application.order 0.5696637 0.5696637 0.5529060
## PrevQualifiGrade 0.5654050 0.5902006 0.5902006
## Displaced 0.5635014 0.5374181 0.5635014
## Previous.qualification 0.5617439 0.5538519 0.5617439
## Mother.s.qualification 0.5444876 0.5444876 0.5194112
## ï..Marital.status 0.5429879 0.5253515 0.5429879
## GDP 0.5386890 0.5336235 0.5386890
## Unemployment.rate 0.5287641 0.5371242 0.5371242
## Daytime.evening.attendance. 0.5267579 0.5267579 0.5236288
## X2ndWithoutEva 0.5254434 0.5499886 0.5499886
## Mother.s.occupation 0.5218603 0.5188915 0.5218603
## X1stWithoutEva 0.5216097 0.5433383 0.5433383
## Father.s.qualification 0.5210370 0.5324671 0.5324671
## Inflation.rate 0.5207763 0.5146957 0.5207763
## Father.s.occupation 0.5190514 0.5119659 0.5190514
## Course 0.5175558 0.5175558 0.5107242
## X1stCredited 0.5147028 0.5147028 0.5079589
## X2ndCredited 0.5140765 0.5114905 0.5140765
## Educational.special.needs 0.5024780 0.5013524 0.5024780
## Nacionality 0.5017635 0.5017635 0.5011255
## International 0.5017442 0.5017442 0.5010535
plot(lda.Imp)Podemos notar que as variáveis X2ndApproved, X1stApproved, X2ndGrade e X1stGrade são variáveis muit impactantes para determinar a classe a qual o individuo pertence, de fato, uma vez que sabemos que as variáveis e X2ndApproved e X1stApproved são referentes ao numero de disciplinas que o inviduo foi aprovado no primeiro e segundo semestre, influencia bastante na decisão de um estudante de permanecer no curso ou não, se decide realizar o dropout ou não; analogamente temos para as variaveis X2ndGrade e X1stGrade, referenciando ao total de disciplinas pagas no primeiro e segundo semestre, podendo interferir bastante na decisão do estudante de desistir ou não do curso, seja por pagar muitas disciplinas, ou por pagar poucas porque não consegue andar no curso por ter reprovado outras disciplinas que são dependencias para outras mais avançadas.
RandomForest
predictions.probs = predict(fit.rf, newdata = tst.features, type='prob')
predictions.raw = predict(fit.rf, newdata = tst.features, type='raw')
rf.probs.raw = data.frame(predictions.probs, 'class' = predictions.raw)
knitr::kable(head(rf.probs.raw))| X0 | X1 | X2 | class | |
|---|---|---|---|---|
| 6 | 0.352 | 0.250 | 0.398 | 2 |
| 11 | 0.116 | 0.242 | 0.642 | 2 |
| 14 | 0.146 | 0.206 | 0.648 | 2 |
| 21 | 0.510 | 0.172 | 0.318 | 0 |
| 24 | 0.054 | 0.074 | 0.872 | 2 |
| 27 | 0.078 | 0.216 | 0.706 | 2 |
rf.probs = NULL
for (i in 1:(dim(rf.probs.raw)[1])){
rf.probs[i] = predictions.probs[i,as.integer(rf.probs.raw$class[i])]
}
probs.class = data.frame('probs'=rf.probs,'class'=rf.probs.raw$class)
knitr::kable(head(probs.class))| probs | class |
|---|---|
| 0.398 | 2 |
| 0.642 | 2 |
| 0.648 | 2 |
| 0.510 | 0 |
| 0.872 | 2 |
| 0.706 | 2 |
attach(probs.class)## The following objects are masked from probs.class (pos = 3):
##
## class, probs
plot(probs ~ class)plot(probs, col=c('blue','black','red')[as.integer(class)])
legend('bottomright', legend=c("Dropout", "Enrolled","Graduate"),
col=c("blue", "black","red"), lty=1, cex=0.8)Acima podemos notar que, diferente do modelo baseado em LDA, os valores das probabilidades aparentaram estar mais dispersos, aparentando o modelo estar classificando com um grau de incerteza maior sobre aquela classe.
- Feature importance
rf.Imp <- varImp(fit.rf, scale = FALSE)
data_imp <- rf.Imp$importance #unique
data_imp['var'] <- rownames(data_imp)
row.names(data_imp) <- seq(1, dim(data_imp)[1])p <- ggplot(data_imp, aes(x = reorder(var, Overall), y = Overall))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=40, hjust=1))
p <- p + ggtitle("Feature Importance para o Random Forest")
p <- p + labs(y = "Feature Importance", x = "Variável")
p.final <- p + coord_flip()
p.finalPodemos notar que as variáveis X2ndApproved, X1stApproved, X2ndGrade, X2ndEvaluations e X1stGrade são variáveis muito impactantes para determinar a classe a qual o individuo pertence, cenário este sendo semelhante aos resultados obtidos pelo modelo LDA.