setwd("C:/Users/fabri/MEGA/Tiroid")
library(dplyr)
library(tidyr)
library(readxl)
library(corrplot)
library(rpart)
library(rpart.plot)
library(randomForest)
library(e1071)
library(caret)
library(rminer)
library(ROCit)
library(ROCR)
library(kableExtra)
library(gbm)
library(pdp)
dat <- read_xlsx(path = "tiroid_unico.xlsx")
# correzione dei nomi dei campi importati da Excel
colnames(dat) <- make.names(colnames(dat),unique = T)
# trasformazione di alcuni campi character in factor
dat$hypo_eu <- factor(dat$hypo_eu)
dat$Razza <- factor(dat$Razza)
dat$Sesso <- factor(dat$Sesso)
dat$AUMENTO.TSH <- factor(dat$AUMENTO.TSH)
dat$AUMENTO.COLESTEROLO <- factor(dat$AUMENTO.COLESTEROLO)
dat$USG <- factor(dat$USG)
dat$UPC <- factor(dat$UPC)
dat$ASTENIA <- factor(dat$ASTENIA)
dat$LETARGIA.DEPRESSIONE <- factor(dat$LETARGIA.DEPRESSIONE)
dat$PU.PD <- factor(dat$PU.PD)
dat$APPETITO <- factor(dat$APPETITO)
dat$OBESITÀ <- factor(dat$OBESITÀ)
dat$ALOPECIA <- factor(dat$ALOPECIA)
dat$DERMATOPATIA <- factor(dat$DERMATOPATIA)
dat$DERMATOPATIA <- factor(dat$DERMATOPATIA)
dat$ALTERAZIONI.NEURO <- factor(dat$ALTERAZIONI.NEURO)
dat$ALTRI.SINTOMI <- factor(dat$ALTRI.SINTOMI)
dat$Motivo.test <- factor(dat$Motivo.test)
dat$DIAGNOSI <- factor(dat$DIAGNOSI)
# ripulisco i nomi dei campi dai "."
colnames(dat) <- gsub("\\.","",colnames(dat))
summary(dat)
## hypo_eu Codice Proprietario Paziente
## eu :233 Length:315 Length:315 Length:315
## hypo: 82 Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Razza Sesso Etàaa7 Xmm
## Meticcio : 84 C: 37 Length:315 Length:315
## Dobermann : 25 F: 58 Class :character Class :character
## Labrador : 17 M:126 Mode :character Mode :character
## Golden Retriever: 13 S: 94
## Pinscher : 7
## Beagle : 6
## (Other) :163
## Etàaa9 Peso BCS T4nmoll
## Min. : 1.250 Min. : 1.00 Min. :2.000 Min. : 1.29
## 1st Qu.: 6.250 1st Qu.: 14.93 1st Qu.:5.000 1st Qu.: 6.68
## Median : 8.917 Median : 26.30 Median :6.000 Median :16.00
## Mean : 8.704 Mean : 28.19 Mean :5.835 Mean :16.94
## 3rd Qu.:11.292 3rd Qu.: 37.50 3rd Qu.:7.000 3rd Qu.:23.15
## Max. :17.250 Max. :323.00 Max. :9.000 Max. :51.86
## NA's :3 NA's :230
## T4µgdl1398 T4postTSHnmolml T4postTSHµgdl Incrementodalbasale
## Min. :0.1006 Min. : 1.29 Min. :0.1006 Min. :0.06251
## 1st Qu.:0.5210 1st Qu.: 6.40 1st Qu.:0.4992 1st Qu.:0.07800
## Median :1.2480 Median : 27.50 Median :2.1450 Median :1.82934
## Mean :1.3216 Mean : 29.91 Mean :2.3332 Mean :2.00862
## 3rd Qu.:1.8057 3rd Qu.: 43.65 3rd Qu.:3.4047 3rd Qu.:3.02477
## Max. :4.0451 Max. :126.00 Max. :9.8280 Max. :7.59292
## NA's :209 NA's :209 NA's :209
## TSHngml003038 AUMENTOTSH Colesterolomgdl AUMENTOCOLESTEROLO
## Min. :0.0100 NO:246 Min. : 76.0 NO:166
## 1st Qu.:0.1000 SI: 69 1st Qu.: 227.5 SI:149
## Median :0.1700 Median : 316.0
## Mean :0.4017 Mean : 374.7
## 3rd Qu.:0.3200 3rd Qu.: 446.0
## Max. :8.9000 Max. :2025.0
## NA's :12
## Trigliceridi Crea ALT AST
## Min. : 22.0 Min. :0.410 Min. : 8.00 Min. : 12.00
## 1st Qu.: 61.0 1st Qu.:0.790 1st Qu.: 40.00 1st Qu.: 30.00
## Median : 86.5 Median :0.950 Median : 61.50 Median : 37.00
## Mean : 209.8 Mean :1.005 Mean : 97.74 Mean : 45.33
## 3rd Qu.: 240.2 3rd Qu.:1.145 3rd Qu.: 103.00 3rd Qu.: 49.00
## Max. :1922.0 Max. :6.300 Max. :1285.00 Max. :1078.00
## NA's :171 NA's :8 NA's :9 NA's :17
## Hgb Hct GR MCV
## Min. : 1.30 Min. : 4.90 Min. : 483000 Min. :54.80
## 1st Qu.:12.93 1st Qu.:38.60 1st Qu.:5792500 1st Qu.:65.72
## Median :14.95 Median :43.80 Median :6510000 Median :68.00
## Mean :14.77 Mean :43.62 Mean :6438343 Mean :67.84
## 3rd Qu.:16.60 3rd Qu.:49.00 3rd Qu.:7065000 3rd Qu.:70.30
## Max. :21.10 Max. :60.70 Max. :9420000 Max. :82.00
## NA's :9 NA's :8 NA's :9 NA's :9
## MCHC USG UPC ASTENIA LETARGIADEPRESSIONE
## Min. :21.2 1044 : 9 0.1 : 6 NO:187 NO:209
## 1st Qu.:33.0 1020 : 8 0.2 : 6 SI:128 SI:106
## Median :33.8 1010 : 7 0.12 : 3
## Mean :33.9 1012 : 6 0.3 : 3
## 3rd Qu.:34.9 1052 : 6 0.09 : 2
## Max. :41.3 (Other): 74 (Other): 45
## NA's :12 NA's :205 NA's :250
## PUPD APPETITO OBESITÀ ALOPECIA DERMATOPATIA ALTERAZIONINEURO
## NO:262 Aumentato: 45 NO:240 NO:196 NO:243 NO:227
## SI: 53 Diminuito: 31 SI: 75 SI:119 SI: 72 SI: 88
## Normale :238
## NA's : 1
##
##
##
## ALTRISINTOMI Motivotest
## NO :98 Sospetto clinico :168
## Intolleranza al freddo: 7 sospetto clinico : 89
## Ipotermia : 7 Sospetto clinico/clinicopatologico: 17
## massa tiroidea : 7 sospetto clinicopatologico : 15
## Dispnea : 6 sospetto clinico/clinicopatologico: 14
## (Other) :93 Sospetto clinicopatologico : 7
## NA's :97 (Other) : 5
## DIAGNOSI
## Iperadrenocorticismo: 9
## PDH : 8
## neoplasia tiroidea : 6
## Diabete mellito : 5
## Epilessia idiopatica: 4
## (Other) :166
## NA's :117
dfMissing
## variable tipo n.Missing perc.Missing n.modalita
## 1 hypo_eu factor 0 0.0 2
## 2 Codice character 0 0.0 0
## 3 Proprietario character 0 0.0 0
## 4 Paziente character 0 0.0 0
## 5 Razza factor 0 0.0 92
## 6 Sesso factor 0 0.0 4
## 7 Etàaa7 character 0 0.0 0
## 8 Xmm character 0 0.0 0
## 9 Etàaa9 numeric 0 0.0 0
## 10 Peso numeric 3 1.0 0
## 11 BCS numeric 230 73.0 0
## 12 T4nmoll numeric 0 0.0 0
## 13 T4µgdl1398 numeric 0 0.0 0
## 14 T4postTSHnmolml numeric 209 66.3 0
## 15 T4postTSHµgdl numeric 209 66.3 0
## 16 Incrementodalbasale numeric 209 66.3 0
## 17 TSHngml003038 numeric 0 0.0 0
## 18 AUMENTOTSH factor 0 0.0 2
## 19 Colesterolomgdl numeric 12 3.8 0
## 20 AUMENTOCOLESTEROLO factor 0 0.0 2
## 21 Trigliceridi numeric 171 54.3 0
## 22 Crea numeric 8 2.5 0
## 23 ALT numeric 9 2.9 0
## 24 AST numeric 17 5.4 0
## 25 Hgb numeric 9 2.9 0
## 26 Hct numeric 8 2.5 0
## 27 GR numeric 9 2.9 0
## 28 MCV numeric 9 2.9 0
## 29 MCHC numeric 12 3.8 0
## 30 USG factor 205 65.1 39
## 31 UPC factor 250 79.4 43
## 32 ASTENIA factor 0 0.0 2
## 33 LETARGIADEPRESSIONE factor 0 0.0 2
## 34 PUPD factor 0 0.0 2
## 35 APPETITO factor 1 0.3 3
## 36 OBESITÀ factor 0 0.0 2
## 37 ALOPECIA factor 0 0.0 2
## 38 DERMATOPATIA factor 0 0.0 2
## 39 ALTERAZIONINEURO factor 0 0.0 2
## 40 ALTRISINTOMI factor 97 30.8 77
## 41 Motivotest factor 0 0.0 9
## 42 DIAGNOSI factor 117 37.1 128
Da una prima analisi delle statistiche descrittive si osservano alcune variabili con un numero elevato di valori mancanti o di modalità:
Razza, BCS, T4postTSHnmolml, T4postTSHµgdl, Incrementodalbasale, Trigliceridi, USG, UPC, ALTRISINTOMI, DIAGNOSI
si osserva anche un valore alquanto anomalo in Peso (avrei delle difficoltà nel portare allo sgambatoio un cane da 323Kg).
Queste variabili possono quindi essere escluse dalle analisi.
varok <- dfMissing[dfMissing$perc.Missing<10 & (dfMissing$tipo=="numeric" | dfMissing$tipo=="factor") & dfMissing$n.modalita<20,"variable"]
varok <- varok[varok!="Peso"]
varok <- varok[varok!="Motivotest"]
varok
## [1] "hypo_eu" "Sesso" "Etàaa9"
## [4] "T4nmoll" "T4µgdl1398" "TSHngml003038"
## [7] "AUMENTOTSH" "Colesterolomgdl" "AUMENTOCOLESTEROLO"
## [10] "Crea" "ALT" "AST"
## [13] "Hgb" "Hct" "GR"
## [16] "MCV" "MCHC" "ASTENIA"
## [19] "LETARGIADEPRESSIONE" "PUPD" "APPETITO"
## [22] "OBESITÀ" "ALOPECIA" "DERMATOPATIA"
## [25] "ALTERAZIONINEURO"
L’analisi della correlazione tra le variabili quantitative mette in evidenza come alcune di loro siano fortemente correlate e che quindi, per semplificare le analisi, possono essere escluse
vCont <- c()
xdat <- as.data.frame(dat[,varok])
for(i in 1:ncol(xdat)){
if(class(as.data.frame(xdat)[,i])=="numeric"){
vCont <- c(vCont,colnames(xdat)[i])
}
}
mcr <- cor(xdat[,vCont],use="pairwise.complete.obs")
corrplot(mcr, type="upper", order="hclust",diag = F)
# escludo variabili con correlazione > 0.8
varok <- varok[!varok %in% c("T4µgdl1398","Hct","GR")]
varok
## [1] "hypo_eu" "Sesso" "Etàaa9"
## [4] "T4nmoll" "TSHngml003038" "AUMENTOTSH"
## [7] "Colesterolomgdl" "AUMENTOCOLESTEROLO" "Crea"
## [10] "ALT" "AST" "Hgb"
## [13] "MCV" "MCHC" "ASTENIA"
## [16] "LETARGIADEPRESSIONE" "PUPD" "APPETITO"
## [19] "OBESITÀ" "ALOPECIA" "DERMATOPATIA"
## [22] "ALTERAZIONINEURO"
# creo il data.frame per elaborazioni
xdat <- dat[,varok]
verifica della normalità distributiva
boxplot variabili quantitative per hypo_eu
Definisco un training set con il 70% delle osservazioni
set.seed(1111)
train_index <- sample(1:nrow(xdat), 0.7 * nrow(xdat))
test_index <- setdiff(1:nrow(xdat), train_index)
train <- xdat[train_index, ]
test <- xdat[test_index, ]
# training e test senza missing
trainr <- na.omit(train)
testr <- na.omit(test)
# tuning dei parametri dell'albero
fm <- formula(hypo_eu~.)
audit.rpart <- tune.rpart(fm, data = trainr, minsplit=seq(5,10,1), cp = seq(0.001,0.01,0.005),tunecontrol = tune.control(nboot=20))
audit.rpart
##
## Parameter tuning of 'rpart.wrapper':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## minsplit cp
## 5 0.001
##
## - best performance: 0.05078947
xminsplit <- 5 # audit.rpart$best.parameters$minsplit
xcp <- 0.001 # audit.rpart$best.parameters$cp
mCT <- rpart(hypo_eu~. ,data = trainr, minsplit = xminsplit, cp = xcp)
rpart.plot(mCT,type = 4,extra = 2)
pred <- prediction(predict(mCT,type = "prob")[,2], trainr$hypo_eu)
auc <- performance(pred,"auc")
auc <- as.numeric(auc@y.values)
cmtx <- confusionMatrix(predict(mCT,type = "class"),trainr$hypo_eu,mode = "everything",positive = "hypo")
# confusion matrix
addmargins(cmtx$table)
## Reference
## Prediction eu hypo Sum
## eu 139 0 139
## hypo 3 54 57
## Sum 142 54 196
round(c(cmtx$overall,AUC=auc),5)
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.98469 0.96231 0.95592 0.99683 0.72449 0.00000 0.24821 0.99283
round(cmtx$byClass[c(1,2,5,7,8,11)],5)
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 1.00000 0.97887 0.94737 0.97297 0.27551 0.98944
# pred <- prediction(predict(mCT,type = "prob")[,2], trainr$hypo_eu)
mCT.p <- predict(mCT,newdata = testr,type = "prob")[,2]
pred <- prediction(mCT.p, testr$hypo_eu)
auc <- performance(pred,"auc")
auc <- as.numeric(auc@y.values)
aucCT <- auc
cmtxT <- confusionMatrix(predict(mCT,newdata = testr,type = "class"),testr$hypo_eu,mode = "everything",positive = "hypo")
# confusion matrix
addmargins(cmtxT$table)
## Reference
## Prediction eu hypo Sum
## eu 58 1 59
## hypo 5 21 26
## Sum 63 22 85
round(c(cmtxT$overall,AUC=auc),5)
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.92941 0.82629 0.85267 0.97366 0.74118 0.00001 0.22067 0.94336
round(cmtxT$byClass[c(1,2,5,7,8,11)],5)
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.95455 0.92063 0.80769 0.87500 0.25882 0.93759
dmCTVI <- data.frame(variable=names(mCT$variable.importance),importance=mCT$variable.importance)
ggplot(dmCTVI, aes(x=reorder(variable,importance),y=importance))+
geom_bar(stat="identity")+
coord_flip()+theme_light()+xlab("")+ylab("")+ggtitle("Variable Importance - Classification Tree")
set.seed(1234)
mRF <- randomForest(hypo_eu~. ,data = trainr,na.action = na.roughfix,ntree=500)
mRF
##
## Call:
## randomForest(formula = hypo_eu ~ ., data = trainr, ntree = 500, na.action = na.roughfix)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 5.1%
## Confusion matrix:
## eu hypo class.error
## eu 137 5 0.03521127
## hypo 5 49 0.09259259
## Reference
## Prediction eu hypo Sum
## eu 137 5 142
## hypo 5 49 54
## Sum 142 54 196
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.94898 0.87220 0.90818 0.97527 0.72449 0.00000 1.00000 0.99204
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.90741 0.96479 0.90741 0.90741 0.27551 0.93610
## Reference
## Prediction eu hypo Sum
## eu 60 1 61
## hypo 3 21 24
## Sum 63 22 85
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.95294 0.88087 0.88387 0.98703 0.74118 0.00000 0.61708 0.97655
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.95455 0.95238 0.87500 0.91304 0.25882 0.95346
# tuning parametri SVM
obj <- tune(svm, hypo_eu~. ,data = trainr,
ranges = list(gamma = seq(0.0001,1,by=0.01), cost = seq(1,1000,by=100)),
tunecontrol = tune.control(sampling = "fix"))
obj
##
## Parameter tuning of 'svm':
##
## - sampling method: fixed training/validation set
##
## - best parameters:
## gamma cost
## 0.0201 1
##
## - best performance: 0.01515152
mSVM <- svm(hypo_eu~. ,data = trainr, cost = obj$best.parameters$cost, gamma = obj$best.parameters$gamma,probability=T)
## Reference
## Prediction eu hypo Sum
## eu 141 5 146
## hypo 1 49 50
## Sum 142 54 196
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.96939 0.92152 0.93456 0.98868 0.72449 0.00000 0.22067 0.99674
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.90741 0.99296 0.98000 0.94231 0.27551 0.95018
## Reference
## Prediction eu hypo Sum
## eu 62 3 65
## hypo 1 19 20
## Sum 63 22 85
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.95294 0.87361 0.88387 0.98703 0.74118 0.00000 0.61708 0.99134
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.86364 0.98413 0.95000 0.90476 0.25882 0.92388
Per non definire un modello troppo complesso, con tutte le variabili presenti nel dataset, le variabili utilizzate nel modello sono state selezionate in due fasi:
1. sono state considerate le 10 più importanti in base a quanto ottenuto dal modello random forest;
2. è stata utilizzata una procedura stepwise.
xtrainr <- trainr[,c("hypo_eu",dmRFVI[order(dmRFVI$MeanDecreaseGini,decreasing = T),"variable"][1:10])]
xtestr <- testr[,c("hypo_eu",dmRFVI[order(dmRFVI$MeanDecreaseGini,decreasing = T),"variable"][1:10])]
mLG <- glm(hypo_eu~. ,data = xtrainr,family = "binomial")
mLG <- step(mLG,trace = F)
summary(mLG)
##
## Call:
## glm(formula = hypo_eu ~ T4nmoll + TSHngml003038 + MCV + Etàaa9,
## family = "binomial", data = xtrainr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.04746 -0.00684 -0.00013 0.00001 1.82028
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1841 6.9523 -0.314 0.753401
## T4nmoll -1.1100 0.3119 -3.559 0.000373 ***
## TSHngml003038 6.7475 3.3584 2.009 0.044519 *
## MCV 0.2025 0.1059 1.912 0.055881 .
## Etàaa9 -0.3638 0.2336 -1.557 0.119388
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 230.756 on 195 degrees of freedom
## Residual deviance: 23.582 on 191 degrees of freedom
## AIC: 33.582
##
## Number of Fisher Scoring iterations: 10
## Reference
## Prediction eu hypo Sum
## eu 140 3 143
## hypo 2 51 53
## Sum 142 54 196
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.97449 0.93573 0.94147 0.99167 0.72449 0.00000 1.00000 0.99700
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.94444 0.98592 0.96226 0.95327 0.27551 0.96518
## Reference
## Prediction eu hypo Sum
## eu 59 1 60
## hypo 4 21 25
## Sum 63 22 85
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.94118 0.85320 0.86804 0.98063 0.74118 0.00000 0.37109 0.98413
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.95455 0.93651 0.84000 0.89362 0.25882 0.94553
mNB = naiveBayes(hypo_eu~. ,data = trainr)
## Reference
## Prediction eu hypo Sum
## eu 138 5 143
## hypo 4 49 53
## Sum 142 54 196
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.95408 0.88431 0.91462 0.97879 0.72449 0.00000 1.00000 0.98826
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.90741 0.97183 0.92453 0.91589 0.27551 0.93962
## Reference
## Prediction eu hypo Sum
## eu 61 1 62
## hypo 2 21 23
## Sum 63 22 85
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.96471 0.90935 0.90030 0.99266 0.74118 0.00000 1.00000 0.97980
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.95455 0.96825 0.91304 0.93333 0.25882 0.96140
xtrainr <- trainr
xtrainr$hypo_eu <- ifelse(xtrainr$hypo_eu=="eu",0,1)
mGB <- gbm(
formula = hypo_eu ~ .,
distribution = "bernoulli",
data = xtrainr,
n.trees = 10000,
interaction.depth = 1,
shrinkage = 0.001,
cv.folds = 5,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
Per il calcolo del modello GBM non è stata fatta nessuna ricerca dei parametri ottimali, anche qui si potrebbe fare un tuning
## Reference
## Prediction eu hypo Sum
## eu 140 0 140
## hypo 2 54 56
## Sum 142 54 196
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.98980 0.97473 0.96363 0.99876 0.72449 0.00000 0.47950 0.99896
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 1.00000 0.98592 0.96429 0.98182 0.27551 0.99296
## Reference
## Prediction eu hypo Sum
## eu 59 1 60
## hypo 4 21 25
## Sum 63 22 85
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.94118 0.85320 0.86804 0.98063 0.74118 0.00000 0.37109 0.97114
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.95455 0.93651 0.84000 0.89362 0.25882 0.94553
| Accuracy | Kappa | AUC | Sensitivity | Specificity | Precision | F1 | Prevalence | Balanced Accuracy | |
|---|---|---|---|---|---|---|---|---|---|
| Tree | 0.9294 | 0.8263 | 0.9434 | 0.9545 | 0.9206 | 0.8077 | 0.8750 | 0.2588 | 0.9376 |
| randomForest | 0.9529 | 0.8809 | 0.9766 | 0.9545 | 0.9524 | 0.8750 | 0.9130 | 0.2588 | 0.9535 |
| SVM | 0.9529 | 0.8736 | 0.9913 | 0.8636 | 0.9841 | 0.9500 | 0.9048 | 0.2588 | 0.9239 |
| Logit | 0.9412 | 0.8532 | 0.9841 | 0.9545 | 0.9365 | 0.8400 | 0.8936 | 0.2588 | 0.9455 |
| NaiveBayes | 0.9647 | 0.9093 | 0.9798 | 0.9545 | 0.9683 | 0.9130 | 0.9333 | 0.2588 | 0.9614 |
| GBM | 0.9412 | 0.8532 | 0.9711 | 0.9545 | 0.9365 | 0.8400 | 0.8936 | 0.2588 | 0.9455 |
# ROCit_obj <- rocit(score=mSVM.p,class=ifelse(testr$hypo_eu=="hypo",1,0))
# pp <- plot(ROCit_obj,YIndex = T,values = T)
# text(0.5,0.3,paste("AUC=",round(ROCit_obj$AUC,4)),font=2,pos = 4)
# text(0.5,0.2,
# paste("Youden Index=",round(pp$`optimal Youden Index point`[1],4),
# "\n(cutoff=",round(pp$`optimal Youden Index point`[4],4),")",sep=""),cex = 1,pos = 4)
I diversi algoritmi sono stati valutati utilizzando 50 diversi campioni train/test (sempre in proporzione 70%/30%); in ogni giro sono inoltre stati determinati i parametri ottimali per classification Tree e SVM.
| method | Accuracy | Kappa | AUC | Sensitivity | Specificity | Precision | F1 | Balanced.Accuracy |
|---|---|---|---|---|---|---|---|---|
| GBM | 0.9510 | 0.8724 | 0.9893 | 0.8975 | 0.9722 | 0.9210 | 0.9055 | 0.9349 |
| Logistic | 0.9498 | 0.8709 | 0.9812 | 0.9194 | 0.9612 | 0.8957 | 0.9054 | 0.9403 |
| NaiveBayes | 0.9392 | 0.8397 | 0.9714 | 0.8590 | 0.9678 | 0.9080 | 0.8810 | 0.9134 |
| randomForest | 0.9545 | 0.8820 | 0.9908 | 0.9040 | 0.9730 | 0.9257 | 0.9132 | 0.9385 |
| SVM | 0.9366 | 0.8293 | 0.9858 | 0.8311 | 0.9745 | 0.9248 | 0.8715 | 0.9028 |
| Tree | 0.9342 | 0.8272 | 0.9266 | 0.8615 | 0.9608 | 0.8904 | 0.8718 | 0.9112 |
| Obs | |||
|---|---|---|---|
| Negative (eu) | Positive (hypo) | ||
| Pred | Negative (eu) | TN | FN |
| Positive (hypo) | FP | TP | |
\(Accuracy = \frac{(TN+TP)}{N}\)
\(Sensitivity = \frac{TP}{(TP+FN)}\)
\(Specificity = \frac{TN}{(TN+FP)}\)
\(Precision = \frac{TP}{(TP+FP)}\)
\(F1\ Score = 2 \frac{(Recall * Precision)}{(Recall + Precision)}\)
\(Balanced\ accuracy = \frac{(Sensitivity + Specificity)}{2}\)
It’s the ratio of the correctly labeled subjects to the whole pool of subjects.
Accuracy is the most intuitive one.
Accuracy answers the following question: How many patients did we correctly label out of all the patients?
Accuracy = (TP+TN)/(TP+FP+FN+TN)
numerator: all correctly labeled subject (All trues)
denominator: all subjects
Recall is the ratio of the correctly +ve labeled by our program to all who are hypo in reality.
Recall answers the following question: Of all the patients who are hypo, how many of those we correctly predict?
Recall = TP/(TP+FN)
numerator: +ve labeled hypo patients.
denominator: all patients who are hypo (whether detected by our program or not)
Specificity is the correctly -ve labeled by the program to all who are eu in reality.
Specifity answers the following question: Of all the patients who are healthy, how many of those did we correctly predict?
Specificity = TN/(TN+FP)
numerator: -ve labeled healthy patients.
denominator: all patients who are healthly in reality (whether +ve or -ve labeled)
Precision is the ratio of the correctly +ve labeled by our program to all +ve labeled.
Precision answers the following: How many of those who we labeled as hypo are actually hypo?
Precision = TP/(TP+FP)
numerator: +ve labeled hypo.
denominator: all +ve labeled by our program (whether they’re hypo or not in reality).
F1 Score considers both precision and recall.
It is the harmonic mean(average) of the precision and recall.
F1 Score is best if there is some sort of balance between precision (p) & recall (r) in the system. Oppositely F1 Score isn’t so high if one measure is improved at the expense of the other.
For example, if P is 1 & R is 0, F1 score is 0.
F1 Score = 2*(Recall * Precision) / (Recall + Precision)
Balanced accuracy is based on sensitivity and specificity
Balanced accuracy = (Sensitivity + Specificity) / 2
Accuracy is a great measure but only when you have symmetric datasets (false negatives & false positives counts are close), also, false negatives & false positives have similar costs.
If the cost of false positives and false negatives are different then F1 is your savior.
F1 is best if you have an uneven class distribution.
An ROC curve (receiver operating characteristic curve) is a graph showing the performance of a classification model at all classification thresholds. This curve plots two parameters:
True Positive Rate
False Positive Rate
True Positive Rate (TPR) (aka Recall or Sensitivity) and is therefore defined as follows:
\(TPR = \frac{TP}{(TP + FN)}\)
False Positive Rate (FPR) is defined as follows:
\(FPR = \frac{FP}{(FP + TN)}\)
An ROC curve plots TPR vs. FPR at different classification thresholds. Lowering the classification threshold classifies more items as positive, thus increasing both False Positives and True Positives.
AUC stands for “Area under the ROC Curve.” That is, AUC measures the entire two-dimensional area underneath the entire ROC curve.
AUC provides an aggregate measure of performance across all possible classification thresholds. One way of interpreting AUC is as the probability that the model ranks a random positive example more highly than a random negative example.
The AUC value, between 0 and 1, is in fact equivalent to the probability that the result of the classifier applied to an individual randomly extracted from the group of hypo is higher than that obtained by applying it to an individual randomly extracted from the healthy group.
library(party)
mPT <- ctree(hypo_eu~.,data = trainr,
controls = ctree_control(minsplit = 10,testtype = c("Bonferroni"),mincriterion=0.95))
plot(mPT)
## Reference
## Prediction eu hypo Sum
## eu 58 1 59
## hypo 5 21 26
## Sum 63 22 85
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.92941 0.82629 0.85267 0.97366 0.74118 0.00001 0.22067 0.96032
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.95455 0.92063 0.80769 0.87500 0.25882 0.93759
mPF <- cforest(hypo_eu~.,data = trainr,controls = cforest_unbiased(mtry = 6))
## Reference
## Prediction eu hypo Sum
## eu 59 1 60
## hypo 4 21 25
## Sum 63 22 85
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull AccuracyPValue McnemarPValue AUC
## 0.94118 0.85320 0.86804 0.98063 0.74118 0.00000 0.37109 0.96753
## Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
## 0.95455 0.93651 0.84000 0.89362 0.25882 0.94553