Importazione e pre-trattamento dei dati

setwd("C:/Users/fabri/MEGA/Tiroid")
library(dplyr)
library(tidyr)
library(readxl)
library(corrplot)
library(rpart)
library(rpart.plot)
library(randomForest)
library(e1071)
library(caret)
library(rminer)
library(ROCit)
library(ROCR)
library(kableExtra)
library(gbm)
library(pdp)
dat <- read_xlsx(path = "tiroid_unico.xlsx")
# correzione dei nomi dei campi importati da Excel
colnames(dat) <- make.names(colnames(dat),unique = T)

# trasformazione di alcuni campi character in factor 
dat$hypo_eu <- factor(dat$hypo_eu)
dat$Razza <- factor(dat$Razza)
dat$Sesso <- factor(dat$Sesso)
dat$AUMENTO.TSH <- factor(dat$AUMENTO.TSH)
dat$AUMENTO.COLESTEROLO <- factor(dat$AUMENTO.COLESTEROLO)
dat$USG <- factor(dat$USG)
dat$UPC <- factor(dat$UPC)
dat$ASTENIA <- factor(dat$ASTENIA)
dat$LETARGIA.DEPRESSIONE <- factor(dat$LETARGIA.DEPRESSIONE)
dat$PU.PD <- factor(dat$PU.PD)
dat$APPETITO <- factor(dat$APPETITO)
dat$OBESITÀ <- factor(dat$OBESITÀ)
dat$ALOPECIA <- factor(dat$ALOPECIA)
dat$DERMATOPATIA <- factor(dat$DERMATOPATIA)
dat$DERMATOPATIA <- factor(dat$DERMATOPATIA)
dat$ALTERAZIONI.NEURO <- factor(dat$ALTERAZIONI.NEURO)
dat$ALTRI.SINTOMI <- factor(dat$ALTRI.SINTOMI)
dat$Motivo.test <- factor(dat$Motivo.test)
dat$DIAGNOSI <- factor(dat$DIAGNOSI)

# ripulisco i nomi dei campi dai "."
colnames(dat) <- gsub("\\.","",colnames(dat))
summary(dat)
##  hypo_eu       Codice          Proprietario         Paziente        
##  eu  :233   Length:315         Length:315         Length:315        
##  hypo: 82   Class :character   Class :character   Class :character  
##             Mode  :character   Mode  :character   Mode  :character  
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##               Razza     Sesso      Etàaa7              Xmm           
##  Meticcio        : 84   C: 37   Length:315         Length:315        
##  Dobermann       : 25   F: 58   Class :character   Class :character  
##  Labrador        : 17   M:126   Mode  :character   Mode  :character  
##  Golden Retriever: 13   S: 94                                        
##  Pinscher        :  7                                                
##  Beagle          :  6                                                
##  (Other)         :163                                                
##      Etàaa9            Peso             BCS           T4nmoll     
##  Min.   : 1.250   Min.   :  1.00   Min.   :2.000   Min.   : 1.29  
##  1st Qu.: 6.250   1st Qu.: 14.93   1st Qu.:5.000   1st Qu.: 6.68  
##  Median : 8.917   Median : 26.30   Median :6.000   Median :16.00  
##  Mean   : 8.704   Mean   : 28.19   Mean   :5.835   Mean   :16.94  
##  3rd Qu.:11.292   3rd Qu.: 37.50   3rd Qu.:7.000   3rd Qu.:23.15  
##  Max.   :17.250   Max.   :323.00   Max.   :9.000   Max.   :51.86  
##                   NA's   :3        NA's   :230                    
##    T4µgdl1398     T4postTSHnmolml  T4postTSHµgdl    Incrementodalbasale
##  Min.   :0.1006   Min.   :  1.29   Min.   :0.1006   Min.   :0.06251    
##  1st Qu.:0.5210   1st Qu.:  6.40   1st Qu.:0.4992   1st Qu.:0.07800    
##  Median :1.2480   Median : 27.50   Median :2.1450   Median :1.82934    
##  Mean   :1.3216   Mean   : 29.91   Mean   :2.3332   Mean   :2.00862    
##  3rd Qu.:1.8057   3rd Qu.: 43.65   3rd Qu.:3.4047   3rd Qu.:3.02477    
##  Max.   :4.0451   Max.   :126.00   Max.   :9.8280   Max.   :7.59292    
##                   NA's   :209      NA's   :209      NA's   :209        
##  TSHngml003038    AUMENTOTSH Colesterolomgdl  AUMENTOCOLESTEROLO
##  Min.   :0.0100   NO:246     Min.   :  76.0   NO:166            
##  1st Qu.:0.1000   SI: 69     1st Qu.: 227.5   SI:149            
##  Median :0.1700              Median : 316.0                     
##  Mean   :0.4017              Mean   : 374.7                     
##  3rd Qu.:0.3200              3rd Qu.: 446.0                     
##  Max.   :8.9000              Max.   :2025.0                     
##                              NA's   :12                         
##   Trigliceridi         Crea            ALT               AST         
##  Min.   :  22.0   Min.   :0.410   Min.   :   8.00   Min.   :  12.00  
##  1st Qu.:  61.0   1st Qu.:0.790   1st Qu.:  40.00   1st Qu.:  30.00  
##  Median :  86.5   Median :0.950   Median :  61.50   Median :  37.00  
##  Mean   : 209.8   Mean   :1.005   Mean   :  97.74   Mean   :  45.33  
##  3rd Qu.: 240.2   3rd Qu.:1.145   3rd Qu.: 103.00   3rd Qu.:  49.00  
##  Max.   :1922.0   Max.   :6.300   Max.   :1285.00   Max.   :1078.00  
##  NA's   :171      NA's   :8       NA's   :9         NA's   :17       
##       Hgb             Hct              GR               MCV       
##  Min.   : 1.30   Min.   : 4.90   Min.   : 483000   Min.   :54.80  
##  1st Qu.:12.93   1st Qu.:38.60   1st Qu.:5792500   1st Qu.:65.72  
##  Median :14.95   Median :43.80   Median :6510000   Median :68.00  
##  Mean   :14.77   Mean   :43.62   Mean   :6438343   Mean   :67.84  
##  3rd Qu.:16.60   3rd Qu.:49.00   3rd Qu.:7065000   3rd Qu.:70.30  
##  Max.   :21.10   Max.   :60.70   Max.   :9420000   Max.   :82.00  
##  NA's   :9       NA's   :8       NA's   :9         NA's   :9      
##       MCHC           USG           UPC      ASTENIA  LETARGIADEPRESSIONE
##  Min.   :21.2   1044   :  9   0.1    :  6   NO:187   NO:209             
##  1st Qu.:33.0   1020   :  8   0.2    :  6   SI:128   SI:106             
##  Median :33.8   1010   :  7   0.12   :  3                               
##  Mean   :33.9   1012   :  6   0.3    :  3                               
##  3rd Qu.:34.9   1052   :  6   0.09   :  2                               
##  Max.   :41.3   (Other): 74   (Other): 45                               
##  NA's   :12     NA's   :205   NA's   :250                               
##  PUPD          APPETITO   OBESITÀ  ALOPECIA DERMATOPATIA ALTERAZIONINEURO
##  NO:262   Aumentato: 45   NO:240   NO:196   NO:243       NO:227          
##  SI: 53   Diminuito: 31   SI: 75   SI:119   SI: 72       SI: 88          
##           Normale  :238                                                  
##           NA's     :  1                                                  
##                                                                          
##                                                                          
##                                                                          
##                  ALTRISINTOMI                              Motivotest 
##  NO                    :98    Sospetto clinico                  :168  
##  Intolleranza al freddo: 7    sospetto clinico                  : 89  
##  Ipotermia             : 7    Sospetto clinico/clinicopatologico: 17  
##  massa tiroidea        : 7    sospetto clinicopatologico        : 15  
##  Dispnea               : 6    sospetto clinico/clinicopatologico: 14  
##  (Other)               :93    Sospetto clinicopatologico        :  7  
##  NA's                  :97    (Other)                           :  5  
##                  DIAGNOSI  
##  Iperadrenocorticismo:  9  
##  PDH                 :  8  
##  neoplasia tiroidea  :  6  
##  Diabete mellito     :  5  
##  Epilessia idiopatica:  4  
##  (Other)             :166  
##  NA's                :117
dfMissing
##               variable      tipo n.Missing perc.Missing n.modalita
## 1              hypo_eu    factor         0          0.0          2
## 2               Codice character         0          0.0          0
## 3         Proprietario character         0          0.0          0
## 4             Paziente character         0          0.0          0
## 5                Razza    factor         0          0.0         92
## 6                Sesso    factor         0          0.0          4
## 7               Etàaa7 character         0          0.0          0
## 8                  Xmm character         0          0.0          0
## 9               Etàaa9   numeric         0          0.0          0
## 10                Peso   numeric         3          1.0          0
## 11                 BCS   numeric       230         73.0          0
## 12             T4nmoll   numeric         0          0.0          0
## 13          T4µgdl1398   numeric         0          0.0          0
## 14     T4postTSHnmolml   numeric       209         66.3          0
## 15       T4postTSHµgdl   numeric       209         66.3          0
## 16 Incrementodalbasale   numeric       209         66.3          0
## 17       TSHngml003038   numeric         0          0.0          0
## 18          AUMENTOTSH    factor         0          0.0          2
## 19     Colesterolomgdl   numeric        12          3.8          0
## 20  AUMENTOCOLESTEROLO    factor         0          0.0          2
## 21        Trigliceridi   numeric       171         54.3          0
## 22                Crea   numeric         8          2.5          0
## 23                 ALT   numeric         9          2.9          0
## 24                 AST   numeric        17          5.4          0
## 25                 Hgb   numeric         9          2.9          0
## 26                 Hct   numeric         8          2.5          0
## 27                  GR   numeric         9          2.9          0
## 28                 MCV   numeric         9          2.9          0
## 29                MCHC   numeric        12          3.8          0
## 30                 USG    factor       205         65.1         39
## 31                 UPC    factor       250         79.4         43
## 32             ASTENIA    factor         0          0.0          2
## 33 LETARGIADEPRESSIONE    factor         0          0.0          2
## 34                PUPD    factor         0          0.0          2
## 35            APPETITO    factor         1          0.3          3
## 36             OBESITÀ    factor         0          0.0          2
## 37            ALOPECIA    factor         0          0.0          2
## 38        DERMATOPATIA    factor         0          0.0          2
## 39    ALTERAZIONINEURO    factor         0          0.0          2
## 40        ALTRISINTOMI    factor        97         30.8         77
## 41          Motivotest    factor         0          0.0          9
## 42            DIAGNOSI    factor       117         37.1        128

Da una prima analisi delle statistiche descrittive si osservano alcune variabili con un numero elevato di valori mancanti o di modalità:
Razza, BCS, T4postTSHnmolml, T4postTSHµgdl, Incrementodalbasale, Trigliceridi, USG, UPC, ALTRISINTOMI, DIAGNOSI
si osserva anche un valore alquanto anomalo in Peso (avrei delle difficoltà nel portare allo sgambatoio un cane da 323Kg).
Queste variabili possono quindi essere escluse dalle analisi.

varok <- dfMissing[dfMissing$perc.Missing<10 & (dfMissing$tipo=="numeric" | dfMissing$tipo=="factor") & dfMissing$n.modalita<20,"variable"]
varok <- varok[varok!="Peso"]
varok <- varok[varok!="Motivotest"]
varok
##  [1] "hypo_eu"             "Sesso"               "Etàaa9"             
##  [4] "T4nmoll"             "T4µgdl1398"          "TSHngml003038"      
##  [7] "AUMENTOTSH"          "Colesterolomgdl"     "AUMENTOCOLESTEROLO" 
## [10] "Crea"                "ALT"                 "AST"                
## [13] "Hgb"                 "Hct"                 "GR"                 
## [16] "MCV"                 "MCHC"                "ASTENIA"            
## [19] "LETARGIADEPRESSIONE" "PUPD"                "APPETITO"           
## [22] "OBESITÀ"             "ALOPECIA"            "DERMATOPATIA"       
## [25] "ALTERAZIONINEURO"

L’analisi della correlazione tra le variabili quantitative mette in evidenza come alcune di loro siano fortemente correlate e che quindi, per semplificare le analisi, possono essere escluse

vCont <- c()
xdat <- as.data.frame(dat[,varok])
for(i in 1:ncol(xdat)){
  if(class(as.data.frame(xdat)[,i])=="numeric"){
    vCont <- c(vCont,colnames(xdat)[i])
  }
}
mcr <- cor(xdat[,vCont],use="pairwise.complete.obs")
corrplot(mcr, type="upper", order="hclust",diag = F)

# escludo variabili con correlazione > 0.8
varok <- varok[!varok %in% c("T4µgdl1398","Hct","GR")]
varok
##  [1] "hypo_eu"             "Sesso"               "Etàaa9"             
##  [4] "T4nmoll"             "TSHngml003038"       "AUMENTOTSH"         
##  [7] "Colesterolomgdl"     "AUMENTOCOLESTEROLO"  "Crea"               
## [10] "ALT"                 "AST"                 "Hgb"                
## [13] "MCV"                 "MCHC"                "ASTENIA"            
## [16] "LETARGIADEPRESSIONE" "PUPD"                "APPETITO"           
## [19] "OBESITÀ"             "ALOPECIA"            "DERMATOPATIA"       
## [22] "ALTERAZIONINEURO"
# creo il data.frame per elaborazioni
xdat <- dat[,varok]

Analisi descrittiva

distribuzione delle variabili quantitative rispetto a hypo_eu

verifica della normalità distributiva

boxplot variabili quantitative per hypo_eu

distribuzione delle variabili qualitative rispetto a hypo_eu

creazione dei dataset di training e test

Definisco un training set con il 70% delle osservazioni

set.seed(1111)
train_index <- sample(1:nrow(xdat), 0.7 * nrow(xdat))
test_index <- setdiff(1:nrow(xdat), train_index)

train <- xdat[train_index, ]
test <- xdat[test_index, ]

# training e test senza missing
trainr <- na.omit(train)
testr <- na.omit(test)

Analisi di classificazione

Classification Tree (CART)

# tuning dei parametri dell'albero
fm <- formula(hypo_eu~.)
audit.rpart <- tune.rpart(fm, data = trainr, minsplit=seq(5,10,1), cp = seq(0.001,0.01,0.005),tunecontrol = tune.control(nboot=20))
audit.rpart
## 
## Parameter tuning of 'rpart.wrapper':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  minsplit    cp
##         5 0.001
## 
## - best performance: 0.05078947
xminsplit <- 5 # audit.rpart$best.parameters$minsplit
xcp <- 0.001   # audit.rpart$best.parameters$cp

mCT <- rpart(hypo_eu~. ,data = trainr, minsplit = xminsplit, cp = xcp)

rpart.plot(mCT,type = 4,extra = 2)

risultati su training dataset

pred <- prediction(predict(mCT,type = "prob")[,2], trainr$hypo_eu) 
auc <- performance(pred,"auc")
auc <- as.numeric(auc@y.values) 

cmtx <- confusionMatrix(predict(mCT,type = "class"),trainr$hypo_eu,mode = "everything",positive = "hypo")
# confusion matrix
addmargins(cmtx$table)
##           Reference
## Prediction  eu hypo Sum
##       eu   139    0 139
##       hypo   3   54  57
##       Sum  142   54 196
round(c(cmtx$overall,AUC=auc),5)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.98469        0.96231        0.95592        0.99683        0.72449        0.00000        0.24821        0.99283
round(cmtx$byClass[c(1,2,5,7,8,11)],5)
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           1.00000           0.97887           0.94737           0.97297           0.27551           0.98944

risultati su test dataset

# pred <- prediction(predict(mCT,type = "prob")[,2], trainr$hypo_eu) 
mCT.p <- predict(mCT,newdata = testr,type = "prob")[,2]
pred <- prediction(mCT.p, testr$hypo_eu) 
auc <- performance(pred,"auc")
auc <- as.numeric(auc@y.values) 
aucCT <- auc
cmtxT <- confusionMatrix(predict(mCT,newdata = testr,type = "class"),testr$hypo_eu,mode = "everything",positive = "hypo")
# confusion matrix
addmargins(cmtxT$table)
##           Reference
## Prediction eu hypo Sum
##       eu   58    1  59
##       hypo  5   21  26
##       Sum  63   22  85
round(c(cmtxT$overall,AUC=auc),5)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.92941        0.82629        0.85267        0.97366        0.74118        0.00001        0.22067        0.94336
round(cmtxT$byClass[c(1,2,5,7,8,11)],5)
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.95455           0.92063           0.80769           0.87500           0.25882           0.93759
dmCTVI <- data.frame(variable=names(mCT$variable.importance),importance=mCT$variable.importance)
ggplot(dmCTVI, aes(x=reorder(variable,importance),y=importance))+
  geom_bar(stat="identity")+
  coord_flip()+theme_light()+xlab("")+ylab("")+ggtitle("Variable Importance - Classification Tree")

Random Forest

set.seed(1234)
mRF <- randomForest(hypo_eu~. ,data = trainr,na.action = na.roughfix,ntree=500)
mRF
## 
## Call:
##  randomForest(formula = hypo_eu ~ ., data = trainr, ntree = 500,      na.action = na.roughfix) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 5.1%
## Confusion matrix:
##       eu hypo class.error
## eu   137    5  0.03521127
## hypo   5   49  0.09259259

risultati su train dataset

##           Reference
## Prediction  eu hypo Sum
##       eu   137    5 142
##       hypo   5   49  54
##       Sum  142   54 196
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.94898        0.87220        0.90818        0.97527        0.72449        0.00000        1.00000        0.99204
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.90741           0.96479           0.90741           0.90741           0.27551           0.93610

risultati su test dataset

##           Reference
## Prediction eu hypo Sum
##       eu   60    1  61
##       hypo  3   21  24
##       Sum  63   22  85
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.95294        0.88087        0.88387        0.98703        0.74118        0.00000        0.61708        0.97655
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.95455           0.95238           0.87500           0.91304           0.25882           0.95346

Support Vector Machine

# tuning parametri SVM
obj <- tune(svm, hypo_eu~. ,data = trainr, 
            ranges = list(gamma = seq(0.0001,1,by=0.01), cost = seq(1,1000,by=100)),
            tunecontrol = tune.control(sampling = "fix"))
obj
## 
## Parameter tuning of 'svm':
## 
## - sampling method: fixed training/validation set 
## 
## - best parameters:
##   gamma cost
##  0.0201    1
## 
## - best performance: 0.01515152
mSVM <- svm(hypo_eu~. ,data = trainr, cost = obj$best.parameters$cost, gamma = obj$best.parameters$gamma,probability=T)

risultati su train dataset

##           Reference
## Prediction  eu hypo Sum
##       eu   141    5 146
##       hypo   1   49  50
##       Sum  142   54 196
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.96939        0.92152        0.93456        0.98868        0.72449        0.00000        0.22067        0.99674
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.90741           0.99296           0.98000           0.94231           0.27551           0.95018

risultati su test dataset

##           Reference
## Prediction eu hypo Sum
##       eu   62    3  65
##       hypo  1   19  20
##       Sum  63   22  85
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.95294        0.87361        0.88387        0.98703        0.74118        0.00000        0.61708        0.99134
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.86364           0.98413           0.95000           0.90476           0.25882           0.92388

Logistic regression

Per non definire un modello troppo complesso, con tutte le variabili presenti nel dataset, le variabili utilizzate nel modello sono state selezionate in due fasi:
1. sono state considerate le 10 più importanti in base a quanto ottenuto dal modello random forest;
2. è stata utilizzata una procedura stepwise.

xtrainr <- trainr[,c("hypo_eu",dmRFVI[order(dmRFVI$MeanDecreaseGini,decreasing = T),"variable"][1:10])]
xtestr <- testr[,c("hypo_eu",dmRFVI[order(dmRFVI$MeanDecreaseGini,decreasing = T),"variable"][1:10])]
mLG <- glm(hypo_eu~. ,data = xtrainr,family = "binomial")
mLG <- step(mLG,trace = F)
summary(mLG)
## 
## Call:
## glm(formula = hypo_eu ~ T4nmoll + TSHngml003038 + MCV + Etàaa9, 
##     family = "binomial", data = xtrainr)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.04746  -0.00684  -0.00013   0.00001   1.82028  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.1841     6.9523  -0.314 0.753401    
## T4nmoll        -1.1100     0.3119  -3.559 0.000373 ***
## TSHngml003038   6.7475     3.3584   2.009 0.044519 *  
## MCV             0.2025     0.1059   1.912 0.055881 .  
## Etàaa9         -0.3638     0.2336  -1.557 0.119388    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 230.756  on 195  degrees of freedom
## Residual deviance:  23.582  on 191  degrees of freedom
## AIC: 33.582
## 
## Number of Fisher Scoring iterations: 10

risultati su train dataset

##           Reference
## Prediction  eu hypo Sum
##       eu   140    3 143
##       hypo   2   51  53
##       Sum  142   54 196
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.97449        0.93573        0.94147        0.99167        0.72449        0.00000        1.00000        0.99700
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.94444           0.98592           0.96226           0.95327           0.27551           0.96518

risultati su test dataset

##           Reference
## Prediction eu hypo Sum
##       eu   59    1  60
##       hypo  4   21  25
##       Sum  63   22  85
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.94118        0.85320        0.86804        0.98063        0.74118        0.00000        0.37109        0.98413
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.95455           0.93651           0.84000           0.89362           0.25882           0.94553

Naive Bayes

mNB = naiveBayes(hypo_eu~. ,data = trainr)

risultati trainig set

##           Reference
## Prediction  eu hypo Sum
##       eu   138    5 143
##       hypo   4   49  53
##       Sum  142   54 196
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.95408        0.88431        0.91462        0.97879        0.72449        0.00000        1.00000        0.98826
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.90741           0.97183           0.92453           0.91589           0.27551           0.93962

risultati test set

##           Reference
## Prediction eu hypo Sum
##       eu   61    1  62
##       hypo  2   21  23
##       Sum  63   22  85
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.96471        0.90935        0.90030        0.99266        0.74118        0.00000        1.00000        0.97980
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.95455           0.96825           0.91304           0.93333           0.25882           0.96140

Gradient Boosting Machine

xtrainr <- trainr
xtrainr$hypo_eu <- ifelse(xtrainr$hypo_eu=="eu",0,1)
mGB <- gbm(
  formula = hypo_eu ~ .,
  distribution = "bernoulli",
  data = xtrainr,
  n.trees = 10000,
  interaction.depth = 1,
  shrinkage = 0.001,
  cv.folds = 5,
  n.cores = NULL, # will use all cores by default
  verbose = FALSE
)  

Per il calcolo del modello GBM non è stata fatta nessuna ricerca dei parametri ottimali, anche qui si potrebbe fare un tuning

risultati trainig set

##           Reference
## Prediction  eu hypo Sum
##       eu   140    0 140
##       hypo   2   54  56
##       Sum  142   54 196
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.98980        0.97473        0.96363        0.99876        0.72449        0.00000        0.47950        0.99896
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           1.00000           0.98592           0.96429           0.98182           0.27551           0.99296

risultati test set

##           Reference
## Prediction eu hypo Sum
##       eu   59    1  60
##       hypo  4   21  25
##       Sum  63   22  85
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.94118        0.85320        0.86804        0.98063        0.74118        0.00000        0.37109        0.97114
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.95455           0.93651           0.84000           0.89362           0.25882           0.94553

tabella riassuntiva di confronto

Accuracy Kappa AUC Sensitivity Specificity Precision F1 Prevalence Balanced Accuracy
Tree 0.9294 0.8263 0.9434 0.9545 0.9206 0.8077 0.8750 0.2588 0.9376
randomForest 0.9529 0.8809 0.9766 0.9545 0.9524 0.8750 0.9130 0.2588 0.9535
SVM 0.9529 0.8736 0.9913 0.8636 0.9841 0.9500 0.9048 0.2588 0.9239
Logit 0.9412 0.8532 0.9841 0.9545 0.9365 0.8400 0.8936 0.2588 0.9455
NaiveBayes 0.9647 0.9093 0.9798 0.9545 0.9683 0.9130 0.9333 0.2588 0.9614
GBM 0.9412 0.8532 0.9711 0.9545 0.9365 0.8400 0.8936 0.2588 0.9455

Curve ROC sul test dataset per metodo

# ROCit_obj <- rocit(score=mSVM.p,class=ifelse(testr$hypo_eu=="hypo",1,0))
# pp <- plot(ROCit_obj,YIndex = T,values = T)
# text(0.5,0.3,paste("AUC=",round(ROCit_obj$AUC,4)),font=2,pos = 4)
# text(0.5,0.2,
#      paste("Youden Index=",round(pp$`optimal Youden Index point`[1],4),
#            "\n(cutoff=",round(pp$`optimal Youden Index point`[4],4),")",sep=""),cex = 1,pos = 4)

Confronto bootstrap

I diversi algoritmi sono stati valutati utilizzando 50 diversi campioni train/test (sempre in proporzione 70%/30%); in ogni giro sono inoltre stati determinati i parametri ottimali per classification Tree e SVM.

Valori medi degli indici per algoritmo

method Accuracy Kappa AUC Sensitivity Specificity Precision F1 Balanced.Accuracy
GBM 0.9510 0.8724 0.9893 0.8975 0.9722 0.9210 0.9055 0.9349
Logistic 0.9498 0.8709 0.9812 0.9194 0.9612 0.8957 0.9054 0.9403
NaiveBayes 0.9392 0.8397 0.9714 0.8590 0.9678 0.9080 0.8810 0.9134
randomForest 0.9545 0.8820 0.9908 0.9040 0.9730 0.9257 0.9132 0.9385
SVM 0.9366 0.8293 0.9858 0.8311 0.9745 0.9248 0.8715 0.9028
Tree 0.9342 0.8272 0.9266 0.8615 0.9608 0.8904 0.8718 0.9112

distribuzione dei risultati per indice e algoritmo



Appendice

Confusion matrix

Obs
Negative (eu) Positive (hypo)
Pred Negative (eu) TN FN
Positive (hypo) FP TP

\(Accuracy = \frac{(TN+TP)}{N}\)

\(Sensitivity = \frac{TP}{(TP+FN)}\)

\(Specificity = \frac{TN}{(TN+FP)}\)

\(Precision = \frac{TP}{(TP+FP)}\)

\(F1\ Score = 2 \frac{(Recall * Precision)}{(Recall + Precision)}\)

\(Balanced\ accuracy = \frac{(Sensitivity + Specificity)}{2}\)

Accuracy

It’s the ratio of the correctly labeled subjects to the whole pool of subjects.
Accuracy is the most intuitive one.
Accuracy answers the following question: How many patients did we correctly label out of all the patients?
Accuracy = (TP+TN)/(TP+FP+FN+TN)
numerator: all correctly labeled subject (All trues)
denominator: all subjects

Sensitivity (aka Recall)

Recall is the ratio of the correctly +ve labeled by our program to all who are hypo in reality.
Recall answers the following question: Of all the patients who are hypo, how many of those we correctly predict?
Recall = TP/(TP+FN)
numerator: +ve labeled hypo patients.
denominator: all patients who are hypo (whether detected by our program or not)

Specificity

Specificity is the correctly -ve labeled by the program to all who are eu in reality.
Specifity answers the following question: Of all the patients who are healthy, how many of those did we correctly predict?
Specificity = TN/(TN+FP)
numerator: -ve labeled healthy patients.
denominator: all patients who are healthly in reality (whether +ve or -ve labeled)

Precision

Precision is the ratio of the correctly +ve labeled by our program to all +ve labeled.
Precision answers the following: How many of those who we labeled as hypo are actually hypo?
Precision = TP/(TP+FP)
numerator: +ve labeled hypo.
denominator: all +ve labeled by our program (whether they’re hypo or not in reality).

F1-score (aka F-Score / F-Measure)

F1 Score considers both precision and recall.
It is the harmonic mean(average) of the precision and recall.
F1 Score is best if there is some sort of balance between precision (p) & recall (r) in the system. Oppositely F1 Score isn’t so high if one measure is improved at the expense of the other.
For example, if P is 1 & R is 0, F1 score is 0.
F1 Score = 2*(Recall * Precision) / (Recall + Precision)

Balanced accuracy

Balanced accuracy is based on sensitivity and specificity
Balanced accuracy = (Sensitivity + Specificity) / 2

General Notes

Accuracy is a great measure but only when you have symmetric datasets (false negatives & false positives counts are close), also, false negatives & false positives have similar costs.
If the cost of false positives and false negatives are different then F1 is your savior.
F1 is best if you have an uneven class distribution.

(https://towardsdatascience.com/accuracy-recall-precision-f-score-specificity-which-to-optimize-on-867d3f11124)

ROC curve

An ROC curve (receiver operating characteristic curve) is a graph showing the performance of a classification model at all classification thresholds. This curve plots two parameters:

True Positive Rate
False Positive Rate

True Positive Rate (TPR) (aka Recall or Sensitivity) and is therefore defined as follows:
\(TPR = \frac{TP}{(TP + FN)}\)

False Positive Rate (FPR) is defined as follows:
\(FPR = \frac{FP}{(FP + TN)}\)

An ROC curve plots TPR vs. FPR at different classification thresholds. Lowering the classification threshold classifies more items as positive, thus increasing both False Positives and True Positives.

AUC: Area Under the ROC Curve

AUC stands for “Area under the ROC Curve.” That is, AUC measures the entire two-dimensional area underneath the entire ROC curve.
AUC provides an aggregate measure of performance across all possible classification thresholds. One way of interpreting AUC is as the probability that the model ranks a random positive example more highly than a random negative example.
The AUC value, between 0 and 1, is in fact equivalent to the probability that the result of the classifier applied to an individual randomly extracted from the group of hypo is higher than that obtained by applying it to an individual randomly extracted from the healthy group.


conditional Classification Tree

library(party)
mPT <- ctree(hypo_eu~.,data = trainr, 
             controls = ctree_control(minsplit = 10,testtype = c("Bonferroni"),mincriterion=0.95))
plot(mPT)

risultati su test dataset

##           Reference
## Prediction eu hypo Sum
##       eu   58    1  59
##       hypo  5   21  26
##       Sum  63   22  85
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.92941        0.82629        0.85267        0.97366        0.74118        0.00001        0.22067        0.96032
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.95455           0.92063           0.80769           0.87500           0.25882           0.93759

conditional Random Forest

mPF <- cforest(hypo_eu~.,data = trainr,controls = cforest_unbiased(mtry = 6))

risultati su test dataset

##           Reference
## Prediction eu hypo Sum
##       eu   59    1  60
##       hypo  4   21  25
##       Sum  63   22  85
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull AccuracyPValue  McnemarPValue            AUC 
##        0.94118        0.85320        0.86804        0.98063        0.74118        0.00000        0.37109        0.96753
##       Sensitivity       Specificity         Precision                F1        Prevalence Balanced Accuracy 
##           0.95455           0.93651           0.84000           0.89362           0.25882           0.94553