Regression

## Logistic Regression Model
##  
##  lrm(formula = neo ~ sexoc + soh + nuevofum + glucosa, data = data.frame(cccrs), 
##      x = T, y = T)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs          1000    LR chi2      87.15    R2       0.112    C       0.669    
##   0            589    d.f.             4    g        0.711    Dxy     0.338    
##   1            411    Pr(> chi2) <0.0001    gr       2.035    gamma   0.369    
##  max |deriv| 2e-12                          gp       0.162    tau-a   0.164    
##                                             Brier    0.222                     
##  
##             Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept  -1.3805 0.1390 -9.93  <0.0001 
##  sexoc=1     0.6674 0.1393  4.79  <0.0001 
##  soh=1       0.5978 0.1347  4.44  <0.0001 
##  nuevofum=1  0.7746 0.1489  5.20  <0.0001 
##  glucosa=1   0.4068 0.1513  2.69  0.0072  
## 

Una primera lista de indices, el modelo de regresión, con la importancia relativa de las covariables

bondad de ajuste Hosmer Lemeshow, grafico calibración

 HLgof.test(fit = preds, obs = cccrs$neo)
## Warning in HLgof.test(fit = preds, obs = cccrs$neo): Found only 7 different
## groups for Hosmer-Lemesho C statistic.
## $C
## 
##  Hosmer-Lemeshow C statistic
## 
## data:  preds and cccrs$neo
## X-squared = 4.9841, df = 5, p-value = 0.4178
## 
## 
## $H
## 
##  Hosmer-Lemeshow H statistic
## 
## data:  preds and cccrs$neo
## X-squared = 10.013, df = 8, p-value = 0.2641
my.calib <- rms::calibrate(lmod, method="boot", B=500) # model calibration
plot(my.calib, main="Calibration", las=1)

## 
## n=1000   Mean absolute error=0.011   Mean squared error=0.00028
## 0.9 Quantile of absolute error=0.015

Buena calibración

Nomograma

plot(nomogram(lmod, fun=plogis,funlabel="Risk of neo"))

Nomograma, un recurso grafico para trasladar la combinación de valores a probabilidad, se puede generar una herramienta on-line

Performance Measure

perfMeasures(preds, truth = cccrs$neo, namePos = 1)
## 
##      Performance Measure(s)
## 
##                                        Measure      Value
## 1                               accuracy (ACC) 0.64600000
## 2  probability of correct classification (PCC) 0.64600000
## 3                        fraction correct (FC) 0.64600000
## 4            simple matching coefficient (SMC) 0.64600000
## 5                Rand (similarity) index (RSI) 0.64600000
## 6       probability of misclassification (PMC) 0.35400000
## 7                              error rate (ER) 0.35400000
## 8                     fraction incorrect (FIC) 0.35400000
## 9                           sensitivity (SENS) 0.35279805
## 10                                recall (REC) 0.35279805
## 11                    true positive rate (TPR) 0.35279805
## 12               probability of detection (PD) 0.35279805
## 13                               hit rate (HR) 0.35279805
## 14                          specificity (SPEC) 0.85059423
## 15                    true negative rate (TNR) 0.85059423
## 16                           selectivity (SEL) 0.85059423
## 17                         detection rate (DR) 0.14500000
## 18                   false positive rate (FPR) 0.14940577
## 19                               fall-out (FO) 0.14940577
## 20                    false alarm (rate) (FAR) 0.14940577
## 21            probability of false alarm (PFA) 0.14940577
## 22                   false negative rate (FNR) 0.64720195
## 23                              miss rate (MR) 0.64720195
## 24                  false discovery rate (FDR) 0.37768240
## 25                   false omission rate (FOR) 0.34680574
## 26                           prevalence (PREV) 0.41100000
## 27      (positive) pre-test probability (PREP) 0.41100000
## 28             (positive) pre-test odds (PREO) 0.69779287
## 29                detection prevalence (DPREV) 0.23300000
## 30       negative pre-test probability (NPREP) 0.58900000
## 31              negative pre-test odds (NPREO) 1.43309002
## 32                   no information rate (NIR) 0.58900000
## 33                    weighted accuracy (WACC) 0.60169614
## 34                    balanced accuracy (BACC) 0.60169614
## 35              (bookmaker) informedness (INF) 0.20339228
## 36                  Youden's J statistic (YJS) 0.20339228
## 37                               deltap' (DPp) 0.20339228
## 38             positive likelihood ratio (PLR) 2.36134152
## 39             negative likelihood ratio (NLR) 0.76088213
## 40             weighted likelihood ratio (WLR) 1.56111182
## 41             balanced likelihood ratio (BLR) 1.56111182
## 42                 diagnostic odds ratio (DOR) 3.10342618
## 43             positive predictive value (PPV) 0.62231760
## 44                            precision (PREC) 0.62231760
## 45    (positive) post-test probability (POSTP) 0.62231760
## 46           (positive) post-test odds (POSTO) 1.64772727
## 47                      Bayes factor G1 (BFG1) 2.36134152
## 48             negative predictive value (NPV) 0.65319426
## 49     negative post-test probability (NPOSTP) 0.65319426
## 50            negative post-test odds (NPOSTO) 1.88345865
## 51                      Bayes factor G0 (BFG0) 1.31426401
## 52                           markedness (MARK) 0.27551186
## 53                                 deltap (DP) 0.27551186
## 54             weighted predictive value (WPV) 0.63775593
## 55             balanced predictive value (BPV) 0.63775593
## 56                              F1 score (F1S) 0.45031056
## 57           Dice similarity coefficient (DSC) 0.45031056
## 58                          F beta score (FBS) 0.45031056
## 59        Jaccard similarity coefficient (JSC) 0.29058116
## 60                           threat score (TS) 0.29058116
## 61                critical success index (CSI) 0.29058116
## 62     Matthews' correlation coefficient (MCC) 0.23672132
## 63        Pearson's correlation (r phi) (RPHI) 0.23672132
## 64                      Phi coefficient (PHIC) 0.23672132
## 65                            Cramer's V (CRV) 0.23672132
## 66    proportion of positive predictions (PPP) 0.23300000
## 67                    expected accuracy (EACC) 0.54752600
## 68             Cohen's kappa coefficient (CKC) 0.21763460
## 69            mutual information in bits (MI2) 0.03996672
## 70                 joint entropy in bits (JE2) 1.19239357
## 71      variation of information in bits (VI2) 1.68029358
## 72                       Jaccard distance (JD) 0.97676705
## 73           information quality ratio (INFQR) 0.02323295
## 74                uncertainty coefficient (UC) 0.04090665
## 75                    entropy coefficient (EC) 0.04090665
## 76                 proficiency (metric) (PROF) 0.04090665
## 77                   deficiency (metric) (DFM) 0.95909335
## 78                            redundancy (RED) 0.02270543
## 79                  symmetric uncertainty (SU) 0.04541087
## 80                 normalized uncertainty (NU) 0.04568868

Multitud de denominaciones/medidas de rendimiento (empleando como corte 0.5)

Performance Score and optimal cutoff

perfScores(preds, truth = cccrs$neo, namePos = 1)
## 
##      Performance Score(s)
## 
##                        Score     Value
## 1     area under curve (AUC) 0.6690130
## 2          Gini index (GINI) 0.3380260
## 3           Brier score (BS) 0.2216517
## 4 positive Brier score (PBS) 0.3104348
## 5 negative Brier score (NBS) 0.1596995
## 6 weighted Brier score (WBS) 0.2350672
## 7 balanced Brier score (BBS) 0.2350672
optCutoff(preds, truth =  cccrs$neo, namePos = 1)
## Optimal Cut-off             YJS 
##       0.4503838       0.2494227

Punto de corte optimo segun alguna métrica -a mirar- :en realidad hay que optar por un corte en funcion del “coste” de falso positivo/negativo.. es decir del uso que se espera y las consecuencias de los diversos errores. Para eso tambien podemos buscar gráficos.. da igual que trabajemos como aqui con la probabilidad predicha por el modelo o con un sistema de puntuación, esto es menos relevante

Confusion Matrix and Statistics (threshold=0.45)

confusionMatrix(data = factor(as.numeric(preds>0.45))   ,
                reference =factor(cccrs$neo), 
                dnn = c("Predicted", "Actual"),
                mode = "everything",
                positive = "1") 
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted   0   1
##         0 402 178
##         1 187 233
##                                           
##                Accuracy : 0.635           
##                  95% CI : (0.6043, 0.6649)
##     No Information Rate : 0.589           
##     P-Value [Acc > NIR] : 0.001636        
##                                           
##                   Kappa : 0.2486          
##                                           
##  Mcnemar's Test P-Value : 0.675407        
##                                           
##             Sensitivity : 0.5669          
##             Specificity : 0.6825          
##          Pos Pred Value : 0.5548          
##          Neg Pred Value : 0.6931          
##               Precision : 0.5548          
##                  Recall : 0.5669          
##                      F1 : 0.5608          
##              Prevalence : 0.4110          
##          Detection Rate : 0.2330          
##    Detection Prevalence : 0.4200          
##       Balanced Accuracy : 0.6247          
##                                           
##        'Positive' Class : 1               
## 

### ROCs, Scoring classifiers

perf <- performance(ROCRpred, "tpr", "fpr")
plot(perf, avg= "threshold", colorize=TRUE, lwd= 3,
     main= " ROC curves ...")
plot(perf, lty=3, col="grey78", add=TRUE)
abline(0,1)

perf <- performance(ROCRpred, "sens", "spec")
plot(perf, avg= "threshold", colorize=TRUE, lwd= 3,
     main="... Sensitivity/Specificity plots ...")
plot(perf, lty=3, col="grey78", add=TRUE)

perf <- performance(ROCRpred, "prec", "rec")
plot(perf, avg= "threshold", colorize=TRUE, lwd= 3,
     main= "... Precision/Recall graphs (tpr/sens)->F1 metric")
plot(perf, lty=3, col="grey78", add=TRUE)

Aqui con diferentes metricas y un codigo de color con los posibles puntos de corte