## Logistic Regression Model
##
## lrm(formula = neo ~ sexoc + soh + nuevofum + glucosa, data = data.frame(cccrs),
## x = T, y = T)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 1000 LR chi2 87.15 R2 0.112 C 0.669
## 0 589 d.f. 4 g 0.711 Dxy 0.338
## 1 411 Pr(> chi2) <0.0001 gr 2.035 gamma 0.369
## max |deriv| 2e-12 gp 0.162 tau-a 0.164
## Brier 0.222
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -1.3805 0.1390 -9.93 <0.0001
## sexoc=1 0.6674 0.1393 4.79 <0.0001
## soh=1 0.5978 0.1347 4.44 <0.0001
## nuevofum=1 0.7746 0.1489 5.20 <0.0001
## glucosa=1 0.4068 0.1513 2.69 0.0072
##
Una primera lista de indices, el modelo de regresión, con la importancia relativa de las covariables
HLgof.test(fit = preds, obs = cccrs$neo)
## Warning in HLgof.test(fit = preds, obs = cccrs$neo): Found only 7 different
## groups for Hosmer-Lemesho C statistic.
## $C
##
## Hosmer-Lemeshow C statistic
##
## data: preds and cccrs$neo
## X-squared = 4.9841, df = 5, p-value = 0.4178
##
##
## $H
##
## Hosmer-Lemeshow H statistic
##
## data: preds and cccrs$neo
## X-squared = 10.013, df = 8, p-value = 0.2641
my.calib <- rms::calibrate(lmod, method="boot", B=500) # model calibration
plot(my.calib, main="Calibration", las=1)
##
## n=1000 Mean absolute error=0.011 Mean squared error=0.00028
## 0.9 Quantile of absolute error=0.015
Buena calibración
plot(nomogram(lmod, fun=plogis,funlabel="Risk of neo"))
Nomograma, un recurso grafico para trasladar la combinación de valores a probabilidad, se puede generar una herramienta on-line
perfMeasures(preds, truth = cccrs$neo, namePos = 1)
##
## Performance Measure(s)
##
## Measure Value
## 1 accuracy (ACC) 0.64600000
## 2 probability of correct classification (PCC) 0.64600000
## 3 fraction correct (FC) 0.64600000
## 4 simple matching coefficient (SMC) 0.64600000
## 5 Rand (similarity) index (RSI) 0.64600000
## 6 probability of misclassification (PMC) 0.35400000
## 7 error rate (ER) 0.35400000
## 8 fraction incorrect (FIC) 0.35400000
## 9 sensitivity (SENS) 0.35279805
## 10 recall (REC) 0.35279805
## 11 true positive rate (TPR) 0.35279805
## 12 probability of detection (PD) 0.35279805
## 13 hit rate (HR) 0.35279805
## 14 specificity (SPEC) 0.85059423
## 15 true negative rate (TNR) 0.85059423
## 16 selectivity (SEL) 0.85059423
## 17 detection rate (DR) 0.14500000
## 18 false positive rate (FPR) 0.14940577
## 19 fall-out (FO) 0.14940577
## 20 false alarm (rate) (FAR) 0.14940577
## 21 probability of false alarm (PFA) 0.14940577
## 22 false negative rate (FNR) 0.64720195
## 23 miss rate (MR) 0.64720195
## 24 false discovery rate (FDR) 0.37768240
## 25 false omission rate (FOR) 0.34680574
## 26 prevalence (PREV) 0.41100000
## 27 (positive) pre-test probability (PREP) 0.41100000
## 28 (positive) pre-test odds (PREO) 0.69779287
## 29 detection prevalence (DPREV) 0.23300000
## 30 negative pre-test probability (NPREP) 0.58900000
## 31 negative pre-test odds (NPREO) 1.43309002
## 32 no information rate (NIR) 0.58900000
## 33 weighted accuracy (WACC) 0.60169614
## 34 balanced accuracy (BACC) 0.60169614
## 35 (bookmaker) informedness (INF) 0.20339228
## 36 Youden's J statistic (YJS) 0.20339228
## 37 deltap' (DPp) 0.20339228
## 38 positive likelihood ratio (PLR) 2.36134152
## 39 negative likelihood ratio (NLR) 0.76088213
## 40 weighted likelihood ratio (WLR) 1.56111182
## 41 balanced likelihood ratio (BLR) 1.56111182
## 42 diagnostic odds ratio (DOR) 3.10342618
## 43 positive predictive value (PPV) 0.62231760
## 44 precision (PREC) 0.62231760
## 45 (positive) post-test probability (POSTP) 0.62231760
## 46 (positive) post-test odds (POSTO) 1.64772727
## 47 Bayes factor G1 (BFG1) 2.36134152
## 48 negative predictive value (NPV) 0.65319426
## 49 negative post-test probability (NPOSTP) 0.65319426
## 50 negative post-test odds (NPOSTO) 1.88345865
## 51 Bayes factor G0 (BFG0) 1.31426401
## 52 markedness (MARK) 0.27551186
## 53 deltap (DP) 0.27551186
## 54 weighted predictive value (WPV) 0.63775593
## 55 balanced predictive value (BPV) 0.63775593
## 56 F1 score (F1S) 0.45031056
## 57 Dice similarity coefficient (DSC) 0.45031056
## 58 F beta score (FBS) 0.45031056
## 59 Jaccard similarity coefficient (JSC) 0.29058116
## 60 threat score (TS) 0.29058116
## 61 critical success index (CSI) 0.29058116
## 62 Matthews' correlation coefficient (MCC) 0.23672132
## 63 Pearson's correlation (r phi) (RPHI) 0.23672132
## 64 Phi coefficient (PHIC) 0.23672132
## 65 Cramer's V (CRV) 0.23672132
## 66 proportion of positive predictions (PPP) 0.23300000
## 67 expected accuracy (EACC) 0.54752600
## 68 Cohen's kappa coefficient (CKC) 0.21763460
## 69 mutual information in bits (MI2) 0.03996672
## 70 joint entropy in bits (JE2) 1.19239357
## 71 variation of information in bits (VI2) 1.68029358
## 72 Jaccard distance (JD) 0.97676705
## 73 information quality ratio (INFQR) 0.02323295
## 74 uncertainty coefficient (UC) 0.04090665
## 75 entropy coefficient (EC) 0.04090665
## 76 proficiency (metric) (PROF) 0.04090665
## 77 deficiency (metric) (DFM) 0.95909335
## 78 redundancy (RED) 0.02270543
## 79 symmetric uncertainty (SU) 0.04541087
## 80 normalized uncertainty (NU) 0.04568868
Multitud de denominaciones/medidas de rendimiento (empleando como corte 0.5)
perfScores(preds, truth = cccrs$neo, namePos = 1)
##
## Performance Score(s)
##
## Score Value
## 1 area under curve (AUC) 0.6690130
## 2 Gini index (GINI) 0.3380260
## 3 Brier score (BS) 0.2216517
## 4 positive Brier score (PBS) 0.3104348
## 5 negative Brier score (NBS) 0.1596995
## 6 weighted Brier score (WBS) 0.2350672
## 7 balanced Brier score (BBS) 0.2350672
optCutoff(preds, truth = cccrs$neo, namePos = 1)
## Optimal Cut-off YJS
## 0.4503838 0.2494227
Punto de corte optimo segun alguna métrica -a mirar- :en realidad hay que optar por un corte en funcion del “coste” de falso positivo/negativo.. es decir del uso que se espera y las consecuencias de los diversos errores. Para eso tambien podemos buscar gráficos.. da igual que trabajemos como aqui con la probabilidad predicha por el modelo o con un sistema de puntuación, esto es menos relevante
confusionMatrix(data = factor(as.numeric(preds>0.45)) ,
reference =factor(cccrs$neo),
dnn = c("Predicted", "Actual"),
mode = "everything",
positive = "1")
## Confusion Matrix and Statistics
##
## Actual
## Predicted 0 1
## 0 402 178
## 1 187 233
##
## Accuracy : 0.635
## 95% CI : (0.6043, 0.6649)
## No Information Rate : 0.589
## P-Value [Acc > NIR] : 0.001636
##
## Kappa : 0.2486
##
## Mcnemar's Test P-Value : 0.675407
##
## Sensitivity : 0.5669
## Specificity : 0.6825
## Pos Pred Value : 0.5548
## Neg Pred Value : 0.6931
## Precision : 0.5548
## Recall : 0.5669
## F1 : 0.5608
## Prevalence : 0.4110
## Detection Rate : 0.2330
## Detection Prevalence : 0.4200
## Balanced Accuracy : 0.6247
##
## 'Positive' Class : 1
##
### ROCs, Scoring classifiers
perf <- performance(ROCRpred, "tpr", "fpr")
plot(perf, avg= "threshold", colorize=TRUE, lwd= 3,
main= " ROC curves ...")
plot(perf, lty=3, col="grey78", add=TRUE)
abline(0,1)
perf <- performance(ROCRpred, "sens", "spec")
plot(perf, avg= "threshold", colorize=TRUE, lwd= 3,
main="... Sensitivity/Specificity plots ...")
plot(perf, lty=3, col="grey78", add=TRUE)
perf <- performance(ROCRpred, "prec", "rec")
plot(perf, avg= "threshold", colorize=TRUE, lwd= 3,
main= "... Precision/Recall graphs (tpr/sens)->F1 metric")
plot(perf, lty=3, col="grey78", add=TRUE)
Aqui con diferentes metricas y un codigo de color con los posibles puntos de corte