The Elements of Statistical Learning: http://www-stat.stanford.edu/~tibs/ElemStatLearn/
Assessing the Performance of Prediction Models http://www.epibiostat.ucsf.edu/courses/roadmapk12/sigs/steyerberg_epidemiol_2010.pdf
Advances in Measuring the Effect of Individual Predictors of Cardiovascular Risk: The Role of Reclassification Measures http://annals.org/article.aspx?articleid=744526
Risk prediction models: I. Development, internal validation, and assessing the incremental value of a new (bio)marker. http://www.pubmed.org/22397945 -Moons KGM, et al. Risk prediction models: I. Development, internal validation, and assessing the incremental value of a new (bio)marker. Heart 2012; 98:683-90.
Moons KGM, et al. Risk prediction models: II. External validation, model updating, and impact assessment. Heart 2012; 98:691-98.
Steyerberg EW, et al. Assessing the performance of prediction models: A framework for traditional and novel measures. Epidemiol 2010; 21:128-38.
Harrell FE, et al. Multivariable prognostic models: Issues in developing models, evaluating assumptions and adequacy, and measuring and reducing errors. Stat Med 1996; 15: 361-387.
Cook NR, et al. Advances in measuring the effect of individual predictors of cardiovascular risk: the role of reclassification measures. Ann Intern Med 2009; 150:795-802.
CalculateAucFromDxy <- function(validate) {
## Test if the object is correct
stopifnot(class(validate) == "validate")
## Calculate AUCs from Dxy's
aucs <- (validate["Dxy", c("index.orig","training","test","optimism","index.corrected")])/2 + 0.5
## Get n
n <- validate["Dxy", c("n")]
## Combine as result
res <- rbind(validate, AUC = c(aucs, n))
## Fix optimism
res["AUC","optimism"] <- res["AUC","optimism"] - 0.5
## Return results
res
}
library(sas7bdat)
cpain <- read.sas7bdat("./cpdata06.sas7bdat")
names(cpain) <- tolower(names(cpain))
Variable selection algorithms based on likelihood ratio test p-values
Fit + penalty measures of model Performance
Akaike Information Criterion (AIC)
AIC = \( -2 log(L) + 2p \) where \( p \) is the number of parameters in the model
AIC-based method is the default one used in R.
logistic.null <- glm(mi ~ 1, data = cpain, family = binomial(link = "logit"))
forward.aic <- step(object = logistic.null,
scope = ~ age + systol + diastol + asso1 + asso2 + asso3 + asso4 + asso5 + past1 + past2,
direction = "forward", trace = 0)
summary(forward.aic)
Call:
glm(formula = mi ~ age + asso3 + asso1 + systol + diastol + asso4 +
asso2 + past1, family = binomial(link = "logit"), data = cpain)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.226 -0.450 -0.362 -0.291 2.744
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.15512 0.42553 -9.76 < 2e-16 ***
age 0.03060 0.00404 7.57 3.8e-14 ***
asso3 0.85780 0.18502 4.64 3.5e-06 ***
asso1 0.46129 0.12125 3.80 0.00014 ***
systol -0.01252 0.00303 -4.14 3.5e-05 ***
diastol 0.01727 0.00503 3.43 0.00060 ***
asso4 -0.30001 0.11897 -2.52 0.01168 *
asso2 0.26968 0.13305 2.03 0.04266 *
past1 0.23511 0.12208 1.93 0.05413 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2521.0 on 4372 degrees of freedom
Residual deviance: 2383.4 on 4364 degrees of freedom
AIC: 2401
Number of Fisher Scoring iterations: 5
Bayes Information Criterion (BIC)
BIC = \( -2 log(L) log(n) * p \) where n is the sample size.
BIC can be obtained by changing the penalty option k from the default 2 to log(n).
k: the multiple of the number of degrees of freedom used for the
penalty. Only ‘k = 2’ gives the genuine AIC: ‘k = log(n)’ is
sometimes referred to as BIC or SBC.
forward.bic <- step(object = logistic.null,
scope = ~ age + systol + diastol + asso1 + asso2 + asso3 + asso4 + asso5 + past1 + past2,
direction = "forward", trace = 0,
k = log(nrow(cpain)))
summary(forward.bic)
Call:
glm(formula = mi ~ age + asso3 + asso1, family = binomial(link = "logit"),
data = cpain)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.026 -0.451 -0.374 -0.304 2.601
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.20027 0.24218 -17.34 < 2e-16 ***
age 0.02666 0.00376 7.10 1.3e-12 ***
asso3 0.98851 0.17110 5.78 7.6e-09 ***
asso1 0.47245 0.11691 4.04 5.3e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2521.0 on 4372 degrees of freedom
Residual deviance: 2415.3 on 4369 degrees of freedom
AIC: 2423
Number of Fisher Scoring iterations: 5
Uniform shrinkage
## Load rms package
library(rms)
## Fit Logistic regression
res.lrm1 <- lrm(formula = mi ~ age + systol + diastol + asso1 + asso2 + asso3 + asso4 + asso5 + past1 + past2,
data = cpain,
x = TRUE, y = TRUE) # This part is necessary
## Validate to get the optimism-corrected slope
res.val <- validate(res.lrm1, B = 200)
res.val
index.orig training test optimism index.corrected n
Dxy 0.3574 0.3658 0.3457 0.0201 0.3373 200
R2 0.0707 0.0752 0.0662 0.0090 0.0618 200
Intercept 0.0000 0.0000 -0.1268 0.1268 -0.1268 200
Slope 1.0000 1.0000 0.9438 0.0562 0.9438 200
Emax 0.0000 0.0000 0.0379 0.0379 0.0379 200
D 0.0313 0.0333 0.0292 0.0041 0.0272 200
U -0.0005 -0.0005 0.0001 -0.0006 0.0001 200
Q 0.0317 0.0338 0.0291 0.0047 0.0271 200
B 0.0743 0.0743 0.0746 -0.0003 0.0746 200
g 0.6921 0.7144 0.6706 0.0438 0.6483 200
gp 0.0532 0.0548 0.0516 0.0032 0.0500 200
## Use the optimism-corrected slope as the shrinkage factor
shrinkage.factor <- res.val["Slope","index.corrected"]
## Show results
res.coef <- data.frame(Original = coef(res.lrm1),
Shrunk.boot = c(coef(res.lrm1)[1], coef(res.lrm1)[-1] * shrinkage.factor))
round(res.coef, 3)
Original Shrunk.boot
Intercept -4.161 -4.161
age 0.031 0.029
systol -0.013 -0.012
diastol 0.017 0.016
asso1 0.461 0.435
asso2 0.267 0.252
asso3 0.857 0.809
asso4 -0.301 -0.284
asso5 0.024 0.023
past1 0.238 0.225
past2 -0.007 -0.006
Penalized maximum likelihood estimation
## Check AIC/BIC at each penalty value
res.pentrance <- pentrace(fit = res.lrm1, penalty = 0:30)
res.pentrance
Best penalty:
penalty df
13 9.552
penalty df aic bic aic.c
0 10.000 117.7 53.85 117.6
1 9.963 117.7 54.15 117.7
2 9.927 117.8 54.44 117.8
3 9.891 117.9 54.73 117.8
4 9.856 117.9 55.01 117.9
5 9.821 118.0 55.27 117.9
6 9.786 118.0 55.53 117.9
7 9.751 118.0 55.78 118.0
8 9.717 118.1 56.03 118.0
9 9.684 118.1 56.27 118.0
10 9.650 118.1 56.50 118.0
11 9.617 118.1 56.72 118.1
12 9.585 118.1 56.93 118.1
13 9.552 118.1 57.15 118.1
14 9.520 118.1 57.35 118.1
15 9.488 118.1 57.55 118.1
16 9.457 118.1 57.74 118.1
17 9.426 118.1 57.93 118.0
18 9.395 118.1 58.11 118.0
19 9.364 118.1 58.29 118.0
20 9.334 118.0 58.46 118.0
21 9.304 118.0 58.63 118.0
22 9.274 118.0 58.79 117.9
23 9.244 118.0 58.95 117.9
24 9.215 117.9 59.11 117.9
25 9.186 117.9 59.26 117.9
26 9.157 117.9 59.41 117.8
27 9.129 117.8 59.55 117.8
28 9.100 117.8 59.69 117.7
29 9.072 117.7 59.83 117.7
30 9.044 117.7 59.96 117.6
plot(res.pentrance, which = c("aic","aic.c"), ylim = c(117,119))
res.pentrance$penalty
[1] 13
## Perform penalized maximum likelihood estimation
res.lrm1.penal <- update(res.lrm1, penalty = res.pentrance$penalty)
## Show results
res.coef$Penalized <- res.lrm1.penal$coefficients
round(res.coef, 3)
Original Shrunk.boot Penalized
Intercept -4.161 -4.161 -4.051
age 0.031 0.029 0.029
systol -0.013 -0.012 -0.011
diastol 0.017 0.016 0.015
asso1 0.461 0.435 0.445
asso2 0.267 0.252 0.262
asso3 0.857 0.809 0.844
asso4 -0.301 -0.284 -0.285
asso5 0.024 0.023 0.023
past1 0.238 0.225 0.233
past2 -0.007 -0.006 0.006
Least absolute shrinkage and selection operator (Lasso)
## Load glmpath (L1 Regularization Path for Generalized Linear Models and Cox Proportional Hazards Model)
library(glmpath)
## Use the lrm object with x and y
## Fit logistic models over a range of L1
res.path <- glmpath(data = res.lrm1)
## Plot results
par(mar = c(5.1, 4.1, 4.1, 5.1))
plot(res.path, type = c("coefficients", "aic", "bic")[1]) # coefficients
plot(res.path, type = c("coefficients", "aic", "bic")[2]) # AIC
plot(res.path, type = c("coefficients", "aic", "bic")[3]) # BIC
## List b.predictor (matrix of coefficient estimates from the predictor steps) with aic at each L1 bound
b.pred.aic <- cbind(res.path$b.predictor, aic = res.path$aic)
b.pred.aic
Intercept age systol diastol asso1 asso2 asso3 asso4 asso5 past1 past2 aic
1 -2.390 0.0000000 0.0000000 0.00000 0.0000 0.00000 0.0000 0.00000 0.00000 0.00000 0.000000 2523
2 -2.426 0.0006307 0.0000000 0.00000 0.0000 0.00000 0.0000 0.00000 0.00000 0.00000 0.000000 2523
3 -2.951 0.0093323 0.0000000 0.00000 0.0000 0.00000 0.5561 0.00000 0.00000 0.00000 0.000000 2472
4 -3.338 0.0147716 0.0000000 0.00000 0.1657 0.00000 0.6744 0.00000 0.00000 0.00000 0.000000 2446
5 -3.464 0.0164212 0.0000000 0.00000 0.2006 0.03450 0.6913 0.00000 0.00000 0.00000 0.000000 2441
6 -3.578 0.0177652 0.0000000 0.00000 0.2362 0.06685 0.7213 0.00000 0.00000 0.03915 0.000000 2436
7 -3.580 0.0177789 0.0000000 0.00000 0.2361 0.06669 0.7208 0.00000 0.00000 0.03886 0.000000 2436
8 -3.568 0.0189574 -0.0007545 0.00000 0.2616 0.09031 0.7411 0.00000 0.00000 0.06289 0.000000 2433
9 -3.525 0.0213586 -0.0022450 0.00000 0.3266 0.14253 0.7797 -0.08880 0.00000 0.11500 0.000000 2423
10 -3.526 0.0213805 -0.0022584 0.00000 0.3265 0.14238 0.7793 -0.08978 0.00000 0.11440 0.000000 2423
11 -4.061 0.0293778 -0.0110140 0.01486 0.4453 0.25403 0.8449 -0.27005 0.00000 0.22264 0.000000 2402
12 -4.145 0.0304208 -0.0122779 0.01688 0.4575 0.26499 0.8553 -0.29623 0.01941 0.23282 0.000000 2403
13 -4.145 0.0304242 -0.0122825 0.01688 0.4575 0.26503 0.8554 -0.29632 0.01937 0.23284 0.000000 2403
14 -4.161 0.0306835 -0.0125122 0.01726 0.4608 0.26731 0.8567 -0.30095 0.02434 0.23813 -0.006538 2405
## Show coefficients with the smallest AIC
b.pred.aic[which.min(res.path$aic), , drop = FALSE]
Intercept age systol diastol asso1 asso2 asso3 asso4 asso5 past1 past2 aic
11 -4.061 0.02938 -0.01101 0.01486 0.4453 0.254 0.8449 -0.2701 0 0.2226 0 2402
## Extract coefficients for which the AIC is the smallest
res.coef$Lasso <- res.path$b.predictor[which.min(res.path$aic),]
## Show the results (sex coefficient is set to zero)
round(res.coef, 3)
Original Shrunk.boot Penalized Lasso
Intercept -4.161 -4.161 -4.051 -4.061
age 0.031 0.029 0.029 0.029
systol -0.013 -0.012 -0.011 -0.011
diastol 0.017 0.016 0.015 0.015
asso1 0.461 0.435 0.445 0.445
asso2 0.267 0.252 0.262 0.254
asso3 0.857 0.809 0.844 0.845
asso4 -0.301 -0.284 -0.285 -0.270
asso5 0.024 0.023 0.023 0.000
past1 0.238 0.225 0.233 0.223
past2 -0.007 -0.006 0.006 0.000
Resubstituition (Performance test on the training set.)
## Load pROC
library(pROC)
## AIC model
roc.forward.aic <- roc(forward.aic$model$mi ~ forward.aic$fitted.values)
plot(roc.forward.aic, print.thres = quantile(fitted(forward.aic)))
Call:
roc.formula(formula = forward.aic$model$mi ~ forward.aic$fitted.values)
Data: forward.aic$fitted.values in 4006 controls (forward.aic$model$mi 0) < 367 cases (forward.aic$model$mi 1).
Area under the curve: 0.678
## BIC model
roc.forward.bic <- roc(forward.bic$model$mi ~ forward.bic$fitted.values)
plot(roc.forward.bic, print.thres = quantile(fitted(forward.bic)))
Call:
roc.formula(formula = forward.bic$model$mi ~ forward.bic$fitted.values)
Data: forward.bic$fitted.values in 4006 controls (forward.bic$model$mi 0) < 367 cases (forward.bic$model$mi 1).
Area under the curve: 0.658
## Penalized maximum likelihood estimation model
roc.pmle <- roc(res.lrm1.penal$y ~ predict(res.lrm1.penal, type = "fitted"))
plot(roc.pmle, print.thres = quantile(predict(res.lrm1.penal, type = "fitted")))
Call:
roc.formula(formula = res.lrm1.penal$y ~ predict(res.lrm1.penal, type = "fitted"))
Data: predict(res.lrm1.penal, type = "fitted") in 4006 controls (res.lrm1.penal$y 0) < 367 cases (res.lrm1.penal$y 1).
Area under the curve: 0.679
Calibration
MKmisc::HLgof.test Hosmer-Lemeshow goodness of fit tests.
PredictABEL::plotCalibration Function for calibration plot and Hosmer-Lemeshow goodness of fit
test.
ResourceSelection::hoslem.test Hosmer-Lemeshow Goodness of Fit (GOF) Test
vcdExtra::HosmerLemeshow Hosmer-Lemeshow Goodness of Fit Test
## Load PredictABEL
library(PredictABEL)
## AIC model
plotCalibration(data = cpain, cOutcome = 31, predRisk = fitted(forward.aic))
$Table_HLtest
total meanpred meanobs predicted observed
[0.0116,0.0352) 438 0.029 0.025 12.69 11
[0.0352,0.0436) 437 0.040 0.032 17.28 14
[0.0436,0.0515) 437 0.048 0.050 20.81 22
[0.0515,0.0604) 438 0.056 0.059 24.52 26
[0.0604,0.0703) 437 0.065 0.062 28.42 27
[0.0703,0.0809) 437 0.075 0.082 32.94 36
[0.0809,0.0948) 438 0.088 0.096 38.43 42
[0.0948,0.1133) 437 0.104 0.103 45.28 45
[0.1133,0.1494) 437 0.129 0.103 56.33 45
[0.1494,0.5283] 437 0.207 0.227 90.29 99
$Chi_square
[1] 5.459
$df
[1] 8
$p_value
[1] 0.7075
## BIC model
plotCalibration(data = cpain, cOutcome = 31, predRisk = fitted(forward.bic))
$Table_HLtest
total meanpred meanobs predicted observed
[0.0323,0.0407) 481 0.036 0.025 17.35 12
[0.0407,0.0486) 442 0.044 0.034 19.62 15
[0.0486,0.0562) 425 0.052 0.059 22.08 25
[0.0562,0.0653) 408 0.060 0.051 24.53 21
[0.0653,0.0758) 511 0.070 0.070 35.84 36
[0.0758,0.0836) 373 0.079 0.088 29.48 33
[0.0836,0.0952) 444 0.089 0.124 39.66 55
[0.0952,0.1116) 437 0.103 0.105 45.02 46
[0.1116,0.1354) 417 0.122 0.113 50.88 47
[0.1354,0.4093] 435 0.190 0.177 82.53 77
$Chi_square
[1] 11.62
$df
[1] 8
$p_value
[1] 0.1688
## Penalized maximum likelihood estimation model
plotCalibration(data = cpain, cOutcome = 31, predRisk = predict(res.lrm1.penal, type = "fitted"))
$Table_HLtest
total meanpred meanobs predicted observed
[0.0142,0.0370) 439 0.031 0.021 13.60 9
[0.0370,0.0450) 436 0.041 0.032 17.99 14
[0.0450,0.0529) 437 0.049 0.057 21.48 25
[0.0529,0.0617) 438 0.057 0.055 25.12 24
[0.0617,0.0707) 438 0.066 0.068 28.98 30
[0.0707,0.0818) 437 0.076 0.080 33.24 35
[0.0818,0.0948) 437 0.088 0.092 38.44 40
[0.0948,0.1123) 437 0.103 0.110 45.00 48
[0.1123,0.1460) 437 0.127 0.098 55.46 43
[0.1460,0.5038] 437 0.201 0.227 87.69 99
$Chi_square
[1] 8.654
$df
[1] 8
$p_value
[1] 0.3723
Somer’s D rank correlation (Dxy) is defined as
\( D_{xy} \) = (C-statistics - 0.5) * 2
This represents the proportion of the left upper triangle area of an ROC plot that is covered by the ROC.
Therefore,
C-statistics = (\( D_{xy} \) + 1) / 2
Full model
## Cross validation for optimism-corrected performance measures
res.lrm1.valid.cross <- validate(res.lrm1, method = "crossvalidation") # can be abbreviated to "cross"
CalculateAucFromDxy(res.lrm1.valid.cross)
index.orig training test optimism index.corrected n
Dxy 0.3573876 0.357504 0.3355955 0.0219086 0.3354790 40
R2 0.0707380 0.070873 0.0787577 -0.0078847 0.0786227 40
Intercept 0.0000000 0.000000 -0.1078626 0.1078626 -0.1078626 40
Slope 1.0000000 1.000000 0.9911957 0.0088043 0.9911957 40
Emax 0.0000000 0.000000 0.0272571 0.0272571 0.0272571 40
D 0.0312549 0.031309 0.0258063 0.0055026 0.0257522 40
U -0.0004574 -0.000469 -0.0003499 -0.0001191 -0.0003382 40
Q 0.0317122 0.031778 0.0261562 0.0056218 0.0260905 40
B 0.0743002 0.074288 0.0750333 -0.0007454 0.0750456 40
g 0.6920656 0.692719 0.6915207 0.0011988 0.6908668 40
gp 0.0532117 0.053254 0.0507137 0.0025402 0.0506715 40
AUC 0.6786938 0.678752 0.6677978 0.0109543 0.6677395 40
## Bootstrap validation for optimism-corrected performance measures
res.lrm1.valid.boot <- validate(res.lrm1, method = "boot")
CalculateAucFromDxy(res.lrm1.valid.boot)
index.orig training test optimism index.corrected n
Dxy 0.3573876 0.3753310 0.3460767 0.0292543 0.3281333 40
R2 0.0707380 0.0796286 0.0663369 0.0132917 0.0574463 40
Intercept 0.0000000 0.0000000 -0.1858767 0.1858767 -0.1858767 40
Slope 1.0000000 1.0000000 0.9185389 0.0814611 0.9185389 40
Emax 0.0000000 0.0000000 0.0560911 0.0560911 0.0560911 40
D 0.0312549 0.0354081 0.0292681 0.0061401 0.0251148 40
U -0.0004574 -0.0004574 0.0002744 -0.0007318 0.0002744 40
Q 0.0317122 0.0358655 0.0289937 0.0068718 0.0248404 40
B 0.0743002 0.0742659 0.0745846 -0.0003187 0.0746189 40
g 0.6920656 0.7326782 0.6682172 0.0644610 0.6276046 40
gp 0.0532117 0.0564110 0.0515543 0.0048568 0.0483549 40
AUC 0.6786938 0.6876655 0.6730383 0.0146272 0.6640667 40
Penalized maximum likelihood estimation model
## Cross validation for optimism-corrected performance measures
res.lrm1.penal.valid.cross <- validate(res.lrm1.penal, method = "crossvalidation") # can be abbreviated to "cross"
CalculateAucFromDxy(res.lrm1.penal.valid.cross)
index.orig training test optimism index.corrected n
Dxy 0.3574720 0.357532 0.328953 0.0285789 0.328893 40
R2 0.0676797 0.067728 0.079530 -0.0118020 0.079482 40
Intercept 0.0000000 0.000000 -0.053875 0.0538753 -0.053875 40
Slope 1.0000000 1.000000 1.017126 -0.0171261 1.017126 40
Emax 0.0000000 0.000000 0.014559 0.0145593 0.014559 40
D 0.0311509 0.029888 0.026247 0.0036417 0.027509 40
U -0.0004574 -0.000469 -0.001586 0.0011171 -0.001574 40
Q 0.0316083 0.030357 0.027833 0.0025246 0.029084 40
B 0.0742938 0.074282 0.074997 -0.0007149 0.075009 40
g 0.6581415 0.657905 0.661767 -0.0038628 0.662004 40
gp 0.0509246 0.050908 0.048784 0.0021241 0.048800 40
AUC 0.6787360 0.678766 0.664477 0.0142895 0.664447 40
## Bootstrap validation for optimism-corrected performance measures
res.lrm1.penal.valid.boot <- validate(res.lrm1.penal, method = "boot")
CalculateAucFromDxy(res.lrm1.penal.valid.boot)
index.orig training test optimism index.corrected n
Dxy 0.3574720 0.3600932 0.34598528 0.0141080 0.34336401 40
R2 0.0676797 0.0691092 0.06582083 0.0032884 0.06439133 40
Intercept 0.0000000 0.0000000 0.02090288 -0.0209029 0.02090288 40
Slope 1.0000000 1.0000000 1.00430575 -0.0043058 1.00430575 40
Emax 0.0000000 0.0000000 0.00542448 0.0054245 0.00542448 40
D 0.0311509 0.0303455 0.02903458 0.0013109 0.02984005 40
U -0.0004574 -0.0004574 -0.00003696 -0.0004204 -0.00003696 40
Q 0.0316083 0.0308028 0.02907155 0.0017313 0.02987701 40
B 0.0742938 0.0736271 0.07454811 -0.0009210 0.07521485 40
g 0.6581415 0.6647026 0.66407790 0.0006247 0.65751675 40
gp 0.0509246 0.0509189 0.05131133 -0.0003924 0.05131698 40
AUC 0.6787360 0.6800466 0.67299264 0.0070540 0.67168201 40
## Create uncorrected AUC
library(pROC)
original_model <- glm(formula = mi ~ age + asso3 + asso1 + systol + diastol + asso4 + asso2 + past1,
data = cpain, family = binomial)
original_model_predicted <- predict(original_model, type = "response")
uncorrected_roc <- roc(cpain$mi ~ original_model_predicted)
uncorrected_auc <- uncorrected_roc$auc
## Create function for boot strapping
fun.auc <- function(DF, i) {
## DF # Original full sample
## DF[i,] # Bootstrap sample (use index i as row numbers)
## Develop model on bootstrap sample
boot_model <- glm(formula = mi ~ age + asso3 + asso1 + systol + diastol + asso4 + asso2 + past1,
data = DF[i,], family = binomial)
## Measure AUC on bootstrap sample using bootstrap model (training set AUC)
boot_predicted <- predict(boot_model, type = "response")
boot_roc <- roc(DF[i,"mi"] ~ boot_predicted)
boot_auc <- as.numeric(boot_roc$auc)
## Measure AUC on original sample using bootstrap model (testing set AUC)
original_predicted <- predict(boot_model, type = "response", newdata = DF)
original_roc <- roc(DF[ ,"mi"] ~ original_predicted) # No index i
original_auc <- as.numeric(original_roc$auc)
## training set AUC - testing set AUC = optimism in AUC
auc_optimism <- (boot_auc - original_auc)
c(training = boot_auc, testing = original_auc, optimism = auc_optimism)
}
## Perform bootstraping 100 times
library(boot)
## library(multicore) # Unnecessary to load explicitly?
system.time(res.boot <- boot(data = cpain, statistic = fun.auc, R = 100, parallel = "multicore", ncpus = 8))
user system elapsed
98.061 1.816 20.895
res.boot.table <- as.data.frame(res.boot$t)
## Extract result for each bootstrap sample
names(res.boot.table) <- c("training.auc","testing.auc","optimism")
head(res.boot.table)
training.auc testing.auc optimism
1 0.6844 0.6683 0.016066
2 0.6766 0.6716 0.005027
3 0.6949 0.6702 0.024704
4 0.6786 0.6754 0.003188
5 0.6755 0.6741 0.001397
6 0.6863 0.6768 0.009549
## Mean optimism
mean.optimism <- mean(res.boot.table$optimism)
mean.optimism
[1] 0.007859
## Uncorrected AUC from original sample
uncorrected_auc
Area under the curve: 0.678
## Optimism-corrected AUC = Uncorrected AUC - Mean optimism
uncorrected_auc - mean.optimism
Area under the curve: 0.671