To train the models, first the hyperparameter grids are defined, before training parameters are set.
###Grids & TR control
{
ELGrid = expand.grid(alpha = seq(0,1,by=0.1),lambda = seq(0,0.1,by = 0.005))
RFGrid = expand.grid(
.mtry=2:10,
.splitrule=c("gini","extratrees"),
.min.node.size=c(1,3,5)
)
XGBGrid <- expand.grid(
nrounds = seq(from = 0, to = 1000, by = 200),
eta = c(0.05, 0.1, 0.3),
max_depth = c(2, 4, 6),
gamma = c(0.5,0.7,1.0),
colsample_bytree = c(0.4,0.8),
min_child_weight = c(1,2,3),
subsample = c(0.5,0.75))
}
cctrl1 <- trainControl(method="repeatedcv", number=10,returnResamp = "all", repeats=5,
classProbs = TRUE, summaryFunction = multiClassSummary, savePredictions = TRUE, verboseIter = TRUE,allowParallel = TRUE)
cctrl3 <- trainControl(method="boot", number=1000,returnResamp = "all",
classProbs = TRUE, summaryFunction = multiClassSummary, savePredictions = TRUE, verboseIter = TRUE)
nocctrl <- trainControl(method="none",classProbs = TRUE)
p="EarlyRec"
fm <- as.formula( paste( p, ".", sep=" ~ "))
rm(p)
}
Each model is then fitted and the hyperparameter combination that yields the highest mean AUC across 5 repeats of 10-fold cross validation is chosen as the optimum hyperparameter. This can then be validated using 1000 bootstrap resamples as below. Note the importance=‘permutation’ argument that is given to the Random Forest model, which is required for variable importance calculations later.
####Elastic Net
registerDoParallel(cores=3)
system.time(ELRFull<- train(fm, data=dataC, method = "glmnet",preProcess=c('knnImpute'), trControl = cctrl1,
metric = "AUC",na.action=na.pass,tuneGrid = ELGrid))
system.time(ELRFinal<-train(fm, data=dataC, method = "glmnet",preProcess=c('knnImpute'), trControl = cctrl3,
metric = "AUC",na.action=na.pass,tuneGrid = expand.grid(alpha = ELRFull$bestTune$alpha,lambda = ELRFull$bestTune$lambda)))
registerDoSEQ()
saveRDS(ELRFull,file="ELRFull4.1.rds")
saveRDS(ELRFinal,file="ELRFinal4.1.rds")
####Random Forest
registerDoParallel(cores=3)
system.time(RFFull<-train(fm, data=dataC, method="ranger", num.trees=1000,preProcess=c('knnImpute'),
na.action=na.pass,replace=TRUE, trControl= cctrl1, tuneGrid=RFGrid, metric ="AUC"))
system.time(RFFinal<-train(fm, data=dataC, method="ranger", num.trees=1000,preProcess=c('knnImpute'),
na.action=na.pass,replace=TRUE,importance='permutation',trControl= cctrl3,
tuneGrid=RFFull$bestTune, metric ="AUC"))
registerDoSEQ()
saveRDS(RFFull,'RFFull4.1rds')
saveRDS(RFFinal,file="RFFinal4.1.rds")
#####XGBoost
registerDoParallel(cores=3)
XGBFull <-train(fm,data=dataC,preProcess=c('knnImpute'),method="xgbTree",trControl=cctrl1,tuneGrid=XGBGrid,
verbose=T,metric="AUC",na.action = na.pass)
final_grid<-XGBFull$bestTune
XGBFinal<-train(fm,data=dataC,preProcess=c('knnImpute'),method="xgbTree",trControl=cctrl3,tuneGrid=final_grid,
verbose=T,metric="AUC",na.action = na.pass)
registerDoSEQ()
saveRDS(XGBFull,'XGBFull4.1.rds')
saveRDS(XGBFinal,'XGBFinal4.1.rds')
#####Ensemble
{
ensboot<-createResample(dataC$EarlyRec,times=1)
enstrain1<-dataC[ensboot[[1]],]
enstrain2<-dataC[-ensboot[[1]],]
registerDoParallel(cores=3)
ELRFull2<- train(EarlyRec~., data=dataC, method = "glmnet",preProcess=c('knnImpute'), trControl = cctrl1,
metric = "AUC",na.action=na.pass,tuneGrid = ELGrid)
ELRFinal2<- train(EarlyRec~., data=dataC, method = "glmnet",preProcess=c('knnImpute'), trControl = nocctrl,
metric = "AUC",na.action=na.pass,tuneGrid = ELRFull2$bestTune)
qsave(ELRFinal2,'ELRFinal2ens.q')
RFFull2<-train(EarlyRec~., data=dataC, method="ranger", num.trees=1000,preProcess=c('knnImpute') ,na.action=na.pass,replace=TRUE,
trControl= cctrl1, tuneGrid=RFGrid, metric ="AUC")
RFFinal2<-train(EarlyRec~., data=dataC, method="ranger", num.trees=1000,preProcess=c('knnImpute') ,na.action=na.pass,replace=TRUE,
trControl= nocctrl, tuneGrid=RFFull2$bestTune, metric ="AUC")
qsave(RFFinal2,'RFFinal2ens.q')
system.time(XGBFull2 <-train(EarlyRec~.,data=dataC,preProcess=c('knnImpute'),method="xgbTree",trControl=cctrl1,tuneGrid=XGBGrid,
verbose=T,metric="AUC",na.action = na.pass))
XGBFinal2 <-train(EarlyRec~.,data=dataC,preProcess=c('knnImpute'),method="xgbTree",trControl=nocctrl,tuneGrid=XGBFull2$bestTune,
verbose=T,metric="AUC",na.action = na.pass)
qsave(XGBFinal2,'XGBFinal2ens.q')
registerDoSEQ()
rm(ELRFull2,RFFull2,XGBFull2)
RFFinal2<-qread('RFFinal2ens.q')
ELRFinal2<-qread('ELRFinal2ens.q')
preds<-as.data.frame(1:nrow(na.omit(enstrain2)))
preds$ELR<-predict(ELRFinal2,enstrain2,type='prob')[,'Yes']
preds$RF<-predict(RFFinal2,enstrain2,type='prob')[,'Yes']
preds$XGB<-predict(XGBFinal2,enstrain2,type='prob')[,'Yes']
preds$Obs<-na.omit(enstrain2)$EarlyRec
preds<-select(preds,-1)
EnsLRM<-train(Obs~., data=preds, method = "glm",preProcess=c('knnImpute'), trControl = nocctrl,
metric = "AUC",na.action=na.pass)
EnsembleFinal<-list(ELRFinal2,RFFinal2,XGBFinal2,EnsLRM)
Enspredfct=function(model,x){
preds<-as.data.frame(predict(model[[4]][[1]],x,type='prob')[,'Yes'])
colnames(preds)<-'ELR'
preds$RF<-predict(model[[4]][[2]],x,type='prob')[,'Yes']
preds$XGB<-predict(model[[4]][[3]],x,type='prob')[,'Yes']
preds$LRM<-predict(model[[4]][[4]],preds,type='prob')[,'Yes']
preds$LRM
}
}
The ensemble model needs to be trained slightly differently, because of the need for the linear blending of probabilities. The dataset is first split into in-bootstrap and out-bootstrap samples, then the base models are trained as above on the in-bootstrap samples. These models are then used to predict on the out-boostrap samples, which are then combined together as a linear blend using logistic regression. The predict function defined below (‘Enspredfct’) is then used to generate predictions from the ‘Ensemblefinal’ object.
#####Ensemble
{
ensboot<-createResample(dataC$EarlyRec,times=1)
enstrain1<-dataC[ensboot[[1]],]
enstrain2<-dataC[-ensboot[[1]],]
registerDoParallel(cores=3)
ELRFull2<- train(EarlyRec~., data=dataC, method = "glmnet",preProcess=c('knnImpute'), trControl = cctrl1,
metric = "AUC",na.action=na.pass,tuneGrid = ELGrid)
ELRFinal2<- train(EarlyRec~., data=dataC, method = "glmnet",preProcess=c('knnImpute'), trControl = nocctrl,
metric = "AUC",na.action=na.pass,tuneGrid = ELRFull2$bestTune)
qsave(ELRFinal2,'ELRFinal2ens.q')
RFFull2<-train(EarlyRec~., data=dataC, method="ranger", num.trees=1000,preProcess=c('knnImpute') ,na.action=na.pass,replace=TRUE,
trControl= cctrl1, tuneGrid=RFGrid, metric ="AUC")
RFFinal2<-train(EarlyRec~., data=dataC, method="ranger", num.trees=1000,preProcess=c('knnImpute') ,na.action=na.pass,replace=TRUE,
trControl= nocctrl, tuneGrid=RFFull2$bestTune, metric ="AUC")
qsave(RFFinal2,'RFFinal2ens.q')
system.time(XGBFull2 <-train(EarlyRec~.,data=dataC,preProcess=c('knnImpute'),method="xgbTree",trControl=cctrl1,tuneGrid=XGBGrid,
verbose=T,metric="AUC",na.action = na.pass))
XGBFinal2 <-train(EarlyRec~.,data=dataC,preProcess=c('knnImpute'),method="xgbTree",trControl=nocctrl,tuneGrid=XGBFull2$bestTune,
verbose=T,metric="AUC",na.action = na.pass)
qsave(XGBFinal2,'XGBFinal2ens.q')
registerDoSEQ()
rm(ELRFull2,RFFull2,XGBFull2)
RFFinal2<-qread('RFFinal2ens.q')
ELRFinal2<-qread('ELRFinal2ens.q')
preds<-as.data.frame(1:nrow(na.omit(enstrain2)))
preds$ELR<-predict(ELRFinal2,enstrain2,type='prob')[,'Yes']
preds$RF<-predict(RFFinal2,enstrain2,type='prob')[,'Yes']
preds$XGB<-predict(XGBFinal2,enstrain2,type='prob')[,'Yes']
preds$Obs<-na.omit(enstrain2)$EarlyRec
preds<-select(preds,-1)
EnsLRM<-train(Obs~., data=preds, method = "glm",preProcess=c('knnImpute'), trControl = nocctrl,
metric = "AUC",na.action=na.pass)
EnsembleFinal<-list(ELRFinal2,RFFinal2,XGBFinal2,EnsLRM)
Enspredfct=function(model,x){
preds<-as.data.frame(predict(model[[4]][[1]],x,type='prob')[,'Yes'])
colnames(preds)<-'ELR'
preds$RF<-predict(model[[4]][[2]],x,type='prob')[,'Yes']
preds$XGB<-predict(model[[4]][[3]],x,type='prob')[,'Yes']
preds$LRM<-predict(model[[4]][[4]],preds,type='prob')[,'Yes']
preds$LRM
}
}
Similar to the above, the data is first split into in/out of bootstrap samples (the outer bootstrap,500 times). Each model is then trained on the outerbootstrap training samples, and hyperparameters optimised by an inner bootstrap (1000 times). The same inner and outer bootstrap cases are used for each modelling strategy to ensure they can be combined and compared easily. The inner bootstrap resamples are then extracted and isotonic and logistic regression (for the ensemble) models are generated using the inner bootstrap testing resamples.
The base models and the isotonic and logistic regression models can then be applied to the outer bootstrap testing data to obtain performance metrics. For each outer bootstrap resample, the 1000 inner bootstrap resample outputs are averaged, and then averaged across all the outer boostrap resamples for the final metrics and plots.
###Outerbootstrap
part<-createResample(dataC$EarlyRec,times=500)
outerbootdata<-1
####Innerbootstrap
system.time(for (p in 1:500){
trainpart<-dataC[part[[p]],]
testpart<-dataC[-part[[p]],]
boot<-createResample(trainpart$EarlyRec,times=1000)
cctrl3x <- trainControl(index=boot,method="boot", number=1000,returnResamp = "all",
classProbs = TRUE, summaryFunction = multiClassSummary, savePredictions = TRUE, verboseIter = TRUE)
####Train models
registerDoParallel(cores=3)
ELR<-train(EarlyRec~., data=trainpart,preProcess=c('knnImpute'),method = "glmnet", trControl = cctrl3x,
metric = "AUC",na.action=na.pass,tuneGrid = Finalmodels[[1]]$bestTune)
RF<-train(EarlyRec~., data=trainpart,preProcess=c('knnImpute'), method="ranger", num.trees=1000,
na.action=na.pass,replace=TRUE,importance='none',trControl= cctrl3x,
tuneGrid=Finalmodels[[2]]$bestTune, metric ="AUC")
XGB<-train(EarlyRec~.,data=trainpart,preProcess=c('knnImpute'),method="xgbTree",
trControl=cctrl3x,tuneGrid=Finalmodels[[3]]$bestTune,verbose=T,metric="AUC",na.action = na.pass)
registerDoSEQ()
cabs<-1
system.time(for (z in 1:length(levels(as.factor(ELR$pred$Resample)))){
ELR$pred$Resample<-as.factor(ELR$pred$Resample)
RF$pred$Resample<-as.factor(RF$pred$Resample)
XGB$pred$Resample<-as.factor(XGB$pred$Resample)
x1<-ELR$pred[ELR$pred$Resample==levels(ELR$pred$Resample)[z],]
x2<-RF$pred[RF$pred$Resample==levels(RF$pred$Resample)[z],]
x3<-XGB$pred[XGB$pred$Resample==levels(XGB$pred$Resample)[z],]
preds<-as.data.frame(cbind(x1$Yes,x2$Yes,x3$Yes,as.numeric(x1$obs)-1))
colnames(preds)<-c('ELR','RF','XGB','obs')
elrcal<-CORElearn::calibrate(as.factor(preds$obs),preds$ELR,class1=2,method='isoReg',assumeProbabilities = TRUE)
rfcal<-CORElearn::calibrate(as.factor(preds$obs),preds$RF,class1=2,method='isoReg',assumeProbabilities = TRUE)
xgbcal<-CORElearn::calibrate(as.factor(preds$obs),preds$XGB,class1=2,method='isoReg',assumeProbabilities = TRUE)
preds$ELRc<-applyCalibration(preds$ELR,elrcal)
preds$RFc<-applyCalibration(preds$RF,rfcal)
preds$XGBc<-applyCalibration(preds$XGB,xgbcal)
lrm<-lrm(obs~ELR+RF+XGB,data=preds)
lrmc<-lrm(obs~ELRc+RFc+XGBc,data=preds)
preds$LRM<-predict(lrm,preds,type='fitted')
lrmc2<-lrm(obs~LRM,data=preds)
preds$LRMc<-predict(lrmc,preds,type='fitted')
cabs[z]<-list(list(elrcal,rfcal,xgbcal,lrm,lrmc,lrmc2))
})####30s
predictions<-1
system.time( for (z in 1:length(levels(as.factor(ELR$pred$Resample)))){
elrcal<-cabs[[z]][[1]]
rfcal<-cabs[[z]][[2]]
xgbcal<-cabs[[z]][[3]]
lrm<-cabs[[z]][[4]]
lrmc<-cabs[[z]][[5]]
lrmc2<-cabs[[z]][[6]]
preds2<-as.data.frame(predict(ELR,testpart,type='prob')[,'Yes'])
colnames(preds2)<-'ELR'
preds2$RF<-predict(RF,testpart,type='prob')[,'Yes']
preds2$XGB<-predict(XGB,testpart,type='prob')[,'Yes']
preds2$LRM<-predict(lrm,preds2,type='fitted')
preds2$ELRc<-applyCalibration(preds2$ELR,elrcal)
preds2$RFc<-applyCalibration(preds2$RF,rfcal)
preds2$XGBc<-applyCalibration(preds2$XGB,xgbcal)
preds2$LRMc<-predict(lrmc,preds2,type='fitted')
preds2$LRMc2<-predict(lrmc2,preds2,type='fitted')
preds2$obs<-as.numeric(na.omit(testpart)$EarlyRec)-1
predictions[z]<-list(preds2)
})#####64s
system.time({
avefoldLL<-1
avefoldBR<-1
avefoldCID<-1
calplot1<-1
ROCplot1<-1
cabdata2<-1
rocdata2<-1
for (i in 1:4){
cabdata<-1
brs<-1
LLs<-1
cids<-1
lgt<-1
roc<-1
for (z in 1:1){
df<-as.data.frame(cbind(predictions[[z]][,i],predictions[[z]][,10]))
colnames(df)<-c('Preds','Obs')
brs[z]<-BrierScore(df$Obs,df$Preds)
LLs[z]<-logLoss(df$Obs,df$Preds)
cids[z]<-auc(df$Obs,df$Preds)
lgt[z]<-nrow(df)
xx<-ggplot_build(ggplot(data = df) + xlab("Predicted Probability") + ylab('Observed Average')+
geom_smooth(data = df, aes(Preds, Obs),
color = "#F8766D",se=TRUE,level=0.95,span=2/3)+xlim(c(0,1))+ylim(c(0,1))+
geom_abline(intercept = 0, slope = 1,linetype=2))
for (m in 1:(length(xx$data[[1]]$y)-1)){
xx$data[[1]]$y[m+1]<-ifelse(xx$data[[1]]$y[m]>0.9&is.na(xx$data[[1]]$y[m+1]),1.0,
ifelse(xx$data[[1]]$y[m]<0.1&is.na(xx$data[[1]]$y[m+1]),0,xx$data[[1]]$y[m+1]))
}
cabdata[z]<-list(select(xx$data[[1]],x,y))
rocobj<-roc(df$Obs,df$Preds)
roc[z]<-list(ggplot_build(ggroc(rocobj,alpha=0.5,colour='red',linetype=6,size=1))$data)
}
avefoldLL[i]<-list(c(mean(LLs),std.error(LLs)))
avefoldBR[i]<-list(c(mean(brs),std.error(brs)))
avefoldCID[i]<-list(c(mean(cids),std.error(cids)))
lgt2<-1
for (n in 1:length(roc)){
lgt2[n]<-length(roc[[n]][[1]]$x)
}
xR<-as.data.frame(roc[[1]][[1]]$x[(1:min(lgt2))])
yR<-as.data.frame(roc[[1]][[1]]$y[(1:min(lgt2))])
for (n in 1:length(roc)){
seqs<-ceiling(seq(from=1,to=length(roc[[n]][[1]]$x),length.out=min(lgt2)))
xR[,n]<-roc[[n]][[1]]$x[seqs]
yR[,n]<-roc[[n]][[1]]$y[seqs]
}
rocdat<-as.data.frame(cbind(-rowMeans(xR),
rowMeans(yR),
rowMeans(yR)+1.96*std.error(t(yR)),
rowMeans(yR)-1.96*std.error(t(yR))))
colnames(rocdat)<-c('x','y','ymax','ymin')
ROCplot1[i]<-list(ggplot(data = rocdat) + xlab("Specificity") + ylab('Sensitivity')+
geom_line(data = rocdat, aes(y=y, x=x),
color = "#F8766D")+xlim(c(1,0))+ylim(c(0,1))+
geom_abline(intercept = 1, slope = 1,linetype=2)+
geom_ribbon(data=rocdat,aes(x=x,ymin=ymin,ymax=ymax), fill="grey", alpha=0.25))
yy<-ggplot_build(ggplot(data = rocdat) + xlab("Specificity") + ylab('Sensitivity')+
geom_line(data = rocdat, aes(y=y, x=x),
color = "#F8766D")+xlim(c(1,0))+ylim(c(0,1))+
geom_abline(intercept = 1, slope = 1,linetype=2)+
geom_ribbon(data=rocdat,aes(x=x,ymin=ymin,ymax=ymax), fill="grey", alpha=0.25))
rocdata2[i]<-list(as.data.frame(c(select(yy$data[[1]],x,y),select(yy$data[[3]],ymax,ymin))))
xC<-as.data.frame(cabdata[[1]]$x)
yC<-as.data.frame(cabdata[[1]]$y)
for (n in 1:length(cabdata)){
xC[,n]<-cabdata[[n]]$x
yC[,n]<-cabdata[[n]]$y
}
yC[is.na(yC)]<-0
cal_plot_data2<-cabdata[[1]]
cal_plot_data2$x<-rowMeans(xC)
cal_plot_data2$y<-rowMeans(yC)
cal_plot_data2$ymin<-rowMeans(yC)-1.96*std.error(t(yC))
cal_plot_data2$ymax<-rowMeans(yC)+1.96*std.error(t(yC))
calplot1[i]<-list(ggplot(data = cal_plot_data2) + xlab("Predicted Probability") + ylab('Observed Average')+
geom_smooth(data = cal_plot_data2, aes(x, y),
color = "#F8766D",se=FALSE,level=0.95,span=2/3)+xlim(c(0,1))+ylim(c(0,1))+
geom_abline(intercept = 0, slope = 1,linetype=2)+
geom_ribbon(data=cal_plot_data2,aes(x=x,ymin=ymin,ymax=ymax), fill="grey", alpha=0.25))
yy<-ggplot_build(ggplot(data = cal_plot_data2) + xlab("Predicted Probability") + ylab('Observed Average')+
geom_smooth(data = cal_plot_data2, aes(x, y),
color = "#F8766D",se=FALSE,level=0.95,span=2/3)+xlim(c(0,1))+ylim(c(0,1))+
geom_abline(intercept = 0, slope = 1,linetype=2)+
geom_ribbon(data=cal_plot_data2,aes(x=x,ymin=ymin,ymax=ymax), fill="grey", alpha=0.25))
cabdata2[i]<-list(as.data.frame(c(select(yy$data[[1]],x,y),select(yy$data[[3]],ymax,ymin))))
}
for (i in 5:9){
cabdata<-1
brs<-1
LLs<-1
cids<-1
lgt<-1
roc<-1
for (z in 1:length(predictions)){
df<-as.data.frame(cbind(predictions[[z]][,i],predictions[[z]][,10]))
colnames(df)<-c('Preds','Obs')
brs[z]<-BrierScore(df$Obs,df$Preds)
LLs[z]<-logLoss(df$Obs,df$Preds)
cids[z]<-auc(df$Obs,df$Preds)
lgt[z]<-nrow(df)
xx<-ggplot_build(ggplot(data = df) + xlab("Predicted Probability") + ylab('Observed Average')+
geom_smooth(data = df, aes(Preds, Obs),
color = "#F8766D",se=TRUE,level=0.95,span=2/3)+xlim(c(0,1))+ylim(c(0,1))+
geom_abline(intercept = 0, slope = 1,linetype=2))
if(nrow(xx$data[[1]])>0){
for (m in 1:(length(xx$data[[1]]$y)-1)){
xx$data[[1]]$y[m+1]<-ifelse(xx$data[[1]]$y[m]>0.9&is.na(xx$data[[1]]$y[m+1]),1.0,
ifelse(xx$data[[1]]$y[m]<0.1&is.na(xx$data[[1]]$y[m+1]),0,xx$data[[1]]$y[m+1]))
}
}else
{
xx<-ggplot_build(ggplot(data = df) + xlab("Predicted Probability") + ylab('Observed Average')+
geom_smooth(data = df, aes(Preds, Obs),
color = "#F8766D",se=TRUE,level=0.95,span=1)+xlim(c(0,1))+ylim(c(0,1))+
geom_abline(intercept = 0, slope = 1,linetype=2))
for (m in 1:(length(xx$data[[1]]$y)-1)){
xx$data[[1]]$y[m+1]<-ifelse(xx$data[[1]]$y[m]>0.9&is.na(xx$data[[1]]$y[m+1]),1.0,
ifelse(xx$data[[1]]$y[m]<0.1&is.na(xx$data[[1]]$y[m+1]),0,xx$data[[1]]$y[m+1]))
}
}
cabdata[z]<-list(select(xx$data[[1]],x,y))
rocobj<-roc(df$Obs,df$Preds)
roc[z]<-list(ggplot_build(ggroc(rocobj,alpha=0.5,colour='red',linetype=6,size=1))$data)
}
avefoldLL[i]<-list(c(mean(LLs),std.error(LLs)))
avefoldBR[i]<-list(c(mean(brs),std.error(brs)))
avefoldCID[i]<-list(c(mean(cids),std.error(cids)))
lgt2<-1
for (n in 1:length(roc)){
lgt2[n]<-length(roc[[n]][[1]]$x)
}
xR<-as.data.frame(roc[[1]][[1]]$x[(1:min(lgt2))])
yR<-as.data.frame(roc[[1]][[1]]$y[(1:min(lgt2))])
for (n in 1:length(roc)){
seqs<-ceiling(seq(from=1,to=length(roc[[n]][[1]]$x),length.out=min(lgt2)))
xR[,n]<-roc[[n]][[1]]$x[seqs]
yR[,n]<-roc[[n]][[1]]$y[seqs]
}
rocdat<-as.data.frame(cbind(-rowMeans(xR),
rowMeans(yR),
rowMeans(yR)+1.96*std.error(t(yR)),
rowMeans(yR)-1.96*std.error(t(yR))))
colnames(rocdat)<-c('x','y','ymax','ymin')
ROCplot1[i]<-list(ggplot(data = rocdat) + xlab("Specificity") + ylab('Sensitivity')+
geom_line(data = rocdat, aes(y=y, x=x),
color = "#F8766D")+xlim(c(1,0))+ylim(c(0,1))+
geom_abline(intercept = 1, slope = 1,linetype=2)+
geom_ribbon(data=rocdat,aes(x=x,ymin=ymin,ymax=ymax), fill="grey", alpha=0.25))
yy<-ggplot_build(ggplot(data = rocdat) + xlab("Specificity") + ylab('Sensitivity')+
geom_line(data = rocdat, aes(y=y, x=x),
color = "#F8766D")+xlim(c(1,0))+ylim(c(0,1))+
geom_abline(intercept = 1, slope = 1,linetype=2)+
geom_ribbon(data=rocdat,aes(x=x,ymin=ymin,ymax=ymax), fill="grey", alpha=0.25))
rocdata2[i]<-list(as.data.frame(c(select(yy$data[[1]],x,y),select(yy$data[[3]],ymax,ymin))))
avefoldLL
avefoldBR
xC<-as.data.frame(cabdata[[1]]$x)
yC<-as.data.frame(cabdata[[1]]$y)
for (n in 1:length(cabdata)){
xC[,n]<-cabdata[[n]]$x
yC[,n]<-cabdata[[n]]$y
}
yC[is.na(yC)]<-0
cal_plot_data2<-cabdata[[1]]
cal_plot_data2$x<-rowMeans(xC)
cal_plot_data2$y<-rowMeans(yC)
cal_plot_data2$ymin<-rowMeans(yC)-1.96*std.error(t(yC))
cal_plot_data2$ymax<-rowMeans(yC)+1.96*std.error(t(yC))
calplot1[i]<-list(ggplot(data = cal_plot_data2) + xlab("Predicted Probability") + ylab('Observed Average')+
geom_smooth(data = cal_plot_data2, aes(x, y),
color = "#F8766D",se=FALSE,level=0.95,span=2/3)+xlim(c(0,1))+ylim(c(0,1))+
geom_abline(intercept = 0, slope = 1,linetype=2)+
geom_ribbon(data=cal_plot_data2,aes(x=x,ymin=ymin,ymax=ymax), fill="grey", alpha=0.25))
yy<-ggplot_build(ggplot(data = cal_plot_data2) + xlab("Predicted Probability") + ylab('Observed Average')+
geom_smooth(data = cal_plot_data2, aes(x, y),
color = "#F8766D",se=FALSE,level=0.95,span=2/3)+xlim(c(0,1))+ylim(c(0,1))+
geom_abline(intercept = 0, slope = 1,linetype=2)+
geom_ribbon(data=cal_plot_data2,aes(x=x,ymin=ymin,ymax=ymax), fill="grey", alpha=0.25))
cabdata2[i]<-list(as.data.frame(c(select(yy$data[[1]],x,y),select(yy$data[[3]],ymax,ymin))))
}
names(avefoldLL)<-colnames(predictions[[1]])[1:9]
names(avefoldBR)<-colnames(predictions[[1]])[1:9]
names(avefoldCID)<-colnames(predictions[[1]])[1:9]
names(calplot1)<-colnames(predictions[[1]])[1:9]
names(ROCplot1)<-colnames(predictions[[1]])[1:9]
names(cabdata2)<-colnames(predictions[[1]])[1:9]
names(rocdata2)<-colnames(predictions[[1]])[1:9]
})
outerbootdata[p]<-list(list(avefoldLL,avefoldBR,avefoldCID,calplot1,ROCplot1,cabdata2,rocdata2))
names(outerbootdata[[p]])<-c('LogLoss','Brier','C-index','Calibration Plots','ROC Plots','Calibration Data','ROC Data')
qsave(outerbootdata,'outerbootdata.q')
}
The partial dependence function is derived below with two example variables (age and gender). The prediction function for the enemble model is defined as earlier (i.e. ‘Enspredfct’ ). Raw values can be extracted from the ‘ft’ object if required.
dataNA<-na.omit(dataC)
expELR<-explain(Finalmodels[[1]],data=dataNA,y=as.numeric(dataNA$EarlyRec)-1,label='ELR')
## Preparation of a new explainer is initiated
## -> model label : ELR
## -> data : 812 rows 12 cols
## -> data : tibbble converted into a data.frame
## -> target variable : 812 values
## -> model_info : package caret , ver. 6.0.86 , task Classification ( [33m default [39m )
## -> predict function : yhat.train will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.04264604 , mean = 0.2842885 , max = 0.9953722
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.9017763 , mean = 0.006351904 , max = 0.9364555
## [32m A new explainer has been created! [39m
expRF<-explain(Finalmodels[[2]],data=dataNA,y=as.numeric(dataNA$EarlyRec)-1,label='RF')
## Preparation of a new explainer is initiated
## -> model label : RF
## -> data : 812 rows 12 cols
## -> data : tibbble converted into a data.frame
## -> target variable : 812 values
## -> model_info : package caret , ver. 6.0.86 , task Classification ( [33m default [39m )
## -> predict function : yhat.train will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.008338665 , mean = 0.2875508 , max = 0.8797991
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.6304979 , mean = 0.00308962 , max = 0.771725
## [32m A new explainer has been created! [39m
expXGB<-explain(Finalmodels[[3]],data=dataNA,y=as.numeric(dataNA$EarlyRec)-1,label='XGBoost')
## Preparation of a new explainer is initiated
## -> model label : XGBoost
## -> data : 812 rows 12 cols
## -> data : tibbble converted into a data.frame
## -> target variable : 812 values
## -> model_info : package caret , ver. 6.0.86 , task Classification ( [33m default [39m )
## -> predict function : yhat.train will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.02298862 , mean = 0.2865324 , max = 0.925137
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.866363 , mean = 0.004108034 , max = 0.9587249
## [32m A new explainer has been created! [39m
expENS<-explain(Finalmodels,data=dataNA,y=as.numeric(dataNA$EarlyRec)-1,label='Ensemble',predict_function=Finalmodels[[5]])
## Preparation of a new explainer is initiated
## -> model label : Ensemble
## -> data : 812 rows 12 cols
## -> data : tibbble converted into a data.frame
## -> target variable : 812 values
## -> model_info : package Model of class: list package unrecognized , ver. Unknown , task regression ( [33m default [39m )
## -> predict function : Finalmodels[[5]]
## -> predicted values : numerical, min = 3.732555e-08 , mean = 0.2998897 , max = 1
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.9847481 , mean = -0.009249295 , max = 0.9784245
## [32m A new explainer has been created! [39m
x<-(ingredients::partial_dependency(expXGB,'Age'))
x1<-(ingredients::partial_dependency(expRF,'Age'))
x2<-(ingredients::partial_dependency(expELR,'Age'))
x3<-(ingredients::partial_dependency(expENS,'Age'))
fintab<-rbind(x,x1,x2,x3)
ft<-rbind(x,x1,x2,x3)
colnames(fintab)<-c('Variable.Name','Model','x','y','ids')
agepdp<-ggplotly(ggplot(fintab,aes(x,y))+
geom_line(data=fintab,mapping=aes(colour=Model))+theme_bw()+
ylim(0,0.6)+
ylab('Predicted Probability')+
xlab('Age (years)')+ggtitle('Age at diagnosis'))
agepdp
x<-(ingredients::partial_dependency(expXGB,'Gender'))
x1<-(ingredients::partial_dependency(expRF,'Gender'))
x2<-(ingredients::partial_dependency(expELR,'Gender'))
x3<-(ingredients::partial_dependency(expENS,'Gender'))
fintab<-rbind(x,x1,x2,x3)
ft<-rbind(ft,fintab)
colnames(fintab)<-c('Variable.Name','Model','x','y','ids')
colnames(fintab)[3]<-'Gender'
Genderpdp<-ggplot(fintab,aes(x=Gender,y=y,fill=Gender))+
geom_bar(position='dodge',stat='identity')+theme_bw()+
ylab('Marginal Predicted Probability')+
xlab('')+
facet_wrap(~Model)+ggtitle('Gender')
Genderpdp