## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## corrplot 0.92 loaded
data(PimaIndiansDiabetes)
# Visualize diabetes status for all sabjects
dat <- PimaIndiansDiabetes %>% dplyr::mutate(status = ifelse(diabetes == "neg", 0, 1))
#select glucose,age, and status
df <- dat %>% select(glucose,age,status)
#convert status to factor
df$status <- as.factor(df$status)
col <- c("0"="blue", "1"="red")
p <- df %>%
ggplot(.,aes(x=age,y=glucose,color=factor(status))) +
geom_point(size=4, shape=19, alpha=0.6) +
scale_color_manual(values = col, labels=c("negative","positive"),name="Test Results")
p
set.seed(1234)
idx<-createDataPartition(PimaIndiansDiabetes$diabetes,p=.75,list=FALSE)
train<-PimaIndiansDiabetes[idx,]
test<-PimaIndiansDiabetes[-idx,]
# Using Repeated Cross-Validation with three repeats
control<-trainControl(method="repeatedcv",number=5,repeats=3,verboseIter=F,classProbs=TRUE,savePredictions='final')
# A function to create tune grid
create_tune_grid<-function(model,tune_length){
params<-modelLookup(model)$parameter
grid<-expand.grid(lapply(1:length(params),function(x)1:tune_length))
names(grid)<-as.character(params)
grid
}
# Regression models to train
regModels <-c("rpart","lda","svmRadial","knn","glm")
m <- lapply(regModels, function(x){
create_tune_grid(x,10)
})
#m
reg_df <- lapply(m,function(x){
as.data.frame(x)
#print(x)
})
names(reg_df)<-regModels
models <- c(names(reg_df))
tunelen <- c(rep(5,length(models)))
tune_grid<-reg_df
args2<-list(method=regModels,tuneGrid=tune_grid,tuneLength=tunelen)
my_models<-pmap(args2,caretModelSpec)
names(my_models)<-regModels
suppressWarnings(model_list<-caretList(
diabetes~.,
data=train,
trControl=control,
metric="Accuracy",
tuneList=my_models
), classes = "Warning")
## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble
## will have the same resampling indexes.
metrics <-lapply(model_list,"[[",14)
acc <- lapply(metrics,"[[",1)
acc_df<-lapply(acc,function(x){
data.frame(x)
})
kappa<-lapply(metrics,"[[",2)
kappa_df<-lapply(kappa,function(x){
data.frame(x)
})
fit<-caretEnsemble(model_list)
head(fit$models$rpart$pred)
## cp pred obs rowIndex neg pos Resample
## 1 10 neg neg 326 0.6507592 0.3492408 Fold2.Rep1
## 2 10 neg neg 327 0.6507592 0.3492408 Fold2.Rep1
## 3 10 neg pos 341 0.6507592 0.3492408 Fold2.Rep1
## 4 10 neg pos 407 0.6507592 0.3492408 Fold2.Rep1
## 5 10 neg neg 336 0.6507592 0.3492408 Fold2.Rep1
## 6 10 neg neg 424 0.6507592 0.3492408 Fold2.Rep1
plot(fit)
ens_mod_acc<-fit$ens_model$results[2]
ens_mod_acc
## Accuracy
## 1 0.7564499
all_mod_performance <- fit$models
rpart_perf <- ggplot(all_mod_performance$rpart)
rpart_perf
svmRadial_perf <- ggplot(all_mod_performance$svmRadial)
svmRadial_perf
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 10 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 40 rows containing missing values (`geom_point()`).
knn_perf <- ggplot(all_mod_performance$knn)
knn_perf
results <- resamples(model_list)
summ <- summary(results)
#Plot Model Accuracies
accuracy <- data.frame(summ$statistics$Accuracy)
accuracy <- rownames_to_column(accuracy,var="Model")
ap <- ggplot(accuracy, aes(x=Model,y=Mean))+
geom_col( aes(fill = Model ), position = "dodge")+
ggtitle("Accuracy comparison across trained models")+
labs(x="Model", y="Accuracy")+
geom_text(aes(label=paste(round(Mean*100,digits = 2),"%"), vjust=-0.5))+
theme(
axis.title.x=element_text(size=12, face = "bold", color = "black"),
axis.text.x=element_text(size=12,face="bold",angle = 45, hjust = 1, vjust = 0.5),
plot.title=element_text(color="darkgreen", size=18, hjust=0.5),
axis.text.y=element_text(size=12,face="bold"),
axis.title.y=element_text(size=12, face = "bold", color = "black")
)
ap
#Visualization of Model Kappa
kappa <- data.frame(summ$statistics$Kappa)
kappa <- rownames_to_column(kappa,var="Model")
kp <- ggplot(kappa, aes(x=Model,y=Mean))+
geom_col( aes(fill = Model ), position = "dodge")+
ggtitle("Kappa comparison across trained models")+
labs(x="Model", y="Kappa")+
geom_text(aes(label=paste(round(Mean*100,digits = 2),"%"), vjust=-0.5))+
theme(
axis.title.x=element_text(size=12, face = "bold", color = "black"),
axis.text.x=element_text(size=12,face="bold",angle = 45, hjust = 1, vjust = 0.5),
plot.title=element_text(color="darkgreen", size=18, hjust=0.5),
axis.text.y=element_text(size=12,face="bold"),
axis.title.y=element_text(size=12, face = "bold", color = "black")
)
kp
If the predictions for the sub-models were highly corrected (> 0:75) then they would be making the same or very similarpredictions most of the time reducing the benefit of combining the predictions.
# Visualize models correlation
mod_cor <- modelCor(results)
corrplot(mod_cor)
The two methods with the highest correlation between their predictions are Logistic Regression (svmRadial) and RF at 0.27 correlation which is not considered high (> 0:75).
splom(results)
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)
Evaluate the overlap of estimate behavour (Accuracy and Kappa)……The higher the Accuracy the lower the Kappa
scales <- list(x=list(relation="free"), y=list(relation="free"))
densityplot(results, scales=scales, pch = "|")
#Dot plots They show both the mean estimated accuracy as well as the 95% confidence interval (e.g. the range in which 95% of observed scores fell).
Dot plots of accuracy
scales <- list(x=list(relation="free"), y=list(relation="free"))
dotplot(results, scales=scales)
#parallell plots It shows how each trial of each cross validation fold behaved for each of the algorithms tested. It can help you see how those hold-out subsets that were difficult for one algorithm affected other algorithms.
parallelplot(results)
The lower diagonal of the table shows p-values for the null hypothesis (distributions are the same), smaller is better. We can see no diference between RPART and SVMRADIAL, we can also see little diference between the distributions for LDA and SVM.
The upper diagonal of the table shows the estimated diference between the distributions.
diffs <- diff(results)
# summarize p-values for pair-wise comparisons
summary(diffs)
##
## Call:
## summary.diff.resamples(object = diffs)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## rpart lda svmRadial knn glm
## rpart -0.118606 -0.058431 -0.086752 -0.119770
## lda 5.146e-10 0.060175 0.031854 -0.001164
## svmRadial 3.404e-06 0.001232 -0.028321 -0.061339
## knn 4.147e-06 0.096067 0.169452 -0.033018
## glm 6.130e-10 1.000000 0.001242 0.080043
##
## Kappa
## rpart lda svmRadial knn glm
## rpart -0.461696 -0.307720 -0.395984 -0.465357
## lda 1.121e-12 0.153976 0.065713 -0.003661
## svmRadial 9.044e-10 0.0010959 -0.088264 -0.157637
## knn 6.684e-10 0.1676919 0.0513506 -0.069374
## glm 1.128e-12 1.0000000 0.0008486 0.1269139
plot_data <- melt(results$values)
## Using Resample as id variables
df <- as.data.frame(plot_data)
names(df)<-c("Resample","Model", "Accuracy")
df$Model <- gsub("~.*", "", df$Model)
rp <- ggplot()+
geom_boxplot(data=df,aes(x=Model,y=Accuracy,color=Model))+
ggtitle("Resample accuracy for ML models estimated")+
stat_summary(fun.data = mean, colour = "red", geom = "point")+
theme_bw()
rp
all_models <- fit$models
imp_plots <-lapply(all_models, function(x){
imp<-varImp(x)
ggplot(imp)+
ggtitle(x)
})
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
ldaplot <- imp_plots[2]
ldaplot
## $lda
svmRadialplot <- imp_plots[3]
svmRadialplot
## $svmRadial
knn_plot <- imp_plots[4]
knn_plot
## $knn
glm_plot <- imp_plots[5]
glm_plot
## $glm