Load Libraries

## Loading required package: ggplot2

## Loading required package: lattice

## 
## Attaching package: 'caretEnsemble'

## The following object is masked from 'package:ggplot2':
## 
##     autoplot

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:caret':
## 
##     lift

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## corrplot 0.92 loaded

Visualize diabetes status for all sabjects

data(PimaIndiansDiabetes)

# Visualize diabetes status for all sabjects
dat <- PimaIndiansDiabetes %>% dplyr::mutate(status = ifelse(diabetes == "neg", 0, 1))
#select glucose,age, and status
df <- dat %>% select(glucose,age,status) 
#convert status to factor
df$status <- as.factor(df$status)
col <- c("0"="blue", "1"="red")
p <- df %>% 
      ggplot(.,aes(x=age,y=glucose,color=factor(status))) + 
      geom_point(size=4, shape=19, alpha=0.6) + 
      scale_color_manual(values = col, labels=c("negative","positive"),name="Test Results")
p

Split data into training and test datasets

set.seed(1234)
idx<-createDataPartition(PimaIndiansDiabetes$diabetes,p=.75,list=FALSE)
train<-PimaIndiansDiabetes[idx,]
test<-PimaIndiansDiabetes[-idx,]

# Using Repeated Cross-Validation with three repeats
control<-trainControl(method="repeatedcv",number=5,repeats=3,verboseIter=F,classProbs=TRUE,savePredictions='final')

# A function to create tune grid
create_tune_grid<-function(model,tune_length){
params<-modelLookup(model)$parameter
grid<-expand.grid(lapply(1:length(params),function(x)1:tune_length))
names(grid)<-as.character(params)
grid
}

# Regression models to train
regModels <-c("rpart","lda","svmRadial","knn","glm")


m <- lapply(regModels, function(x){
    create_tune_grid(x,10)
    })
#m


reg_df <- lapply(m,function(x){
    as.data.frame(x)
    #print(x)
    })

names(reg_df)<-regModels

models <- c(names(reg_df))
tunelen <- c(rep(5,length(models)))
tune_grid<-reg_df

args2<-list(method=regModels,tuneGrid=tune_grid,tuneLength=tunelen)
my_models<-pmap(args2,caretModelSpec)

names(my_models)<-regModels


suppressWarnings(model_list<-caretList(
diabetes~.,
data=train,
trControl=control,
metric="Accuracy",
tuneList=my_models
), classes = "Warning")

## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl.  Attempting to set them ourselves, so each model in the ensemble
## will have the same resampling indexes.

metrics <-lapply(model_list,"[[",14)

acc <- lapply(metrics,"[[",1)

acc_df<-lapply(acc,function(x){
    data.frame(x)
    })

kappa<-lapply(metrics,"[[",2)

kappa_df<-lapply(kappa,function(x){
    data.frame(x)
    })

fit<-caretEnsemble(model_list)
head(fit$models$rpart$pred)

##   cp pred obs rowIndex       neg       pos   Resample
## 1 10  neg neg      326 0.6507592 0.3492408 Fold2.Rep1
## 2 10  neg neg      327 0.6507592 0.3492408 Fold2.Rep1
## 3 10  neg pos      341 0.6507592 0.3492408 Fold2.Rep1
## 4 10  neg pos      407 0.6507592 0.3492408 Fold2.Rep1
## 5 10  neg neg      336 0.6507592 0.3492408 Fold2.Rep1
## 6 10  neg neg      424 0.6507592 0.3492408 Fold2.Rep1

A plot of fitted models

plot(fit)

Accuracy of the ensemble models

ens_mod_acc<-fit$ens_model$results[2]
ens_mod_acc

##    Accuracy
## 1 0.7564499

RPART perfomance

all_mod_performance <- fit$models
rpart_perf <- ggplot(all_mod_performance$rpart)
rpart_perf

svmRadial perfomance

svmRadial_perf <- ggplot(all_mod_performance$svmRadial)
svmRadial_perf

## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 10 values. Consider specifying shapes manually if you need
##   that many have them.

## Warning: Removed 40 rows containing missing values (`geom_point()`).

knn perfomance

knn_perf <- ggplot(all_mod_performance$knn)
knn_perf

Plot Model Accuracies

results <- resamples(model_list)
summ <- summary(results)

#Plot Model Accuracies
accuracy <- data.frame(summ$statistics$Accuracy)
accuracy <- rownames_to_column(accuracy,var="Model")
ap <- ggplot(accuracy, aes(x=Model,y=Mean))+
    geom_col( aes(fill = Model ), position = "dodge")+
    ggtitle("Accuracy comparison across trained models")+
    labs(x="Model", y="Accuracy")+
    geom_text(aes(label=paste(round(Mean*100,digits = 2),"%"), vjust=-0.5))+
    theme(
        axis.title.x=element_text(size=12, face = "bold", color = "black"),
        axis.text.x=element_text(size=12,face="bold",angle = 45, hjust = 1, vjust = 0.5),
        plot.title=element_text(color="darkgreen", size=18, hjust=0.5),
        axis.text.y=element_text(size=12,face="bold"),
        axis.title.y=element_text(size=12, face = "bold", color = "black")
    )
ap

#Visualization of Model Kappa

kappa <- data.frame(summ$statistics$Kappa)
kappa <- rownames_to_column(kappa,var="Model")
kp <- ggplot(kappa, aes(x=Model,y=Mean))+
    geom_col( aes(fill = Model ), position = "dodge")+
    ggtitle("Kappa comparison across trained models")+
    labs(x="Model", y="Kappa")+
    geom_text(aes(label=paste(round(Mean*100,digits = 2),"%"), vjust=-0.5))+
    theme(
        axis.title.x=element_text(size=12, face = "bold", color = "black"),
        axis.text.x=element_text(size=12,face="bold",angle = 45, hjust = 1, vjust = 0.5),
        plot.title=element_text(color="darkgreen", size=18, hjust=0.5),
        axis.text.y=element_text(size=12,face="bold"),
        axis.title.y=element_text(size=12, face = "bold", color = "black")
    )

kp

Model Correlation

If the predictions for the sub-models were highly corrected (> 0:75) then they would be making the same or very similarpredictions most of the time reducing the benefit of combining the predictions.

# Visualize models correlation
mod_cor <- modelCor(results)
corrplot(mod_cor)

The two methods with the highest correlation between their predictions are Logistic Regression (svmRadial) and RF at 0.27 correlation which is not considered high (> 0:75).

splom(results)

Visually inspect model accuracies using box-whisker plot

scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)

Evaluate the overlap of estimate behavour (Accuracy and Kappa)……The higher the Accuracy the lower the Kappa

density plots of accuracy

scales <- list(x=list(relation="free"), y=list(relation="free"))
densityplot(results, scales=scales, pch = "|")

#Dot plots They show both the mean estimated accuracy as well as the 95% confidence interval (e.g. the range in which 95% of observed scores fell).

Dot plots of accuracy

scales <- list(x=list(relation="free"), y=list(relation="free"))
dotplot(results, scales=scales)

#parallell plots It shows how each trial of each cross validation fold behaved for each of the algorithms tested. It can help you see how those hold-out subsets that were difficult for one algorithm affected other algorithms.

parallelplot(results)

Statistical significance of the difference in model predictions

The lower diagonal of the table shows p-values for the null hypothesis (distributions are the same), smaller is better. We can see no diference between RPART and SVMRADIAL, we can also see little diference between the distributions for LDA and SVM.

The upper diagonal of the table shows the estimated diference between the distributions.

diffs <- diff(results)
# summarize p-values for pair-wise comparisons
summary(diffs)

## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##           rpart     lda       svmRadial knn       glm      
## rpart               -0.118606 -0.058431 -0.086752 -0.119770
## lda       5.146e-10            0.060175  0.031854 -0.001164
## svmRadial 3.404e-06 0.001232            -0.028321 -0.061339
## knn       4.147e-06 0.096067  0.169452            -0.033018
## glm       6.130e-10 1.000000  0.001242  0.080043           
## 
## Kappa 
##           rpart     lda       svmRadial knn       glm      
## rpart               -0.461696 -0.307720 -0.395984 -0.465357
## lda       1.121e-12            0.153976  0.065713 -0.003661
## svmRadial 9.044e-10 0.0010959           -0.088264 -0.157637
## knn       6.684e-10 0.1676919 0.0513506           -0.069374
## glm       1.128e-12 1.0000000 0.0008486 0.1269139

Resample Accuracies

plot_data <- melt(results$values)

## Using Resample as id variables

df <- as.data.frame(plot_data)
names(df)<-c("Resample","Model", "Accuracy")
df$Model <- gsub("~.*", "", df$Model)


rp <- ggplot()+
      geom_boxplot(data=df,aes(x=Model,y=Accuracy,color=Model))+
      ggtitle("Resample accuracy for ML models estimated")+
      stat_summary(fun.data = mean, colour = "red", geom = "point")+
      theme_bw()

rp

Variable Importance plots

all_models <- fit$models
imp_plots <-lapply(all_models, function(x){
    imp<-varImp(x)
    ggplot(imp)+
    ggtitle(x)
 })

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

LDA Variable Importance plot

ldaplot <- imp_plots[2]
ldaplot

## $lda

svmRadial Variable Importance plot

svmRadialplot <- imp_plots[3]
svmRadialplot

## $svmRadial

knn Variable Importance plot

knn_plot <- imp_plots[4]
knn_plot

## $knn

glm Variable Importance plot

glm_plot <- imp_plots[5]
glm_plot

## $glm

Machine Learning using PimaIndiansDiabetes Dataset

Kiplagat John Noel

2024-01-22