Choosing the best model

Including Plots

You can also embed plots, for example:

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(mlbench)
data("PimaIndiansDiabetes")
summary(PimaIndiansDiabetes)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age        diabetes 
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00            
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

str(PimaIndiansDiabetes)

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

View(PimaIndiansDiabetes)
#prepare training scheme
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
#cart, decision tree
set.seed(7)
fit.cart <- train(diabetes~.,data=PimaIndiansDiabetes,method="rpart", trControl=trainControl)
#LDA, linear discriminant analysis
set.seed(7)
fit.lda <- train(diabetes~.,data=PimaIndiansDiabetes,method="lda", trControl=trainControl)
#svm, support vector 
set.seed(7)
fit.svm <- train(diabetes~.,data=PimaIndiansDiabetes,method="svmRadial", trControl=trainControl)
#knn, k-nearest neighbors 
set.seed(7)
fit.knn <- train(diabetes~.,data=PimaIndiansDiabetes,method="knn", trControl=trainControl)
#rf, random forest
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(7)
fit.rf <- train(diabetes~.,data=PimaIndiansDiabetes,method="rf", trControl=trainControl)
#collect resamples
results <- resamples(list(CART = fit.cart, LDA = fit.lda, SVM = fit.svm, KNN = fit.knn, RF = fit.rf))
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: CART, LDA, SVM, KNN, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.6753247 0.7272727 0.7532468 0.7469697 0.7662338 0.7922078    0
## LDA  0.7142857 0.7508117 0.7662338 0.7791069 0.8000256 0.9078947    0
## SVM  0.7236842 0.7508117 0.7631579 0.7712919 0.7915243 0.8947368    0
## KNN  0.6753247 0.7036056 0.7272727 0.7369503 0.7662338 0.8311688    0
## RF   0.6842105 0.7305195 0.7597403 0.7638528 0.8019481 0.8421053    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.2762566 0.3620724 0.4241878 0.4151867 0.4861107 0.5250000    0
## LDA  0.3011551 0.4192537 0.4662541 0.4862025 0.5308596 0.7812500    0
## SVM  0.3391908 0.3997116 0.4460612 0.4621585 0.5234605 0.7475083    0
## KNN  0.2553191 0.3406000 0.3841761 0.3984995 0.4539789 0.6195363    0
## RF   0.2951613 0.3778304 0.4640696 0.4630809 0.5447483 0.6426332    0

fit.knn

## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 691, 691, 691, 691, 691, 691, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7191900  0.3580128
##   7  0.7261734  0.3779733
##   9  0.7369503  0.3984995
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

table(PimaIndiansDiabetes$diabetes)

## 
## neg pos 
## 500 268

prop.table(table(PimaIndiansDiabetes$diabetes))

## 
##       neg       pos 
## 0.6510417 0.3489583

scales <- list(x = list(relation="free"),y=list(relation="free"))
bwplot(results,scales=scales)

#calculate the density plots, density accuracy
scales <- list(x = list(relation="free"),y=list(relation="free"))
densityplot(results,scales=scales,pch ="l")

#calculate the dot plots, ddot plots of accuracy
scales <- list(x = list(relation="free"),y=list(relation="free"))
dotplot(results,scales=scales)

#SCATTERPLOT
splom(results)

diffs <- diff(results)
summary(diffs)

## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      CART      LDA       SVM       KNN       RF       
## CART           -0.032137 -0.024322  0.010019 -0.016883
## LDA  0.0011862            0.007815  0.042157  0.015254
## SVM  0.0116401 0.9156892            0.034342  0.007439
## KNN  1.0000000 6.68e-05  0.0002941           -0.026902
## RF   0.2727542 0.4490617 1.0000000 0.0183793          
## 
## Kappa 
##      CART      LDA        SVM        KNN        RF        
## CART           -0.0710158 -0.0469717  0.0166872 -0.0478942
## LDA  0.0008086             0.0240440  0.0877029  0.0231215
## SVM  0.0258079 0.3562734              0.0636589 -0.0009225
## KNN  1.0000000 0.0003858  0.0040823             -0.0645814
## RF   0.0211763 1.0000000  1.0000000  0.0158974

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Choosing the best model

Bezawit Tilahun

2024-04-30

R Markdown

Including Plots