library(ggplot2)
library(lattice)
library(mlbench)
library(caret)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
data(PimaIndiansDiabetes)
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
# CART
set.seed(7)
fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=trainControl)
# LDA
set.seed(7)
fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=trainControl)
# SVM
set.seed(7)
fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=trainControl)
# KNN
set.seed(7)
fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=trainControl)
# Random Forest
set.seed(7)
fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=trainControl)
# Collect resamples
results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: CART, LDA, SVM, KNN, RF
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.6753247 0.7272727 0.7532468 0.7469697 0.7662338 0.7922078 0
## LDA 0.7142857 0.7508117 0.7662338 0.7791069 0.8000256 0.9078947 0
## SVM 0.7236842 0.7508117 0.7631579 0.7712919 0.7915243 0.8947368 0
## KNN 0.6753247 0.7036056 0.7272727 0.7369503 0.7662338 0.8311688 0
## RF 0.6842105 0.7305195 0.7597403 0.7638528 0.8019481 0.8421053 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.2762566 0.3620724 0.4241878 0.4151867 0.4861107 0.5250000 0
## LDA 0.3011551 0.4192537 0.4662541 0.4862025 0.5308596 0.7812500 0
## SVM 0.3391908 0.3997116 0.4460612 0.4621585 0.5234605 0.7475083 0
## KNN 0.2553191 0.3406000 0.3841761 0.3984995 0.4539789 0.6195363 0
## RF 0.2951613 0.3778304 0.4640696 0.4630809 0.5447483 0.6426332 0
#Visualization of models performance
# box and whisker plots to compare models
scales <- list(x = list(relation="free"), y = list(relation="free"))
bwplot(results, scales=scales)
# density plots of accuracy
scales <- list(x = list(relation="free"), y = list(relation="free"))
densityplot(results, scales=scales, pch = "|")
#Calculate the Dot Plots (code)
# dot plots of accuracy
scales <- list(x = list(relation="free"), y = list(relation="free"))
dotplot(results, scales=scales)
# pairwise scatter plots of predictions to compare models
splom(results)
# difference in model predictions
diffs <- diff (results)
# summarize p-values for pairwise comparisons
summary (diffs)
##
## Call:
## summary.diff.resamples(object = diffs)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## CART LDA SVM KNN RF
## CART -0.032137 -0.024322 0.010019 -0.016883
## LDA 0.0011862 0.007815 0.042157 0.015254
## SVM 0.0116401 0.9156892 0.034342 0.007439
## KNN 1.0000000 6.68e-05 0.0002941 -0.026902
## RF 0.2727542 0.4490617 1.0000000 0.0183793
##
## Kappa
## CART LDA SVM KNN RF
## CART -0.0710158 -0.0469717 0.0166872 -0.0478942
## LDA 0.0008086 0.0240440 0.0877029 0.0231215
## SVM 0.0258079 0.3562734 0.0636589 -0.0009225
## KNN 1.0000000 0.0003858 0.0040823 -0.0645814
## RF 0.0211763 1.0000000 1.0000000 0.0158974
#Iris dataset model comparaison
trainControl<- trainControl(method="cv", number=10)
metric<- "Accuracy"
#Building Five Models:
# LDA
set.seed(7)
fit.lda<- train(Species~., data=iris, method="lda", metric=metric, trControl=trainControl)
# CART
set.seed(7)
fit.cart <- train(Species~., data=iris, method="rpart", metric=metric, trControl=trainControl)
# KNN
set.seed(7)
fit.knn <- train(Species~., data=iris, method="knn", metric=metric, trControl=trainControl)
# SVM
set.seed(7)
fit.svm <- train(Species~., data= iris, method="svmRadial", metric=metric, trControl=trainControl)
# Random Forest
set.seed(7)
fit.rf <- train(Species~., data=iris, method="rf", metric=metric, trControl=trainControl)
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, cart, knn, svm, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.9333333 0.9500000 1.0000000 0.9800000 1.0000000 1 0
## cart 0.8666667 0.9333333 0.9333333 0.9400000 0.9833333 1 0
## knn 0.8666667 0.9333333 1.0000000 0.9666667 1.0000000 1 0
## svm 0.8000000 0.9333333 0.9666667 0.9466667 1.0000000 1 0
## rf 0.8666667 0.9333333 0.9666667 0.9600000 1.0000000 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.9 0.925 1.00 0.97 1.000 1 0
## cart 0.8 0.900 0.90 0.91 0.975 1 0
## knn 0.8 0.900 1.00 0.95 1.000 1 0
## svm 0.7 0.900 0.95 0.92 1.000 1 0
## rf 0.8 0.900 0.95 0.94 1.000 1 0
dotplot(results)
# summarize Best Model
print(fit.lda)
## Linear Discriminant Analysis
##
## 150 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results:
##
## Accuracy Kappa
## 0.98 0.97
#Using a validation dataset trying my code:
TrainIndex<-createDataPartition(iris$Species,p=0.70, list=FALSE)
LeftData=iris[-TrainIndex, ]
ValidIndex<-createDataPartition(LeftData$Species,p=.5, list=FALSE)
train<-iris[TrainIndex, ]
test<-LeftData[-ValidIndex, ]
validate<-LeftData[ValidIndex, ]
predictions <- predict(fit.lda, validate)
confusionMatrix(predictions, validate$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 8 0 0
## versicolor 0 8 0
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8575, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 3.541e-12
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
predictionsTest <- predict(fit.lda, test)
confusionMatrix(predictionsTest, test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 7 0 0
## versicolor 0 7 0
## virginica 0 0 7
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8389, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 9.56e-11
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000