This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
You can also embed plots, for example:
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(mlbench)
data("PimaIndiansDiabetes")
summary(PimaIndiansDiabetes)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age diabetes
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00 neg:500
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00 pos:268
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
View(PimaIndiansDiabetes)
#prepare training scheme
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
#cart, decision tree
set.seed(7)
fit.cart <- train(diabetes~.,data=PimaIndiansDiabetes,method="rpart", trControl=trainControl)
#LDA, linear discriminant analysis
set.seed(7)
fit.lda <- train(diabetes~.,data=PimaIndiansDiabetes,method="lda", trControl=trainControl)
#svm, support vector
set.seed(7)
fit.svm <- train(diabetes~.,data=PimaIndiansDiabetes,method="svmRadial", trControl=trainControl)
#knn, k-nearest neighbors
set.seed(7)
fit.knn <- train(diabetes~.,data=PimaIndiansDiabetes,method="knn", trControl=trainControl)
#rf, random forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(7)
fit.rf <- train(diabetes~.,data=PimaIndiansDiabetes,method="rf", trControl=trainControl)
#collect resamples
results <- resamples(list(CART = fit.cart, LDA = fit.lda, SVM = fit.svm, KNN = fit.knn, RF = fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: CART, LDA, SVM, KNN, RF
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.6753247 0.7272727 0.7532468 0.7469697 0.7662338 0.7922078 0
## LDA 0.7142857 0.7508117 0.7662338 0.7791069 0.8000256 0.9078947 0
## SVM 0.7236842 0.7508117 0.7631579 0.7712919 0.7915243 0.8947368 0
## KNN 0.6753247 0.7036056 0.7272727 0.7369503 0.7662338 0.8311688 0
## RF 0.6842105 0.7305195 0.7597403 0.7638528 0.8019481 0.8421053 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.2762566 0.3620724 0.4241878 0.4151867 0.4861107 0.5250000 0
## LDA 0.3011551 0.4192537 0.4662541 0.4862025 0.5308596 0.7812500 0
## SVM 0.3391908 0.3997116 0.4460612 0.4621585 0.5234605 0.7475083 0
## KNN 0.2553191 0.3406000 0.3841761 0.3984995 0.4539789 0.6195363 0
## RF 0.2951613 0.3778304 0.4640696 0.4630809 0.5447483 0.6426332 0
fit.knn
## k-Nearest Neighbors
##
## 768 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 691, 691, 691, 691, 691, 691, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7191900 0.3580128
## 7 0.7261734 0.3779733
## 9 0.7369503 0.3984995
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
table(PimaIndiansDiabetes$diabetes)
##
## neg pos
## 500 268
prop.table(table(PimaIndiansDiabetes$diabetes))
##
## neg pos
## 0.6510417 0.3489583
scales <- list(x = list(relation="free"),y=list(relation="free"))
bwplot(results,scales=scales)
#calculate the density plots, density accuracy
scales <- list(x = list(relation="free"),y=list(relation="free"))
densityplot(results,scales=scales,pch ="l")
#calculate the dot plots, ddot plots of accuracy
scales <- list(x = list(relation="free"),y=list(relation="free"))
dotplot(results,scales=scales)
#SCATTERPLOT
splom(results)
diffs <- diff(results)
summary(diffs)
##
## Call:
## summary.diff.resamples(object = diffs)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## CART LDA SVM KNN RF
## CART -0.032137 -0.024322 0.010019 -0.016883
## LDA 0.0011862 0.007815 0.042157 0.015254
## SVM 0.0116401 0.9156892 0.034342 0.007439
## KNN 1.0000000 6.68e-05 0.0002941 -0.026902
## RF 0.2727542 0.4490617 1.0000000 0.0183793
##
## Kappa
## CART LDA SVM KNN RF
## CART -0.0710158 -0.0469717 0.0166872 -0.0478942
## LDA 0.0008086 0.0240440 0.0877029 0.0231215
## SVM 0.0258079 0.3562734 0.0636589 -0.0009225
## KNN 1.0000000 0.0003858 0.0040823 -0.0645814
## RF 0.0211763 1.0000000 1.0000000 0.0158974
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.