R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data("PimaIndiansDiabetes")



# Set train control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

# CART model
set.seed(7)
cart_model <- train(diabetes~., data = PimaIndiansDiabetes, method = "rpart", trControl = ctrl)

# LDA model
set.seed(7)
lda_model <- train(diabetes~., data = PimaIndiansDiabetes, method = "lda", trControl = ctrl)

# SVM model
set.seed(7)
svm_model <- train(diabetes~., data = PimaIndiansDiabetes, method = "svmRadial", trControl = ctrl)

# KNN model
set.seed(7)
knn_model <- train(diabetes~., data =PimaIndiansDiabetes, method = "knn", trControl = ctrl)

# Random Forest model
set.seed(7)
rf_model <- train(diabetes~., data = PimaIndiansDiabetes, method = "rf", trControl = ctrl)

print(cart_model)
## CART 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 691, 691, 691, 691, 691, 691, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.01741294  0.7469697  0.4151867
##   0.10447761  0.7178742  0.3614570
##   0.24253731  0.6991684  0.2776661
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01741294.
print(lda_model)
## Linear Discriminant Analysis 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 691, 691, 691, 691, 691, 691, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7791069  0.4862025
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 691, 691, 691, 691, 691, 691, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.7712919  0.4621585
##   0.50  0.7625769  0.4485309
##   1.00  0.7560549  0.4339951
## 
## Tuning parameter 'sigma' was held constant at a value of 0.124824
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.124824 and C = 0.25.
print(knn_model)
## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 691, 691, 691, 691, 691, 691, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7191900  0.3580128
##   7  0.7261734  0.3779733
##   9  0.7369503  0.3984995
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
print(rf_model)
## Random Forest 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 691, 691, 691, 691, 691, 691, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7638528  0.4630809
##   5     0.7634256  0.4664261
##   8     0.7599738  0.4596437
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
results<-resamples(list(CART=cart_model,LDA=lda_model,SVM=svm_model,KNN=knn_model,RF=rf_model))

print(results)
## 
## Call:
## resamples.default(x = list(CART = cart_model, LDA = lda_model, SVM =
##  svm_model, KNN = knn_model, RF = rf_model))
## 
## Models: CART, LDA, SVM, KNN, RF 
## Number of resamples: 30 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit
#summarize the results
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: CART, LDA, SVM, KNN, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.6753247 0.7272727 0.7532468 0.7469697 0.7662338 0.7922078    0
## LDA  0.7142857 0.7508117 0.7662338 0.7791069 0.8000256 0.9078947    0
## SVM  0.7236842 0.7508117 0.7631579 0.7712919 0.7915243 0.8947368    0
## KNN  0.6753247 0.7036056 0.7272727 0.7369503 0.7662338 0.8311688    0
## RF   0.6842105 0.7305195 0.7597403 0.7638528 0.8019481 0.8421053    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.2762566 0.3620724 0.4241878 0.4151867 0.4861107 0.5250000    0
## LDA  0.3011551 0.4192537 0.4662541 0.4862025 0.5308596 0.7812500    0
## SVM  0.3391908 0.3997116 0.4460612 0.4621585 0.5234605 0.7475083    0
## KNN  0.2553191 0.3406000 0.3841761 0.3984995 0.4539789 0.6195363    0
## RF   0.2951613 0.3778304 0.4640696 0.4630809 0.5447483 0.6426332    0
table(PimaIndiansDiabetes$diabetes)
## 
## neg pos 
## 500 268
prop.table(table(PimaIndiansDiabetes$diabetes))
## 
##       neg       pos 
## 0.6510417 0.3489583
scales<-list(x=list(relation="free"), y=list(relation="free"))
bwplot(results,scales=scales)

densityplot(results, scales=scales)

#densityplot(results, scales=scales, pch="|", xlab("methods")=)
dotplot(results, scales=scales)

splom(results)

diffs<-diff(results)

summary(diffs)
## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      CART      LDA       SVM       KNN       RF       
## CART           -0.032137 -0.024322  0.010019 -0.016883
## LDA  0.0011862            0.007815  0.042157  0.015254
## SVM  0.0116401 0.9156892            0.034342  0.007439
## KNN  1.0000000 6.68e-05  0.0002941           -0.026902
## RF   0.2727542 0.4490617 1.0000000 0.0183793          
## 
## Kappa 
##      CART      LDA        SVM        KNN        RF        
## CART           -0.0710158 -0.0469717  0.0166872 -0.0478942
## LDA  0.0008086             0.0240440  0.0877029  0.0231215
## SVM  0.0258079 0.3562734              0.0636589 -0.0009225
## KNN  1.0000000 0.0003858  0.0040823             -0.0645814
## RF   0.0211763 1.0000000  1.0000000  0.0158974

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.