library(ggplot2)
library(caret)
library(ellipse)
data("iris")
ggplot(iris, aes(
x=Petal.Length,
y=Sepal.Length,
colour=Species))+
geom_point()+
ggtitle("Iris Species by Petal and Sepal Length")
#percentage of species
percentage<-prop.table(table(iris$Species))*100
cbind(freq=table(iris$Species), percentage=percentage)
## freq percentage
## setosa 50 33.33333
## versicolor 50 33.33333
## virginica 50 33.33333
dataset<-iris
#80% train data
validation_index<-createDataPartition(dataset$Species, p=0.8, list = FALSE)
#20% validation data
validation<-dataset[-validation_index,]
#80% for training and test model
dataset<-dataset[validation_index,]
#accomodate numeric data type / column 1-4
x<-dataset[,1:4]
par(mfrow=c(1,4))
for(i in 1:4) {
boxplot(x[,i], main=names(iris))[i]}
#accomodate the column 5 / species
y<-dataset[,5]
featurePlot(x=x, y=y, plot = "ellipse")
featurePlot(x=x, y=y, plot = "box")
scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
#estimate the accuracy with 10-fold crossvalidation
control<- trainControl(method="cv", number=10)
metric<-"Accuracy"
#LDA algorithm
set.seed(7)
fit.lda<-train(Species~., data=dataset, method="lda", metric=metric, trControl=control)
##CART algorithm
set.seed(7)
fit.cart<-train(Species~., data=dataset, method="rpart", metric=metric, trControl=control)
#kNN algorithm
set.seed(7)
fit.knn<-train(Species~., data=dataset, method="knn", metric=metric, trControl=control)
#SVM algorithm
set.seed(7)
fit.svm<-train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control)
#Random Forest Algorithm
set.seed(7)
fit.rf<-train(Species~., data=dataset, method="rf", metric=metric, trControl=control)
results<-resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, cart, knn, svm, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.8333333 0.9375000 1.0000000 0.9666667 1.0000000 1 0
## cart 0.7500000 0.9166667 0.9166667 0.9166667 0.9791667 1 0
## knn 0.9166667 0.9166667 1.0000000 0.9666667 1.0000000 1 0
## svm 0.8333333 0.9166667 1.0000000 0.9583333 1.0000000 1 0
## rf 0.7500000 0.9166667 0.9583333 0.9416667 1.0000000 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.750 0.90625 1.0000 0.9500 1.00000 1 0
## cart 0.625 0.87500 0.8750 0.8750 0.96875 1 0
## knn 0.875 0.87500 1.0000 0.9500 1.00000 1 0
## svm 0.750 0.87500 1.0000 0.9375 1.00000 1 0
## rf 0.625 0.87500 0.9375 0.9125 1.00000 1 0
dotplot(results)
print(fit.lda);
## Linear Discriminant Analysis
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9666667 0.95
print(fit.knn);
## k-Nearest Neighbors
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9666667 0.950
## 7 0.9500000 0.925
## 9 0.9500000 0.925
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
print(fit.cart);
## CART
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.00 0.9166667 0.875
## 0.45 0.7166667 0.575
## 0.50 0.3333333 0.000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
print(fit.rf);
## Random Forest
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9416667 0.9125
## 3 0.9333333 0.9000
## 4 0.9416667 0.9125
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
print(fit.svm)
## Support Vector Machines with Radial Basis Function Kernel
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.9416667 0.9125
## 0.50 0.9583333 0.9375
## 1.00 0.9583333 0.9375
##
## Tuning parameter 'sigma' was held constant at a value of 0.608061
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.608061 and C = 0.5.
predictions<-predict(fit.lda, validation)
confusionMatrix(predictions, validation$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500