prediction-IRIS.utf8

library(ggplot2)
library(caret)
library(ellipse)
data("iris")

ggplot(iris, aes(
  x=Petal.Length,
  y=Sepal.Length,
  colour=Species))+
geom_point()+
ggtitle("Iris Species by Petal and Sepal Length")

#percentage of species

percentage<-prop.table(table(iris$Species))*100
cbind(freq=table(iris$Species), percentage=percentage)

##            freq percentage
## setosa       50   33.33333
## versicolor   50   33.33333
## virginica    50   33.33333

dataset<-iris

#80% train data

validation_index<-createDataPartition(dataset$Species, p=0.8, list = FALSE)

#20% validation data

validation<-dataset[-validation_index,]

#80% for training and test model

dataset<-dataset[validation_index,]

#accomodate numeric data type / column 1-4

x<-dataset[,1:4]
par(mfrow=c(1,4))
for(i in 1:4) {
  boxplot(x[,i], main=names(iris))[i]}

#accomodate the column 5 / species

y<-dataset[,5]

featurePlot(x=x, y=y, plot = "ellipse")

featurePlot(x=x, y=y, plot = "box")

scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

#estimate the accuracy with 10-fold crossvalidation

control<- trainControl(method="cv", number=10)
metric<-"Accuracy"

#LDA algorithm

set.seed(7)
fit.lda<-train(Species~., data=dataset, method="lda", metric=metric, trControl=control)

##CART algorithm

set.seed(7)
fit.cart<-train(Species~., data=dataset, method="rpart", metric=metric, trControl=control)

#kNN algorithm

set.seed(7)
fit.knn<-train(Species~., data=dataset, method="knn", metric=metric, trControl=control)

#SVM algorithm

set.seed(7)
fit.svm<-train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control)

#Random Forest Algorithm

set.seed(7)
fit.rf<-train(Species~., data=dataset, method="rf", metric=metric, trControl=control)

results<-resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, cart, knn, svm, rf 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## lda  0.8333333 0.9375000 1.0000000 0.9666667 1.0000000    1    0
## cart 0.7500000 0.9166667 0.9166667 0.9166667 0.9791667    1    0
## knn  0.9166667 0.9166667 1.0000000 0.9666667 1.0000000    1    0
## svm  0.8333333 0.9166667 1.0000000 0.9583333 1.0000000    1    0
## rf   0.7500000 0.9166667 0.9583333 0.9416667 1.0000000    1    0
## 
## Kappa 
##       Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## lda  0.750 0.90625 1.0000 0.9500 1.00000    1    0
## cart 0.625 0.87500 0.8750 0.8750 0.96875    1    0
## knn  0.875 0.87500 1.0000 0.9500 1.00000    1    0
## svm  0.750 0.87500 1.0000 0.9375 1.00000    1    0
## rf   0.625 0.87500 0.9375 0.9125 1.00000    1    0

dotplot(results)

print(fit.lda);

## Linear Discriminant Analysis 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results:
## 
##   Accuracy   Kappa
##   0.9666667  0.95

print(fit.knn);

## k-Nearest Neighbors 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa
##   5  0.9666667  0.950
##   7  0.9500000  0.925
##   9  0.9500000  0.925
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.

print(fit.cart);

## CART 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa
##   0.00  0.9166667  0.875
##   0.45  0.7166667  0.575
##   0.50  0.3333333  0.000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.

print(fit.rf);

## Random Forest 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa 
##   2     0.9416667  0.9125
##   3     0.9333333  0.9000
##   4     0.9416667  0.9125
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

print(fit.svm)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa 
##   0.25  0.9416667  0.9125
##   0.50  0.9583333  0.9375
##   1.00  0.9583333  0.9375
## 
## Tuning parameter 'sigma' was held constant at a value of 0.608061
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.608061 and C = 0.5.

predictions<-predict(fit.lda, validation)
confusionMatrix(predictions, validation$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500