Beginner ML Project

Flower Classification

Project steps: 1. Define problem. 2. Prepare data. 3. Evaluate algorithms. 4. Improve results. 5. Present results.

Load Packages

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

Load Data

data(iris)
datatset <- iris

Create a Test Dataset

test_index <- createDataPartition(datatset$Species, p=0.80, list=FALSE)
test <- datatset[-test_index, ]
train <- datatset[test_index, ]

Summarize Dataset

#dimensions of train & test dataset
dim(train)

## [1] 120   5

dim(test)

## [1] 30  5

#Types of attributes
sapply(train, class)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##    "numeric"    "numeric"    "numeric"    "numeric"     "factor"

#Peek at the data first 6 rows of the data
head(train)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

#levels for the class (target)
levels(train$Species)

## [1] "setosa"     "versicolor" "virginica"

#summarize the class distribution
percentage <- prop.table(table(train$Species)) * 100
cbind(freq = table(train$Species),percentage=percentage)

##            freq percentage
## setosa       40   33.33333
## versicolor   40   33.33333
## virginica    40   33.33333

#Statistical Summary
summary(train)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.200   Min.   :1.100   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.575   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.300   Median :1.300  
##  Mean   :5.847   Mean   :3.057   Mean   :3.772   Mean   :1.197  
##  3rd Qu.:6.400   3rd Qu.:3.400   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.200   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :40  
##  versicolor:40  
##  virginica :40  
##                 
##                 
##

Visualize Dataset

#Split into features and target
X_train <- train[,1:4]
y_train <- train[,5]

#Boxplot for each attribute on one image
par(mfrow=c(1,4))
  for(i in 1:4){
    boxplot(X_train[,i], main=names(train)[i])
  }

#Barplot for class breakdown
plot(y_train)

#Scatterplot Matrix
featurePlot(X_train, y_train, plot="box")

#Scatterplot Matrix
featurePlot(X_train, y_train, plot="pairs", auto.key= list(columns = 3))

#Density Plots for each attribute by class value
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(X_train, y_train, plot="density", scales=scales, auto.key= list(columns = 3))

Evaluate Some Algorithms

#Run algorithms using 10-fold cross validation
control <- trainControl(method = "cv", number = 10)
metric <- "Accuracy"

#Build models

#(a) linear algorithms
set.seed(7)
fit.lda <- train(Species~., data=train, method="lda", metric=metric, trControl=control)

#(b) nonlinear algorithms
#CART
set.seed(7)
fit.cart <- train(Species~., data=train, method="rpart", metric=metric, trControl=control)

#kNN
set.seed(7)
fit.knn <- train(Species~., data=train, method="knn", metric=metric, trControl=control)

#(c) advanced algorithms
#SVM
set.seed(7)
fit.svm <- train(Species~., data=train, method="svmRadial", metric=metric, trControl=control)

#Random Forest
set.seed(7)
fit.rf <- train(Species~., data=train, method="rf", metric=metric, trControl=control)

#Select best model

# summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, cart, knn, svm, rf 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## lda  0.8333333 1.0000000 1.0000000 0.9750000 1.0000000    1    0
## cart 0.8333333 0.9166667 0.9166667 0.9166667 0.9166667    1    0
## knn  0.9166667 0.9166667 0.9583333 0.9583333 1.0000000    1    0
## svm  0.8333333 0.9166667 0.9166667 0.9333333 0.9791667    1    0
## rf   0.8333333 0.9166667 0.9583333 0.9500000 1.0000000    1    0
## 
## Kappa 
##       Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## lda  0.750   1.000 1.0000 0.9625 1.00000    1    0
## cart 0.750   0.875 0.8750 0.8750 0.87500    1    0
## knn  0.875   0.875 0.9375 0.9375 1.00000    1    0
## svm  0.750   0.875 0.8750 0.9000 0.96875    1    0
## rf   0.750   0.875 0.9375 0.9250 1.00000    1    0

# compare accuracy of models
dotplot(results)

# summarize Best Model
print(fit.knn)

## k-Nearest Neighbors 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa 
##   5  0.9500000  0.9250
##   7  0.9583333  0.9375
##   9  0.9500000  0.9250
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.

Make Predictions

# estimate skill of kNN on the validation dataset
predictions <- predict(fit.knn, test)
confusionMatrix(predictions, test$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         0
##   virginica       0          0        10
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8843, 1)
##     No Information Rate : 0.3333     
##     P-Value [Acc > NIR] : 4.857e-15  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           1.0000
## Specificity                 1.0000            1.0000           1.0000
## Pos Pred Value              1.0000            1.0000           1.0000
## Neg Pred Value              1.0000            1.0000           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3333
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            1.0000           1.0000

Reference

Brownlee, J. (2019, October 07). Your first machine learning project in R Step-By-Step. Retrieved February 15, 2021, from https://machinelearningmastery.com/machine-learning-in-r-step-by-step/