Complete Machine Learning project example using iris dataset

Step 1: Install Package

if (!require("caret")) {
  install.packages("caret", repos="http://cran.rstudio.com/") 
  library("caret")
}

Step 2 : Load the package and data

data("iris")

# Rename the dataset as myfirstdataset
myfirstdataset <- iris

# Set the column names in the dataset
colnames(myfirstdataset) <- c("Sepal.Length","Sepal.Width", "Petal.length", "Petal.Width","Species")
names(myfirstdataset)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.length" "Petal.Width" 
## [5] "Species"
# Create a Validation Dataset
# We will split the loaded dataset into two, 80% of which we will use to train our models and 20% that we will hold back as a validation dataset.

validation_index <- createDataPartition(myfirstdataset$Species,p=0.80,list = FALSE)
# validation_index variable holds a list of 80% of original dataset  for training.

validation <- myfirstdataset[-validation_index,]
# validation variable holds 20% of original dataset for testing. 

myfirstdataset <- myfirstdataset[validation_index,]
# store 80% of remaining dataset in myfirstdataset

Step 3 : Summarize Dataset

# Find the dimensions of dataset
dim(myfirstdataset)
## [1] 120   5
# check for types of Attributes
sapply(myfirstdataset, class)
## Sepal.Length  Sepal.Width Petal.length  Petal.Width      Species 
##    "numeric"    "numeric"    "numeric"    "numeric"     "factor"
# Take a look at the Data
head(myfirstdataset)
##   Sepal.Length Sepal.Width Petal.length Petal.Width Species
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
## 7          4.6         3.4          1.4         0.3  setosa
# Check for levels of the Class
levels(myfirstdataset$Species)
## [1] "setosa"     "versicolor" "virginica"
# We will take a look at the number of instances (rows) that belong to each class in terms of Percentage and frequency
percentage <- prop.table(table(myfirstdataset$Species)) * 100
cbind(freq = table(myfirstdataset$Species), percentage = percentage)
##            freq percentage
## setosa       40   33.33333
## versicolor   40   33.33333
## virginica    40   33.33333
# summarize attribute distributions
summary(myfirstdataset)
##   Sepal.Length    Sepal.Width     Petal.length   Petal.Width  
##  Min.   :4.300   Min.   :2.000   Min.   :1.00   Min.   :0.10  
##  1st Qu.:5.100   1st Qu.:2.775   1st Qu.:1.50   1st Qu.:0.30  
##  Median :5.800   Median :3.000   Median :4.40   Median :1.30  
##  Mean   :5.853   Mean   :3.048   Mean   :3.76   Mean   :1.19  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.10   3rd Qu.:1.80  
##  Max.   :7.900   Max.   :4.400   Max.   :6.90   Max.   :2.50  
##        Species  
##  setosa    :40  
##  versicolor:40  
##  virginica :40  
##                 
##                 
## 
# We can see that all of the numerical values have the same scale (centimeters) and similar ranges [0,8] centimeters

Step 4. Visualize Dataset

# We start with some univariate plots, that is, plots of each individual variable.
# split input and output
x <- myfirstdataset[,1:4]
y <- myfirstdataset[,5]

# Given that the input variables are numeric, we can create box and whisker plots of each.
par(mfrow=c(1,4))
for( i in 1:4) {
  boxplot(x[,i],main=names(myfirstdataset)[i])
}

# we can also create a barplot of the Species class variable to get a graphical representation of the class distribution.
# barplot for class breakdown
plot(y)

### Plot a multivariate plot

# Make a multivariate plot to show interaction betweeen the variables
featurePlot(x=x,y=y,plot="ellipse")

# we can see Box and Whisker Plot of Iris data by Class Value
featurePlot(x=x,y=y,plot="box") 

# View the density plot for each attribute by class value.
# Density Plots of Iris Data By Class Value
scales <- list(x =list(relation="free"), y=list(relation="free"))
featurePlot(x=x,y=y,plot ="density",scales=scales)

Step 5: Evaluate Some Algorithms and Build Models

# We will use 10- cross valdation to estimate accuracy
control <- trainControl(method = "cv",number = 10)
metric <- "Accuracy"

Step 6: Build Models

Let’s evaluate 5 different algorithms:

# a) Linear algorithms
set.seed(7)
fit.lda <- train(Species~., data = myfirstdataset, method = "lda",metric = metric, trControl =control)

# b) Non Linear algorithms
# CART
set.seed(7)
fit.cart <- train(Species~., data = myfirstdataset, methods="rpart",metric=metric,trControl = control
                  )
# c) Advanced algorithmn
set.seed(7)
fit.svm <- train(Species~., data = myfirstdataset, method = "svmRadial", metric= metric, trControl=control)

# knn
set.seed(7)
fit.knn <- train(Species~., data = myfirstdataset,method ="knn", metric=metric,trControl = control)

# Random Forest
set.seed(7)
fit.rf <- train(Species~., data = myfirstdataset, method ="rf", metric = metric, trControl = control)

Step 7: Select Best Model

# summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary((results))
## 
## Call:
## summary.resamples(object = (results))
## 
## Models: lda, cart, knn, svm, rf 
## Number of resamples: 10 
## 
## Accuracy 
##        Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## lda  0.9167  1.0000 1.0000 0.9833       1    1    0
## cart 0.8333  0.9167 0.9167 0.9417       1    1    0
## knn  0.8333  0.9375 1.0000 0.9667       1    1    0
## svm  0.8333  0.9167 1.0000 0.9583       1    1    0
## rf   0.8333  0.9167 0.9167 0.9417       1    1    0
## 
## Kappa 
##       Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## lda  0.875  1.0000  1.000 0.9750       1    1    0
## cart 0.750  0.8750  0.875 0.9125       1    1    0
## knn  0.750  0.9062  1.000 0.9500       1    1    0
## svm  0.750  0.8750  1.000 0.9375       1    1    0
## rf   0.750  0.8750  0.875 0.9125       1    1    0
# We can see the accuracy of each classifier and also other metrics like Kappa:

# Compare Accuracy of models
dotplot(results)

# The result of LDA model can be summerized
print(fit.lda)
## Linear Discriminant Analysis 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results:
## 
##   Accuracy   Kappa
##   0.9833333  0.975
## 
## 
# We can run the LDA model directly on the validation set and summarize the results in a confusion matrix.
# estimate skill of LDA on the validation data set
predictions <- predict(fit.lda, validation)
confusionMatrix(predictions,validation$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500
# We can see that the accuracy is 100%, nearest to our prediction between expected margin of 97% +/-4%. Therefore we predict that LDA is the best fit model.