Iris Dataset Analysis and Model Deployment

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(randomForest)

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(e1071)
library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Load the Iris dataset
data(iris)

# Display the first few rows of the dataset
kable(head(iris)) %>% kable_styling()

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
5.1	3.5	1.4	0.2	setosa
4.9	3.0	1.4	0.2	setosa
4.7	3.2	1.3	0.2	setosa
4.6	3.1	1.5	0.2	setosa
5.0	3.6	1.4	0.2	setosa
5.4	3.9	1.7	0.4	setosa

# Check for missing values
sum(is.na(iris))

## [1] 0

# Convert Species to a factor (if not already)
iris$Species <- as.factor(iris$Species)

# Summary statistics of the dataset
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# Add a new column with Petal.Length to Sepal.Length ratio
iris <- iris %>%
  mutate(Petal.Sepal.Ratio = Petal.Length / Sepal.Length)

# Display the manipulated dataset
kable(head(iris)) %>% kable_styling()

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species	Petal.Sepal.Ratio
5.1	3.5	1.4	0.2	setosa	0.2745098
4.9	3.0	1.4	0.2	setosa	0.2857143
4.7	3.2	1.3	0.2	setosa	0.2765957
4.6	3.1	1.5	0.2	setosa	0.3260870
5.0	3.6	1.4	0.2	setosa	0.2800000
5.4	3.9	1.7	0.4	setosa	0.3148148

# Scatter plot of Sepal.Length vs Petal.Length colored by Species
ggplot(iris, aes(x = Sepal.Length, y = Petal.Length, color = Species)) +
  geom_point(size = 3) +
  labs(title = "Sepal Length vs Petal Length by Species", x = "Sepal Length", y = "Petal Length") +
  theme_minimal()

# Boxplot of Petal.Sepal.Ratio by Species
ggplot(iris, aes(x = Species, y = Petal.Sepal.Ratio, fill = Species)) +
  geom_boxplot() +
  labs(title = "Petal to Sepal Length Ratio by Species", x = "Species", y = "Petal to Sepal Length Ratio") +
  theme_minimal()

# Set seed for reproducibility
set.seed(123)

# Split the data into training (70%) and testing (30%) sets
trainIndex <- createDataPartition(iris$Species, p = .7, list = FALSE)
trainData <- iris[trainIndex,]
testData <- iris[-trainIndex,]

# Train a Random Forest model
rfModel <- randomForest(Species ~ ., data = trainData, importance = TRUE)

# Display model summary
rfModel

## 
## Call:
##  randomForest(formula = Species ~ ., data = trainData, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 5.71%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         35          0         0  0.00000000
## versicolor      0         32         3  0.08571429
## virginica       0          3        32  0.08571429

# Train a Support Vector Machine model
svmModel <- svm(Species ~ ., data = trainData)

# Display model summary
svmModel

## 
## Call:
## svm(formula = Species ~ ., data = trainData)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  43

# Predict on test data using Random Forest
rfPred <- predict(rfModel, newdata = testData)

# Confusion matrix for Random Forest
rfConfMat <- confusionMatrix(rfPred, testData$Species)
rfConfMat

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         14         2
##   virginica       0          1        13
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9333         
##                  95% CI : (0.8173, 0.986)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9            
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9333           0.8667
## Specificity                 1.0000            0.9333           0.9667
## Pos Pred Value              1.0000            0.8750           0.9286
## Neg Pred Value              1.0000            0.9655           0.9355
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3111           0.2889
## Detection Prevalence        0.3333            0.3556           0.3111
## Balanced Accuracy           1.0000            0.9333           0.9167

# Predict on test data using SVM
svmPred <- predict(svmModel, newdata = testData)

# Confusion matrix for SVM
svmConfMat <- confusionMatrix(svmPred, testData$Species)
svmConfMat

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         14         2
##   virginica       0          1        13
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9333         
##                  95% CI : (0.8173, 0.986)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9            
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9333           0.8667
## Specificity                 1.0000            0.9333           0.9667
## Pos Pred Value              1.0000            0.8750           0.9286
## Neg Pred Value              1.0000            0.9655           0.9355
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3111           0.2889
## Detection Prevalence        0.3333            0.3556           0.3111
## Balanced Accuracy           1.0000            0.9333           0.9167

# Plot variable importance for the Random Forest model
varImpPlot(rfModel, main = "Random Forest - Feature Importance")

# Deploy the best model (let's assume Random Forest) for predictions on new data
newData <- data.frame(Sepal.Length = c(5.1, 6.2), 
                      Sepal.Width = c(3.5, 3.4), 
                      Petal.Length = c(1.4, 4.5), 
                      Petal.Width = c(0.2, 1.5),
                      Petal.Sepal.Ratio = c(0.274, 0.726))

# Predict species using the deployed Random Forest model
deployedPred <- predict(rfModel, newData)
newData$PredictedSpecies <- deployedPred

# Display the predictions
kable(newData) %>% kable_styling()

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Petal.Sepal.Ratio	PredictedSpecies
5.1	3.5	1.4	0.2	0.274	setosa
6.2	3.4	4.5	1.5	0.726	versicolor

# Perform cross-validation on the Random Forest model
control <- trainControl(method = "cv", number = 10)
cvModel <- train(Species ~ ., data = iris, method = "rf", trControl = control)

# Display cross-validation results
cvModel

## Random Forest 
## 
## 150 samples
##   5 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa
##   2     0.9533333  0.93 
##   3     0.9533333  0.93 
##   5     0.9333333  0.90 
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Iris Dataset Analysis and Model Deployment

Prof. Y.Mohammed Iqbal

2024-08-23