# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
# Load the Iris dataset
data(iris)
# Display the first few rows of the dataset
kable(head(iris)) %>% kable_styling()
|
Sepal.Length
|
Sepal.Width
|
Petal.Length
|
Petal.Width
|
Species
|
|
5.1
|
3.5
|
1.4
|
0.2
|
setosa
|
|
4.9
|
3.0
|
1.4
|
0.2
|
setosa
|
|
4.7
|
3.2
|
1.3
|
0.2
|
setosa
|
|
4.6
|
3.1
|
1.5
|
0.2
|
setosa
|
|
5.0
|
3.6
|
1.4
|
0.2
|
setosa
|
|
5.4
|
3.9
|
1.7
|
0.4
|
setosa
|
# Check for missing values
sum(is.na(iris))
## [1] 0
# Convert Species to a factor (if not already)
iris$Species <- as.factor(iris$Species)
# Summary statistics of the dataset
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# Add a new column with Petal.Length to Sepal.Length ratio
iris <- iris %>%
mutate(Petal.Sepal.Ratio = Petal.Length / Sepal.Length)
# Display the manipulated dataset
kable(head(iris)) %>% kable_styling()
|
Sepal.Length
|
Sepal.Width
|
Petal.Length
|
Petal.Width
|
Species
|
Petal.Sepal.Ratio
|
|
5.1
|
3.5
|
1.4
|
0.2
|
setosa
|
0.2745098
|
|
4.9
|
3.0
|
1.4
|
0.2
|
setosa
|
0.2857143
|
|
4.7
|
3.2
|
1.3
|
0.2
|
setosa
|
0.2765957
|
|
4.6
|
3.1
|
1.5
|
0.2
|
setosa
|
0.3260870
|
|
5.0
|
3.6
|
1.4
|
0.2
|
setosa
|
0.2800000
|
|
5.4
|
3.9
|
1.7
|
0.4
|
setosa
|
0.3148148
|
# Scatter plot of Sepal.Length vs Petal.Length colored by Species
ggplot(iris, aes(x = Sepal.Length, y = Petal.Length, color = Species)) +
geom_point(size = 3) +
labs(title = "Sepal Length vs Petal Length by Species", x = "Sepal Length", y = "Petal Length") +
theme_minimal()

# Boxplot of Petal.Sepal.Ratio by Species
ggplot(iris, aes(x = Species, y = Petal.Sepal.Ratio, fill = Species)) +
geom_boxplot() +
labs(title = "Petal to Sepal Length Ratio by Species", x = "Species", y = "Petal to Sepal Length Ratio") +
theme_minimal()

# Set seed for reproducibility
set.seed(123)
# Split the data into training (70%) and testing (30%) sets
trainIndex <- createDataPartition(iris$Species, p = .7, list = FALSE)
trainData <- iris[trainIndex,]
testData <- iris[-trainIndex,]
# Train a Random Forest model
rfModel <- randomForest(Species ~ ., data = trainData, importance = TRUE)
# Display model summary
rfModel
##
## Call:
## randomForest(formula = Species ~ ., data = trainData, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 5.71%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 35 0 0 0.00000000
## versicolor 0 32 3 0.08571429
## virginica 0 3 32 0.08571429
# Train a Support Vector Machine model
svmModel <- svm(Species ~ ., data = trainData)
# Display model summary
svmModel
##
## Call:
## svm(formula = Species ~ ., data = trainData)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 43
# Predict on test data using Random Forest
rfPred <- predict(rfModel, newdata = testData)
# Confusion matrix for Random Forest
rfConfMat <- confusionMatrix(rfPred, testData$Species)
rfConfMat
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 14 2
## virginica 0 1 13
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.8173, 0.986)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9333 0.8667
## Specificity 1.0000 0.9333 0.9667
## Pos Pred Value 1.0000 0.8750 0.9286
## Neg Pred Value 1.0000 0.9655 0.9355
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3111 0.2889
## Detection Prevalence 0.3333 0.3556 0.3111
## Balanced Accuracy 1.0000 0.9333 0.9167
# Predict on test data using SVM
svmPred <- predict(svmModel, newdata = testData)
# Confusion matrix for SVM
svmConfMat <- confusionMatrix(svmPred, testData$Species)
svmConfMat
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 14 2
## virginica 0 1 13
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.8173, 0.986)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9333 0.8667
## Specificity 1.0000 0.9333 0.9667
## Pos Pred Value 1.0000 0.8750 0.9286
## Neg Pred Value 1.0000 0.9655 0.9355
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3111 0.2889
## Detection Prevalence 0.3333 0.3556 0.3111
## Balanced Accuracy 1.0000 0.9333 0.9167
# Plot variable importance for the Random Forest model
varImpPlot(rfModel, main = "Random Forest - Feature Importance")

# Deploy the best model (let's assume Random Forest) for predictions on new data
newData <- data.frame(Sepal.Length = c(5.1, 6.2),
Sepal.Width = c(3.5, 3.4),
Petal.Length = c(1.4, 4.5),
Petal.Width = c(0.2, 1.5),
Petal.Sepal.Ratio = c(0.274, 0.726))
# Predict species using the deployed Random Forest model
deployedPred <- predict(rfModel, newData)
newData$PredictedSpecies <- deployedPred
# Display the predictions
kable(newData) %>% kable_styling()
|
Sepal.Length
|
Sepal.Width
|
Petal.Length
|
Petal.Width
|
Petal.Sepal.Ratio
|
PredictedSpecies
|
|
5.1
|
3.5
|
1.4
|
0.2
|
0.274
|
setosa
|
|
6.2
|
3.4
|
4.5
|
1.5
|
0.726
|
versicolor
|
# Perform cross-validation on the Random Forest model
control <- trainControl(method = "cv", number = 10)
cvModel <- train(Species ~ ., data = iris, method = "rf", trControl = control)
# Display cross-validation results
cvModel
## Random Forest
##
## 150 samples
## 5 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9533333 0.93
## 3 0.9533333 0.93
## 5 0.9333333 0.90
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.