Iris_KNN.knit

#Iris_KNN_project

###The project implements the K-Nearest Neighbor (KNN) algorithm on the Iris dataset in R programming language. The aim is to classify the species of Iris flowers based on their petal length and width. The code loads the necessary libraries, preprocesses the data, creates a train-test split, trains the KNN model, evaluates its performance, and visualizes the results using ggplot2. The project can be useful for learning how to implement machine learning algorithms on real-world datasets and visualizing the results.

# Load the required libraries
library(ggplot2)   # for data visualization
library(caret)     # for creating the data partition

## Loading required package: lattice

library(class)     # for implementing the KNN algorithm

#checking basic info
?iris
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# Load the iris dataset
data(iris)

# Scale the data
scaled_data <- scale(iris[, 1:4])

# Split the data into training and test sets
set.seed(123)   # set a seed for reproducibility
trainIndex <- createDataPartition(iris$Species, p = 0.7, list = FALSE)   # create a 70/30 split for training and testing
train_data <- scaled_data[trainIndex, ]
train_labels <- iris$Species[trainIndex]
test_data <- scaled_data[-trainIndex, ]
test_labels <- iris$Species[-trainIndex]

# Train the KNN model, I used 5 because I tested different models and got higher accuracy with 5
predicted_labels <- knn(train = train_data, test = test_data, cl = train_labels, k = 5)

# Evaluate model performance
accuracy <- sum(predicted_labels == test_labels) / length(test_labels)   # compute the accuracy of the predicted labels against the actual labels in the test set
accuracy

## [1] 0.9111111

# Visualize results
iris_test <- data.frame(test_data, test_labels)
colnames(iris_test)[5] <- "Species"   # set the column name of the actual species labels to "Species"
iris_test$predicted <- predicted_labels   # add a column of predicted species labels to the test data

ggplot(iris_test, aes(x = Petal.Length, y = Petal.Width, color = predicted, shape = Species)) +   # create a scatter plot using ggplot2, where the x-axis represents the petal length, the y-axis represents the petal width, the color represents the predicted species, and the shape represents the actual species
  geom_point(size = 7) +
  scale_color_manual(values = c("#005E7D", "#F4A460", "#4CAF50")) +   # set the color codes for each species
  scale_shape_manual(values = c(21, 22, 24)) +   # set the shape codes for each species
  theme_light() +   # set the theme for the plot
  theme(legend.position = "bottom", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +   # set the position of the legend to the bottom and remove the grid lines
  labs(title = "KNN Prediction on Iris Dataset",   # set the title, subtitle, and axis labels for the plot
       subtitle = "Predicted Species vs. Actual Species (Test Data)",
       x = "Petal Length", y = "Petal Width", color = "Predicted Species", shape = "Actual Species")