#Iris_KNN_project
###The project implements the K-Nearest Neighbor (KNN) algorithm on the Iris dataset in R programming language. The aim is to classify the species of Iris flowers based on their petal length and width. The code loads the necessary libraries, preprocesses the data, creates a train-test split, trains the KNN model, evaluates its performance, and visualizes the results using ggplot2. The project can be useful for learning how to implement machine learning algorithms on real-world datasets and visualizing the results.
# Load the required libraries
library(ggplot2) # for data visualization
library(caret) # for creating the data partition
## Loading required package: lattice
library(class) # for implementing the KNN algorithm
#checking basic info
?iris
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# Load the iris dataset
data(iris)
# Scale the data
scaled_data <- scale(iris[, 1:4])
# Split the data into training and test sets
set.seed(123) # set a seed for reproducibility
trainIndex <- createDataPartition(iris$Species, p = 0.7, list = FALSE) # create a 70/30 split for training and testing
train_data <- scaled_data[trainIndex, ]
train_labels <- iris$Species[trainIndex]
test_data <- scaled_data[-trainIndex, ]
test_labels <- iris$Species[-trainIndex]
# Train the KNN model, I used 5 because I tested different models and got higher accuracy with 5
predicted_labels <- knn(train = train_data, test = test_data, cl = train_labels, k = 5)
# Evaluate model performance
accuracy <- sum(predicted_labels == test_labels) / length(test_labels) # compute the accuracy of the predicted labels against the actual labels in the test set
accuracy
## [1] 0.9111111
# Visualize results
iris_test <- data.frame(test_data, test_labels)
colnames(iris_test)[5] <- "Species" # set the column name of the actual species labels to "Species"
iris_test$predicted <- predicted_labels # add a column of predicted species labels to the test data
ggplot(iris_test, aes(x = Petal.Length, y = Petal.Width, color = predicted, shape = Species)) + # create a scatter plot using ggplot2, where the x-axis represents the petal length, the y-axis represents the petal width, the color represents the predicted species, and the shape represents the actual species
geom_point(size = 7) +
scale_color_manual(values = c("#005E7D", "#F4A460", "#4CAF50")) + # set the color codes for each species
scale_shape_manual(values = c(21, 22, 24)) + # set the shape codes for each species
theme_light() + # set the theme for the plot
theme(legend.position = "bottom", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # set the position of the legend to the bottom and remove the grid lines
labs(title = "KNN Prediction on Iris Dataset", # set the title, subtitle, and axis labels for the plot
subtitle = "Predicted Species vs. Actual Species (Test Data)",
x = "Petal Length", y = "Petal Width", color = "Predicted Species", shape = "Actual Species")