# Most popular package for using KNN models and m-means is the standard library class:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:purrr':
##
## compact
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(class)
# Importing the dataset
data = iris
# will check the summary of the iris data set
summary(data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# Histogram for each species
data %>%
gather(Attributes, Value, 1:4) %>%
ggplot(aes(x=Value, fill=Attributes)) +
geom_histogram(colour="black") +
facet_wrap(~Species) +
theme_bw() +
labs(x="Values", y="Frequency",
title="Iris data set",
subtitle="Histogram for each species") +
theme(legend.title=element_blank(),
legend.position="bottom")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Density plot for each species
data %>%
gather(Attributes, value, 1:4) %>%
ggplot(aes(x=value, fill=Species)) +
geom_density(colour="black", alpha=0.5) +
facet_wrap(~Attributes, scales="free_x") +
labs(x="Values", y="Density",
title="Iris data set",
subtitle="Density plot for each attribute") +
theme_bw() +
theme(legend.position="bottom",
legend.title=element_blank())

# check the species data with pie chart
table(data$Species)
##
## setosa versicolor virginica
## 50 50 50
pie(table(data$Species), main = "Pie Chart of the Iris data set Species",
col = c("orange1", "chocolate", "coral"), radius = 1)

# Scatter plot and correlations
ggpairs(cbind(data, Cluster=as.factor(data$Species)),
columns=1:4, aes(colour=Cluster, alpha=0.5),
lower=list(continuous="points"),
axisLabels="none", switch="both") +
theme_bw()

# Will check the stats of "Sepal.Length" with respect to Species
aggregate(Sepal.Length ~ Species, data, summary)
## Species Sepal.Length.Min. Sepal.Length.1st Qu. Sepal.Length.Median
## 1 setosa 4.300 4.800 5.000
## 2 versicolor 4.900 5.600 5.900
## 3 virginica 4.900 6.225 6.500
## Sepal.Length.Mean Sepal.Length.3rd Qu. Sepal.Length.Max.
## 1 5.006 5.200 5.800
## 2 5.936 6.300 7.000
## 3 6.588 6.900 7.900
# will see the data distribution using box plot
par(mfrow=c(2,2))
boxplot(Sepal.Length ~ Species, data, main = "Sepal Length wrt Species", col = "lightpink3")
boxplot(Sepal.Width ~ Species, data, main = "Sepal Width wrt Species", col = "antiquewhite1")
boxplot(Petal.Length ~ Species, data, main = "Petal Length wrt Species", col = "lightskyblue4")
boxplot(Petal.Width ~ Species, data, main = "Petal Width wrt Species", col = "orange1")

row_labels = data[,5]
# Encoding the target feature as factor
data$Species <- as.numeric(data$Species)
# Scale the data since we will be using distance formulas on the data
# and we want to reduce complexity and computation when computing
# especially when our datasets are huge!
data[,c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")] <- scale(
data[,c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")])
# Split into test and train 80/20
set.seed(123)
size <- floor(0.8 * nrow(data))
train_ind <- sample(seq_len(nrow(data)), size = size)
train_labels <- data[train_ind, 5]
data_train <- data[train_ind,1:4]
data_test <- data[-train_ind,1:4]
data_test_labels <- row_labels[-train_ind]
# Fit KNN Model
predictions <- knn(train = data_train,
test = data_test,
cl = train_labels,
k= 11)
# Notice that I am only getting 2 dimensions
plot_predictions <- data.frame(
data_test$Sepal.Length,
data_test$Sepal.Width,
data_test$Petal.Length,
data_test$Petal.Width,
predicted = predictions)
colnames(plot_predictions) <- c("Sepal.Length",
"Sepal.Width",
"Petal.Length",
"Petal.Width",
'predicted')
# Visualize the KNN algorithm results.
p1 <- ggplot(plot_predictions, aes(Petal.Length, Petal.Width, color = predicted, fill = predicted)) +
geom_point(size = 5) +
geom_text(aes(label=data_test_labels),hjust=1, vjust=2) +
ggtitle("Predicted relationship between Petal Length and Width") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
p2 <- ggplot(plot_predictions, aes(Sepal.Length, Sepal.Width, color = predicted, fill = predicted)) +
geom_point(size = 5) +
geom_text(aes(label=data_test_labels),hjust=1, vjust=2) +
ggtitle("Predicted relationship between Sepal Length and Sepal") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
grid.arrange(p1, p2, ncol=2)

# iris.labels
iris.labels = iris$Species
table(iris.labels)
## iris.labels
## setosa versicolor virginica
## 50 50 50
iris_data <- iris[1:4]
# Scale data
iris_data_scale <- scale(iris_data)
# Distance
iris_data <- dist(iris_data_scale)
# Calculate how many clusters you need
# Within Sum Squares
fviz_nbclust(iris_data_scale, kmeans, method = "wss")+
labs(subtitle="Elbow Method")

# Kmeans
km.out <- kmeans(iris_data_scale, centers=3,nstart=100)
print(km.out)
## K-means clustering with 3 clusters of sizes 53, 47, 50
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -0.05005221 -0.88042696 0.3465767 0.2805873
## 2 1.13217737 0.08812645 0.9928284 1.0141287
## 3 -1.01119138 0.85041372 -1.3006301 -1.2507035
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1
## [75] 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
## [112] 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 2 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2
## [149] 2 1
##
## Within cluster sum of squares by cluster:
## [1] 44.08754 47.45019 47.35062
## (between_SS / total_SS = 76.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Visualize the clustering algorithm results.
km.clusters<-km.out$cluster
rownames(iris_data_scale)<-paste(iris$Species, 1:dim(iris)[1], sep = "_")
fviz_cluster(list(data=iris_data_scale, cluster = km.clusters))

table(km.clusters, iris$Species)
##
## km.clusters setosa versicolor virginica
## 1 0 39 14
## 2 0 11 36
## 3 50 0 0