# install libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages ---------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'readr' was built under R version 3.6.1
## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr) # data manipulation
library(cluster) # clustering algorithms
## Warning: package 'cluster' was built under R version 3.6.1
library(factoextra) # clustering algorithms & visualization
## Warning: package 'factoextra' was built under R version 3.6.1
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# read csv file and remove any missing values
clusterdf <- read.csv("C:/Users/Owner/Desktop/Lenin Files/Data Sciences/Assignments/clustering-data.csv")
clusterdfnew <- scale(na.omit(clusterdf))
head(clusterdfnew)
## x y
## 1 -0.8482235 1.561107
## 2 -0.5415045 1.561107
## 3 0.4586659 1.561107
## 4 0.8187273 1.561107
## 5 1.1254462 1.561107
## 6 1.1387818 1.561107
# a.plot the data using scatter plot
library(ggplot2)
scatterclus <- ggplot(clusterdf, aes(x, y))
scatterclustpl <- scatterclus + geom_point() + geom_smooth(method = "lm", alpha = 0.1) + labs(x = "Value of X", y = "Value of y", colour = "label")
scatterclustpl
# c.get distance between points and plot k on x-axis and average distance on y-axis
clusterdist <- get_dist(clusterdfnew)
# fviz_dist(clusterdist, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
kcluster2 <- kmeans(clusterdfnew, centers = 2, nstart = 25)
kcluster3 <- kmeans(clusterdfnew, centers = 3, nstart = 25)
kcluster4 <- kmeans(clusterdfnew, centers = 4, nstart = 25)
kcluster5 <- kmeans(clusterdfnew, centers = 5, nstart = 25)
kcluster6 <- kmeans(clusterdfnew, centers = 6, nstart = 25)
kcluster7 <- kmeans(clusterdfnew, centers = 7, nstart = 25)
kcluster8 <- kmeans(clusterdfnew, centers = 8, nstart = 25)
kcluster9 <- kmeans(clusterdfnew, centers = 9, nstart = 25)
kcluster10 <- kmeans(clusterdfnew, centers = 10, nstart = 25)
kcluster11 <- kmeans(clusterdfnew, centers = 11, nstart = 25)
kcluster12 <- kmeans(clusterdfnew, centers = 12, nstart = 25)
pc2 <- fviz_cluster(kcluster2, geom = "point", data = clusterdfnew) + ggtitle("k = 2")
pc3 <- fviz_cluster(kcluster3, geom = "point", data = clusterdfnew) + ggtitle("k = 3")
pc4 <- fviz_cluster(kcluster4, geom = "point", data = clusterdfnew) + ggtitle("k = 4")
pc5 <- fviz_cluster(kcluster5, geom = "point", data = clusterdfnew) + ggtitle("k = 5")
pc6 <- fviz_cluster(kcluster6, geom = "point", data = clusterdfnew) + ggtitle("k = 6")
pc7 <- fviz_cluster(kcluster7, geom = "point", data = clusterdfnew) + ggtitle("k = 7")
pc8 <- fviz_cluster(kcluster8, geom = "point", data = clusterdfnew) + ggtitle("k = 8")
pc9 <- fviz_cluster(kcluster9, geom = "point", data = clusterdfnew) + ggtitle("k = 9")
pc10 <- fviz_cluster(kcluster10, geom = "point", data = clusterdfnew) + ggtitle("k = 10")
pc11 <- fviz_cluster(kcluster11, geom = "point", data = clusterdfnew) + ggtitle("k = 11")
pc12 <- fviz_cluster(kcluster12, geom = "point", data = clusterdfnew) + ggtitle("k = 12")
# b. create scatter plots to compare
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(pc2, pc3, pc4, pc5, pc6, pc7, nrow = 2)
grid.arrange(pc8, pc9, pc10, pc11, pc12, nrow = 2)
set.seed(123)
# function to compute total within-cluster sum of square
wss <- function(k) {
kmeans(clusterdfnew, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 2 to k = 12
k.values <- 2:12
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
# verify optimal number of clusters using fviz_nbclust function
set.seed(123)
fviz_nbclust(clusterdfnew, kmeans, method = "wss")
### Looking at the graph, the elbow point is determined to be 4.