Assignment8.3.rmd

# install libraries
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.1

## -- Attaching packages ---------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.0       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0

## Warning: package 'readr' was built under R version 3.6.1

## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)  # data manipulation
library(cluster)    # clustering algorithms

## Warning: package 'cluster' was built under R version 3.6.1

library(factoextra) # clustering algorithms & visualization

## Warning: package 'factoextra' was built under R version 3.6.1

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

# read csv file and remove any missing values
clusterdf <- read.csv("C:/Users/Owner/Desktop/Lenin Files/Data Sciences/Assignments/clustering-data.csv")
clusterdfnew <- scale(na.omit(clusterdf))
head(clusterdfnew)

##            x        y
## 1 -0.8482235 1.561107
## 2 -0.5415045 1.561107
## 3  0.4586659 1.561107
## 4  0.8187273 1.561107
## 5  1.1254462 1.561107
## 6  1.1387818 1.561107

# a.plot the data using scatter plot
library(ggplot2)
scatterclus <- ggplot(clusterdf, aes(x, y))
scatterclustpl <- scatterclus + geom_point() + geom_smooth(method = "lm", alpha = 0.1) + labs(x = "Value of X", y = "Value of y", colour = "label")
scatterclustpl

# c.get distance between points and plot k on x-axis and average distance on y-axis
clusterdist <- get_dist(clusterdfnew)
# fviz_dist(clusterdist, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

kcluster2 <- kmeans(clusterdfnew, centers = 2, nstart = 25)
kcluster3 <- kmeans(clusterdfnew, centers = 3, nstart = 25)
kcluster4 <- kmeans(clusterdfnew, centers = 4, nstart = 25)
kcluster5 <- kmeans(clusterdfnew, centers = 5, nstart = 25)
kcluster6 <- kmeans(clusterdfnew, centers = 6, nstart = 25)
kcluster7 <- kmeans(clusterdfnew, centers = 7, nstart = 25)
kcluster8 <- kmeans(clusterdfnew, centers = 8, nstart = 25)
kcluster9 <- kmeans(clusterdfnew, centers = 9, nstart = 25)
kcluster10 <- kmeans(clusterdfnew, centers = 10, nstart = 25)
kcluster11 <- kmeans(clusterdfnew, centers = 11, nstart = 25)
kcluster12 <- kmeans(clusterdfnew, centers = 12, nstart = 25)
pc2 <- fviz_cluster(kcluster2, geom = "point", data = clusterdfnew) + ggtitle("k = 2")
pc3 <- fviz_cluster(kcluster3, geom = "point", data = clusterdfnew) + ggtitle("k = 3")
pc4 <- fviz_cluster(kcluster4, geom = "point", data = clusterdfnew) + ggtitle("k = 4")
pc5 <- fviz_cluster(kcluster5, geom = "point", data = clusterdfnew) + ggtitle("k = 5")
pc6 <- fviz_cluster(kcluster6, geom = "point", data = clusterdfnew) + ggtitle("k = 6")
pc7 <- fviz_cluster(kcluster7, geom = "point", data = clusterdfnew) + ggtitle("k = 7")
pc8 <- fviz_cluster(kcluster8, geom = "point", data = clusterdfnew) + ggtitle("k = 8")
pc9 <- fviz_cluster(kcluster9, geom = "point", data = clusterdfnew) + ggtitle("k = 9")
pc10 <- fviz_cluster(kcluster10, geom = "point", data = clusterdfnew) + ggtitle("k = 10")
pc11 <- fviz_cluster(kcluster11, geom = "point", data = clusterdfnew) + ggtitle("k = 11")
pc12 <- fviz_cluster(kcluster12, geom = "point",  data = clusterdfnew) + ggtitle("k = 12")

# b. create scatter plots to compare 
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(pc2, pc3, pc4, pc5, pc6, pc7, nrow = 2)

grid.arrange(pc8, pc9, pc10, pc11, pc12, nrow = 2)

set.seed(123)

# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(clusterdfnew, k, nstart = 10 )$tot.withinss
}

# Compute and plot wss for k = 2 to k = 12
k.values <- 2:12

# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

# verify optimal number of clusters using fviz_nbclust function
set.seed(123)

fviz_nbclust(clusterdfnew, kmeans, method = "wss")

### Looking at the graph, the elbow point is determined to be 4.