R Markdown

library(dataset)
str()
row.names()

##Data preprocessing

sum(!complete.cases(iris))
summary(iris)

##Remove orimputing missing objects

df <- na.omit()

##rescale (or normalization)

df <- scale (df, center = T, scale = T)
summary(df)

##standardization

apply(iris, 2, sd)
apply(iris, 2, mean)
apply(df, 2, sd)

Distance function and visualization

library(factoextra)
distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = “#00AFBB”, mid = “white”, high = “FC4E07”)) 

K means

km_output <- kmeans(df, centers = 2, nstart = 25, iter.max = 100, algorithm = “Hartigan-Wong”)
str(km_output)
names(km_output)
typeof(km_output)
length(km_output)
km_output$cluster

Cluster Validation Evaluation -

Objective function: Sum of Square Error (SSE)

SSE

Cluster cohesion

SSE can be used to compare cluster performance only for a similar number of clusters

km_output$totss
km_output$withinss      # distance without and within clusters
km_output$betweenss
sum(c(km_output$withinss, km_output$betweenss)
cohesion <- sum(km_output$withinss)/ km_output$totss
cohesion 

Visualize Clusters

fviz_cluster(km_output, data = df)
library(dplyr)
library(ggplot2)

df  %>%
as.data.frame()%>%
mutate(cluster = km_output$cluster, species = row.names(dataset)) %>% 
ggplot(aes(x = UrbanPop, y = Murder, color = factor(cluster), label = species)) + geom_text(  )

Put Cluster Output on the Map(1)

cluster_df <-  data.frame(species = tolower(row.names(dataset)), cluster = unname(km_output$cluster))
library(maps)
objects_names <- map_data(“species”)
objects_names %>%
left_join(cluster_df, by = c(“region” = “species”)) %>%
ggplot( ) +
geom_polygon(aes(x = long, y = lat, fill = as.factor(cluster), group =group), color = “white”) +
coord_fixed(1.3) +
guides(fill = F) +
theme_bw( ) +
theme(panel.grid.major = element_blank( ), panel.grid.minor = element_blank( ),
  panel.border = element_blank( ),
  axis.line = element_blank( ),
  axis.text = element_blank( ),
  axis.ticks = element_blank( ),
  axis.title = element_blank( ))    

Elbow method to decide Optimal Number of Clusters(1)

set.seed(8)
wss <-function(k) {
  return(kmeans(df, k,  nstart = 25)$tot.withinss)}
k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)
plot(x = k_values, y = wss_values,
  type = “b”, frame = F,
  xlab = “Number of clusters K”,
  ylab = “Total within-clusters sum of square”)

Hierarchical Clustering

hac_output <- hclust(dist(dataset, method = “Euclidean”), method = “complete”)
plot(hac_output)       # Calculating distance using hierarchical clustering, using Euclidean distance
                      # and using complete linkage for hierarchical clustering

Output Desirable Number of Clusters after Modeling

hac_cut <- cutree(hac_output, 2)
for ( i in 1:length(hac_cut)) {
  if( hac_cut[i]   != km_output$cluster[i])   print(names(hac_cut) [i])
  }