if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}

Measuring Distance Between Two Records

Euclidean Distance

library(tidyverse)
# load data and use Company column as row names
utilities.df <- mlba::Utilities %>%
  column_to_rownames("Company")

# compute Euclidean distance
# (to compute other distance measures, change the value in the method argument)
d <- dist(utilities.df, method = "euclidean")

Normalizing Numerical Variables

# normalize input variables
utilities.df.norm <- scale(utilities.df)

# compute normalized distance based on Sales and Fuel Cost
d.norm <- dist(utilities.df.norm[,c("Sales","Fuel_Cost")], method="euclidean")

Hierarchical (Agglomerative) Clustering

Dendrograms: Displaying Clustering Process and Results

library(ggplot2)
library(ggdendro)
d.norm <- dist(utilities.df.norm, method="euclidean")

# in hclust() set argument method \galit{to}
# "ward.D", "single", "complete", "average", "median", or "centroid"
hc1 <- hclust(d.norm, method="single")
plot(hc1, hang=-1, ann=FALSE)  # use baseR

ggdendrogram(hc1)      # use ggdendro package (shown in figure below)

hc2 <- hclust(d.norm, method="average")
plot(hc2, hang=-1, ann=FALSE)

ggdendrogram(hc2)

library(gridExtra)
addCutline <- function(g, hc, ncluster) {
  heights <- rev(hc$height)
  cut_at <- 0.5 * (heights[ncluster] + heights[ncluster - 1])
  return (g + geom_hline(yintercept=cut_at, color='red', linetype=2))
}
g1 <- ggdendrogram(hc1)
g2 <- ggdendrogram(hc2)
grid.arrange(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2)

g <- arrangeGrob(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2)
ggsave(file=file.path(getwd(), "figures", "chapter_16", "utilities-dendrograms.pdf"),
       g, width=5, height=8, units="in")

#> Error in grDevices::pdf(file = filename, ..., version = version): cannot open file 'C:/mlba/rmdFiles/mlba-Rmd/figures/chapter_16/utilities-dendrograms.pdf'

memb <- cutree(hc1, k = 6)
memb

#>     Arizona       Boston      Central  Commonwealth           NY     Florida  
#>            1            1            2            1            3            1 
#>    Hawaiian         Idaho     Kentucky     Madison        Nevada  New England 
#>            1            4            1            1            5            1 
#>     Northern     Oklahoma     Pacific         Puget    San Diego     Southern 
#>            1            1            1            4            6            1 
#>        Texas    Wisconsin       United     Virginia 
#>            1            1            1            1

memb <- cutree(hc2, k = 6)
memb

#>     Arizona       Boston      Central  Commonwealth           NY     Florida  
#>            1            2            1            2            3            1 
#>    Hawaiian         Idaho     Kentucky     Madison        Nevada  New England 
#>            4            5            1            2            5            4 
#>     Northern     Oklahoma     Pacific         Puget    San Diego     Southern 
#>            2            1            4            5            6            1 
#>        Texas    Wisconsin       United     Virginia 
#>            1            2            4            2

Validating Clusters

# set labels as cluster membership and utility name
row.names(utilities.df.norm) <- paste(memb, ": ", row.names(utilities.df), sep = "")

# plot heatmap
heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust)

# grey scale
# rev() reverses the color mapping to large = dark
heatmap(as.matrix(utilities.df.norm), Colv = NA, hclustfun = hclust,
        col=rev(paste("gray",1:99,sep="")))

pdf(file=file.path(getwd(), "figures", "chapter_16", "utilities-heatmap.pdf"),
    width=5, height=5)

#> Error in pdf(file = file.path(getwd(), "figures", "chapter_16", "utilities-heatmap.pdf"), : cannot open file 'C:/mlba/rmdFiles/mlba-Rmd/figures/chapter_16/utilities-heatmap.pdf'

heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust)

dev.off()

#> null device 
#>           1

Non-hierarchical Clustering: The

Choosing the Number of Clusters (\(k\))

set.seed(123) # set random seed for reproducability
# load and preprocess data
utilities.df <- mlba::Utilities %>%
  column_to_rownames("Company")

# normalized distance:
utilities.df.norm <- scale(utilities.df)

# run kmeans algorithm
km <- kmeans(utilities.df.norm, 6)

# show cluster membership
sort(km$cluster)

#>      Boston     Hawaiian   New England     Pacific     San Diego       United 
#>            1            1            1            1            1            1 
#>     Central      Florida      Oklahoma        Texas     Arizona      Kentucky 
#>            2            2            2            2            3            3 
#>     Southern     Virginia           NY Commonwealth     Madison      Northern 
#>            3            3            4            5            5            5 
#>    Wisconsin        Idaho       Nevada        Puget 
#>            5            6            6            6

# centroids
  km$centers

#>   Fixed_charge        RoR       Cost Load_factor Demand_growth      Sales
#> 1  -0.61834147 -0.6252226  0.2019400   1.1482980   0.056364166 -0.7402978
#> 2   0.73659004  1.0755719 -1.5095844  -0.6225467  -0.999249196  0.5537221
#> 3   0.08622291  0.1286230 -0.1804218  -0.1181922   0.355677321  0.1450583
#> 4   2.03732429 -0.8628882  0.5782326  -1.2950193  -0.718643112 -1.5814284
#> 5   0.04557497  0.5742460  0.2383554  -0.2975182  -0.005101929 -0.5853676
#> 6  -0.60027572 -0.8331800  1.3389101  -0.4805802   0.991717777  1.8565214
#>      Nuclear  Fuel_Cost
#> 1 -0.3722028  1.1759426
#> 2 -0.3796469 -0.3991693
#> 3 -0.3186056 -0.2278866
#> 4  0.2143888  1.6926380
#> 5  1.7389316 -0.8356930
#> 6 -0.7146294 -0.9657660

  # within-cluster sum of squares
  km$withinss

#> [1] 21.187976 12.200433 10.773938  0.000000  5.830596  9.533522

  # cluster size
  km$size

#> [1] 6 4 4 1 4 3

library(GGally)
centroids <- data.frame(km$centers)
centroids['Cluster'] = paste('Cluster', seq(1, 6))

ggparcoord(centroids, columns=1:8, groupColumn='Cluster', showPoints=TRUE) +
    scale_color_viridis_d() +
    labs(x='Variable', y='Value')

ggsave(file=file.path(getwd(), "figures", "chapter_16", "utilities-clusterProfile.pdf"),
         last_plot() + theme_bw(), width=8.5, height=3.2, units="in")

#> Error in grDevices::pdf(file = filename, ..., version = version): cannot open file 'C:/mlba/rmdFiles/mlba-Rmd/figures/chapter_16/utilities-clusterProfile.pdf'

result <- tibble()
for (k in 1:6) {
  km <- kmeans(utilities.df.norm, k)
  result <- bind_rows(result, tibble(k=k, average_withinss=mean(km$withinss)))
}

ggplot(result, aes(x=k, y=average_withinss)) +
  geom_line() +
  geom_point() +
  labs(y="Average within-cluster squared distance",
       x=expression(paste("Number of clusters ", italic("k")))) +
  theme_bw()

ggsave(file=file.path(getwd(), "figures", "chapter_16", "utilities-ellbow.pdf"),
       last_plot(), width=4, height=4, units="in")

#> Error in grDevices::pdf(file = filename, ..., version = version): cannot open file 'C:/mlba/rmdFiles/mlba-Rmd/figures/chapter_16/utilities-ellbow.pdf'

dist(km$centers)

#>          1        2        3        4        5
#> 2 4.946218                                    
#> 3 5.522029 4.417278                           
#> 4 3.554242 4.125795 3.196652                  
#> 5 4.992803 3.499350 2.313238 3.235940         
#> 6 4.735629 3.753375 2.951934 3.019693 2.924783

Machine Learning for Business Analytics

Chapter 16: Cluster Analysis

Measuring Distance Between Two Records

Euclidean Distance

Normalizing Numerical Variables

Hierarchical (Agglomerative) Clustering

Dendrograms: Displaying Clustering Process and Results

Validating Clusters

Non-hierarchical Clustering: The

Choosing the Number of Clusters (\(k\))