if (!require(mlba)) {
library(devtools)
install_github("gedeck/mlba/mlba", force=TRUE)
}
library(tidyverse)
# load data and use Company column as row names
utilities.df <- mlba::Utilities %>%
column_to_rownames("Company")
# compute Euclidean distance
# (to compute other distance measures, change the value in the method argument)
d <- dist(utilities.df, method = "euclidean")
# normalize input variables
utilities.df.norm <- scale(utilities.df)
# compute normalized distance based on Sales and Fuel Cost
d.norm <- dist(utilities.df.norm[,c("Sales","Fuel_Cost")], method="euclidean")
library(ggplot2)
library(ggdendro)
d.norm <- dist(utilities.df.norm, method="euclidean")
# in hclust() set argument method \galit{to}
# "ward.D", "single", "complete", "average", "median", or "centroid"
hc1 <- hclust(d.norm, method="single")
plot(hc1, hang=-1, ann=FALSE) # use baseR
ggdendrogram(hc1) # use ggdendro package (shown in figure below)
hc2 <- hclust(d.norm, method="average")
plot(hc2, hang=-1, ann=FALSE)
ggdendrogram(hc2)
library(gridExtra)
addCutline <- function(g, hc, ncluster) {
heights <- rev(hc$height)
cut_at <- 0.5 * (heights[ncluster] + heights[ncluster - 1])
return (g + geom_hline(yintercept=cut_at, color='red', linetype=2))
}
g1 <- ggdendrogram(hc1)
g2 <- ggdendrogram(hc2)
grid.arrange(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2)
g <- arrangeGrob(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2)
ggsave(file=file.path(getwd(), "figures", "chapter_16", "utilities-dendrograms.pdf"),
g, width=5, height=8, units="in")
#> Error in grDevices::pdf(file = filename, ..., version = version): cannot open file 'C:/mlba/rmdFiles/mlba-Rmd/figures/chapter_16/utilities-dendrograms.pdf'
memb <- cutree(hc1, k = 6)
memb
#> Arizona Boston Central Commonwealth NY Florida
#> 1 1 2 1 3 1
#> Hawaiian Idaho Kentucky Madison Nevada New England
#> 1 4 1 1 5 1
#> Northern Oklahoma Pacific Puget San Diego Southern
#> 1 1 1 4 6 1
#> Texas Wisconsin United Virginia
#> 1 1 1 1
memb <- cutree(hc2, k = 6)
memb
#> Arizona Boston Central Commonwealth NY Florida
#> 1 2 1 2 3 1
#> Hawaiian Idaho Kentucky Madison Nevada New England
#> 4 5 1 2 5 4
#> Northern Oklahoma Pacific Puget San Diego Southern
#> 2 1 4 5 6 1
#> Texas Wisconsin United Virginia
#> 1 2 4 2
# set labels as cluster membership and utility name
row.names(utilities.df.norm) <- paste(memb, ": ", row.names(utilities.df), sep = "")
# plot heatmap
heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust)
# grey scale
# rev() reverses the color mapping to large = dark
heatmap(as.matrix(utilities.df.norm), Colv = NA, hclustfun = hclust,
col=rev(paste("gray",1:99,sep="")))
pdf(file=file.path(getwd(), "figures", "chapter_16", "utilities-heatmap.pdf"),
width=5, height=5)
#> Error in pdf(file = file.path(getwd(), "figures", "chapter_16", "utilities-heatmap.pdf"), : cannot open file 'C:/mlba/rmdFiles/mlba-Rmd/figures/chapter_16/utilities-heatmap.pdf'
heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust)
dev.off()
#> null device
#> 1
set.seed(123) # set random seed for reproducability
# load and preprocess data
utilities.df <- mlba::Utilities %>%
column_to_rownames("Company")
# normalized distance:
utilities.df.norm <- scale(utilities.df)
# run kmeans algorithm
km <- kmeans(utilities.df.norm, 6)
# show cluster membership
sort(km$cluster)
#> Boston Hawaiian New England Pacific San Diego United
#> 1 1 1 1 1 1
#> Central Florida Oklahoma Texas Arizona Kentucky
#> 2 2 2 2 3 3
#> Southern Virginia NY Commonwealth Madison Northern
#> 3 3 4 5 5 5
#> Wisconsin Idaho Nevada Puget
#> 5 6 6 6
# centroids
km$centers
#> Fixed_charge RoR Cost Load_factor Demand_growth Sales
#> 1 -0.61834147 -0.6252226 0.2019400 1.1482980 0.056364166 -0.7402978
#> 2 0.73659004 1.0755719 -1.5095844 -0.6225467 -0.999249196 0.5537221
#> 3 0.08622291 0.1286230 -0.1804218 -0.1181922 0.355677321 0.1450583
#> 4 2.03732429 -0.8628882 0.5782326 -1.2950193 -0.718643112 -1.5814284
#> 5 0.04557497 0.5742460 0.2383554 -0.2975182 -0.005101929 -0.5853676
#> 6 -0.60027572 -0.8331800 1.3389101 -0.4805802 0.991717777 1.8565214
#> Nuclear Fuel_Cost
#> 1 -0.3722028 1.1759426
#> 2 -0.3796469 -0.3991693
#> 3 -0.3186056 -0.2278866
#> 4 0.2143888 1.6926380
#> 5 1.7389316 -0.8356930
#> 6 -0.7146294 -0.9657660
# within-cluster sum of squares
km$withinss
#> [1] 21.187976 12.200433 10.773938 0.000000 5.830596 9.533522
# cluster size
km$size
#> [1] 6 4 4 1 4 3
library(GGally)
centroids <- data.frame(km$centers)
centroids['Cluster'] = paste('Cluster', seq(1, 6))
ggparcoord(centroids, columns=1:8, groupColumn='Cluster', showPoints=TRUE) +
scale_color_viridis_d() +
labs(x='Variable', y='Value')
ggsave(file=file.path(getwd(), "figures", "chapter_16", "utilities-clusterProfile.pdf"),
last_plot() + theme_bw(), width=8.5, height=3.2, units="in")
#> Error in grDevices::pdf(file = filename, ..., version = version): cannot open file 'C:/mlba/rmdFiles/mlba-Rmd/figures/chapter_16/utilities-clusterProfile.pdf'
result <- tibble()
for (k in 1:6) {
km <- kmeans(utilities.df.norm, k)
result <- bind_rows(result, tibble(k=k, average_withinss=mean(km$withinss)))
}
ggplot(result, aes(x=k, y=average_withinss)) +
geom_line() +
geom_point() +
labs(y="Average within-cluster squared distance",
x=expression(paste("Number of clusters ", italic("k")))) +
theme_bw()
ggsave(file=file.path(getwd(), "figures", "chapter_16", "utilities-ellbow.pdf"),
last_plot(), width=4, height=4, units="in")
#> Error in grDevices::pdf(file = filename, ..., version = version): cannot open file 'C:/mlba/rmdFiles/mlba-Rmd/figures/chapter_16/utilities-ellbow.pdf'
dist(km$centers)
#> 1 2 3 4 5
#> 2 4.946218
#> 3 5.522029 4.417278
#> 4 3.554242 4.125795 3.196652
#> 5 4.992803 3.499350 2.313238 3.235940
#> 6 4.735629 3.753375 2.951934 3.019693 2.924783