of the First Five Utilities, Using Euclidean Distance
#### Table 15.2
utilities.df <- read.csv("~/Box/Teaching (jmmejia@iu.edu)/2020 - K-513/Public Files/Public Data/Utilities.csv")
# set row names to the utilities column
row.names(utilities.df) <- utilities.df[,1]
# remove the utility column
utilities.df <- utilities.df[,-1]
# compute Euclidean distance
# (to compute other distance measures, change the value in method = )
d <- dist(utilities.df, method = "euclidean")
d
## Arizona Boston Central Commonwealth NY
## Boston 3989.40808
## Central 140.40286 4125.04413
## Commonwealth 2654.27763 1335.46650 2789.75967
## NY 5777.16767 1788.06803 5912.55291 3123.15322
## Florida 2050.52944 6039.68908 1915.15515 4704.36310 7827.42921
## Hawaiian 1435.26502 2554.28716 1571.29540 1219.56001 4342.09380
## Idaho 4006.10419 7994.15599 3872.25763 6659.53457 9782.15818
## Kentucky 671.27635 3318.27656 807.92079 1983.31435 5106.09415
## Madison 2622.69900 1367.09063 2758.55966 43.64889 3155.09559
## Nevada 8364.03105 12353.06270 8229.22328 11018.05781 14141.02258
## New England 2923.13610 1066.57943 3058.70743 271.45273 2854.09948
## Northern 1899.27982 2091.16049 2035.44152 756.83195 3879.16746
## Oklahoma 598.55663 4586.30256 461.34167 3250.98459 6373.74325
## Pacific 2609.04536 1380.74996 2744.50285 56.64463 3168.17746
## Puget 6914.74206 10903.14646 6780.43031 9568.43443 12691.15511
## San Diego 3363.06163 629.76075 3498.11301 710.29296 2414.69876
## Southern 1063.00907 5052.33167 928.74925 3717.20296 6840.15029
## Texas 4430.25159 8419.61054 4295.01469 7084.37284 10207.39263
## Wisconsin 1790.48565 2199.72167 1925.77256 864.27315 3987.33596
## United 2427.58887 1562.21081 2563.63736 232.47687 3350.07312
## Virginia 1016.61769 5005.08126 883.53546 3670.01819 6793.03530
## Florida Hawaiian Idaho Kentucky Madison
## Boston
## Central
## Commonwealth
## NY
## Florida
## Hawaiian 3485.67156
## Idaho 1959.73108 5440.46178
## Kentucky 2721.70630 764.08319 4676.63838
## Madison 4672.82929 1187.94114 6627.29178 1951.62858
## Nevada 6314.35909 9799.01555 4359.59960 9035.00749 10986.09801
## New England 4973.50684 1488.01491 6928.32617 2252.02672 304.27703
## Northern 3949.09232 466.55912 5903.39545 1228.43633 724.09618
## Oklahoma 1454.29260 2032.61425 3412.26397 1269.10210 3219.82511
## Pacific 4659.35626 1174.07562 6614.49924 1938.02656 53.30140
## Puget 4866.11165 8349.36644 2909.01468 7585.46729 9536.24219
## San Diego 5413.09300 1928.44148 7368.81544 2692.21236 744.25367
## Southern 988.04456 2498.14902 2943.53557 1734.10330 3685.51009
## Texas 2380.12497 5865.44719 447.82867 5101.41414 7052.72388
## Wisconsin 3840.22794 358.47629 5795.95881 1119.94001 833.47299
## United 4478.02887 992.45325 6432.13220 1756.37897 199.22840
## Virginia 1035.98148 2451.18516 2989.96398 1687.23603 3638.09755
## Nevada New England Northern Oklahoma Pacific
## Boston
## Central
## Commonwealth
## NY
## Florida
## Hawaiian
## Idaho
## Kentucky
## Madison
## Nevada
## New England 11287.00691
## Northern 10262.15729 1026.48299
## Oklahoma 7768.38479 3519.97756 2496.63889
## Pacific 10973.01095 314.35403 713.66505 3205.74888
## Puget 1452.16201 9837.28183 8812.30356 6319.93384 9523.41350
## San Diego 11727.06629 442.13276 1466.99195 3959.24075 754.61209
## Southern 7301.04086 3986.10243 2961.83475 470.16479 3672.03540
## Texas 3934.61752 7353.37915 6328.91795 3834.01226 7039.26207
## Wisconsin 10154.11879 1134.14501 119.98126 2386.94275 820.16430
## United 10791.04927 496.68741 531.47633 3024.95235 186.38865
## Virginia 7348.04902 3939.10035 2914.20499 428.06526 3625.11887
## Puget San Diego Southern Texas Wisconsin
## Boston
## Central
## Commonwealth
## NY
## Florida
## Hawaiian
## Idaho
## Kentucky
## Madison
## Nevada
## New England
## Northern
## Oklahoma
## Pacific
## Puget
## San Diego 10277.66038
## Southern 5851.89331 4426.04189
## Texas 2488.43222 7793.08395 3367.31887
## Wisconsin 8704.72128 1573.40838 2853.29878 6220.29673
## United 9341.12661 938.52273 3490.42292 6857.73586 640.78677
## Virginia 5898.57696 4379.21182 59.32529 3414.83146 2806.16571
## United
## Boston
## Central
## Commonwealth
## NY
## Florida
## Hawaiian
## Idaho
## Kentucky
## Madison
## Nevada
## New England
## Northern
## Oklahoma
## Pacific
## Puget
## San Diego
## Southern
## Texas
## Wisconsin
## United
## Virginia 3443.24097
# normalize input variables
utilities.df.norm <- sapply(utilities.df, scale)
# add row names: utilities
row.names(utilities.df.norm) <- row.names(utilities.df)
# compute normalized distance based on variables Sales and FuelCost
d.norm <- dist(utilities.df.norm[,c(6,8)], method = "euclidean")
d.norm
## Arizona Boston Central Commonwealth NY Florida
## Boston 2.0103293
## Central 0.7741795 1.4657027
## Commonwealth 0.7587375 1.5828208 1.0157104
## NY 3.0219066 1.0133700 2.4325285 2.5719693
## Florida 1.2444219 1.7923968 0.6318918 1.6438566 2.6355728
## Hawaiian 1.8852481 0.7402833 1.1560922 1.7460268 1.4116954 1.2288047
## Idaho 1.2656380 3.1766540 1.7327770 2.0032300 4.1625615 1.7641233
## Kentucky 0.4612918 1.5577377 0.4192538 0.6299937 2.5664387 1.0256629
## Madison 0.7386496 1.7196319 1.1022872 0.1387579 2.7054453 1.7225099
## Nevada 2.3694792 3.7565131 2.3759746 3.1060838 4.5970059 1.9715184
## New England 2.4259752 0.6843933 1.7373219 2.1538314 0.8462906 1.8313804
## Northern 0.5646572 1.9401658 1.1134329 0.3770043 2.9386369 1.6986240
## Oklahoma 0.1826480 2.1660781 0.8550928 0.9373890 3.1745882 1.2436342
## Pacific 1.5707796 0.4783340 0.9877719 1.2588346 1.4620188 1.3431847
## Puget 1.9476675 3.5013904 2.0656431 2.6990600 4.3974331 1.7675811
## San Diego 2.5090434 0.6796342 1.8367621 2.2029297 0.7156293 1.9534230
## Southern 0.9136210 1.6344254 0.2764402 1.2785143 2.5584087 0.3667437
## Texas 1.2479759 2.8905601 1.4281594 1.9988179 3.8311318 1.2779197
## Wisconsin 0.5214913 1.6542554 0.8389668 0.2434079 2.6617861 1.4524174
## United 2.7617447 1.1005949 2.0348238 2.5471162 0.9525069 2.0164926
## Virginia 1.2523502 1.4792607 0.5103653 1.5020926 2.3286909 0.3138469
## Hawaiian Idaho Kentucky Madison Nevada New England
## Boston
## Central
## Commonwealth
## NY
## Florida
## Hawaiian
## Idaho 2.8601888
## Kentucky 1.4368218 1.6504169
## Madison 1.8803606 1.9502960 0.6976742
## Nevada 3.1853105 1.4795256 2.5506890 3.1056271
## New England 0.6081070 3.4587708 1.9663226 2.2925313 3.7723588
## Northern 2.0272242 1.7084093 0.6945241 0.2671984 2.9230227 2.4804563
## Oklahoma 1.9970362 1.0834492 0.6084009 0.9086653 2.2119896 2.5541088
## Pacific 0.5609973 2.7055789 1.1108538 1.3972404 3.2933099 0.8980935
## Puget 2.9958483 0.9920924 2.1804959 2.6862147 0.4875080 3.5988456
## San Diego 0.7260955 3.5637271 2.0480980 2.3416442 3.8992116 0.1306629
## Southern 1.2050342 1.6586708 0.6589960 1.3557857 2.1455853 1.8093545
## Texas 2.4632271 0.6000891 1.4932742 1.9866254 1.1333108 3.0711777
## Wisconsin 1.7112561 1.7788126 0.4267801 0.2740610 2.8627558 2.1724729
## United 0.8799342 3.7204215 2.3086131 2.6853401 3.8879179 0.4178660
## Virginia 0.9294143 1.9807148 0.9291409 1.5995869 2.2848031 1.5364364
## Northern Oklahoma Pacific Puget San Diego Southern
## Boston
## Central
## Commonwealth
## NY
## Florida
## Hawaiian
## Idaho
## Kentucky
## Madison
## Nevada
## New England
## Northern
## Oklahoma 0.7110504
## Pacific 1.5825914 1.7167394
## Puget 2.4878919 1.7806564 3.0271161
## San Diego 2.5387195 2.6421549 0.9589052 3.7209695
## Southern 1.3368874 0.9442949 1.1600171 1.8672352 1.9200350
## Texas 1.7932874 1.0834486 2.4122780 0.7003127 3.1859416 1.2727841
## Wisconsin 0.3161598 0.7026837 1.2762004 2.4562721 2.2346324 1.0857741
## United 2.8612926 2.8766457 1.2885630 3.7630655 0.4401629 2.0620666
## Virginia 1.6236142 1.2965484 1.0350276 2.0693140 1.6554978 0.3562984
## Texas Wisconsin United
## Boston
## Central
## Commonwealth
## NY
## Florida
## Hawaiian
## Idaho
## Kentucky
## Madison
## Nevada
## New England
## Northern
## Oklahoma
## Pacific
## Puget
## San Diego
## Southern
## Texas
## Wisconsin 1.7561356
## United 3.2884604 2.5490405
## Virginia 1.5415758 1.3433064 1.7499296
A dendrogram is a treelike diagram that summarizes the process of clustering. On the x-axis are the records. Similar records are joined by lines whose vertical length reflects the distance between the records. Figure 15.3 shows the dendrograms that results from clustering the 22 utilities using the 8 normalized measurements, Euclidean distance, once with single linkage (top) and once with average linkage (bottom).
#### Figure 15.3
# compute normalized distance based on all 8 variables
d.norm <- dist(utilities.df.norm, method = "euclidean")
# in hclust() set argument method =
# to "ward.D", "single", "complete", "average", "median", or "centroid"
hc1 <- hclust(d.norm, method = "single")
plot(hc1, hang = -1, ann = T)
hc2 <- hclust(d.norm, method = "average")
plot(hc2, hang = -1, ann = T)
By choosing a cutoff distance on the y-axis, a set of clusters is created. Visually, this means drawing a horizontal line on a dendrogram. Records with connections below the horizontal line (that is, their distance is smaller than the cutoff distance) belong to the same cluster. For example, setting the cutoff distance to 2.7 on the single linkage dendrogram in Figure 15.3 (top) results in six clusters. The six clusters are (from left to right on the dendrogram):
{NY}, {Nevada}, {San Diego}, {Idaho, Puget}, {Central}, {Others}.
If we want six clusters using average linkage, we can choose a cutoff distance of 3.5. The resulting six clusters are slightly different.
The six (or other number of) clusters can be computed in R by applying the function cutree() to the dendrogram object. Table 15.6 shows the results for the single linkage and the average linkage clustering with six clusters. Each record is assigned a cluster number. While some records remain in the same cluster in both methods (e.g., Arizona, Florida, Kentucky, Oklamona, Texas), others change.
#### Table 15.6
memb <- cutree(hc1, k = 6)
memb
## Arizona Boston Central Commonwealth NY Florida
## 1 1 2 1 3 1
## Hawaiian Idaho Kentucky Madison Nevada New England
## 1 4 1 1 5 1
## Northern Oklahoma Pacific Puget San Diego Southern
## 1 1 1 4 6 1
## Texas Wisconsin United Virginia
## 1 1 1 1
memb <- cutree(hc2, k = 6)
memb
## Arizona Boston Central Commonwealth NY Florida
## 1 2 1 2 3 1
## Hawaiian Idaho Kentucky Madison Nevada New England
## 4 5 1 2 5 4
## Northern Oklahoma Pacific Puget San Diego Southern
## 2 1 4 5 6 1
## Texas Wisconsin United Virginia
## 1 2 4 2
One important goal of cluster analysis is to come up with meaningful clusters. Since there are many variations that can be chosen, it is important to make sure that the resulting clusters are valid, in the sense that they really generate some insight. To see whether the cluster analysis is useful, consider each of the following aspects:
Cluster interpretability. Is the interpretation of the resulting clusters reasonable? To interpret the clusters, explore the characteristics of each cluster by
Obtaining summary statistics (e.g., average, min, max) from each cluster on each measurement that was used in the cluster analysis Examining the clusters for separation along some common feature (variable) that was not used in the cluster analysis Labeling the clusters: based on the interpretation, trying to assign a name or label to each cluster Cluster stability. Do cluster assignments change significantly if some of the inputs are altered slightly? Another way to check stability is to partition the data and see how well clusters formed based on one part apply to the other part. To do this:
Cluster partition A. Use the cluster centroids from A to assign each record in partition B (each record is assigned to the cluster with the closest centroid). Assess how consistent the cluster assignments are compared to the assignments based on all the data. Cluster separation. Examine the ratio of between-cluster variation to within-cluster variation to see whether the separation is reasonable. There exist statistical tests for this task (an F-ratio), but their usefulness is somewhat controversial.
Number of clusters. The number of resulting clusters must be useful, given the purpose of the analysis. For example, suppose the goal of the clustering is to identify categories of customers and assign labels to them for market segmentation purposes. If the marketing department can only manage to sustain three different marketing presentations, it would probably not make sense to identify more than three clusters.
Returning to the utilities example, we notice that both methods (single and average linkage) identify {NY} and {San Diego} as singleton clusters. Also, both dendrograms imply that a reasonable number of clusters in this dataset is four. One insight that can be derived from the average linkage clustering is that clusters tend to group geographically. The four non-singleton clusters form (approximately) a southern group, a northern group, an east/west seaboard group, and a west group.
We can further characterize each of the clusters by examining the summary statistics of their measurements, or visually looking at a heatmap of their individual measurements. Figure 15.4 shows a heatmap of the four clusters and two singletons, highlighting the different profile that each cluster has in terms of the eight measurements. We see, for instance, that cluster 2 is characterized by utilities with a high percent of nuclear power; cluster 1 is characterized by high fixed charge and RoR; cluster 4 has high fuel costs.
#### Figure 15.4
# set labels as cluster membership and utility name
row.names(utilities.df.norm) <- paste(memb, ": ", row.names(utilities.df), sep = "")
# plot heatmap
# rev() reverses the color mapping to large = dark
heatmap(as.matrix(utilities.df.norm), Colv = NA, hclustfun = hclust,
col=rev(paste("gray",1:99,sep="")))
A non-hierarchical approach to forming good clusters is to pre-specify a desired number of clusters, k, and assign each case to one of the k clusters so as to minimize a measure of dispersion within the clusters. In other words, the goal is to divide the sample into a predetermined number k of non-overlapping clusters so that clusters are as homogeneous as possible with respect to the measurements used.
A common measure of within-cluster dispersion is the sum of distances (or sum of squared Euclidean distances) of records from their cluster centroid. The problem can be set up as an optimization problem involving integer programming, but because solving integer programs with a large number of variables is time-consuming, clusters are often computed using a fast, heuristic method that produces good (although not necessarily optimal) solutions. The k-means algorithm is one such method.
The k-means algorithm starts with an initial partition of the records into k clusters. Subsequent steps modify the partition to reduce the sum of the distances of each record from its cluster centroid. The modification consists of allocating each record to the nearest of the k centroids of the previous partition. This leads to a new partition for which the sum of distances is smaller than before. The means of the new clusters are computed and the improvement step is repeated until the improvement is very small.
-means clustering algorithm: Start with k initial clusters (user chooses k). At every step, each record is reassigned to the cluster with the “closest” centroid. Recompute the centroids of clusters that lost or gained a record, and repeat Step 2. Stop when moving any more records between clusters increases cluster dispersion.
#### Table 15.9
# load and preprocess data
#utilities.df <- read.csv("Utilities.csv")
#row.names(utilities.df) <- utilities.df[,1]
#utilities.df <- utilities.df[,-1]
# normalized distance:
utilities.df.norm <- sapply(utilities.df, scale)
row.names(utilities.df.norm) <- row.names(utilities.df)
# run kmeans algorithm
set.seed(1)
km <- kmeans(utilities.df.norm, 6)
km
## K-means clustering with 6 clusters of sizes 4, 3, 4, 4, 3, 4
##
## Cluster means:
## Fixed_charge RoR Cost Load_factor Demand_growth Sales
## 1 0.04557497 0.57424600 0.2383554 -0.2975182 -0.005101929 -0.5853676
## 2 0.89918182 -0.04591267 0.3273709 0.7373425 -0.301742646 -0.9057446
## 3 0.08622291 0.12862300 -0.1804218 -0.1181922 0.355677321 0.1450583
## 4 -1.09256750 -1.11912138 0.2019400 0.8456853 0.131192455 -0.8264954
## 5 -0.60027572 -0.83317996 1.3389101 -0.4805802 0.991717777 1.8565214
## 6 0.73659004 1.07557186 -1.5095844 -0.6225467 -0.999249196 0.5537221
## Nuclear Fuel_Cost
## 1 1.7389316 -0.8356930
## 2 -0.4049567 1.3695535
## 3 -0.3186056 -0.2278866
## 4 -0.2009895 1.1599082
## 5 -0.7146294 -0.9657660
## 6 -0.3796469 -0.3991693
##
## Clustering vector:
## Arizona Boston Central Commonwealth NY Florida
## 3 4 6 1 2 6
## Hawaiian Idaho Kentucky Madison Nevada New England
## 2 5 3 1 5 2
## Northern Oklahoma Pacific Puget San Diego Southern
## 1 6 4 5 4 3
## Texas Wisconsin United Virginia
## 6 1 4 3
##
## Within cluster sum of squares by cluster:
## [1] 5.830596 12.289734 10.773938 12.574083 9.533522 12.200433
## (between_SS / total_SS = 62.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# show cluster membership
km$cluster
## Arizona Boston Central Commonwealth NY Florida
## 3 4 6 1 2 6
## Hawaiian Idaho Kentucky Madison Nevada New England
## 2 5 3 1 5 2
## Northern Oklahoma Pacific Puget San Diego Southern
## 1 6 4 5 4 3
## Texas Wisconsin United Virginia
## 6 1 4 3
{
#### Figure 15.5
# plot an empty scatter plot
plot(c(0), xaxt = 'n', ylab = "", type = "l",
ylim = c(min(km$centers), max(km$centers)), xlim = c(0, 8))
# label x-axes
axis(1, at = c(1:8), labels = names(utilities.df))
# plot centroids
for (i in c(1:6))
lines(km$centers[i,], lty = i, lwd = 2, col = ifelse(i %in% c(1, 3, 5),
"black", "dark grey"))
# name clusters
text(x = 0.5, y = km$centers[, 1], labels = paste("Cluster", c(1:6)))
}
#### Table 15.11
dist(km$centers)
## 1 2 3 4 5
## 2 3.441236
## 3 2.382086 2.403501
## 4 3.646031 2.329288 2.637383
## 5 4.078732 4.497497 2.805529 4.012131
## 6 3.294624 3.500492 2.318757 4.346614 4.433858