Clustering

Distance Matrix Between Pairs

of the First Five Utilities, Using Euclidean Distance

#### Table 15.2

utilities.df <- read.csv("~/Box/Teaching (jmmejia@iu.edu)/2020 - K-513/Public Files/Public Data/Utilities.csv")

# set row names to the utilities column
row.names(utilities.df) <- utilities.df[,1]

# remove the utility column
utilities.df <- utilities.df[,-1]

# compute Euclidean distance
# (to compute other distance measures, change the value in method = )
d <- dist(utilities.df, method = "euclidean")
d

##                 Arizona      Boston     Central  Commonwealth          NY
## Boston        3989.40808                                                 
## Central        140.40286  4125.04413                                     
## Commonwealth  2654.27763  1335.46650  2789.75967                         
## NY            5777.16767  1788.06803  5912.55291   3123.15322            
## Florida       2050.52944  6039.68908  1915.15515   4704.36310  7827.42921
## Hawaiian      1435.26502  2554.28716  1571.29540   1219.56001  4342.09380
## Idaho         4006.10419  7994.15599  3872.25763   6659.53457  9782.15818
## Kentucky       671.27635  3318.27656   807.92079   1983.31435  5106.09415
## Madison       2622.69900  1367.09063  2758.55966     43.64889  3155.09559
## Nevada        8364.03105 12353.06270  8229.22328  11018.05781 14141.02258
## New England   2923.13610  1066.57943  3058.70743    271.45273  2854.09948
## Northern      1899.27982  2091.16049  2035.44152    756.83195  3879.16746
## Oklahoma       598.55663  4586.30256   461.34167   3250.98459  6373.74325
## Pacific       2609.04536  1380.74996  2744.50285     56.64463  3168.17746
## Puget         6914.74206 10903.14646  6780.43031   9568.43443 12691.15511
## San Diego     3363.06163   629.76075  3498.11301    710.29296  2414.69876
## Southern      1063.00907  5052.33167   928.74925   3717.20296  6840.15029
## Texas         4430.25159  8419.61054  4295.01469   7084.37284 10207.39263
## Wisconsin     1790.48565  2199.72167  1925.77256    864.27315  3987.33596
## United        2427.58887  1562.21081  2563.63736    232.47687  3350.07312
## Virginia      1016.61769  5005.08126   883.53546   3670.01819  6793.03530
##                 Florida    Hawaiian        Idaho    Kentucky    Madison 
## Boston                                                                  
## Central                                                                 
## Commonwealth                                                            
## NY                                                                      
## Florida                                                                 
## Hawaiian      3485.67156                                                
## Idaho         1959.73108  5440.46178                                    
## Kentucky      2721.70630   764.08319  4676.63838                        
## Madison       4672.82929  1187.94114  6627.29178  1951.62858            
## Nevada        6314.35909  9799.01555  4359.59960  9035.00749 10986.09801
## New England   4973.50684  1488.01491  6928.32617  2252.02672   304.27703
## Northern      3949.09232   466.55912  5903.39545  1228.43633   724.09618
## Oklahoma      1454.29260  2032.61425  3412.26397  1269.10210  3219.82511
## Pacific       4659.35626  1174.07562  6614.49924  1938.02656    53.30140
## Puget         4866.11165  8349.36644  2909.01468  7585.46729  9536.24219
## San Diego     5413.09300  1928.44148  7368.81544  2692.21236   744.25367
## Southern       988.04456  2498.14902  2943.53557  1734.10330  3685.51009
## Texas         2380.12497  5865.44719   447.82867  5101.41414  7052.72388
## Wisconsin     3840.22794   358.47629  5795.95881  1119.94001   833.47299
## United        4478.02887   992.45325  6432.13220  1756.37897   199.22840
## Virginia      1035.98148  2451.18516  2989.96398  1687.23603  3638.09755
##                   Nevada New England    Northern    Oklahoma    Pacific 
## Boston                                                                  
## Central                                                                 
## Commonwealth                                                            
## NY                                                                      
## Florida                                                                 
## Hawaiian                                                                
## Idaho                                                                   
## Kentucky                                                                
## Madison                                                                 
## Nevada                                                                  
## New England  11287.00691                                                
## Northern     10262.15729  1026.48299                                    
## Oklahoma      7768.38479  3519.97756  2496.63889                        
## Pacific      10973.01095   314.35403   713.66505  3205.74888            
## Puget         1452.16201  9837.28183  8812.30356  6319.93384  9523.41350
## San Diego    11727.06629   442.13276  1466.99195  3959.24075   754.61209
## Southern      7301.04086  3986.10243  2961.83475   470.16479  3672.03540
## Texas         3934.61752  7353.37915  6328.91795  3834.01226  7039.26207
## Wisconsin    10154.11879  1134.14501   119.98126  2386.94275   820.16430
## United       10791.04927   496.68741   531.47633  3024.95235   186.38865
## Virginia      7348.04902  3939.10035  2914.20499   428.06526  3625.11887
##                    Puget   San Diego    Southern       Texas   Wisconsin
## Boston                                                                  
## Central                                                                 
## Commonwealth                                                            
## NY                                                                      
## Florida                                                                 
## Hawaiian                                                                
## Idaho                                                                   
## Kentucky                                                                
## Madison                                                                 
## Nevada                                                                  
## New England                                                             
## Northern                                                                
## Oklahoma                                                                
## Pacific                                                                 
## Puget                                                                   
## San Diego    10277.66038                                                
## Southern      5851.89331  4426.04189                                    
## Texas         2488.43222  7793.08395  3367.31887                        
## Wisconsin     8704.72128  1573.40838  2853.29878  6220.29673            
## United        9341.12661   938.52273  3490.42292  6857.73586   640.78677
## Virginia      5898.57696  4379.21182    59.32529  3414.83146  2806.16571
##                   United
## Boston                  
## Central                 
## Commonwealth            
## NY                      
## Florida                 
## Hawaiian                
## Idaho                   
## Kentucky                
## Madison                 
## Nevada                  
## New England             
## Northern                
## Oklahoma                
## Pacific                 
## Puget                   
## San Diego               
## Southern                
## Texas                   
## Wisconsin               
## United                  
## Virginia      3443.24097

Normalize variables

# normalize input variables
utilities.df.norm <- sapply(utilities.df, scale)

# add row names: utilities
row.names(utilities.df.norm) <- row.names(utilities.df) 

# compute normalized distance based on variables Sales and FuelCost
d.norm <- dist(utilities.df.norm[,c(6,8)], method = "euclidean")
d.norm

##               Arizona    Boston   Central  Commonwealth        NY  Florida 
## Boston       2.0103293                                                     
## Central      0.7741795 1.4657027                                           
## Commonwealth 0.7587375 1.5828208 1.0157104                                 
## NY           3.0219066 1.0133700 2.4325285    2.5719693                    
## Florida      1.2444219 1.7923968 0.6318918    1.6438566 2.6355728          
## Hawaiian     1.8852481 0.7402833 1.1560922    1.7460268 1.4116954 1.2288047
## Idaho        1.2656380 3.1766540 1.7327770    2.0032300 4.1625615 1.7641233
## Kentucky     0.4612918 1.5577377 0.4192538    0.6299937 2.5664387 1.0256629
## Madison      0.7386496 1.7196319 1.1022872    0.1387579 2.7054453 1.7225099
## Nevada       2.3694792 3.7565131 2.3759746    3.1060838 4.5970059 1.9715184
## New England  2.4259752 0.6843933 1.7373219    2.1538314 0.8462906 1.8313804
## Northern     0.5646572 1.9401658 1.1134329    0.3770043 2.9386369 1.6986240
## Oklahoma     0.1826480 2.1660781 0.8550928    0.9373890 3.1745882 1.2436342
## Pacific      1.5707796 0.4783340 0.9877719    1.2588346 1.4620188 1.3431847
## Puget        1.9476675 3.5013904 2.0656431    2.6990600 4.3974331 1.7675811
## San Diego    2.5090434 0.6796342 1.8367621    2.2029297 0.7156293 1.9534230
## Southern     0.9136210 1.6344254 0.2764402    1.2785143 2.5584087 0.3667437
## Texas        1.2479759 2.8905601 1.4281594    1.9988179 3.8311318 1.2779197
## Wisconsin    0.5214913 1.6542554 0.8389668    0.2434079 2.6617861 1.4524174
## United       2.7617447 1.1005949 2.0348238    2.5471162 0.9525069 2.0164926
## Virginia     1.2523502 1.4792607 0.5103653    1.5020926 2.3286909 0.3138469
##              Hawaiian      Idaho  Kentucky  Madison     Nevada New England
## Boston                                                                    
## Central                                                                   
## Commonwealth                                                              
## NY                                                                        
## Florida                                                                   
## Hawaiian                                                                  
## Idaho        2.8601888                                                    
## Kentucky     1.4368218 1.6504169                                          
## Madison      1.8803606 1.9502960 0.6976742                                
## Nevada       3.1853105 1.4795256 2.5506890 3.1056271                      
## New England  0.6081070 3.4587708 1.9663226 2.2925313 3.7723588            
## Northern     2.0272242 1.7084093 0.6945241 0.2671984 2.9230227   2.4804563
## Oklahoma     1.9970362 1.0834492 0.6084009 0.9086653 2.2119896   2.5541088
## Pacific      0.5609973 2.7055789 1.1108538 1.3972404 3.2933099   0.8980935
## Puget        2.9958483 0.9920924 2.1804959 2.6862147 0.4875080   3.5988456
## San Diego    0.7260955 3.5637271 2.0480980 2.3416442 3.8992116   0.1306629
## Southern     1.2050342 1.6586708 0.6589960 1.3557857 2.1455853   1.8093545
## Texas        2.4632271 0.6000891 1.4932742 1.9866254 1.1333108   3.0711777
## Wisconsin    1.7112561 1.7788126 0.4267801 0.2740610 2.8627558   2.1724729
## United       0.8799342 3.7204215 2.3086131 2.6853401 3.8879179   0.4178660
## Virginia     0.9294143 1.9807148 0.9291409 1.5995869 2.2848031   1.5364364
##               Northern  Oklahoma  Pacific      Puget San Diego  Southern
## Boston                                                                  
## Central                                                                 
## Commonwealth                                                            
## NY                                                                      
## Florida                                                                 
## Hawaiian                                                                
## Idaho                                                                   
## Kentucky                                                                
## Madison                                                                 
## Nevada                                                                  
## New England                                                             
## Northern                                                                
## Oklahoma     0.7110504                                                  
## Pacific      1.5825914 1.7167394                                        
## Puget        2.4878919 1.7806564 3.0271161                              
## San Diego    2.5387195 2.6421549 0.9589052 3.7209695                    
## Southern     1.3368874 0.9442949 1.1600171 1.8672352 1.9200350          
## Texas        1.7932874 1.0834486 2.4122780 0.7003127 3.1859416 1.2727841
## Wisconsin    0.3161598 0.7026837 1.2762004 2.4562721 2.2346324 1.0857741
## United       2.8612926 2.8766457 1.2885630 3.7630655 0.4401629 2.0620666
## Virginia     1.6236142 1.2965484 1.0350276 2.0693140 1.6554978 0.3562984
##                  Texas Wisconsin    United
## Boston                                    
## Central                                   
## Commonwealth                              
## NY                                        
## Florida                                   
## Hawaiian                                  
## Idaho                                     
## Kentucky                                  
## Madison                                   
## Nevada                                    
## New England                               
## Northern                                  
## Oklahoma                                  
## Pacific                                   
## Puget                                     
## San Diego                                 
## Southern                                  
## Texas                                     
## Wisconsin    1.7561356                    
## United       3.2884604 2.5490405          
## Virginia     1.5415758 1.3433064 1.7499296

Dendrograms: Displaying Clustering Process and Results

A dendrogram is a treelike diagram that summarizes the process of clustering. On the x-axis are the records. Similar records are joined by lines whose vertical length reflects the distance between the records. Figure 15.3 shows the dendrograms that results from clustering the 22 utilities using the 8 normalized measurements, Euclidean distance, once with single linkage (top) and once with average linkage (bottom).

#### Figure 15.3
# compute normalized distance based on all 8 variables
d.norm <- dist(utilities.df.norm, method = "euclidean")

# in hclust() set argument method =  
# to "ward.D", "single", "complete", "average", "median", or "centroid"
hc1 <- hclust(d.norm, method = "single")
plot(hc1, hang = -1, ann = T)

hc2 <- hclust(d.norm, method = "average")
plot(hc2, hang = -1, ann = T)

Table 15.6 Computing cluster membership by “cutting” the dendrogram

By choosing a cutoff distance on the y-axis, a set of clusters is created. Visually, this means drawing a horizontal line on a dendrogram. Records with connections below the horizontal line (that is, their distance is smaller than the cutoff distance) belong to the same cluster. For example, setting the cutoff distance to 2.7 on the single linkage dendrogram in Figure 15.3 (top) results in six clusters. The six clusters are (from left to right on the dendrogram):

{NY}, {Nevada}, {San Diego}, {Idaho, Puget}, {Central}, {Others}.

If we want six clusters using average linkage, we can choose a cutoff distance of 3.5. The resulting six clusters are slightly different.

The six (or other number of) clusters can be computed in R by applying the function cutree() to the dendrogram object. Table 15.6 shows the results for the single linkage and the average linkage clustering with six clusters. Each record is assigned a cluster number. While some records remain in the same cluster in both methods (e.g., Arizona, Florida, Kentucky, Oklamona, Texas), others change.

#### Table 15.6

memb <- cutree(hc1, k = 6)
memb

##     Arizona       Boston      Central  Commonwealth           NY     Florida  
##            1            1            2            1            3            1 
##    Hawaiian         Idaho     Kentucky     Madison        Nevada  New England 
##            1            4            1            1            5            1 
##     Northern     Oklahoma     Pacific         Puget    San Diego     Southern 
##            1            1            1            4            6            1 
##        Texas    Wisconsin       United     Virginia 
##            1            1            1            1

memb <- cutree(hc2, k = 6)
memb

##     Arizona       Boston      Central  Commonwealth           NY     Florida  
##            1            2            1            2            3            1 
##    Hawaiian         Idaho     Kentucky     Madison        Nevada  New England 
##            4            5            1            2            5            4 
##     Northern     Oklahoma     Pacific         Puget    San Diego     Southern 
##            2            1            4            5            6            1 
##        Texas    Wisconsin       United     Virginia 
##            1            2            4            2

Validating Clusters

One important goal of cluster analysis is to come up with meaningful clusters. Since there are many variations that can be chosen, it is important to make sure that the resulting clusters are valid, in the sense that they really generate some insight. To see whether the cluster analysis is useful, consider each of the following aspects:

Cluster interpretability. Is the interpretation of the resulting clusters reasonable? To interpret the clusters, explore the characteristics of each cluster by

Obtaining summary statistics (e.g., average, min, max) from each cluster on each measurement that was used in the cluster analysis Examining the clusters for separation along some common feature (variable) that was not used in the cluster analysis Labeling the clusters: based on the interpretation, trying to assign a name or label to each cluster Cluster stability. Do cluster assignments change significantly if some of the inputs are altered slightly? Another way to check stability is to partition the data and see how well clusters formed based on one part apply to the other part. To do this:

Cluster partition A. Use the cluster centroids from A to assign each record in partition B (each record is assigned to the cluster with the closest centroid). Assess how consistent the cluster assignments are compared to the assignments based on all the data. Cluster separation. Examine the ratio of between-cluster variation to within-cluster variation to see whether the separation is reasonable. There exist statistical tests for this task (an F-ratio), but their usefulness is somewhat controversial.

Number of clusters. The number of resulting clusters must be useful, given the purpose of the analysis. For example, suppose the goal of the clustering is to identify categories of customers and assign labels to them for market segmentation purposes. If the marketing department can only manage to sustain three different marketing presentations, it would probably not make sense to identify more than three clusters.

Returning to the utilities example, we notice that both methods (single and average linkage) identify {NY} and {San Diego} as singleton clusters. Also, both dendrograms imply that a reasonable number of clusters in this dataset is four. One insight that can be derived from the average linkage clustering is that clusters tend to group geographically. The four non-singleton clusters form (approximately) a southern group, a northern group, an east/west seaboard group, and a west group.

We can further characterize each of the clusters by examining the summary statistics of their measurements, or visually looking at a heatmap of their individual measurements. Figure 15.4 shows a heatmap of the four clusters and two singletons, highlighting the different profile that each cluster has in terms of the eight measurements. We see, for instance, that cluster 2 is characterized by utilities with a high percent of nuclear power; cluster 1 is characterized by high fixed charge and RoR; cluster 4 has high fuel costs.

#### Figure 15.4

# set labels as cluster membership and utility name
row.names(utilities.df.norm) <- paste(memb, ": ", row.names(utilities.df), sep = "")

# plot heatmap 
# rev() reverses the color mapping to large = dark
heatmap(as.matrix(utilities.df.norm), Colv = NA, hclustfun = hclust, 
        col=rev(paste("gray",1:99,sep="")))

Non-hierarchical Clustering: The -Means Algorithm

A non-hierarchical approach to forming good clusters is to pre-specify a desired number of clusters, k, and assign each case to one of the k clusters so as to minimize a measure of dispersion within the clusters. In other words, the goal is to divide the sample into a predetermined number k of non-overlapping clusters so that clusters are as homogeneous as possible with respect to the measurements used.

A common measure of within-cluster dispersion is the sum of distances (or sum of squared Euclidean distances) of records from their cluster centroid. The problem can be set up as an optimization problem involving integer programming, but because solving integer programs with a large number of variables is time-consuming, clusters are often computed using a fast, heuristic method that produces good (although not necessarily optimal) solutions. The k-means algorithm is one such method.

The k-means algorithm starts with an initial partition of the records into k clusters. Subsequent steps modify the partition to reduce the sum of the distances of each record from its cluster centroid. The modification consists of allocating each record to the nearest of the k centroids of the previous partition. This leads to a new partition for which the sum of distances is smaller than before. The means of the new clusters are computed and the improvement step is repeated until the improvement is very small.

-means clustering algorithm: Start with k initial clusters (user chooses k). At every step, each record is reassigned to the cluster with the “closest” centroid. Recompute the centroids of clusters that lost or gained a record, and repeat Step 2. Stop when moving any more records between clusters increases cluster dispersion.

#### Table 15.9

# load and preprocess data 
#utilities.df <- read.csv("Utilities.csv")
#row.names(utilities.df) <- utilities.df[,1]
#utilities.df <- utilities.df[,-1]

# normalized distance:
utilities.df.norm <- sapply(utilities.df, scale)
row.names(utilities.df.norm) <- row.names(utilities.df) 

# run kmeans algorithm 
set.seed(1)
km <- kmeans(utilities.df.norm, 6)
km

## K-means clustering with 6 clusters of sizes 4, 3, 4, 4, 3, 4
## 
## Cluster means:
##   Fixed_charge         RoR       Cost Load_factor Demand_growth      Sales
## 1   0.04557497  0.57424600  0.2383554  -0.2975182  -0.005101929 -0.5853676
## 2   0.89918182 -0.04591267  0.3273709   0.7373425  -0.301742646 -0.9057446
## 3   0.08622291  0.12862300 -0.1804218  -0.1181922   0.355677321  0.1450583
## 4  -1.09256750 -1.11912138  0.2019400   0.8456853   0.131192455 -0.8264954
## 5  -0.60027572 -0.83317996  1.3389101  -0.4805802   0.991717777  1.8565214
## 6   0.73659004  1.07557186 -1.5095844  -0.6225467  -0.999249196  0.5537221
##      Nuclear  Fuel_Cost
## 1  1.7389316 -0.8356930
## 2 -0.4049567  1.3695535
## 3 -0.3186056 -0.2278866
## 4 -0.2009895  1.1599082
## 5 -0.7146294 -0.9657660
## 6 -0.3796469 -0.3991693
## 
## Clustering vector:
##     Arizona       Boston      Central  Commonwealth           NY     Florida  
##            3            4            6            1            2            6 
##    Hawaiian         Idaho     Kentucky     Madison        Nevada  New England 
##            2            5            3            1            5            2 
##     Northern     Oklahoma     Pacific         Puget    San Diego     Southern 
##            1            6            4            5            4            3 
##        Texas    Wisconsin       United     Virginia 
##            6            1            4            3 
## 
## Within cluster sum of squares by cluster:
## [1]  5.830596 12.289734 10.773938 12.574083  9.533522 12.200433
##  (between_SS / total_SS =  62.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# show cluster membership
km$cluster

##     Arizona       Boston      Central  Commonwealth           NY     Florida  
##            3            4            6            1            2            6 
##    Hawaiian         Idaho     Kentucky     Madison        Nevada  New England 
##            2            5            3            1            5            2 
##     Northern     Oklahoma     Pacific         Puget    San Diego     Southern 
##            1            6            4            5            4            3 
##        Texas    Wisconsin       United     Virginia 
##            6            1            4            3

{
  #### Figure 15.5

# plot an empty scatter plot
plot(c(0), xaxt = 'n', ylab = "", type = "l", 
     ylim = c(min(km$centers), max(km$centers)), xlim = c(0, 8))

# label x-axes
axis(1, at = c(1:8), labels = names(utilities.df))

# plot centroids
for (i in c(1:6))
  lines(km$centers[i,], lty = i, lwd = 2, col = ifelse(i %in% c(1, 3, 5),
                                                       "black", "dark grey"))

# name clusters
text(x = 0.5, y = km$centers[, 1], labels = paste("Cluster", c(1:6)))
}

#### Table 15.11

dist(km$centers)

##          1        2        3        4        5
## 2 3.441236                                    
## 3 2.382086 2.403501                           
## 4 3.646031 2.329288 2.637383                  
## 5 4.078732 4.497497 2.805529 4.012131         
## 6 3.294624 3.500492 2.318757 4.346614 4.433858