The Challenge

Find a moderately sized data set where hierarchical clustering is a useful technique and perform hierarchical clustering on that data set. Try different distance measures and types of linkages and state clearly your final findings.

Spotify Song Attribute Dataset

X acousticness danceability duration_ms energy instrumentalness key liveness loudness mode speechiness tempo time_signature valence target song_title artist
0 0.01020 0.833 204600 0.434 0.021900 2 0.1650 -8.795 1 0.4310 150.062 4 0.286 1 Mask Off Future
1 0.19900 0.743 326933 0.359 0.006110 1 0.1370 -10.401 1 0.0794 160.083 4 0.588 1 Redbone Childish Gambino
2 0.03440 0.838 185707 0.412 0.000234 2 0.1590 -7.148 1 0.2890 75.044 4 0.173 1 Xanny Family Future
3 0.60400 0.494 199413 0.338 0.510000 5 0.0922 -15.236 1 0.0261 86.468 4 0.230 1 Master Of None Beach House
4 0.18000 0.678 392893 0.561 0.512000 5 0.4390 -11.648 0 0.0694 174.004 4 0.904 1 Parallel Lines Junior Boys
5 0.00479 0.804 251333 0.560 0.000000 8 0.1640 -6.682 1 0.1850 85.023 4 0.264 1 Sneakin’ Drake
       X         acousticness        danceability     duration_ms     
 Min.   :   0   Min.   :0.0000028   Min.   :0.1220   Min.   :  16042  
 1st Qu.: 504   1st Qu.:0.0096300   1st Qu.:0.5140   1st Qu.: 200015  
 Median :1008   Median :0.0633000   Median :0.6310   Median : 229261  
 Mean   :1008   Mean   :0.1875900   Mean   :0.6184   Mean   : 246306  
 3rd Qu.:1512   3rd Qu.:0.2650000   3rd Qu.:0.7380   3rd Qu.: 270333  
 Max.   :2016   Max.   :0.9950000   Max.   :0.9840   Max.   :1004627  
                                                                      
     energy       instrumentalness         key            liveness     
 Min.   :0.0148   Min.   :0.0000000   Min.   : 0.000   Min.   :0.0188  
 1st Qu.:0.5630   1st Qu.:0.0000000   1st Qu.: 2.000   1st Qu.:0.0923  
 Median :0.7150   Median :0.0000762   Median : 6.000   Median :0.1270  
 Mean   :0.6816   Mean   :0.1332855   Mean   : 5.343   Mean   :0.1908  
 3rd Qu.:0.8460   3rd Qu.:0.0540000   3rd Qu.: 9.000   3rd Qu.:0.2470  
 Max.   :0.9980   Max.   :0.9760000   Max.   :11.000   Max.   :0.9690  
                                                                       
    loudness            mode         speechiness          tempo       
 Min.   :-33.097   Min.   :0.0000   Min.   :0.02310   Min.   : 47.86  
 1st Qu.: -8.394   1st Qu.:0.0000   1st Qu.:0.03750   1st Qu.:100.19  
 Median : -6.248   Median :1.0000   Median :0.05490   Median :121.43  
 Mean   : -7.086   Mean   :0.6123   Mean   :0.09266   Mean   :121.60  
 3rd Qu.: -4.746   3rd Qu.:1.0000   3rd Qu.:0.10800   3rd Qu.:137.85  
 Max.   : -0.307   Max.   :1.0000   Max.   :0.81600   Max.   :219.33  
                                                                      
 time_signature     valence           target                song_title  
 Min.   :1.000   Min.   :0.0348   Min.   :0.0000   Jack          :   3  
 1st Qu.:4.000   1st Qu.:0.2950   1st Qu.:0.0000   River         :   3  
 Median :4.000   Median :0.4920   Median :1.0000   1-800-273-8255:   2  
 Mean   :3.968   Mean   :0.4968   Mean   :0.5057   Acamar        :   2  
 3rd Qu.:4.000   3rd Qu.:0.6910   3rd Qu.:1.0000   Alright       :   2  
 Max.   :5.000   Max.   :0.9920   Max.   :1.0000   Annie         :   2  
                                                   (Other)       :2003  
             artist    
 Drake          :  16  
 Rick Ross      :  13  
 Disclosure     :  12  
 Backstreet Boys:  10  
 WALK THE MOON  :  10  
 Crystal Castles:   9  
 (Other)        :1947  

Distances

d.euc <- dist(df, method = "euclidean")
d.max <- dist(df, method = "maximum")
d.hattan <- dist(df, method = "manhattan")
d.berra <- dist(df, method = "canberra")
d.bin <- dist(df, method = "binary")
d.mink <- dist(df, method = "minkowski")

Clustering

Euclidean Distance

set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.euc, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.euc, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Maximum Distance

set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.max, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.max, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Manhattan Distance

set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.hattan, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.hattan, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Canberra Distance

set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.berra, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.berra, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Binomial Distance

set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.bin, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.bin, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Minkowski Distance

set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.mink, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.mink, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Analysis

Interesting Observations

kable(dat[1536, ])
X acousticness danceability duration_ms energy instrumentalness key liveness loudness mode speechiness tempo time_signature valence target song_title artist
1536 1535 0.992 0.447 1004627 0.163 0.776 8 0.654 -14.835 1 0.0707 74.933 3 0.19 0 The Nearness of You Joshua Redman
kable(df %>% select(acousticness, danceability, duration_ms, energy, instrumentalness, 
    key, liveness, loudness, speechiness, tempo, valence) %>% summarise_all(mean))
acousticness danceability duration_ms energy instrumentalness key liveness loudness speechiness tempo valence
0.18759 0.6184219 246306.2 0.6815771 0.1332855 5.342588 0.190844 -7.085624 0.0926643 121.6033 0.496815
summary(df)
  acousticness        danceability     duration_ms          energy      
 Min.   :0.0000028   Min.   :0.1220   Min.   :  16042   Min.   :0.0148  
 1st Qu.:0.0096300   1st Qu.:0.5140   1st Qu.: 200015   1st Qu.:0.5630  
 Median :0.0633000   Median :0.6310   Median : 229261   Median :0.7150  
 Mean   :0.1875900   Mean   :0.6184   Mean   : 246306   Mean   :0.6816  
 3rd Qu.:0.2650000   3rd Qu.:0.7380   3rd Qu.: 270333   3rd Qu.:0.8460  
 Max.   :0.9950000   Max.   :0.9840   Max.   :1004627   Max.   :0.9980  
 instrumentalness         key            liveness         loudness      
 Min.   :0.0000000   Min.   : 0.000   Min.   :0.0188   Min.   :-33.097  
 1st Qu.:0.0000000   1st Qu.: 2.000   1st Qu.:0.0923   1st Qu.: -8.394  
 Median :0.0000762   Median : 6.000   Median :0.1270   Median : -6.248  
 Mean   :0.1332855   Mean   : 5.343   Mean   :0.1908   Mean   : -7.086  
 3rd Qu.:0.0540000   3rd Qu.: 9.000   3rd Qu.:0.2470   3rd Qu.: -4.746  
 Max.   :0.9760000   Max.   :11.000   Max.   :0.9690   Max.   : -0.307  
      mode         speechiness          tempo        time_signature 
 Min.   :0.0000   Min.   :0.02310   Min.   : 47.86   Min.   :1.000  
 1st Qu.:0.0000   1st Qu.:0.03750   1st Qu.:100.19   1st Qu.:4.000  
 Median :1.0000   Median :0.05490   Median :121.43   Median :4.000  
 Mean   :0.6123   Mean   :0.09266   Mean   :121.60   Mean   :3.968  
 3rd Qu.:1.0000   3rd Qu.:0.10800   3rd Qu.:137.85   3rd Qu.:4.000  
 Max.   :1.0000   Max.   :0.81600   Max.   :219.33   Max.   :5.000  
    valence      
 Min.   :0.0348  
 1st Qu.:0.2950  
 Median :0.4920  
 Mean   :0.4968  
 3rd Qu.:0.6910  
 Max.   :0.9920