The Challenge
Find a moderately sized data set where hierarchical clustering is a useful technique and perform hierarchical clustering on that data set. Try different distance measures and types of linkages and state clearly your final findings.
Spotify Song Attribute Dataset
| 0 |
0.01020 |
0.833 |
204600 |
0.434 |
0.021900 |
2 |
0.1650 |
-8.795 |
1 |
0.4310 |
150.062 |
4 |
0.286 |
1 |
Mask Off |
Future |
| 1 |
0.19900 |
0.743 |
326933 |
0.359 |
0.006110 |
1 |
0.1370 |
-10.401 |
1 |
0.0794 |
160.083 |
4 |
0.588 |
1 |
Redbone |
Childish Gambino |
| 2 |
0.03440 |
0.838 |
185707 |
0.412 |
0.000234 |
2 |
0.1590 |
-7.148 |
1 |
0.2890 |
75.044 |
4 |
0.173 |
1 |
Xanny Family |
Future |
| 3 |
0.60400 |
0.494 |
199413 |
0.338 |
0.510000 |
5 |
0.0922 |
-15.236 |
1 |
0.0261 |
86.468 |
4 |
0.230 |
1 |
Master Of None |
Beach House |
| 4 |
0.18000 |
0.678 |
392893 |
0.561 |
0.512000 |
5 |
0.4390 |
-11.648 |
0 |
0.0694 |
174.004 |
4 |
0.904 |
1 |
Parallel Lines |
Junior Boys |
| 5 |
0.00479 |
0.804 |
251333 |
0.560 |
0.000000 |
8 |
0.1640 |
-6.682 |
1 |
0.1850 |
85.023 |
4 |
0.264 |
1 |
Sneakin’ |
Drake |
X acousticness danceability duration_ms
Min. : 0 Min. :0.0000028 Min. :0.1220 Min. : 16042
1st Qu.: 504 1st Qu.:0.0096300 1st Qu.:0.5140 1st Qu.: 200015
Median :1008 Median :0.0633000 Median :0.6310 Median : 229261
Mean :1008 Mean :0.1875900 Mean :0.6184 Mean : 246306
3rd Qu.:1512 3rd Qu.:0.2650000 3rd Qu.:0.7380 3rd Qu.: 270333
Max. :2016 Max. :0.9950000 Max. :0.9840 Max. :1004627
energy instrumentalness key liveness
Min. :0.0148 Min. :0.0000000 Min. : 0.000 Min. :0.0188
1st Qu.:0.5630 1st Qu.:0.0000000 1st Qu.: 2.000 1st Qu.:0.0923
Median :0.7150 Median :0.0000762 Median : 6.000 Median :0.1270
Mean :0.6816 Mean :0.1332855 Mean : 5.343 Mean :0.1908
3rd Qu.:0.8460 3rd Qu.:0.0540000 3rd Qu.: 9.000 3rd Qu.:0.2470
Max. :0.9980 Max. :0.9760000 Max. :11.000 Max. :0.9690
loudness mode speechiness tempo
Min. :-33.097 Min. :0.0000 Min. :0.02310 Min. : 47.86
1st Qu.: -8.394 1st Qu.:0.0000 1st Qu.:0.03750 1st Qu.:100.19
Median : -6.248 Median :1.0000 Median :0.05490 Median :121.43
Mean : -7.086 Mean :0.6123 Mean :0.09266 Mean :121.60
3rd Qu.: -4.746 3rd Qu.:1.0000 3rd Qu.:0.10800 3rd Qu.:137.85
Max. : -0.307 Max. :1.0000 Max. :0.81600 Max. :219.33
time_signature valence target song_title
Min. :1.000 Min. :0.0348 Min. :0.0000 Jack : 3
1st Qu.:4.000 1st Qu.:0.2950 1st Qu.:0.0000 River : 3
Median :4.000 Median :0.4920 Median :1.0000 1-800-273-8255: 2
Mean :3.968 Mean :0.4968 Mean :0.5057 Acamar : 2
3rd Qu.:4.000 3rd Qu.:0.6910 3rd Qu.:1.0000 Alright : 2
Max. :5.000 Max. :0.9920 Max. :1.0000 Annie : 2
(Other) :2003
artist
Drake : 16
Rick Ross : 13
Disclosure : 12
Backstreet Boys: 10
WALK THE MOON : 10
Crystal Castles: 9
(Other) :1947
Distances
d.euc <- dist(df, method = "euclidean")
d.max <- dist(df, method = "maximum")
d.hattan <- dist(df, method = "manhattan")
d.berra <- dist(df, method = "canberra")
d.bin <- dist(df, method = "binary")
d.mink <- dist(df, method = "minkowski")
Clustering
Euclidean Distance
set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.euc, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.euc, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Maximum Distance
set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.max, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.max, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Manhattan Distance
set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.hattan, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.hattan, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Canberra Distance
set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.berra, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.berra, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Binomial Distance
set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.bin, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.bin, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Minkowski Distance
set.seed(2634)
# Single Linkage Hierarchical Clustering
clust.single <- hclust(d.mink, method = "single")
plot(clust.single, cex = 0.5)
groups <- cutree(clust.single, k = clusters)
rect.hclust(clust.single, border = "red", k = clusters)

# Complete Linkage Hierarchical Clustering
cluster.complete <- hclust(d.mink, method = "ave")
plot(cluster.complete, cex = 0.5)
groups <- cutree(cluster.complete, k = clusters)
rect.hclust(cluster.complete, border = "red", k = clusters)

Analysis
Interesting Observations
| 1536 |
1535 |
0.992 |
0.447 |
1004627 |
0.163 |
0.776 |
8 |
0.654 |
-14.835 |
1 |
0.0707 |
74.933 |
3 |
0.19 |
0 |
The Nearness of You |
Joshua Redman |
kable(df %>% select(acousticness, danceability, duration_ms, energy, instrumentalness,
key, liveness, loudness, speechiness, tempo, valence) %>% summarise_all(mean))
| 0.18759 |
0.6184219 |
246306.2 |
0.6815771 |
0.1332855 |
5.342588 |
0.190844 |
-7.085624 |
0.0926643 |
121.6033 |
0.496815 |
acousticness danceability duration_ms energy
Min. :0.0000028 Min. :0.1220 Min. : 16042 Min. :0.0148
1st Qu.:0.0096300 1st Qu.:0.5140 1st Qu.: 200015 1st Qu.:0.5630
Median :0.0633000 Median :0.6310 Median : 229261 Median :0.7150
Mean :0.1875900 Mean :0.6184 Mean : 246306 Mean :0.6816
3rd Qu.:0.2650000 3rd Qu.:0.7380 3rd Qu.: 270333 3rd Qu.:0.8460
Max. :0.9950000 Max. :0.9840 Max. :1004627 Max. :0.9980
instrumentalness key liveness loudness
Min. :0.0000000 Min. : 0.000 Min. :0.0188 Min. :-33.097
1st Qu.:0.0000000 1st Qu.: 2.000 1st Qu.:0.0923 1st Qu.: -8.394
Median :0.0000762 Median : 6.000 Median :0.1270 Median : -6.248
Mean :0.1332855 Mean : 5.343 Mean :0.1908 Mean : -7.086
3rd Qu.:0.0540000 3rd Qu.: 9.000 3rd Qu.:0.2470 3rd Qu.: -4.746
Max. :0.9760000 Max. :11.000 Max. :0.9690 Max. : -0.307
mode speechiness tempo time_signature
Min. :0.0000 Min. :0.02310 Min. : 47.86 Min. :1.000
1st Qu.:0.0000 1st Qu.:0.03750 1st Qu.:100.19 1st Qu.:4.000
Median :1.0000 Median :0.05490 Median :121.43 Median :4.000
Mean :0.6123 Mean :0.09266 Mean :121.60 Mean :3.968
3rd Qu.:1.0000 3rd Qu.:0.10800 3rd Qu.:137.85 3rd Qu.:4.000
Max. :1.0000 Max. :0.81600 Max. :219.33 Max. :5.000
valence
Min. :0.0348
1st Qu.:0.2950
Median :0.4920
Mean :0.4968
3rd Qu.:0.6910
Max. :0.9920