Hierarchical Clustering of Whiskey

These data were obtained online at http://www-958.ibm.com/software/analytics/manyeye. The dataframe includes information on rating, country of origin, type of whiskey, price, alcohol by volume (ABV), age, and brand for each of 284 whiskeys.

whiskey <- read.delim("whiskey.txt")
## data was obtained online at
## http://www-958.ibm.com/software/analytics/manyeyes/datasets/whiskey-data-4/versions/1
## posted by user bretwcollins
whiskey[whiskey == "*"] <- NA
whiskey$Rating <- as.numeric(as.character(whiskey$Rating))
whiskey$Price <- as.numeric(as.character(whiskey$Price))
whiskey$ABV <- as.numeric(as.character(whiskey$ABV))
whiskey$Age <- as.numeric(as.character(whiskey$ABV))
row.names(whiskey) <- whiskey$Name
whiskey <- whiskey[, 2:8]

distance <- daisy(whiskey, metric = "gower")
whiskeyclust <- hclust(distance)

To consider a natural number of clusters, the overall structure of the hierarchical clusters was considered. From the plot below, it's clear that there are four fairly distinct clusters. However, after those four clusters, the splits seem less distinct. For now, each of the whiskeys will be classified as belonging to one of four clusters.

plot(whiskeyclust, cex = 0.1, axes = T, xlab = "", ylab = "Height", sub = NA, 
    main = "Structure of Hierarchical Clusters")

plot of chunk unnamed-chunk-4

col4 <- c("slateblue", "mediumseagreen", "palevioletred", "royalblue3")
clust4 <- cutree(whiskeyclust, h = 0.69)
## par(mar=c(0,0,1.5,0))
plot(as.phylo(whiskeyclust), tip.color = col4[clust4], cex = 0.5, label.offset = 0.002, 
    main = "Hierarchical Clusters of Whiskeys", cex.main = 1.5)

plot of chunk unnamed-chunk-5

cluster1 <- whiskey[clust4 == 1, ]
cluster2 <- whiskey[clust4 == 2, ]
cluster3 <- whiskey[clust4 == 3, ]
cluster4 <- whiskey[clust4 == 4, ]

clusterList <- list(cluster1 = cluster1, cluster2 = cluster2, cluster3 = cluster3, 
    cluster4 = cluster4)

Age <- sapply(clusterList, function(x) signif(mean(x$Age, na.rm = TRUE), 3))
Rating <- sapply(clusterList, function(x) signif(mean(x$Rating, na.rm = TRUE), 
    3))
ABV <- sapply(clusterList, function(x) signif(mean(x$ABV, na.rm = TRUE), 3))
Country <- sapply(clusterList, function(x) {
    temp <- sort(table(x$Country), decreasing = TRUE)[1]
    pluralityCountry <- names(temp)
    proportionCountry <- signif(as.numeric(temp)/nrow(x), 3)
    return(paste(pluralityCountry, proportionCountry, sep = ", "))
})
clustChar <- cbind.data.frame(`Country, Proportion` = Country, `Mean Age` = Age, 
    `Mean Rating` = Rating, `Mean ABV` = ABV)
clustChar

##          Country, Proportion Mean Age Mean Rating Mean ABV
## cluster1          Ireland, 1     41.8        80.2     41.8
## cluster2         Scotland, 1     43.1        92.3     43.1
## cluster3          USA, 0.461     46.1        86.8     46.1
## cluster4     Scotland, 0.538     39.3        72.6     39.3

countries <- gsub(",.*$", "", as.character(clustChar[, 1]))
## use regular expressions to get out the first part of the string

for (i in 1:length(clusterList)) {
    list <- clusterList[[i]][, c(1, 6)]
    list <- as.data.frame(list[complete.cases(list), ])
    if (i == 1) {
        plot(x = jitter(list$Age, factor = 3), y = jitter(list$Rating, factor = 3), 
            pch = 16, col = alpha(col4[i], alpha = 0.7), xlab = "Age in Cluster", 
            ylab = "Rating in Cluster", main = "Age and Rating Characteristics by Cluster", 
            xlim = c(35, 65), ylim = c(40, 100))
    }
    points(jitter(list$Age, factor = 3), jitter(list$Rating, factor = 3), pch = 16, 
        col = alpha(col4[i], alpha = 0.7))
}

plot of chunk unnamed-chunk-8