These data were obtained online at http://www-958.ibm.com/software/analytics/manyeye. The dataframe includes information on rating, country of origin, type of whiskey, price, alcohol by volume (ABV), age, and brand for each of 284 whiskeys.
whiskey <- read.delim("whiskey.txt")
## data was obtained online at
## http://www-958.ibm.com/software/analytics/manyeyes/datasets/whiskey-data-4/versions/1
## posted by user bretwcollins
whiskey[whiskey == "*"] <- NA
whiskey$Rating <- as.numeric(as.character(whiskey$Rating))
whiskey$Price <- as.numeric(as.character(whiskey$Price))
whiskey$ABV <- as.numeric(as.character(whiskey$ABV))
whiskey$Age <- as.numeric(as.character(whiskey$ABV))
row.names(whiskey) <- whiskey$Name
whiskey <- whiskey[, 2:8]
distance <- daisy(whiskey, metric = "gower")
whiskeyclust <- hclust(distance)
To consider a natural number of clusters, the overall structure of the hierarchical clusters was considered. From the plot below, it's clear that there are four fairly distinct clusters. However, after those four clusters, the splits seem less distinct. For now, each of the whiskeys will be classified as belonging to one of four clusters.
plot(whiskeyclust, cex = 0.1, axes = T, xlab = "", ylab = "Height", sub = NA,
main = "Structure of Hierarchical Clusters")
col4 <- c("slateblue", "mediumseagreen", "palevioletred", "royalblue3")
clust4 <- cutree(whiskeyclust, h = 0.69)
## par(mar=c(0,0,1.5,0))
plot(as.phylo(whiskeyclust), tip.color = col4[clust4], cex = 0.5, label.offset = 0.002,
main = "Hierarchical Clusters of Whiskeys", cex.main = 1.5)
cluster1 <- whiskey[clust4 == 1, ]
cluster2 <- whiskey[clust4 == 2, ]
cluster3 <- whiskey[clust4 == 3, ]
cluster4 <- whiskey[clust4 == 4, ]
clusterList <- list(cluster1 = cluster1, cluster2 = cluster2, cluster3 = cluster3,
cluster4 = cluster4)
Age <- sapply(clusterList, function(x) signif(mean(x$Age, na.rm = TRUE), 3))
Rating <- sapply(clusterList, function(x) signif(mean(x$Rating, na.rm = TRUE),
3))
ABV <- sapply(clusterList, function(x) signif(mean(x$ABV, na.rm = TRUE), 3))
Country <- sapply(clusterList, function(x) {
temp <- sort(table(x$Country), decreasing = TRUE)[1]
pluralityCountry <- names(temp)
proportionCountry <- signif(as.numeric(temp)/nrow(x), 3)
return(paste(pluralityCountry, proportionCountry, sep = ", "))
})
clustChar <- cbind.data.frame(`Country, Proportion` = Country, `Mean Age` = Age,
`Mean Rating` = Rating, `Mean ABV` = ABV)
clustChar
## Country, Proportion Mean Age Mean Rating Mean ABV
## cluster1 Ireland, 1 41.8 80.2 41.8
## cluster2 Scotland, 1 43.1 92.3 43.1
## cluster3 USA, 0.461 46.1 86.8 46.1
## cluster4 Scotland, 0.538 39.3 72.6 39.3
countries <- gsub(",.*$", "", as.character(clustChar[, 1]))
## use regular expressions to get out the first part of the string
for (i in 1:length(clusterList)) {
list <- clusterList[[i]][, c(1, 6)]
list <- as.data.frame(list[complete.cases(list), ])
if (i == 1) {
plot(x = jitter(list$Age, factor = 3), y = jitter(list$Rating, factor = 3),
pch = 16, col = alpha(col4[i], alpha = 0.7), xlab = "Age in Cluster",
ylab = "Rating in Cluster", main = "Age and Rating Characteristics by Cluster",
xlim = c(35, 65), ylim = c(40, 100))
}
points(jitter(list$Age, factor = 3), jitter(list$Rating, factor = 3), pch = 16,
col = alpha(col4[i], alpha = 0.7))
}