setwd("~/Desktop/R Materials/mih140/Lecture 24 - Unsupervised Learning II")
cereal = read.table("BreakfastCereals.txt", sep = "\t", header = T) # 43 Cereals from Europe
cereal.std = data.frame(scale(cereal[,3:10])) # Scales the numeric columns
cereal.std = cereal.std[sample(nrow(cereal.std)), ] # Shuffles the rows
Qu: Whats a good grouping of the cereals? For instance you might wonder which sorts of cereals are similarly nutritious so you can catagorize cereals as Healthy, Okay, Pure Sugar. This sort of classification can determine how advertisers market their cereals, how it should be displayed, etc. It also is just interesting.
# Lets just look at a few of the features
features = c("Calories", "Protein", "Fat", "Sodium", "Potassium")
dist_mat = dist(cereal.std[,features], method = "euclidean")
hc_single = hclust(d = dist_mat, method = "single", members = as.factor(cereal$Brand))
# Lets plot it
plot(hc_single) # makes a horrible mess
plot(hc_single, labels = as.factor(cereal$Brand), hang = -1) # Much nicer
# install.packages("NbClust")
library(NbClust)
num_of_clust = NbClust(cereal.std[,features], distance = "euclidean", min.nc = 2, max.nc = 7, method = "single") # Finds best number of clusters is 3.
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 11 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 2 proposed 7 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
# Lets see the three clusters!
plot(hc_single, labels = as.factor(cereal$Brand), hang = -1)
rect.hclust(hc_single, 3)
# To get a cluster as a vector of labels, use the cutree() function i.e. labels of size 3
clust_labels_3 = cutree(hc_single, 3)
# Average Link
hc_avg = hclust(d = dist_mat, method = "average", members = as.factor(cereal$Brand))
plot(hc_avg, labels = as.factor(cereal$Brand), hang = -1)
rect.hclust(hc_avg, 3)
# Complete Link
hc_cmplt = hclust(d = dist_mat, method = "complete", members = as.factor(cereal$Brand))
plot(hc_cmplt, labels = as.factor(cereal$Brand), hang = -1)
rect.hclust(hc_cmplt, 3)