This is a dataset of 43 different cereal brands. I wanted to analyze it further as we already know the groups that the different brands belong too and see if clustering would be a good technique to identify the correct manufacturer, general mills, kelloggs, or quaker. The analysis is at the very bottom.
cereal <- read.csv("T11-9.DAT.txt", sep="", header=FALSE)
colnames(cereal) = c("Brand","Manufacturer","Calories","Protein","Fat","Sodium","Fiber","Carbohydrates", "Sugar", "Potassium", "Group")
cereal <- cereal[,2:10]
cereal$Carbohydrates <- as.numeric(cereal$Carbohydrates)
head(cereal)
## Manufacturer Calories Protein Fat Sodium Fiber Carbohydrates Sugar
## 1 G 110 2 2 180 1.5 4 10
## 2 G 110 6 2 290 2.0 13 1
## 3 G 110 1 1 180 0.0 7 13
## 4 G 110 1 1 180 0.0 7 13
## 5 G 110 1 1 280 0.0 11 9
## 6 G 110 3 1 250 1.5 6 10
## Potassium
## 1 70
## 2 105
## 3 55
## 4 65
## 5 45
## 6 90
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.2
library(robustHD)
## Loading required package: ggplot2
## Loading required package: perry
## Loading required package: parallel
## Loading required package: robustbase
set.seed(1234)
z <- lda(Manufacturer ~ ., data=cereal, prior = c(1,1,1)/3)
z
## Call:
## lda(Manufacturer ~ ., data = cereal, prior = c(1, 1, 1)/3)
##
## Prior probabilities of groups:
## G K Q
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## Calories Protein Fat Sodium Fiber Carbohydrates Sugar
## G 110.5882 2.352941 1.235294 203.52941 1.294118 9.705882 8.117647
## K 111.0000 2.600000 0.650000 185.50000 2.250000 12.050000 7.950000
## Q 90.0000 2.333333 1.333333 98.33333 1.116667 5.500000 5.000000
## Potassium
## G 85.00000
## K 91.75000
## Q 58.33333
##
## Coefficients of linear discriminants:
## LD1 LD2
## Calories -0.042394995 -0.022385273
## Protein -0.192736441 0.043708862
## Fat 1.030238269 0.230557790
## Sodium -0.002097074 0.008271173
## Fiber -0.938811912 -1.424824908
## Carbohydrates -0.112847256 0.016979986
## Sugar -0.103510958 0.070253284
## Potassium 0.019779960 0.035706831
##
## Proportion of trace:
## LD1 LD2
## 0.8095 0.1905
plot(z)
cereal <- read.csv("T11-9.DAT.txt", sep="", header=FALSE)
colnames(cereal) = c("Brand","Manufacturer","Calories","Protein","Fat","Sodium","Fiber","Carbohydrates", "Sugar", "Potassium", "Group")
cereal <- cereal[,1:10]
cereal$Carbohydrates <- as.numeric(cereal$Carbohydrates)
cereal$Fiber <- as.numeric(cereal$Fiber)
cereal <- as.data.frame(cereal)
std = standardize(cereal[3:10])
rownames(std) = cereal$Brand
std$Manufacturer <- cereal$Manufacturer
cereal <- std
#Average Linkage, Euclidean
d=dist(cereal,method = "euclidean") # distance matrix
## Warning in dist(cereal, method = "euclidean"): NAs introduced by coercion
fit=hclust(d, method="ave")
plot(fit,cex=0.5, labels=cereal$Brand, main = "Average Linkage, Euclidean") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
#Single Linkage, Euclidean
d=dist(cereal,method = "euclidean") # distance matrix
## Warning in dist(cereal, method = "euclidean"): NAs introduced by coercion
fit=hclust(d, method="single")
plot(fit,cex=0.5, labels= cereal$Brand, main = "Single Linkage, Euclidean") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
#Complete Linkage, Euclidean
d=dist(cereal,method = "euclidean") # distance matrix
## Warning in dist(cereal, method = "euclidean"): NAs introduced by coercion
fit=hclust(d, method="complete")
plot(fit,cex=0.5, labels =cereal$Brand, main = "Complete Linkage, Euclidean") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
#Average Linkage, Manhatten
d=dist(cereal,method = "manhattan") # distance matrix
## Warning in dist(cereal, method = "manhattan"): NAs introduced by coercion
fit=hclust(d, method="ave")
plot(fit,cex=0.5, labels=cereal$Brand, main = "Average Linkage, Manhattan") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
#Single Linkage, Manhattan
d=dist(cereal,method = "manhattan") # distance matrix
## Warning in dist(cereal, method = "manhattan"): NAs introduced by coercion
fit=hclust(d, method="single")
plot(fit,cex=0.5, labels= cereal$Brand, main = "Single Linkage, Manhattan") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
#Complete Linkage, Manhattan
d=dist(cereal,method = "manhattan") # distance matrix
## Warning in dist(cereal, method = "manhattan"): NAs introduced by coercion
fit=hclust(d, method="complete")
plot(fit,cex=0.5, labels =cereal$Brand, main = "Complete Linkage, Manhattan") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
#Average Linkage, Maximum
d=dist(cereal,method = "maximum") # distance matrix
## Warning in dist(cereal, method = "maximum"): NAs introduced by coercion
fit=hclust(d, method="ave")
plot(fit,cex=0.5, labels=cereal$Brand, main = "Average Linkage, Maximum") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
#Single Linkage, Maximum
d=dist(cereal,method = "maximum") # distance matrix
## Warning in dist(cereal, method = "maximum"): NAs introduced by coercion
fit=hclust(d, method="single")
plot(fit,cex=0.5, labels= cereal$Brand, main = "Single Linkage, Maximum") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
#Complete Linkage, Maximum
d=dist(cereal,method = "maximum") # distance matrix
## Warning in dist(cereal, method = "maximum"): NAs introduced by coercion
fit=hclust(d, method="complete")
plot(fit,cex=0.5, labels =cereal$Brand, main = "Complete Linkage, Maximum") # display dendogram
groups=cutree(fit, k=3)# cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")
bye_all_bran <- cereal[-18,]
d=dist(bye_all_bran,method = "manhattan") # distance matrix
## Warning in dist(bye_all_bran, method = "manhattan"): NAs introduced by
## coercion
fit=hclust(d, method="ave")
plot(fit,cex=0.5, labels=cereal$Brand, main = "Average Linkage, Manhattan") # display dendogram
groups=cutree(fit, k=3) # cut tree into 3 clusters
rect.hclust(fit, k=3, border="red")