Variety clustering using car data
# 1. Hierarchical Clustering
cars <- read.delim("cars.tab", stringsAsFactors = FALSE)
head(cars)
## Country Car MPG Weight Drive_Ratio Horsepower
## 1 U.S. Buick Estate Wagon 16.9 4.360 2.73 155
## 2 U.S. Ford Country Squire Wagon 15.5 4.054 2.26 142
## 3 U.S. Chevy Malibu Wagon 19.2 3.605 2.56 125
## 4 U.S. Chrysler LeBaron Wagon 18.5 3.940 2.45 150
## 5 U.S. Chevette 30.0 2.155 3.70 68
## 6 Japan Toyota Corona 27.5 2.560 3.05 95
## Displacement Cylinders
## 1 350 8
## 2 351 8
## 3 267 8
## 4 360 8
## 5 98 4
## 6 134 4
cars.use <- cars[, -c(1, 2)]
medians <- apply(cars.use, 2, median)
mads <- apply(cars.use, 2, mad)
cars.use <- scale(cars.use, center = medians, scale = mads)
cars.dist <- dist(cars.use)
# hclust use a dissimilarity structure as produced by dist.
cars.hclust <- hclust(cars.dist)
plot(cars.hclust, label = cars$Car, main = "Default from hclust")
groups.3 <- cutree(cars.hclust, 3)
groups.3
## [1] 1 1 1 1 2 2 2 2 2 3 2 3 3 3 3 3 1 1 1 1 2 3 2 2 2 2 2 2 3 3 2 2 2 2 2
## [36] 3 2 2
table(groups.3)
## groups.3
## 1 2 3
## 8 20 10
counts <- sapply(2:10, function(ncl) table(cutree(cars.hclust, ncl)))
names(counts) = 2:10
counts
## $`2`
##
## 1 2
## 18 20
##
## $`3`
##
## 1 2 3
## 8 20 10
##
## $`4`
##
## 1 2 3 4
## 8 17 3 10
##
## $`5`
##
## 1 2 3 4 5
## 8 11 6 3 10
##
## $`6`
##
## 1 2 3 4 5 6
## 8 11 6 3 5 5
##
## $`7`
##
## 1 2 3 4 5 6 7
## 8 11 6 3 2 3 5
##
## $`8`
##
## 1 2 3 4 5 6 7 8
## 8 11 6 3 2 3 3 2
##
## $`9`
##
## 1 2 3 4 5 6 7 8 9
## 4 4 11 6 3 2 3 3 2
##
## $`10`
##
## 1 2 3 4 5 6 7 8 9 10
## 4 4 11 4 3 2 3 3 2 2
cars$Car[groups.3 == 1]
## [1] "Buick Estate Wagon" "Ford Country Squire Wagon"
## [3] "Chevy Malibu Wagon" "Chrysler LeBaron Wagon"
## [5] "Chevy Caprice Classic" "Ford LTD"
## [7] "Mercury Grand Marquis" "Dodge St Regis"
sapply(unique(groups.3), function(g) cars$Car[groups.3 == g])
## [[1]]
## [1] "Buick Estate Wagon" "Ford Country Squire Wagon"
## [3] "Chevy Malibu Wagon" "Chrysler LeBaron Wagon"
## [5] "Chevy Caprice Classic" "Ford LTD"
## [7] "Mercury Grand Marquis" "Dodge St Regis"
##
## [[2]]
## [1] "Chevette" "Toyota Corona" "Datsun 510"
## [4] "Dodge Omni" "Audi 5000" "Saab 99 GLE"
## [7] "Ford Mustang 4" "Mazda GLC" "Dodge Colt"
## [10] "AMC Spirit" "VW Scirocco" "Honda Accord LX"
## [13] "Buick Skylark" "Pontiac Phoenix" "Plymouth Horizon"
## [16] "Datsun 210" "Fiat Strada" "VW Dasher"
## [19] "BMW 320i" "VW Rabbit"
##
## [[3]]
## [1] "Volvo 240 GL" "Peugeot 694 SL"
## [3] "Buick Century Special" "Mercury Zephyr"
## [5] "Dodge Aspen" "AMC Concord D/L"
## [7] "Ford Mustang Ghia" "Chevy Citation"
## [9] "Olds Omega" "Datsun 810"
groups.4 <- cutree(cars.hclust, 4)
sapply(unique(groups.4), function(g) cars$Car[groups.4 == g])
## [[1]]
## [1] "Buick Estate Wagon" "Ford Country Squire Wagon"
## [3] "Chevy Malibu Wagon" "Chrysler LeBaron Wagon"
## [5] "Chevy Caprice Classic" "Ford LTD"
## [7] "Mercury Grand Marquis" "Dodge St Regis"
##
## [[2]]
## [1] "Chevette" "Toyota Corona" "Datsun 510"
## [4] "Dodge Omni" "Ford Mustang 4" "Mazda GLC"
## [7] "Dodge Colt" "AMC Spirit" "VW Scirocco"
## [10] "Honda Accord LX" "Buick Skylark" "Pontiac Phoenix"
## [13] "Plymouth Horizon" "Datsun 210" "Fiat Strada"
## [16] "VW Dasher" "VW Rabbit"
##
## [[3]]
## [1] "Audi 5000" "Saab 99 GLE" "BMW 320i"
##
## [[4]]
## [1] "Volvo 240 GL" "Peugeot 694 SL"
## [3] "Buick Century Special" "Mercury Zephyr"
## [5] "Dodge Aspen" "AMC Concord D/L"
## [7] "Ford Mustang Ghia" "Chevy Citation"
## [9] "Olds Omega" "Datsun 810"
table(groups.3, cars$Country)
##
## groups.3 France Germany Italy Japan Sweden U.S.
## 1 0 0 0 0 0 8
## 2 0 5 1 6 1 7
## 3 1 0 0 1 1 7
aggregate(cars.use, list(groups.3), median)
## Group.1 MPG Weight Drive_Ratio Horsepower Displacement Cylinders
## 1 1 -0.7945 1.5051 -0.9134 1.0476 2.4776 4.7214
## 2 2 0.6859 -0.5871 0.5269 -0.6027 -0.5810 -0.6745
## 3 3 -0.4058 0.5246 -0.1686 0.3588 0.3272 2.0235
aggregate(cars[, -c(1, 2)], list(groups.3), median)
## Group.1 MPG Weight Drive_Ratio Horsepower Displacement Cylinders
## 1 1 17.30 3.890 2.430 136.5 334 8
## 2 2 30.25 2.215 3.455 79.0 105 4
## 3 3 20.70 3.105 2.960 112.5 173 6
# 2. PAM: Partitioning Around Medois
library(cluster)
# pam(x) - x is data matrix or data frame, or dissimilarity matrix or
# object
cars.pam <- pam(cars.dist, 3)
names(cars.pam)
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call"
table(groups.3, cars.pam$clustering)
##
## groups.3 1 2 3
## 1 8 0 0
## 2 0 19 1
## 3 0 0 10
cars$Car[groups.3 != cars.pam$cluster]
## [1] "Audi 5000"
cars$Car[cars.pam$id.med]
## [1] "Dodge St Regis" "Dodge Omni" "Ford Mustang Ghia"
# Range of SC Interpretation 0.71-1.0 A strong structure has been found
# 0.51-0.70 A reasonable structure has been found 0.26-0.50 The structure
# is weak and could be artificial < 0.25 No substantial structure has been
# found
plot(cars.pam)
# if k=3 then result will be same as above
plot(silhouette(cutree(cars.hclust, 4), cars.dist))
# 3. AGNES: Agglomerative Nesting
data(iris)
iris.use <- subset(iris, select = -Species)
d <- dist(iris.use)
library(cluster)
# daisy - Dissimilarity Matrix Calculation
d1 <- daisy(iris.use)
sum(abs(d - d1))
## [1] 0
z <- agnes(d)
plot(z)
table(cutree(z, 3), iris$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 50 14
## 3 0 0 36
library(lattice)
splom(~iris, group = iris$Species, auto.key = TRUE)
You can also embed plots, for example: