Variety clustering using car data

source : http://statistics.berkeley.edu/classes/s133/Cluster2a.html

# 1. Hierarchical Clustering

cars <- read.delim("cars.tab", stringsAsFactors = FALSE)
head(cars)
##   Country                       Car  MPG Weight Drive_Ratio Horsepower
## 1    U.S.        Buick Estate Wagon 16.9  4.360        2.73        155
## 2    U.S. Ford Country Squire Wagon 15.5  4.054        2.26        142
## 3    U.S.        Chevy Malibu Wagon 19.2  3.605        2.56        125
## 4    U.S.    Chrysler LeBaron Wagon 18.5  3.940        2.45        150
## 5    U.S.                  Chevette 30.0  2.155        3.70         68
## 6   Japan             Toyota Corona 27.5  2.560        3.05         95
##   Displacement Cylinders
## 1          350         8
## 2          351         8
## 3          267         8
## 4          360         8
## 5           98         4
## 6          134         4

cars.use <- cars[, -c(1, 2)]
medians <- apply(cars.use, 2, median)
mads <- apply(cars.use, 2, mad)
cars.use <- scale(cars.use, center = medians, scale = mads)
cars.dist <- dist(cars.use)

# hclust use a dissimilarity structure as produced by dist.
cars.hclust <- hclust(cars.dist)

plot(cars.hclust, label = cars$Car, main = "Default from hclust")

plot of chunk unnamed-chunk-1


groups.3 <- cutree(cars.hclust, 3)
groups.3
##  [1] 1 1 1 1 2 2 2 2 2 3 2 3 3 3 3 3 1 1 1 1 2 3 2 2 2 2 2 2 3 3 2 2 2 2 2
## [36] 3 2 2
table(groups.3)
## groups.3
##  1  2  3 
##  8 20 10

counts <- sapply(2:10, function(ncl) table(cutree(cars.hclust, ncl)))
names(counts) = 2:10
counts
## $`2`
## 
##  1  2 
## 18 20 
## 
## $`3`
## 
##  1  2  3 
##  8 20 10 
## 
## $`4`
## 
##  1  2  3  4 
##  8 17  3 10 
## 
## $`5`
## 
##  1  2  3  4  5 
##  8 11  6  3 10 
## 
## $`6`
## 
##  1  2  3  4  5  6 
##  8 11  6  3  5  5 
## 
## $`7`
## 
##  1  2  3  4  5  6  7 
##  8 11  6  3  2  3  5 
## 
## $`8`
## 
##  1  2  3  4  5  6  7  8 
##  8 11  6  3  2  3  3  2 
## 
## $`9`
## 
##  1  2  3  4  5  6  7  8  9 
##  4  4 11  6  3  2  3  3  2 
## 
## $`10`
## 
##  1  2  3  4  5  6  7  8  9 10 
##  4  4 11  4  3  2  3  3  2  2

cars$Car[groups.3 == 1]
## [1] "Buick Estate Wagon"        "Ford Country Squire Wagon"
## [3] "Chevy Malibu Wagon"        "Chrysler LeBaron Wagon"   
## [5] "Chevy Caprice Classic"     "Ford LTD"                 
## [7] "Mercury Grand Marquis"     "Dodge St Regis"
sapply(unique(groups.3), function(g) cars$Car[groups.3 == g])
## [[1]]
## [1] "Buick Estate Wagon"        "Ford Country Squire Wagon"
## [3] "Chevy Malibu Wagon"        "Chrysler LeBaron Wagon"   
## [5] "Chevy Caprice Classic"     "Ford LTD"                 
## [7] "Mercury Grand Marquis"     "Dodge St Regis"           
## 
## [[2]]
##  [1] "Chevette"         "Toyota Corona"    "Datsun 510"      
##  [4] "Dodge Omni"       "Audi 5000"        "Saab 99 GLE"     
##  [7] "Ford Mustang 4"   "Mazda GLC"        "Dodge Colt"      
## [10] "AMC Spirit"       "VW Scirocco"      "Honda Accord LX" 
## [13] "Buick Skylark"    "Pontiac Phoenix"  "Plymouth Horizon"
## [16] "Datsun 210"       "Fiat Strada"      "VW Dasher"       
## [19] "BMW 320i"         "VW Rabbit"       
## 
## [[3]]
##  [1] "Volvo 240 GL"          "Peugeot 694 SL"       
##  [3] "Buick Century Special" "Mercury Zephyr"       
##  [5] "Dodge Aspen"           "AMC Concord D/L"      
##  [7] "Ford Mustang Ghia"     "Chevy Citation"       
##  [9] "Olds Omega"            "Datsun 810"

groups.4 <- cutree(cars.hclust, 4)
sapply(unique(groups.4), function(g) cars$Car[groups.4 == g])
## [[1]]
## [1] "Buick Estate Wagon"        "Ford Country Squire Wagon"
## [3] "Chevy Malibu Wagon"        "Chrysler LeBaron Wagon"   
## [5] "Chevy Caprice Classic"     "Ford LTD"                 
## [7] "Mercury Grand Marquis"     "Dodge St Regis"           
## 
## [[2]]
##  [1] "Chevette"         "Toyota Corona"    "Datsun 510"      
##  [4] "Dodge Omni"       "Ford Mustang 4"   "Mazda GLC"       
##  [7] "Dodge Colt"       "AMC Spirit"       "VW Scirocco"     
## [10] "Honda Accord LX"  "Buick Skylark"    "Pontiac Phoenix" 
## [13] "Plymouth Horizon" "Datsun 210"       "Fiat Strada"     
## [16] "VW Dasher"        "VW Rabbit"       
## 
## [[3]]
## [1] "Audi 5000"   "Saab 99 GLE" "BMW 320i"   
## 
## [[4]]
##  [1] "Volvo 240 GL"          "Peugeot 694 SL"       
##  [3] "Buick Century Special" "Mercury Zephyr"       
##  [5] "Dodge Aspen"           "AMC Concord D/L"      
##  [7] "Ford Mustang Ghia"     "Chevy Citation"       
##  [9] "Olds Omega"            "Datsun 810"

table(groups.3, cars$Country)
##         
## groups.3 France Germany Italy Japan Sweden U.S.
##        1      0       0     0     0      0    8
##        2      0       5     1     6      1    7
##        3      1       0     0     1      1    7
aggregate(cars.use, list(groups.3), median)
##   Group.1     MPG  Weight Drive_Ratio Horsepower Displacement Cylinders
## 1       1 -0.7945  1.5051     -0.9134     1.0476       2.4776    4.7214
## 2       2  0.6859 -0.5871      0.5269    -0.6027      -0.5810   -0.6745
## 3       3 -0.4058  0.5246     -0.1686     0.3588       0.3272    2.0235
aggregate(cars[, -c(1, 2)], list(groups.3), median)
##   Group.1   MPG Weight Drive_Ratio Horsepower Displacement Cylinders
## 1       1 17.30  3.890       2.430      136.5          334         8
## 2       2 30.25  2.215       3.455       79.0          105         4
## 3       3 20.70  3.105       2.960      112.5          173         6

# 2. PAM: Partitioning Around Medois


library(cluster)

# pam(x) - x is data matrix or data frame, or dissimilarity matrix or
# object
cars.pam <- pam(cars.dist, 3)
names(cars.pam)
## [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
## [6] "clusinfo"   "silinfo"    "diss"       "call"

table(groups.3, cars.pam$clustering)
##         
## groups.3  1  2  3
##        1  8  0  0
##        2  0 19  1
##        3  0  0 10
cars$Car[groups.3 != cars.pam$cluster]
## [1] "Audi 5000"
cars$Car[cars.pam$id.med]
## [1] "Dodge St Regis"    "Dodge Omni"        "Ford Mustang Ghia"

# Range of SC Interpretation 0.71-1.0 A strong structure has been found
# 0.51-0.70 A reasonable structure has been found 0.26-0.50 The structure
# is weak and could be artificial < 0.25 No substantial structure has been
# found
plot(cars.pam)

plot of chunk unnamed-chunk-1

# if k=3 then result will be same as above
plot(silhouette(cutree(cars.hclust, 4), cars.dist))

plot of chunk unnamed-chunk-1


# 3. AGNES: Agglomerative Nesting
data(iris)
iris.use <- subset(iris, select = -Species)
d <- dist(iris.use)
library(cluster)
# daisy - Dissimilarity Matrix Calculation
d1 <- daisy(iris.use)
sum(abs(d - d1))
## [1] 0

z <- agnes(d)
plot(z)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

table(cutree(z, 3), iris$Species)
##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         50        14
##   3      0          0        36
library(lattice)
splom(~iris, group = iris$Species, auto.key = TRUE)

plot of chunk unnamed-chunk-1

You can also embed plots, for example: