Clustering with Cars data

kable(cars[1:10,])

Country	Car	MPG	Weight	Drive_Ratio	Horsepower	Displacement	Cylinders
U.S.	Buick Estate Wagon	16.9	4.360	2.73	155	350	8
U.S.	Ford Country Squire Wagon	15.5	4.054	2.26	142	351	8
U.S.	Chevy Malibu Wagon	19.2	3.605	2.56	125	267	8
U.S.	Chrysler LeBaron Wagon	18.5	3.940	2.45	150	360	8
U.S.	Chevette	30.0	2.155	3.70	68	98	4
Japan	Toyota Corona	27.5	2.560	3.05	95	134	4
Japan	Datsun 510	27.2	2.300	3.54	97	119	4
U.S.	Dodge Omni	30.9	2.230	3.37	75	105	4
Germany	Audi 5000	20.3	2.830	3.90	103	131	5
Sweden	Volvo 240 GL	17.0	3.140	3.50	125	163	6

car1 <-cars[,-c(1,2)]
car <- scale(car1,center=TRUE,scale=TRUE)

##
discar<- dist(car)
hcar<-hclust(discar)
ggdendrogram(hcar,size=3,rotate = T,hang=-1,theme_dendro = F,labels = T)+theme_dark()

ggdendrogram(hcar,size=3,rotate = F,hang=-1,theme_dendro = F)

###checking with other distances 
hcar.a<- hclust(discar,method = "average")
hcaralabel<- cutree(hcar.a,3)
hcar.labels<- cutree(hcar,3)
table(hcar.labels,hcaralabel) ####a bit contradiction between 2nd and 4rd cluster

##            hcaralabel
## hcar.labels  1  2  3
##           1  8  0  0
##           2  0 17  2
##           3  0  0 11

car<- data.frame(car)
ggplot(car,aes(car$Weight,car$Drive_Ratio))+geom_point(aes(col=hcar.labels),show.legend = F)+
  scale_color_gradient(low="blue", high="red")##change the colour by the scale

silhouette(hcar.labels,discar)

##       cluster neighbor   sil_width
##  [1,]       1        3  0.64464958
##  [2,]       1        3  0.77106933
##  [3,]       1        3  0.54210767
##  [4,]       1        3  0.74656176
##  [5,]       2        3  0.54585357
##  [6,]       2        3  0.33613000
##  [7,]       2        3  0.32432997
##  [8,]       2        3  0.59482921
##  [9,]       3        2  0.30602432
## [10,]       3        1  0.52111190
## [11,]       3        2  0.22879122
## [12,]       3        1  0.43599839
## [13,]       3        1  0.23127213
## [14,]       3        2  0.33422249
## [15,]       3        1  0.11759924
## [16,]       3        1 -0.03202054
## [17,]       1        3  0.75863102
## [18,]       1        3  0.73053785
## [19,]       1        3  0.78556792
## [20,]       1        3  0.77793710
## [21,]       2        3  0.33006248
## [22,]       3        2  0.40350225
## [23,]       2        3  0.56112055
## [24,]       2        3  0.55642437
## [25,]       2        3  0.40264059
## [26,]       2        3  0.55379182
## [27,]       2        3  0.54934199
## [28,]       2        3  0.26902569
## [29,]       2        3 -0.05152949
## [30,]       2        3 -0.18909017
## [31,]       2        3  0.41418644
## [32,]       2        3  0.60390847
## [33,]       2        3  0.56935797
## [34,]       2        3  0.56130671
## [35,]       2        3  0.53992065
## [36,]       3        2  0.30493117
## [37,]       3        2  0.15460105
## [38,]       2        3  0.55387268
## attr(,"Ordered")
## [1] FALSE
## attr(,"call")
## silhouette.default(x = hcar.labels, dist = discar)
## attr(,"class")
## [1] "silhouette"

plot(silhouette(hcar.labels,discar)) #### overall good, 3 outliars

########found the number of K
gap_stat <- clusGap(car, FUN = kmeans,K.max = 10) ######with clusgap
fviz_gap_stat(gap_stat)

ekm <- eclust(car, "kmeans")  ####with eclust

fviz_gap_stat(ekm$gap_stat)

  #### found the number of K
########################
wss <- (nrow(car)-1)*sum(apply(car,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(car, 
    centers=i)$withinss)
####plot(1:15, wss, type="b", xlab="Number of Clusters",
  ##ylab="Within groups sum of squares")
wss<- data.frame(1:15,wss)
ggplot(data=wss,aes(x=1:15,y=wss))+geom_line()+
  geom_point()

########################## normal ones 
hcarkm2 <- kmeans(car, centers=2)  ###########
plot(car,col=hcarkm2$cluster)

hcarkm3 <-kmeans(car, centers=3)  ###############
plot(car,col=hcarkm3$cluster)

hcarkm5 <-kmeans(car, centers=5)  #########
###

plot(silhouette(hcarkm2$cluster,discar)) ###2 cluster kmeans

fviz_cluster(hcarkm2, car)  ########

fviz_silhouette(silhouette(hcarkm2$cluster,discar))

##   cluster size ave.sil.width
## 1       1   13          0.53
## 2       2   25          0.51

plot(silhouette(hcarkm3$cluster,discar)) ## 3 cluster kmeans

fviz_cluster(hcarkm3, car)

fviz_silhouette(silhouette(hcarkm3$cluster,discar))

##   cluster size ave.sil.width
## 1       1    8          0.72
## 2       2   13          0.25
## 3       3   17          0.51

fviz_silhouette((ekm)) ####### another way

##   cluster size ave.sil.width
## 1       1   17          0.51
## 2       2    8          0.72
## 3       3   13          0.25

plot(silhouette(hcarkm5$cluster,discar)) ## 5 cluster kmeans

fviz_cluster(hcarkm5, car)

fviz_silhouette(silhouette(hcarkm5$cluster,discar))

##   cluster size ave.sil.width
## 1       1    7          0.26
## 2       2    5          0.40
## 3       3    7          0.30
## 4       4    8          0.65
## 5       5   11          0.52

###############
#####################        H methodddddddddddddddddddddddddddd
ehc <- eclust(car, "hclust") # compute hclust
fviz_dend(ehc, rect = F) # dendrogam

fviz_silhouette(ehc) # silhouette plot

##   cluster size ave.sil.width
## 1       1    8          0.66
## 2       2   11          0.45
## 3       3    6          0.37
## 4       4    6          0.35
## 5       5    7          0.28

fviz_cluster(ehc)+theme_cleveland()+geom_smooth() # scatter plot

## `geom_smooth()` using method = 'loess'

### cluster modelling 
ggscatter(car,x = "Weight",y = "Horsepower")+geom_density2d()

cars<- scale(car) # Standardize the data
mcar<- Mclust(car)
summary(mcar)

## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VEV (ellipsoidal, equal shape) model with 3 components:
## 
##  log.likelihood  n df       BIC       ICL
##        23.52866 38 73 -218.4865 -218.5149
## 
## Clustering table:
##  1  2  3 
##  8 17 13

mcar$modelName  ##### Optimal selected model ==> "VEV"

## [1] "VEV"

mcar$G          ##### Optimal number of cluster => 3

## [1] 3

head(mcar$z, 30)              # Probality to belong to a given cluster

##            [,1]          [,2]          [,3]
##  [1,] 1.0000000  0.000000e+00 6.744316e-107
##  [2,] 1.0000000  0.000000e+00  1.034739e-23
##  [3,] 0.9999979  0.000000e+00  2.133299e-06
##  [4,] 1.0000000  0.000000e+00  2.941828e-75
##  [5,] 0.0000000  9.999946e-01  5.430960e-06
##  [6,] 0.0000000  9.999716e-01  2.843821e-05
##  [7,] 0.0000000  9.861104e-01  1.388963e-02
##  [8,] 0.0000000  9.999963e-01  3.685960e-06
##  [9,] 0.0000000 3.729863e-183  1.000000e+00
## [10,] 0.0000000  0.000000e+00  1.000000e+00
## [11,] 0.0000000  2.818544e-14  1.000000e+00
## [12,] 0.0000000  0.000000e+00  1.000000e+00
## [13,] 0.0000000  0.000000e+00  1.000000e+00
## [14,] 0.0000000  0.000000e+00  1.000000e+00
## [15,] 0.0000000  0.000000e+00  1.000000e+00
## [16,] 0.0000000  0.000000e+00  1.000000e+00
## [17,] 1.0000000  0.000000e+00  2.467999e-10
## [18,] 0.9999973  0.000000e+00  2.686976e-06
## [19,] 1.0000000  0.000000e+00  1.257471e-26
## [20,] 1.0000000  0.000000e+00  2.364268e-25
## [21,] 0.0000000  9.999909e-01  9.086168e-06
## [22,] 0.0000000  0.000000e+00  1.000000e+00
## [23,] 0.0000000  1.000000e+00  3.799760e-14
## [24,] 0.0000000  9.998226e-01  1.774055e-04
## [25,] 0.0000000  1.000000e+00  6.401645e-09
## [26,] 0.0000000  9.999999e-01  8.646310e-08
## [27,] 0.0000000  1.000000e+00  2.559689e-21
## [28,] 0.0000000  1.000000e+00  1.020366e-13
## [29,] 0.0000000  0.000000e+00  1.000000e+00
## [30,] 0.0000000  0.000000e+00  1.000000e+00

head(mcar$classification, 30) # Cluster assignement of each observation

##  [1] 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 1 1 1 1 2 3 2 2 2 2 2 2 3 3

# BIC values used for choosing the number of clusters
fviz_mclust(mcar, "BIC", palette = "jco")

# Classification: plot showing the clustering
fviz_mclust(mcar, "classification", geom = "point", 
            pointsize = 1.5, palette = "jco")

# Classification uncertainty
fviz_mclust(mcar, "uncertainty", palette = "jco")

####
###

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##       MPG               Weight         Drive_Ratio      
##  Min.   :-1.41440   Min.   :-1.3410   Min.   :-1.60999  
##  1st Qu.:-0.95238   1st Qu.:-0.9272   1st Qu.:-0.76966  
##  Median :-0.07797   Median :-0.2517   Median :-0.02593  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.85752   3rd Qu.: 0.7740   3rd Qu.: 1.02689  
##  Max.   : 1.91521   Max.   : 2.1179   Max.   : 1.55813  
##    Horsepower        Displacement       Cylinders      
##  Min.   :-1.38918   Min.   :-1.0384   Min.   :-0.8701  
##  1st Qu.:-0.87869   1st Qu.:-0.8134   1st Qu.:-0.8701  
##  Median :-0.06568   Median :-0.3239   Median :-0.5582  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.83242   3rd Qu.: 0.5874   3rd Qu.: 0.3776  
##  Max.   : 2.01412   Max.   : 2.0558   Max.   : 1.6252

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.