kable(cars[1:10,])
| Country | Car | MPG | Weight | Drive_Ratio | Horsepower | Displacement | Cylinders |
|---|---|---|---|---|---|---|---|
| U.S. | Buick Estate Wagon | 16.9 | 4.360 | 2.73 | 155 | 350 | 8 |
| U.S. | Ford Country Squire Wagon | 15.5 | 4.054 | 2.26 | 142 | 351 | 8 |
| U.S. | Chevy Malibu Wagon | 19.2 | 3.605 | 2.56 | 125 | 267 | 8 |
| U.S. | Chrysler LeBaron Wagon | 18.5 | 3.940 | 2.45 | 150 | 360 | 8 |
| U.S. | Chevette | 30.0 | 2.155 | 3.70 | 68 | 98 | 4 |
| Japan | Toyota Corona | 27.5 | 2.560 | 3.05 | 95 | 134 | 4 |
| Japan | Datsun 510 | 27.2 | 2.300 | 3.54 | 97 | 119 | 4 |
| U.S. | Dodge Omni | 30.9 | 2.230 | 3.37 | 75 | 105 | 4 |
| Germany | Audi 5000 | 20.3 | 2.830 | 3.90 | 103 | 131 | 5 |
| Sweden | Volvo 240 GL | 17.0 | 3.140 | 3.50 | 125 | 163 | 6 |
car1 <-cars[,-c(1,2)]
car <- scale(car1,center=TRUE,scale=TRUE)
##
discar<- dist(car)
hcar<-hclust(discar)
ggdendrogram(hcar,size=3,rotate = T,hang=-1,theme_dendro = F,labels = T)+theme_dark()
ggdendrogram(hcar,size=3,rotate = F,hang=-1,theme_dendro = F)
###checking with other distances
hcar.a<- hclust(discar,method = "average")
hcaralabel<- cutree(hcar.a,3)
hcar.labels<- cutree(hcar,3)
table(hcar.labels,hcaralabel) ####a bit contradiction between 2nd and 4rd cluster
## hcaralabel
## hcar.labels 1 2 3
## 1 8 0 0
## 2 0 17 2
## 3 0 0 11
car<- data.frame(car)
ggplot(car,aes(car$Weight,car$Drive_Ratio))+geom_point(aes(col=hcar.labels),show.legend = F)+
scale_color_gradient(low="blue", high="red")##change the colour by the scale
silhouette(hcar.labels,discar)
## cluster neighbor sil_width
## [1,] 1 3 0.64464958
## [2,] 1 3 0.77106933
## [3,] 1 3 0.54210767
## [4,] 1 3 0.74656176
## [5,] 2 3 0.54585357
## [6,] 2 3 0.33613000
## [7,] 2 3 0.32432997
## [8,] 2 3 0.59482921
## [9,] 3 2 0.30602432
## [10,] 3 1 0.52111190
## [11,] 3 2 0.22879122
## [12,] 3 1 0.43599839
## [13,] 3 1 0.23127213
## [14,] 3 2 0.33422249
## [15,] 3 1 0.11759924
## [16,] 3 1 -0.03202054
## [17,] 1 3 0.75863102
## [18,] 1 3 0.73053785
## [19,] 1 3 0.78556792
## [20,] 1 3 0.77793710
## [21,] 2 3 0.33006248
## [22,] 3 2 0.40350225
## [23,] 2 3 0.56112055
## [24,] 2 3 0.55642437
## [25,] 2 3 0.40264059
## [26,] 2 3 0.55379182
## [27,] 2 3 0.54934199
## [28,] 2 3 0.26902569
## [29,] 2 3 -0.05152949
## [30,] 2 3 -0.18909017
## [31,] 2 3 0.41418644
## [32,] 2 3 0.60390847
## [33,] 2 3 0.56935797
## [34,] 2 3 0.56130671
## [35,] 2 3 0.53992065
## [36,] 3 2 0.30493117
## [37,] 3 2 0.15460105
## [38,] 2 3 0.55387268
## attr(,"Ordered")
## [1] FALSE
## attr(,"call")
## silhouette.default(x = hcar.labels, dist = discar)
## attr(,"class")
## [1] "silhouette"
plot(silhouette(hcar.labels,discar)) #### overall good, 3 outliars
########found the number of K
gap_stat <- clusGap(car, FUN = kmeans,K.max = 10) ######with clusgap
fviz_gap_stat(gap_stat)
ekm <- eclust(car, "kmeans") ####with eclust
fviz_gap_stat(ekm$gap_stat)
#### found the number of K
########################
wss <- (nrow(car)-1)*sum(apply(car,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(car,
centers=i)$withinss)
####plot(1:15, wss, type="b", xlab="Number of Clusters",
##ylab="Within groups sum of squares")
wss<- data.frame(1:15,wss)
ggplot(data=wss,aes(x=1:15,y=wss))+geom_line()+
geom_point()
########################## normal ones
hcarkm2 <- kmeans(car, centers=2) ###########
plot(car,col=hcarkm2$cluster)
hcarkm3 <-kmeans(car, centers=3) ###############
plot(car,col=hcarkm3$cluster)
hcarkm5 <-kmeans(car, centers=5) #########
###
plot(silhouette(hcarkm2$cluster,discar)) ###2 cluster kmeans
fviz_cluster(hcarkm2, car) ########
fviz_silhouette(silhouette(hcarkm2$cluster,discar))
## cluster size ave.sil.width
## 1 1 13 0.53
## 2 2 25 0.51
plot(silhouette(hcarkm3$cluster,discar)) ## 3 cluster kmeans
fviz_cluster(hcarkm3, car)
fviz_silhouette(silhouette(hcarkm3$cluster,discar))
## cluster size ave.sil.width
## 1 1 8 0.72
## 2 2 13 0.25
## 3 3 17 0.51
fviz_silhouette((ekm)) ####### another way
## cluster size ave.sil.width
## 1 1 17 0.51
## 2 2 8 0.72
## 3 3 13 0.25
plot(silhouette(hcarkm5$cluster,discar)) ## 5 cluster kmeans
fviz_cluster(hcarkm5, car)
fviz_silhouette(silhouette(hcarkm5$cluster,discar))
## cluster size ave.sil.width
## 1 1 7 0.26
## 2 2 5 0.40
## 3 3 7 0.30
## 4 4 8 0.65
## 5 5 11 0.52
###############
##################### H methodddddddddddddddddddddddddddd
ehc <- eclust(car, "hclust") # compute hclust
fviz_dend(ehc, rect = F) # dendrogam
fviz_silhouette(ehc) # silhouette plot
## cluster size ave.sil.width
## 1 1 8 0.66
## 2 2 11 0.45
## 3 3 6 0.37
## 4 4 6 0.35
## 5 5 7 0.28
fviz_cluster(ehc)+theme_cleveland()+geom_smooth() # scatter plot
## `geom_smooth()` using method = 'loess'
### cluster modelling
ggscatter(car,x = "Weight",y = "Horsepower")+geom_density2d()
cars<- scale(car) # Standardize the data
mcar<- Mclust(car)
summary(mcar)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 3 components:
##
## log.likelihood n df BIC ICL
## 23.52866 38 73 -218.4865 -218.5149
##
## Clustering table:
## 1 2 3
## 8 17 13
mcar$modelName ##### Optimal selected model ==> "VEV"
## [1] "VEV"
mcar$G ##### Optimal number of cluster => 3
## [1] 3
head(mcar$z, 30) # Probality to belong to a given cluster
## [,1] [,2] [,3]
## [1,] 1.0000000 0.000000e+00 6.744316e-107
## [2,] 1.0000000 0.000000e+00 1.034739e-23
## [3,] 0.9999979 0.000000e+00 2.133299e-06
## [4,] 1.0000000 0.000000e+00 2.941828e-75
## [5,] 0.0000000 9.999946e-01 5.430960e-06
## [6,] 0.0000000 9.999716e-01 2.843821e-05
## [7,] 0.0000000 9.861104e-01 1.388963e-02
## [8,] 0.0000000 9.999963e-01 3.685960e-06
## [9,] 0.0000000 3.729863e-183 1.000000e+00
## [10,] 0.0000000 0.000000e+00 1.000000e+00
## [11,] 0.0000000 2.818544e-14 1.000000e+00
## [12,] 0.0000000 0.000000e+00 1.000000e+00
## [13,] 0.0000000 0.000000e+00 1.000000e+00
## [14,] 0.0000000 0.000000e+00 1.000000e+00
## [15,] 0.0000000 0.000000e+00 1.000000e+00
## [16,] 0.0000000 0.000000e+00 1.000000e+00
## [17,] 1.0000000 0.000000e+00 2.467999e-10
## [18,] 0.9999973 0.000000e+00 2.686976e-06
## [19,] 1.0000000 0.000000e+00 1.257471e-26
## [20,] 1.0000000 0.000000e+00 2.364268e-25
## [21,] 0.0000000 9.999909e-01 9.086168e-06
## [22,] 0.0000000 0.000000e+00 1.000000e+00
## [23,] 0.0000000 1.000000e+00 3.799760e-14
## [24,] 0.0000000 9.998226e-01 1.774055e-04
## [25,] 0.0000000 1.000000e+00 6.401645e-09
## [26,] 0.0000000 9.999999e-01 8.646310e-08
## [27,] 0.0000000 1.000000e+00 2.559689e-21
## [28,] 0.0000000 1.000000e+00 1.020366e-13
## [29,] 0.0000000 0.000000e+00 1.000000e+00
## [30,] 0.0000000 0.000000e+00 1.000000e+00
head(mcar$classification, 30) # Cluster assignement of each observation
## [1] 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 1 1 1 1 2 3 2 2 2 2 2 2 3 3
# BIC values used for choosing the number of clusters
fviz_mclust(mcar, "BIC", palette = "jco")
# Classification: plot showing the clustering
fviz_mclust(mcar, "classification", geom = "point",
pointsize = 1.5, palette = "jco")
# Classification uncertainty
fviz_mclust(mcar, "uncertainty", palette = "jco")
####
###
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## MPG Weight Drive_Ratio
## Min. :-1.41440 Min. :-1.3410 Min. :-1.60999
## 1st Qu.:-0.95238 1st Qu.:-0.9272 1st Qu.:-0.76966
## Median :-0.07797 Median :-0.2517 Median :-0.02593
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.85752 3rd Qu.: 0.7740 3rd Qu.: 1.02689
## Max. : 1.91521 Max. : 2.1179 Max. : 1.55813
## Horsepower Displacement Cylinders
## Min. :-1.38918 Min. :-1.0384 Min. :-0.8701
## 1st Qu.:-0.87869 1st Qu.:-0.8134 1st Qu.:-0.8701
## Median :-0.06568 Median :-0.3239 Median :-0.5582
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.83242 3rd Qu.: 0.5874 3rd Qu.: 0.3776
## Max. : 2.01412 Max. : 2.0558 Max. : 1.6252
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.