Cluster Analysis in R

Getting Data

library(readr)
Data_1_ <- read_csv("C:/Users/LENOVO/Downloads/Data (1).txt")
## Rows: 22 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Company
## dbl (8): Fixed_charge, RoR, Cost, Load, D Demand, Sales, Nuclear, Fuel_Cost
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data_1_
## # A tibble: 22 x 9
##    Company     Fixed_charge   RoR  Cost  Load `D Demand` Sales Nuclear Fuel_Cost
##    <chr>              <dbl> <dbl> <dbl> <dbl>      <dbl> <dbl>   <dbl>     <dbl>
##  1 Arizona             1.06   9.2   151  54.4        1.6  9077     0       0.628
##  2 Boston              0.89  10.3   202  57.9        2.2  5088    25.3     1.56 
##  3 Central             1.43  15.4   113  53          3.4  9212     0       1.06 
##  4 Commonweal~         1.02  11.2   168  56          0.3  6423    34.3     0.7  
##  5 Con Ed NY           1.49   8.8   192  51.2        1    3300    15.6     2.04 
##  6 Florida             1.32  13.5   111  60         -2.2 11127    22.5     1.24 
##  7 Hawaiian            1.22  12.2   175  67.6        2.2  7642     0       1.65 
##  8 Idaho               1.1    9.2   245  57          3.3 13082     0       0.309
##  9 Kentucky            1.34  13     168  60.4        7.2  8406     0       0.862
## 10 Madison             1.12  12.4   197  53          2.7  6455    39.2     0.623
## # ... with 12 more rows

#Scatter plot

library(ggplot2)
plot(Data_1_$Fuel_Cost~ Data_1_$Sales, data = Data_1_)
with(Data_1_,text(Data_1_$Fuel_Cost ~ Data_1_$Sales, labels=Data_1_$Company,pos=4))

#Normalize

z <- Data_1_[,-c(1,1)]
means <- apply(z,2,mean)
sds <- apply(z,2,sd)
nor <- scale(z,center=means,scale=sds)

#Calculate distance matrix

distance = dist(nor)

#Hierarchical agglomerative clustering

mydata.hclust = hclust(distance)
plot(mydata.hclust)

plot(mydata.hclust,labels=Data_1_$Company,main='Default from hclust')

plot(mydata.hclust,hang=-1, labels=Data_1_$Company,main='Default from hclust')

#Hierarchical agglomerative clustering using “average” linkage

mydata.hclust<-hclust(distance,method="average") 
plot(mydata.hclust,hang=-1) 

#Cluster membership

member = cutree(mydata.hclust,3)
table(member)
## member
##  1  2  3 
## 18  1  3
member
##  [1] 1 1 1 1 2 1 1 3 1 1 3 1 1 1 1 3 1 1 1 1 1 1

#Characterizing clusters

aggregate(nor,list(member),mean)
##   Group.1 Fixed_charge        RoR       Cost       Load   D Demand      Sales
## 1       1  -0.01313873  0.1868016 -0.2552757  0.1520422 -0.1253617 -0.2215631
## 2       2   2.03732429 -0.8628882  0.5782326 -1.2950193 -0.7186431 -1.5814284
## 3       3  -0.60027572 -0.8331800  1.3389101 -0.4805802  0.9917178  1.8565214
##      Nuclear   Fuel_Cost
## 1  0.1071944  0.06692555
## 2  0.2143888  1.69263800
## 3 -0.7146294 -0.96576599
aggregate(Data_1_[,-c(1,1)],list(member),mean)
##   Group.1 Fixed_charge       RoR     Cost     Load D Demand    Sales Nuclear
## 1       1     1.111667 11.155556 157.6667 57.65556 2.850000  8127.50    13.8
## 2       2     1.490000  8.800000 192.0000 51.20000 1.000000  3300.00    15.6
## 3       3     1.003333  8.866667 223.3333 54.83333 6.333333 15504.67     0.0
##   Fuel_Cost
## 1 1.1399444
## 2 2.0440000
## 3 0.5656667

#Silhouette Plot

library(cluster)
plot(silhouette(cutree(mydata.hclust,3), distance))

#Scree Plot

wss <- (nrow(nor)-1)*sum(apply(nor,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(nor, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

#K-means clustering

set.seed(123)
kc<-kmeans(nor,3)
kc
## K-means clustering with 3 clusters of sizes 7, 5, 10
## 
## Cluster means:
##   Fixed_charge         RoR       Cost       Load    D Demand      Sales
## 1  -0.23896065 -0.65917479  0.2556961  0.7992527 -0.05435116 -0.8604593
## 2   0.51980100  1.02655333 -1.2959473 -0.5104679 -0.83409247  0.5120458
## 3  -0.09262805 -0.05185431  0.4689864 -0.3042429  0.45509205  0.3462986
##      Nuclear  Fuel_Cost
## 1 -0.2884040  1.2497562
## 2 -0.4466434 -0.3174391
## 3  0.4252045 -0.7161098
## 
## Clustering vector:
##  [1] 3 1 2 3 1 2 1 3 3 3 3 1 3 2 1 3 1 2 2 3 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 34.16481 15.15613 57.53424
##  (between_SS / total_SS =  36.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
ot<-nor
datadistshortset<-dist(ot,method = "euclidean")
hc1 <- hclust(datadistshortset, method = "complete" )
pamvshortset <- pam(datadistshortset,4, diss = FALSE)
clusplot(pamvshortset, shade = FALSE,labels=2,col.clus="blue",col.p="red",span=FALSE,main="Cluster Mapping",cex=1.2)

#Cluster Analysis in R

library(factoextra) 
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
k2 <- kmeans(nor, centers = 3, nstart = 25)
k2
## K-means clustering with 3 clusters of sizes 3, 7, 12
## 
## Cluster means:
##   Fixed_charge        RoR       Cost       Load    D Demand       Sales
## 1   -0.6002757 -0.8331800  1.3389101 -0.4805802  0.99171778  1.85652137
## 2   -0.2389606 -0.6591748  0.2556961  0.7992527 -0.05435116 -0.86045933
## 3    0.2894626  0.5928136 -0.4838836 -0.3460857 -0.21622460  0.03780427
##      Nuclear Fuel_Cost
## 1 -0.7146294 -0.965766
## 2 -0.2884040  1.249756
## 3  0.3468930 -0.487583
## 
## Clustering vector:
##  [1] 3 2 3 3 2 3 2 1 3 3 1 2 3 3 2 1 2 3 3 3 2 3
## 
## Within cluster sum of squares by cluster:
## [1]  9.533522 34.164812 58.012322
##  (between_SS / total_SS =  39.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
str(k2)
## List of 9
##  $ cluster     : int [1:22] 3 2 3 3 2 3 2 1 3 3 ...
##  $ centers     : num [1:3, 1:8] -0.6 -0.239 0.289 -0.833 -0.659 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:8] "Fixed_charge" "RoR" "Cost" "Load" ...
##  $ totss       : num 168
##  $ withinss    : num [1:3] 9.53 34.16 58.01
##  $ tot.withinss: num 102
##  $ betweenss   : num 66.3
##  $ size        : int [1:3] 3 7 12
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = nor)

#Optimal Clusters

fviz_nbclust(nor, kmeans, method = "wss")

#Average Silhouette Method

fviz_nbclust(nor, kmeans, method = "silhouette")

#Gap Statistic Method

gap_stat <- clusGap(nor, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

#Conclusion

K-means clustering is a very simple and fast algorithm and it can efficiently deal with very large data sets.

K-means clustering needs to provide a number of clusters as an input, Hierarchical clustering is an alternative approach that does not require that we commit to a particular choice of clusters.

Hierarchical clustering has an added advantage over K-means clustering because it has an attractive tree-based representation of the observations (dendrogram).