Cluster Analysis in R

Getting Data

mydata <- read.csv("E:/JORIELYN/MULTIVARIATE ANALYSIS/Data.txt", header=T)
str(mydata)

'data.frame':   22 obs. of  9 variables:
 $ Company     : chr  "Arizona " "Boston " "Central " "Commonwealth" ...
 $ Fixed_charge: num  1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ...
 $ RoR         : num  9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ...
 $ Cost        : int  151 202 113 168 192 111 175 245 168 197 ...
 $ Load        : num  54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ...
 $ D.Demand    : num  1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ...
 $ Sales       : int  9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ...
 $ Nuclear     : num  0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ...
 $ Fuel_Cost   : num  0.628 1.555 1.058 0.7 2.044 ...

head(mydata)

       Company Fixed_charge  RoR Cost Load D.Demand Sales Nuclear Fuel_Cost
1     Arizona          1.06  9.2  151 54.4      1.6  9077     0.0     0.628
2      Boston          0.89 10.3  202 57.9      2.2  5088    25.3     1.555
3     Central          1.43 15.4  113 53.0      3.4  9212     0.0     1.058
4 Commonwealth         1.02 11.2  168 56.0      0.3  6423    34.3     0.700
5    Con Ed NY         1.49  8.8  192 51.2      1.0  3300    15.6     2.044
6     Florida          1.32 13.5  111 60.0     -2.2 11127    22.5     1.241

pairs(mydata[2:9])

Scatter plot

plot(mydata$Fuel_Cost~ mydata$Sales, data = mydata)
with(mydata,text(mydata$Fuel_Cost ~ mydata$Sales, labels=mydata$Company,pos=4))

Normalize

z <- mydata[,-c(1,1)]
means <- apply(z,2,mean)
sds <- apply(z,2,sd)
nor <- scale(z,center=means,scale=sds)

Calculate distance matrix

distance = dist(nor)

Hierarchical agglomerative clustering

mydata.hclust = hclust(distance)
plot(mydata.hclust)

plot(mydata.hclust,labels=mydata$Company,main='Default from hclust')

plot(mydata.hclust,hang=-1, labels=mydata$Company,main='Default from hclust')

Hierarchical agglomerative clustering using “average” linkage

mydata.hclust<-hclust(distance,method="average") 
plot(mydata.hclust,hang=-1)

Cluster membership

member = cutree(mydata.hclust,3)
table(member)

member
 1  2  3 
18  1  3

Characterizing clusters

aggregate(nor,list(member),mean)

  Group.1 Fixed_charge        RoR       Cost       Load   D.Demand      Sales
1       1  -0.01313873  0.1868016 -0.2552757  0.1520422 -0.1253617 -0.2215631
2       2   2.03732429 -0.8628882  0.5782326 -1.2950193 -0.7186431 -1.5814284
3       3  -0.60027572 -0.8331800  1.3389101 -0.4805802  0.9917178  1.8565214
     Nuclear   Fuel_Cost
1  0.1071944  0.06692555
2  0.2143888  1.69263800
3 -0.7146294 -0.96576599

aggregate(mydata[,-c(1,1)],list(member),mean)

  Group.1 Fixed_charge       RoR     Cost     Load D.Demand    Sales Nuclear
1       1     1.111667 11.155556 157.6667 57.65556 2.850000  8127.50    13.8
2       2     1.490000  8.800000 192.0000 51.20000 1.000000  3300.00    15.6
3       3     1.003333  8.866667 223.3333 54.83333 6.333333 15504.67     0.0
  Fuel_Cost
1 1.1399444
2 2.0440000
3 0.5656667

Silhouette Plot

library(cluster)
plot(silhouette(cutree(mydata.hclust,3), distance))

Scree Plot

wss <- (nrow(nor)-1)*sum(apply(nor,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(nor, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

K-means clustering

set.seed(123)
kc<-kmeans(nor,3)
kc

K-means clustering with 3 clusters of sizes 7, 5, 10

Cluster means:
  Fixed_charge         RoR       Cost       Load    D.Demand      Sales
1  -0.23896065 -0.65917479  0.2556961  0.7992527 -0.05435116 -0.8604593
2   0.51980100  1.02655333 -1.2959473 -0.5104679 -0.83409247  0.5120458
3  -0.09262805 -0.05185431  0.4689864 -0.3042429  0.45509205  0.3462986
     Nuclear  Fuel_Cost
1 -0.2884040  1.2497562
2 -0.4466434 -0.3174391
3  0.4252045 -0.7161098

Clustering vector:
 [1] 3 1 2 3 1 2 1 3 3 3 3 1 3 2 1 3 1 2 2 3 1 3

Within cluster sum of squares by cluster:
[1] 34.16481 15.15613 57.53424
 (between_SS / total_SS =  36.4 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

ot<-nor
datadistshortset<-dist(ot,method = "euclidean")
hc1 <- hclust(datadistshortset, method = "complete" )
pamvshortset <- pam(datadistshortset,4, diss = FALSE)
clusplot(pamvshortset, shade = FALSE,labels=2,col.clus="blue",col.p="red",span=FALSE,main="Cluster Mapping",cex=1.2)

Cluster Analysis in R

library(factoextra) 
k2 <- kmeans(nor, centers = 3, nstart = 25)
str(k2)

List of 9
 $ cluster     : int [1:22] 3 2 3 3 2 3 2 1 3 3 ...
 $ centers     : num [1:3, 1:8] -0.6 -0.239 0.289 -0.833 -0.659 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:3] "1" "2" "3"
  .. ..$ : chr [1:8] "Fixed_charge" "RoR" "Cost" "Load" ...
 $ totss       : num 168
 $ withinss    : num [1:3] 9.53 34.16 58.01
 $ tot.withinss: num 102
 $ betweenss   : num 66.3
 $ size        : int [1:3] 3 7 12
 $ iter        : int 2
 $ ifault      : int 0
 - attr(*, "class")= chr "kmeans"

fviz_cluster(k2, data = nor)

Optimal Clusters

fviz_nbclust(nor, kmeans, method = "wss")

Average Silhouette Method

fviz_nbclust(nor, kmeans, method = "silhouette")

Gap Statistic Method

gap_stat <- clusGap(nor, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

Cluster Analysis in R

JORIELYN S. MANLA

2024-05-18

Getting Data

Scatter plot

Normalize

Calculate distance matrix

Hierarchical agglomerative clustering

Hierarchical agglomerative clustering using “average” linkage

Cluster membership

Characterizing clusters

Silhouette Plot

Scree Plot

K-means clustering

Cluster Analysis in R

Optimal Clusters

Average Silhouette Method

Gap Statistic Method