Getting Data

 Data <- read.csv("D:/School/4th Year College/Second Semester/Multivariate Data Analysis/Activity/Data.txt")
View(Data)
str(Data)
## 'data.frame':    22 obs. of  9 variables:
##  $ Company     : chr  "Arizona " "Boston " "Central " "Commonwealth" ...
##  $ Fixed_charge: num  1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ...
##  $ RoR         : num  9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ...
##  $ Cost        : int  151 202 113 168 192 111 175 245 168 197 ...
##  $ Load        : num  54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ...
##  $ D.Demand    : num  1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ...
##  $ Sales       : int  9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ...
##  $ Nuclear     : num  0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ...
##  $ Fuel_Cost   : num  0.628 1.555 1.058 0.7 2.044 ...
head(Data)
##        Company Fixed_charge  RoR Cost Load D.Demand Sales Nuclear Fuel_Cost
## 1     Arizona          1.06  9.2  151 54.4      1.6  9077     0.0     0.628
## 2      Boston          0.89 10.3  202 57.9      2.2  5088    25.3     1.555
## 3     Central          1.43 15.4  113 53.0      3.4  9212     0.0     1.058
## 4 Commonwealth         1.02 11.2  168 56.0      0.3  6423    34.3     0.700
## 5    Con Ed NY         1.49  8.8  192 51.2      1.0  3300    15.6     2.044
## 6     Florida          1.32 13.5  111 60.0     -2.2 11127    22.5     1.241
pairs(Data[2:9])

Basically, a pairs plot will provide a scatter plot for all possible combinations.

Scatter plot

Now, we visualize the data using a scatterplot.

plot(Data$Fuel_Cost~ Data$Sales, data = Data)
with(Data,text(Data$Fuel_Cost ~ Data$Sales, labels=Data$Company,pos=4))

Normalize

Now, we normalize the data. Normalization is very important in cluster analysis, sometimes we have variables in different scales, need to normalized based on scale function before clustering the data sets.

z <- Data[,-c(1,1)]
means <- apply(z,2,mean)
sds <- apply(z,2,sd)
nor <- scale(z,center=means,scale=sds)

Calculate distance matrix

distance = dist(nor)

Hierarchical agglomerative clustering

Data.hclust = hclust(distance)
plot(Data.hclust)

plot(Data.hclust,labels=Data$Company,main='Default from hclust')

plot(Data.hclust,hang=-1, labels=Data$Company,main='Default from hclust')

Hierarchical agglomerative clustering using “average” linkage

Data.hclust<-hclust(distance,method="average") 
plot(Data.hclust,hang=-1) 

Cluster membership

member = cutree(Data.hclust,3)
table(member)
## member
##  1  2  3 
## 18  1  3

Characterizing clusters

aggregate(nor,list(member),mean)
##   Group.1 Fixed_charge        RoR       Cost       Load   D.Demand      Sales
## 1       1  -0.01313873  0.1868016 -0.2552757  0.1520422 -0.1253617 -0.2215631
## 2       2   2.03732429 -0.8628882  0.5782326 -1.2950193 -0.7186431 -1.5814284
## 3       3  -0.60027572 -0.8331800  1.3389101 -0.4805802  0.9917178  1.8565214
##      Nuclear   Fuel_Cost
## 1  0.1071944  0.06692555
## 2  0.2143888  1.69263800
## 3 -0.7146294 -0.96576599
aggregate(Data[,-c(1,1)],list(member),mean)
##   Group.1 Fixed_charge       RoR     Cost     Load D.Demand    Sales Nuclear
## 1       1     1.111667 11.155556 157.6667 57.65556 2.850000  8127.50    13.8
## 2       2     1.490000  8.800000 192.0000 51.20000 1.000000  3300.00    15.6
## 3       3     1.003333  8.866667 223.3333 54.83333 6.333333 15504.67     0.0
##   Fuel_Cost
## 1 1.1399444
## 2 2.0440000
## 3 0.5656667

Silhouette Plot

library(cluster)
plot(silhouette(cutree(Data.hclust,3), distance))

If any of the bars in the above plot are on the negative side, we can conclude that the data is an outlier and should be removed from our study.

Scree Plot

The scree plot will help us to see the variations in clusters; for example, if the number of clusters is increased, the within-group sum of squares will decrease.

wss <- (nrow(nor)-1)*sum(apply(nor,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(nor, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

In this case, the best number of clusters is 3, 4, or 5.

K-means clustering

set.seed(123)
kc<-kmeans(nor,3)
kc
## K-means clustering with 3 clusters of sizes 7, 5, 10
## 
## Cluster means:
##   Fixed_charge         RoR       Cost       Load    D.Demand      Sales
## 1  -0.23896065 -0.65917479  0.2556961  0.7992527 -0.05435116 -0.8604593
## 2   0.51980100  1.02655333 -1.2959473 -0.5104679 -0.83409247  0.5120458
## 3  -0.09262805 -0.05185431  0.4689864 -0.3042429  0.45509205  0.3462986
##      Nuclear  Fuel_Cost
## 1 -0.2884040  1.2497562
## 2 -0.4466434 -0.3174391
## 3  0.4252045 -0.7161098
## 
## Clustering vector:
##  [1] 3 1 2 3 1 2 1 3 3 3 3 1 3 2 1 3 1 2 2 3 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 34.16481 15.15613 57.53424
##  (between_SS / total_SS =  36.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
ot<-nor
datadistshortset<-dist(ot,method = "euclidean")
hc1 <- hclust(datadistshortset, method = "complete" )
pamvshortset <- pam(datadistshortset,4, diss = FALSE)
clusplot(pamvshortset, shade = FALSE,labels=2,col.clus="blue",col.p="red",span=FALSE,main="Cluster Mapping",cex=1.2)

Cluster Analysis in R

library(factoextra) 
## Warning: package 'factoextra' was built under R version 4.1.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
k2 <- kmeans(nor, centers = 3, nstart = 25)

str(k2)
## List of 9
##  $ cluster     : int [1:22] 3 2 3 3 2 3 2 1 3 3 ...
##  $ centers     : num [1:3, 1:8] -0.6 -0.239 0.289 -0.833 -0.659 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:8] "Fixed_charge" "RoR" "Cost" "Load" ...
##  $ totss       : num 168
##  $ withinss    : num [1:3] 9.53 34.16 58.01
##  $ tot.withinss: num 102
##  $ betweenss   : num 66.3
##  $ size        : int [1:3] 3 7 12
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

We can also use fviz_cluster to see our kmeans findings. This creates a better representation of clusters.

fviz_cluster(k2, data = nor)

Optimal Clusters

The following code in R can be used to determine ideal clusters. The results imply that four clusters is the best amount, since it appears to represent the knee bend.

fviz_nbclust(nor, kmeans, method = "wss")

Average Silhouette Method

The average silhouette method assesses the clustering’s quality. It specifies the degree to which each observation fits within its cluster.

The following code can be used to do so.

fviz_nbclust(nor, kmeans, method = "silhouette")

Gap Statistic Method

The gap statistic compares the total intracluster variation for different values of k with their expected values under null reference distribution of the data.

gap_stat <- clusGap(nor, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

The optimal number of cluster provided in this method is 5.