mydata <- read.csv("C:/Users/63966/Downloads/Data.txt", header=T)
str(mydata)
## 'data.frame': 22 obs. of 9 variables:
## $ Company : chr "Arizona " "Boston " "Central " "Commonwealth" ...
## $ Fixed_charge: num 1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ...
## $ RoR : num 9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ...
## $ Cost : int 151 202 113 168 192 111 175 245 168 197 ...
## $ Load : num 54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ...
## $ D.Demand : num 1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ...
## $ Sales : int 9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ...
## $ Nuclear : num 0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ...
## $ Fuel_Cost : num 0.628 1.555 1.058 0.7 2.044 ...
head(mydata)
## Company Fixed_charge RoR Cost Load D.Demand Sales Nuclear Fuel_Cost
## 1 Arizona 1.06 9.2 151 54.4 1.6 9077 0.0 0.628
## 2 Boston 0.89 10.3 202 57.9 2.2 5088 25.3 1.555
## 3 Central 1.43 15.4 113 53.0 3.4 9212 0.0 1.058
## 4 Commonwealth 1.02 11.2 168 56.0 0.3 6423 34.3 0.700
## 5 Con Ed NY 1.49 8.8 192 51.2 1.0 3300 15.6 2.044
## 6 Florida 1.32 13.5 111 60.0 -2.2 11127 22.5 1.241
pairs(mydata[2:9])
## Scatter Plot
plot(mydata$Fuel_Cost~ mydata$Sales, data = mydata)
with(mydata,text(mydata$Fuel_Cost ~ mydata$Sales, labels=mydata$Company,pos=4))
z <- mydata[,-c(1,1)]
means <- apply(z,2,mean)
sds <- apply(z,2,sd)
nor <- scale(z,center=means,scale=sds)
distance = dist(nor)
mydata.hclust = hclust(distance)
plot(mydata.hclust)
plot(mydata.hclust,labels=mydata$Company,main='Default from hclust')
plot(mydata.hclust,hang=-1, labels=mydata$Company,main='Default from hclust')
mydata.hclust<-hclust(distance,method="average")
plot(mydata.hclust,hang=-1)
member = cutree(mydata.hclust,3)
table(member)
## member
## 1 2 3
## 18 1 3
member
## [1] 1 1 1 1 2 1 1 3 1 1 3 1 1 1 1 3 1 1 1 1 1 1
aggregate(nor,list(member),mean)
## Group.1 Fixed_charge RoR Cost Load D.Demand Sales
## 1 1 -0.01313873 0.1868016 -0.2552757 0.1520422 -0.1253617 -0.2215631
## 2 2 2.03732429 -0.8628882 0.5782326 -1.2950193 -0.7186431 -1.5814284
## 3 3 -0.60027572 -0.8331800 1.3389101 -0.4805802 0.9917178 1.8565214
## Nuclear Fuel_Cost
## 1 0.1071944 0.06692555
## 2 0.2143888 1.69263800
## 3 -0.7146294 -0.96576599
aggregate(mydata[,-c(1,1)],list(member),mean)
## Group.1 Fixed_charge RoR Cost Load D.Demand Sales Nuclear
## 1 1 1.111667 11.155556 157.6667 57.65556 2.850000 8127.50 13.8
## 2 2 1.490000 8.800000 192.0000 51.20000 1.000000 3300.00 15.6
## 3 3 1.003333 8.866667 223.3333 54.83333 6.333333 15504.67 0.0
## Fuel_Cost
## 1 1.1399444
## 2 2.0440000
## 3 0.5656667
library(cluster)
plot(silhouette(cutree(mydata.hclust,3), distance))
Scree plot will allow us to see the variabilities in clusters, suppose if we increase the number of clusters within-group sum of squares will come down.
wss <- (nrow(nor)-1)*sum(apply(nor,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(nor, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
set.seed(123)
kc<-kmeans(nor,3)
kc
## K-means clustering with 3 clusters of sizes 7, 5, 10
##
## Cluster means:
## Fixed_charge RoR Cost Load D.Demand Sales
## 1 -0.23896065 -0.65917479 0.2556961 0.7992527 -0.05435116 -0.8604593
## 2 0.51980100 1.02655333 -1.2959473 -0.5104679 -0.83409247 0.5120458
## 3 -0.09262805 -0.05185431 0.4689864 -0.3042429 0.45509205 0.3462986
## Nuclear Fuel_Cost
## 1 -0.2884040 1.2497562
## 2 -0.4466434 -0.3174391
## 3 0.4252045 -0.7161098
##
## Clustering vector:
## [1] 3 1 2 3 1 2 1 3 3 3 3 1 3 2 1 3 1 2 2 3 1 3
##
## Within cluster sum of squares by cluster:
## [1] 34.16481 15.15613 57.53424
## (between_SS / total_SS = 36.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
ot<-nor
datadistshortset<-dist(ot,method = "euclidean")
hc1 <- hclust(datadistshortset, method = "complete" )
pamvshortset <- pam(datadistshortset,4, diss = FALSE)
clusplot(pamvshortset, shade = FALSE,labels=2,col.clus="blue",col.p="red",span=FALSE,main="Cluster Mapping",cex=1.2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
k2 <- kmeans(nor, centers = 3, nstart = 25)
str(k2)
## List of 9
## $ cluster : int [1:22] 3 2 3 3 2 3 2 1 3 3 ...
## $ centers : num [1:3, 1:8] -0.6 -0.239 0.289 -0.833 -0.659 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:8] "Fixed_charge" "RoR" "Cost" "Load" ...
## $ totss : num 168
## $ withinss : num [1:3] 9.53 34.16 58.01
## $ tot.withinss: num 102
## $ betweenss : num 66.3
## $ size : int [1:3] 3 7 12
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = nor)
We can find out optimal clusters in R with the following code. The results suggest that 4 is the optimal number of clusters as it appears to be the bend in the knee.
The same we executed above with traditional coding’s.
fviz_nbclust(nor, kmeans, method = "wss")
The average silhouette approach measures the quality of a clustering. It determines how well each observation lies within its cluster.
A high average silhouette width indicates a good clustering. The average silhouette method computes the average silhouette of observations for different values of k.
fviz_nbclust(nor, kmeans, method = "silhouette")
This approach can be utilized in any type of clustering method (i.e. K-means clustering, hierarchical clustering).
The gap statistic compares the total intracluster variation for different values of k with their expected values under null reference distribution of the data.
gap_stat <- clusGap(nor, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
K-means clustering is a very simple and fast algorithm and it can efficiently deal with very large data sets.
K-means clustering needs to provide a number of clusters as an input, Hierarchical clustering is an alternative approach that does not require that we commit to a particular choice of clusters.
Hierarchical clustering has an added advantage over K-means clustering because it has an attractive tree-based representation of the observations (dendrogram).