#Loading the Data

 Data <- read.csv("C:/Users/Swift/Downloads/Data.txt",header = T)
str(Data)
## 'data.frame':    22 obs. of  9 variables:
##  $ Company     : chr  "Arizona " "Boston " "Central " "Commonwealth" ...
##  $ Fixed_charge: num  1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ...
##  $ RoR         : num  9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ...
##  $ Cost        : int  151 202 113 168 192 111 175 245 168 197 ...
##  $ Load        : num  54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ...
##  $ D.Demand    : num  1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ...
##  $ Sales       : int  9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ...
##  $ Nuclear     : num  0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ...
##  $ Fuel_Cost   : num  0.628 1.555 1.058 0.7 2.044 ...
head(Data)
##        Company Fixed_charge  RoR Cost Load D.Demand Sales Nuclear Fuel_Cost
## 1     Arizona          1.06  9.2  151 54.4      1.6  9077     0.0     0.628
## 2      Boston          0.89 10.3  202 57.9      2.2  5088    25.3     1.555
## 3     Central          1.43 15.4  113 53.0      3.4  9212     0.0     1.058
## 4 Commonwealth         1.02 11.2  168 56.0      0.3  6423    34.3     0.700
## 5    Con Ed NY         1.49  8.8  192 51.2      1.0  3300    15.6     2.044
## 6     Florida          1.32 13.5  111 60.0     -2.2 11127    22.5     1.241
pairs(Data[2:9])

#Scatterplot

plot(Data$Fuel_Cost~ Data$Sales, data = Data)
with(Data,text(Data$Fuel_Cost ~ Data$Sales, labels=Data$Company,pos=4))

#Normalize

z <- Data[,-c(1,1)]
means <- apply(z,2,mean)
sds <- apply(z,2,sd)
nor <- scale(z,center=means,scale=sds)

means
## Fixed_charge          RoR         Cost         Load     D.Demand        Sales 
##     1.114091    10.736364   168.181818    56.977273     3.240909  8914.045455 
##      Nuclear    Fuel_Cost 
##    12.000000     1.102727
sds
## Fixed_charge          RoR         Cost         Load     D.Demand        Sales 
##    0.1845112    2.2440494   41.1913495    4.4611478    3.1182503 3549.9840305 
##      Nuclear    Fuel_Cost 
##   16.7919198    0.5560981
nor
##       Fixed_charge         RoR         Cost        Load    D.Demand       Sales
##  [1,]  -0.29315791 -0.68463896 -0.417122002 -0.57771516 -0.52622751  0.04590290
##  [2,]  -1.21451134 -0.19445367  0.821002037  0.20683629 -0.33381191 -1.07776413
##  [3,]   1.71214073  2.07822360 -1.339645796 -0.89153574  0.05101929  0.08393124
##  [4,]  -0.50994695  0.20660702 -0.004413989 -0.21906307 -0.94312798 -0.70170610
##  [5,]   2.03732429 -0.86288816  0.578232617 -1.29501935 -0.71864311 -1.58142837
##  [6,]   1.11597086  1.23153991 -1.388199680  0.67756716 -1.74485965  0.62337028
##  [7,]   0.57399826  0.65223002  0.165524604  2.38116460 -0.33381191 -0.35832428
##  [8,]  -0.07636887 -0.68463896  1.864910540  0.00509449  0.01895002  1.17407698
##  [9,]   1.22436538  1.00872841 -0.004413989  0.76723019  1.26965142 -0.14311204
## [10,]   0.03202565  0.74135462  0.699617327 -0.89153574 -0.17346558 -0.69269198
## [11,]  -1.97327298 -1.44219805  0.116970720 -1.22777208  1.04516655  2.40196983
## [12,]   0.08622291  0.07292013  0.238355430  1.12588228  0.14722709 -0.77748109
## [13,]   0.19461744  0.87504152  0.748171211 -0.73462545  1.01309729 -0.48874740
## [14,]  -0.13056613  0.56310542 -1.752353809 -1.60883993 -0.59036605  0.21379097
## [15,]  -0.83513051 -1.39763576 -0.101521757  1.17071379 -1.07140505 -0.68902999
## [16,]   0.24881470 -0.37270287  2.034849134 -0.21906307  1.91103676  1.99351729
## [17,]  -1.91907572 -1.93238335 -0.781276132  1.10346652  1.84689822 -0.90142531
## [18,]  -0.34735517  0.83047922 -0.441398944 -0.06215278 -0.17346558  0.34534086
## [19,]   0.24881470  0.42941852 -1.558138274 -0.66737818 -1.71279038  1.29379583
## [20,]   0.46560374  0.47398082 -0.489952828  0.65515141  0.08308855 -0.45832473
## [21,]  -0.40155243 -0.95201276  0.869555920  0.90172472  0.08308855 -0.63776215
## [22,]  -0.23896065 -0.64007666  0.141247662 -0.60013092  0.85275095  0.33210137
##          Nuclear   Fuel_Cost
##  [1,] -0.7146294 -0.85367545
##  [2,]  0.7920476  0.81329670
##  [3,] -0.7146294 -0.08043055
##  [4,]  1.3280197 -0.72420189
##  [5,]  0.2143888  1.69263800
##  [6,]  0.6253007  0.24864810
##  [7,] -0.7146294  0.98772637
##  [8,] -0.7146294 -1.42731528
##  [9,] -0.7146294 -0.43288637
## [10,]  1.6198267 -0.86266667
## [11,] -0.7146294 -0.60192130
## [12,] -0.7146294  1.42829614
## [13,]  2.2749037 -1.03529809
## [14,] -0.7146294 -0.92560521
## [15,] -0.6610322  0.53456889
## [16,] -0.7146294 -0.86806140
## [17,] -0.2203441  1.46965575
## [18,] -0.7146294  0.00948165
## [19,] -0.7146294 -0.83928950
## [20,]  1.7329764 -0.72060540
## [21,] -0.7146294  1.82211157
## [22,]  0.8694658  0.36553395
## attr(,"scaled:center")
## Fixed_charge          RoR         Cost         Load     D.Demand        Sales 
##     1.114091    10.736364   168.181818    56.977273     3.240909  8914.045455 
##      Nuclear    Fuel_Cost 
##    12.000000     1.102727 
## attr(,"scaled:scale")
## Fixed_charge          RoR         Cost         Load     D.Demand        Sales 
##    0.1845112    2.2440494   41.1913495    4.4611478    3.1182503 3549.9840305 
##      Nuclear    Fuel_Cost 
##   16.7919198    0.5560981

#Calculate distance matrix

distance = dist(nor)
distance
##           1        2        3        4        5        6        7        8
## 2  3.096154                                                               
## 3  3.679230 4.916465                                                      
## 4  2.462149 2.164213 4.107079                                             
## 5  4.123129 3.852850 4.468735 4.127368                                    
## 6  3.606269 4.218804 2.992760 3.201836 4.600183                           
## 7  3.901898 3.448346 4.217769 3.969367 4.596261 3.352919                  
## 8  2.737407 3.892524 4.990876 3.692949 5.155516 4.913953 4.364509         
## 9  3.253851 3.957125 2.752623 3.753627 4.489900 3.730814 2.796298 3.594824
## 10 3.099116 2.705330 3.934935 1.491427 4.045276 3.829058 4.506512 3.673884
## 11 3.491163 4.792640 5.902882 4.864730 6.460986 6.004557 5.995814 3.462587
## 12 3.223138 2.432568 4.031434 3.498769 3.603863 3.738824 1.660047 4.059770
## 13 3.959637 3.434878 4.385973 2.577003 4.758059 4.554909 5.010221 4.140607
## 14 2.113490 4.323825 2.742000 3.230069 4.818803 3.469268 4.914949 4.335241
## 15 2.593481 2.501195 5.156977 3.190250 4.255251 4.065764 2.930142 3.849872
## 16 4.033051 4.837051 5.264442 4.967244 5.816715 5.842268 5.042444 2.201457
## 17 4.396680 3.623588 6.356548 4.893679 5.628591 6.099456 4.577294 5.426511
## 18 1.877248 2.904409 2.723954 2.651532 4.338150 2.853942 2.949006 3.237409
## 19 2.410434 4.634878 3.179392 3.464171 5.133791 2.581208 4.515428 4.107966
## 20 3.174488 2.997481 3.733274 1.816465 4.385852 2.912401 3.541931 4.094283
## 21 3.453407 2.318451 5.088018 3.884260 3.644137 4.628341 2.675404 3.977130
## 22 2.509287 2.421916 4.109321 2.578463 3.771757 4.026935 4.000096 3.239374
##           9       10       11       12       13       14       15       16
## 2                                                                         
## 3                                                                         
## 4                                                                         
## 5                                                                         
## 6                                                                         
## 7                                                                         
## 8                                                                         
## 9                                                                         
## 10 3.572023                                                               
## 11 5.175240 5.081469                                                      
## 12 2.735861 3.942171 5.208504                                             
## 13 3.658647 1.407032 5.309741 4.496249                                    
## 14 3.816443 3.610272 4.315584 4.335484 4.385649                           
## 15 4.113606 4.264133 4.735659 2.328833 5.103646 4.239522                  
## 16 3.627307 4.531420 3.429962 4.617791 4.406173 5.169314 5.175157         
## 17 4.901037 5.484537 4.751387 3.497555 5.606577 5.558002 3.399659 5.559320
## 18 2.428533 3.070750 3.945595 2.451935 3.780942 2.301050 2.998784 3.973815
## 19 4.109049 4.130120 4.522319 4.414578 5.010864 1.876051 4.030721 5.232256
## 20 2.948021 2.054393 5.352136 3.430937 2.226493 3.744430 3.782111 4.823711
## 21 3.742680 4.361961 4.883977 1.384124 4.937119 4.926966 2.097150 4.568885
## 22 3.208932 2.559945 3.436927 2.995066 2.739910 3.512207 3.352644 3.457129
##          17       18       19       20       21
## 2                                              
## 3                                              
## 4                                              
## 5                                              
## 6                                              
## 7                                              
## 8                                              
## 9                                              
## 10                                             
## 11                                             
## 12                                             
## 13                                             
## 14                                             
## 15                                             
## 16                                             
## 17                                             
## 18 4.426129                                    
## 19 6.089597 2.473696                           
## 20 4.866540 2.922392 3.903723                  
## 21 3.095002 3.185250 4.972551 4.145222         
## 22 3.628061 2.548060 3.967618 2.618050 3.012264

#Hierarchical agglomerative clustering

Data.hclust = hclust(distance)
plot(Data.hclust)

plot(Data.hclust,labels=Data$Company,main='Default from hclust')

plot(Data.hclust,hang=-1, labels=Data$Company,main='Default from hclust')

#Hierarchical agglomerative clustering using “average” linkage

Data.hclust<-hclust(distance,method="average") 
plot(Data.hclust,hang=-1) 

#Cluster membership

member = cutree(Data.hclust,3)
table(member)
## member
##  1  2  3 
## 18  1  3
member
##  [1] 1 1 1 1 2 1 1 3 1 1 3 1 1 1 1 3 1 1 1 1 1 1

#Characterizing clusters

aggregate(nor,list(member),mean)
##   Group.1 Fixed_charge        RoR       Cost       Load   D.Demand      Sales
## 1       1  -0.01313873  0.1868016 -0.2552757  0.1520422 -0.1253617 -0.2215631
## 2       2   2.03732429 -0.8628882  0.5782326 -1.2950193 -0.7186431 -1.5814284
## 3       3  -0.60027572 -0.8331800  1.3389101 -0.4805802  0.9917178  1.8565214
##      Nuclear   Fuel_Cost
## 1  0.1071944  0.06692555
## 2  0.2143888  1.69263800
## 3 -0.7146294 -0.96576599
aggregate(Data[,-c(1,1)],list(member),mean)
##   Group.1 Fixed_charge       RoR     Cost     Load D.Demand    Sales Nuclear
## 1       1     1.111667 11.155556 157.6667 57.65556 2.850000  8127.50    13.8
## 2       2     1.490000  8.800000 192.0000 51.20000 1.000000  3300.00    15.6
## 3       3     1.003333  8.866667 223.3333 54.83333 6.333333 15504.67     0.0
##   Fuel_Cost
## 1 1.1399444
## 2 2.0440000
## 3 0.5656667

#Silhouette Plot

library(cluster)
plot(silhouette(cutree(Data.hclust,3), distance))

#Scree Plot

wss <- (nrow(nor)-1)*sum(apply(nor,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(nor, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

#K-means clustering

set.seed(123)
kc<-kmeans(nor,3)
kc
## K-means clustering with 3 clusters of sizes 7, 5, 10
## 
## Cluster means:
##   Fixed_charge         RoR       Cost       Load    D.Demand      Sales
## 1  -0.23896065 -0.65917479  0.2556961  0.7992527 -0.05435116 -0.8604593
## 2   0.51980100  1.02655333 -1.2959473 -0.5104679 -0.83409247  0.5120458
## 3  -0.09262805 -0.05185431  0.4689864 -0.3042429  0.45509205  0.3462986
##      Nuclear  Fuel_Cost
## 1 -0.2884040  1.2497562
## 2 -0.4466434 -0.3174391
## 3  0.4252045 -0.7161098
## 
## Clustering vector:
##  [1] 3 1 2 3 1 2 1 3 3 3 3 1 3 2 1 3 1 2 2 3 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 34.16481 15.15613 57.53424
##  (between_SS / total_SS =  36.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
ot<-nor
datadistshortset<-dist(ot,method = "euclidean")
hc1 <- hclust(datadistshortset, method = "complete" )
pamvshortset <- pam(datadistshortset,4, diss = FALSE)
clusplot(pamvshortset, shade = FALSE,labels=2,col.clus="blue",col.p="red",span=FALSE,main="Cluster Mapping",cex=1.2)

#Cluster Analysis in R

library(factoextra) 
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
k2 <- kmeans(nor, centers = 3, nstart = 25)
k2
## K-means clustering with 3 clusters of sizes 3, 7, 12
## 
## Cluster means:
##   Fixed_charge        RoR       Cost       Load    D.Demand       Sales
## 1   -0.6002757 -0.8331800  1.3389101 -0.4805802  0.99171778  1.85652137
## 2   -0.2389606 -0.6591748  0.2556961  0.7992527 -0.05435116 -0.86045933
## 3    0.2894626  0.5928136 -0.4838836 -0.3460857 -0.21622460  0.03780427
##      Nuclear Fuel_Cost
## 1 -0.7146294 -0.965766
## 2 -0.2884040  1.249756
## 3  0.3468930 -0.487583
## 
## Clustering vector:
##  [1] 3 2 3 3 2 3 2 1 3 3 1 2 3 3 2 1 2 3 3 3 2 3
## 
## Within cluster sum of squares by cluster:
## [1]  9.533522 34.164812 58.012322
##  (between_SS / total_SS =  39.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
str(k2)
## List of 9
##  $ cluster     : int [1:22] 3 2 3 3 2 3 2 1 3 3 ...
##  $ centers     : num [1:3, 1:8] -0.6 -0.239 0.289 -0.833 -0.659 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:8] "Fixed_charge" "RoR" "Cost" "Load" ...
##  $ totss       : num 168
##  $ withinss    : num [1:3] 9.53 34.16 58.01
##  $ tot.withinss: num 102
##  $ betweenss   : num 66.3
##  $ size        : int [1:3] 3 7 12
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = nor)

#Optimal Clusters

fviz_nbclust(nor, kmeans, method = "wss")

#Average Silhouette Method

fviz_nbclust(nor, kmeans, method = "silhouette")

#Gap Statistic Method

gap_stat <- clusGap(nor, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

Conclusion K-means clustering is a very simple and fast algorithm and it can efficiently deal with very large data sets.

K-means clustering needs to provide a number of clusters as an input, Hierarchical clustering is an alternative approach that does not require that we commit to a particular choice of clusters.

Hierarchical clustering has an added advantage over K-means clustering because it has an attractive tree-based representation of the observations (dendrogram).