Getting Data

mydata <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/CA2/Data.txt")
str(mydata)
'data.frame':   22 obs. of  9 variables:
 $ Company     : chr  "Arizona " "Boston " "Central " "Commonwealth" ...
 $ Fixed_charge: num  1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ...
 $ RoR         : num  9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ...
 $ Cost        : int  151 202 113 168 192 111 175 245 168 197 ...
 $ Load        : num  54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ...
 $ D.Demand    : num  1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ...
 $ Sales       : int  9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ...
 $ Nuclear     : num  0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ...
 $ Fuel_Cost   : num  0.628 1.555 1.058 0.7 2.044 ...
head(mydata)
       Company Fixed_charge  RoR Cost Load D.Demand Sales Nuclear Fuel_Cost
1     Arizona          1.06  9.2  151 54.4      1.6  9077     0.0     0.628
2      Boston          0.89 10.3  202 57.9      2.2  5088    25.3     1.555
3     Central          1.43 15.4  113 53.0      3.4  9212     0.0     1.058
4 Commonwealth         1.02 11.2  168 56.0      0.3  6423    34.3     0.700
5    Con Ed NY         1.49  8.8  192 51.2      1.0  3300    15.6     2.044
6     Florida          1.32 13.5  111 60.0     -2.2 11127    22.5     1.241
pairs(mydata[2:9])

Scatter plot

plot(mydata$Fuel_Cost~ mydata$Sales, data = mydata)
with(mydata,text(mydata$Fuel_Cost ~ mydata$Sales, labels=mydata$Company,pos=4))

## Normalize

z <- mydata[,-c(1,1)]
means <- apply(z,2,mean)
sds <- apply(z,2,sd)
nor <- scale(z,center=means,scale=sds)
nor
      Fixed_charge         RoR         Cost        Load    D.Demand       Sales
 [1,]  -0.29315791 -0.68463896 -0.417122002 -0.57771516 -0.52622751  0.04590290
 [2,]  -1.21451134 -0.19445367  0.821002037  0.20683629 -0.33381191 -1.07776413
 [3,]   1.71214073  2.07822360 -1.339645796 -0.89153574  0.05101929  0.08393124
 [4,]  -0.50994695  0.20660702 -0.004413989 -0.21906307 -0.94312798 -0.70170610
 [5,]   2.03732429 -0.86288816  0.578232617 -1.29501935 -0.71864311 -1.58142837
 [6,]   1.11597086  1.23153991 -1.388199680  0.67756716 -1.74485965  0.62337028
 [7,]   0.57399826  0.65223002  0.165524604  2.38116460 -0.33381191 -0.35832428
 [8,]  -0.07636887 -0.68463896  1.864910540  0.00509449  0.01895002  1.17407698
 [9,]   1.22436538  1.00872841 -0.004413989  0.76723019  1.26965142 -0.14311204
[10,]   0.03202565  0.74135462  0.699617327 -0.89153574 -0.17346558 -0.69269198
[11,]  -1.97327298 -1.44219805  0.116970720 -1.22777208  1.04516655  2.40196983
[12,]   0.08622291  0.07292013  0.238355430  1.12588228  0.14722709 -0.77748109
[13,]   0.19461744  0.87504152  0.748171211 -0.73462545  1.01309729 -0.48874740
[14,]  -0.13056613  0.56310542 -1.752353809 -1.60883993 -0.59036605  0.21379097
[15,]  -0.83513051 -1.39763576 -0.101521757  1.17071379 -1.07140505 -0.68902999
[16,]   0.24881470 -0.37270287  2.034849134 -0.21906307  1.91103676  1.99351729
[17,]  -1.91907572 -1.93238335 -0.781276132  1.10346652  1.84689822 -0.90142531
[18,]  -0.34735517  0.83047922 -0.441398944 -0.06215278 -0.17346558  0.34534086
[19,]   0.24881470  0.42941852 -1.558138274 -0.66737818 -1.71279038  1.29379583
[20,]   0.46560374  0.47398082 -0.489952828  0.65515141  0.08308855 -0.45832473
[21,]  -0.40155243 -0.95201276  0.869555920  0.90172472  0.08308855 -0.63776215
[22,]  -0.23896065 -0.64007666  0.141247662 -0.60013092  0.85275095  0.33210137
         Nuclear   Fuel_Cost
 [1,] -0.7146294 -0.85367545
 [2,]  0.7920476  0.81329670
 [3,] -0.7146294 -0.08043055
 [4,]  1.3280197 -0.72420189
 [5,]  0.2143888  1.69263800
 [6,]  0.6253007  0.24864810
 [7,] -0.7146294  0.98772637
 [8,] -0.7146294 -1.42731528
 [9,] -0.7146294 -0.43288637
[10,]  1.6198267 -0.86266667
[11,] -0.7146294 -0.60192130
[12,] -0.7146294  1.42829614
[13,]  2.2749037 -1.03529809
[14,] -0.7146294 -0.92560521
[15,] -0.6610322  0.53456889
[16,] -0.7146294 -0.86806140
[17,] -0.2203441  1.46965575
[18,] -0.7146294  0.00948165
[19,] -0.7146294 -0.83928950
[20,]  1.7329764 -0.72060540
[21,] -0.7146294  1.82211157
[22,]  0.8694658  0.36553395
attr(,"scaled:center")
Fixed_charge          RoR         Cost         Load     D.Demand        Sales 
    1.114091    10.736364   168.181818    56.977273     3.240909  8914.045455 
     Nuclear    Fuel_Cost 
   12.000000     1.102727 
attr(,"scaled:scale")
Fixed_charge          RoR         Cost         Load     D.Demand        Sales 
   0.1845112    2.2440494   41.1913495    4.4611478    3.1182503 3549.9840305 
     Nuclear    Fuel_Cost 
  16.7919198    0.5560981 

Calculate distance matrix

distance = dist(nor)
distance
          1        2        3        4        5        6        7        8
2  3.096154                                                               
3  3.679230 4.916465                                                      
4  2.462149 2.164213 4.107079                                             
5  4.123129 3.852850 4.468735 4.127368                                    
6  3.606269 4.218804 2.992760 3.201836 4.600183                           
7  3.901898 3.448346 4.217769 3.969367 4.596261 3.352919                  
8  2.737407 3.892524 4.990876 3.692949 5.155516 4.913953 4.364509         
9  3.253851 3.957125 2.752623 3.753627 4.489900 3.730814 2.796298 3.594824
10 3.099116 2.705330 3.934935 1.491427 4.045276 3.829058 4.506512 3.673884
11 3.491163 4.792640 5.902882 4.864730 6.460986 6.004557 5.995814 3.462587
12 3.223138 2.432568 4.031434 3.498769 3.603863 3.738824 1.660047 4.059770
13 3.959637 3.434878 4.385973 2.577003 4.758059 4.554909 5.010221 4.140607
14 2.113490 4.323825 2.742000 3.230069 4.818803 3.469268 4.914949 4.335241
15 2.593481 2.501195 5.156977 3.190250 4.255251 4.065764 2.930142 3.849872
16 4.033051 4.837051 5.264442 4.967244 5.816715 5.842268 5.042444 2.201457
17 4.396680 3.623588 6.356548 4.893679 5.628591 6.099456 4.577294 5.426511
18 1.877248 2.904409 2.723954 2.651532 4.338150 2.853942 2.949006 3.237409
19 2.410434 4.634878 3.179392 3.464171 5.133791 2.581208 4.515428 4.107966
20 3.174488 2.997481 3.733274 1.816465 4.385852 2.912401 3.541931 4.094283
21 3.453407 2.318451 5.088018 3.884260 3.644137 4.628341 2.675404 3.977130
22 2.509287 2.421916 4.109321 2.578463 3.771757 4.026935 4.000096 3.239374
          9       10       11       12       13       14       15       16
2                                                                         
3                                                                         
4                                                                         
5                                                                         
6                                                                         
7                                                                         
8                                                                         
9                                                                         
10 3.572023                                                               
11 5.175240 5.081469                                                      
12 2.735861 3.942171 5.208504                                             
13 3.658647 1.407032 5.309741 4.496249                                    
14 3.816443 3.610272 4.315584 4.335484 4.385649                           
15 4.113606 4.264133 4.735659 2.328833 5.103646 4.239522                  
16 3.627307 4.531420 3.429962 4.617791 4.406173 5.169314 5.175157         
17 4.901037 5.484537 4.751387 3.497555 5.606577 5.558002 3.399659 5.559320
18 2.428533 3.070750 3.945595 2.451935 3.780942 2.301050 2.998784 3.973815
19 4.109049 4.130120 4.522319 4.414578 5.010864 1.876051 4.030721 5.232256
20 2.948021 2.054393 5.352136 3.430937 2.226493 3.744430 3.782111 4.823711
21 3.742680 4.361961 4.883977 1.384124 4.937119 4.926966 2.097150 4.568885
22 3.208932 2.559945 3.436927 2.995066 2.739910 3.512207 3.352644 3.457129
         17       18       19       20       21
2                                              
3                                              
4                                              
5                                              
6                                              
7                                              
8                                              
9                                              
10                                             
11                                             
12                                             
13                                             
14                                             
15                                             
16                                             
17                                             
18 4.426129                                    
19 6.089597 2.473696                           
20 4.866540 2.922392 3.903723                  
21 3.095002 3.185250 4.972551 4.145222         
22 3.628061 2.548060 3.967618 2.618050 3.012264

Hierarchical agglomerative clustering

mydata.hclust = hclust(distance)
plot(mydata.hclust)

plot(mydata.hclust,labels=mydata$Company,main='Default from hclust')

plot(mydata.hclust,hang=-1, labels=mydata$Company,main='Default from hclust')

## Hierarchical agglomerative clustering using “average” linkage

mydata.hclust<-hclust(distance,method="average") 
plot(mydata.hclust,hang=-1) 

## Cluster membership

member = cutree(mydata.hclust,3)
table(member)
member
 1  2  3 
18  1  3 
member
 [1] 1 1 1 1 2 1 1 3 1 1 3 1 1 1 1 3 1 1 1 1 1 1

Characterizing clusters

aggregate(nor,list(member),mean)
  Group.1 Fixed_charge        RoR       Cost       Load   D.Demand      Sales
1       1  -0.01313873  0.1868016 -0.2552757  0.1520422 -0.1253617 -0.2215631
2       2   2.03732429 -0.8628882  0.5782326 -1.2950193 -0.7186431 -1.5814284
3       3  -0.60027572 -0.8331800  1.3389101 -0.4805802  0.9917178  1.8565214
     Nuclear   Fuel_Cost
1  0.1071944  0.06692555
2  0.2143888  1.69263800
3 -0.7146294 -0.96576599
aggregate(mydata[,-c(1,1)],list(member),mean)
  Group.1 Fixed_charge       RoR     Cost     Load D.Demand    Sales Nuclear
1       1     1.111667 11.155556 157.6667 57.65556 2.850000  8127.50    13.8
2       2     1.490000  8.800000 192.0000 51.20000 1.000000  3300.00    15.6
3       3     1.003333  8.866667 223.3333 54.83333 6.333333 15504.67     0.0
  Fuel_Cost
1 1.1399444
2 2.0440000
3 0.5656667

Silhouette Plot

library(cluster)
plot(silhouette(cutree(mydata.hclust,3), distance))

## Scree Plot

wss <- (nrow(nor)-1)*sum(apply(nor,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(nor, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

## K-means clustering

set.seed(123)
kc<-kmeans(nor,3)
kc
K-means clustering with 3 clusters of sizes 7, 5, 10

Cluster means:
  Fixed_charge         RoR       Cost       Load    D.Demand      Sales
1  -0.23896065 -0.65917479  0.2556961  0.7992527 -0.05435116 -0.8604593
2   0.51980100  1.02655333 -1.2959473 -0.5104679 -0.83409247  0.5120458
3  -0.09262805 -0.05185431  0.4689864 -0.3042429  0.45509205  0.3462986
     Nuclear  Fuel_Cost
1 -0.2884040  1.2497562
2 -0.4466434 -0.3174391
3  0.4252045 -0.7161098

Clustering vector:
 [1] 3 1 2 3 1 2 1 3 3 3 3 1 3 2 1 3 1 2 2 3 1 3

Within cluster sum of squares by cluster:
[1] 34.16481 15.15613 57.53424
 (between_SS / total_SS =  36.4 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
ot<-nor
datadistshortset<-dist(ot,method = "euclidean")
hc1 <- hclust(datadistshortset, method = "complete" )
pamvshortset <- pam(datadistshortset,4, diss = FALSE)
clusplot(pamvshortset, shade = FALSE,labels=2,col.clus="blue",col.p="red",span=FALSE,main="Cluster Mapping",cex=1.2)

## Cluster Analysis in R

library(factoextra) 
k2 <- kmeans(nor, centers = 3, nstart = 25)
str(k2)
List of 9
 $ cluster     : int [1:22] 3 2 3 3 2 3 2 1 3 3 ...
 $ centers     : num [1:3, 1:8] -0.6 -0.239 0.289 -0.833 -0.659 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:3] "1" "2" "3"
  .. ..$ : chr [1:8] "Fixed_charge" "RoR" "Cost" "Load" ...
 $ totss       : num 168
 $ withinss    : num [1:3] 9.53 34.16 58.01
 $ tot.withinss: num 102
 $ betweenss   : num 66.3
 $ size        : int [1:3] 3 7 12
 $ iter        : int 2
 $ ifault      : int 0
 - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = nor)

## Optimal Clusters

fviz_nbclust(nor, kmeans, method = "wss")

## Average Silhouette Method

fviz_nbclust(nor, kmeans, method = "silhouette")

## Gap Statistic Method

gap_stat <- clusGap(nor, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
fviz_gap_stat(gap_stat)