mydata <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/CA2/Data.txt")
str(mydata)
'data.frame': 22 obs. of 9 variables:
$ Company : chr "Arizona " "Boston " "Central " "Commonwealth" ...
$ Fixed_charge: num 1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ...
$ RoR : num 9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ...
$ Cost : int 151 202 113 168 192 111 175 245 168 197 ...
$ Load : num 54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ...
$ D.Demand : num 1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ...
$ Sales : int 9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ...
$ Nuclear : num 0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ...
$ Fuel_Cost : num 0.628 1.555 1.058 0.7 2.044 ...
head(mydata)
Company Fixed_charge RoR Cost Load D.Demand Sales Nuclear Fuel_Cost
1 Arizona 1.06 9.2 151 54.4 1.6 9077 0.0 0.628
2 Boston 0.89 10.3 202 57.9 2.2 5088 25.3 1.555
3 Central 1.43 15.4 113 53.0 3.4 9212 0.0 1.058
4 Commonwealth 1.02 11.2 168 56.0 0.3 6423 34.3 0.700
5 Con Ed NY 1.49 8.8 192 51.2 1.0 3300 15.6 2.044
6 Florida 1.32 13.5 111 60.0 -2.2 11127 22.5 1.241
pairs(mydata[2:9])
plot(mydata$Fuel_Cost~ mydata$Sales, data = mydata)
with(mydata,text(mydata$Fuel_Cost ~ mydata$Sales, labels=mydata$Company,pos=4))
## Normalize
z <- mydata[,-c(1,1)]
means <- apply(z,2,mean)
sds <- apply(z,2,sd)
nor <- scale(z,center=means,scale=sds)
nor
Fixed_charge RoR Cost Load D.Demand Sales
[1,] -0.29315791 -0.68463896 -0.417122002 -0.57771516 -0.52622751 0.04590290
[2,] -1.21451134 -0.19445367 0.821002037 0.20683629 -0.33381191 -1.07776413
[3,] 1.71214073 2.07822360 -1.339645796 -0.89153574 0.05101929 0.08393124
[4,] -0.50994695 0.20660702 -0.004413989 -0.21906307 -0.94312798 -0.70170610
[5,] 2.03732429 -0.86288816 0.578232617 -1.29501935 -0.71864311 -1.58142837
[6,] 1.11597086 1.23153991 -1.388199680 0.67756716 -1.74485965 0.62337028
[7,] 0.57399826 0.65223002 0.165524604 2.38116460 -0.33381191 -0.35832428
[8,] -0.07636887 -0.68463896 1.864910540 0.00509449 0.01895002 1.17407698
[9,] 1.22436538 1.00872841 -0.004413989 0.76723019 1.26965142 -0.14311204
[10,] 0.03202565 0.74135462 0.699617327 -0.89153574 -0.17346558 -0.69269198
[11,] -1.97327298 -1.44219805 0.116970720 -1.22777208 1.04516655 2.40196983
[12,] 0.08622291 0.07292013 0.238355430 1.12588228 0.14722709 -0.77748109
[13,] 0.19461744 0.87504152 0.748171211 -0.73462545 1.01309729 -0.48874740
[14,] -0.13056613 0.56310542 -1.752353809 -1.60883993 -0.59036605 0.21379097
[15,] -0.83513051 -1.39763576 -0.101521757 1.17071379 -1.07140505 -0.68902999
[16,] 0.24881470 -0.37270287 2.034849134 -0.21906307 1.91103676 1.99351729
[17,] -1.91907572 -1.93238335 -0.781276132 1.10346652 1.84689822 -0.90142531
[18,] -0.34735517 0.83047922 -0.441398944 -0.06215278 -0.17346558 0.34534086
[19,] 0.24881470 0.42941852 -1.558138274 -0.66737818 -1.71279038 1.29379583
[20,] 0.46560374 0.47398082 -0.489952828 0.65515141 0.08308855 -0.45832473
[21,] -0.40155243 -0.95201276 0.869555920 0.90172472 0.08308855 -0.63776215
[22,] -0.23896065 -0.64007666 0.141247662 -0.60013092 0.85275095 0.33210137
Nuclear Fuel_Cost
[1,] -0.7146294 -0.85367545
[2,] 0.7920476 0.81329670
[3,] -0.7146294 -0.08043055
[4,] 1.3280197 -0.72420189
[5,] 0.2143888 1.69263800
[6,] 0.6253007 0.24864810
[7,] -0.7146294 0.98772637
[8,] -0.7146294 -1.42731528
[9,] -0.7146294 -0.43288637
[10,] 1.6198267 -0.86266667
[11,] -0.7146294 -0.60192130
[12,] -0.7146294 1.42829614
[13,] 2.2749037 -1.03529809
[14,] -0.7146294 -0.92560521
[15,] -0.6610322 0.53456889
[16,] -0.7146294 -0.86806140
[17,] -0.2203441 1.46965575
[18,] -0.7146294 0.00948165
[19,] -0.7146294 -0.83928950
[20,] 1.7329764 -0.72060540
[21,] -0.7146294 1.82211157
[22,] 0.8694658 0.36553395
attr(,"scaled:center")
Fixed_charge RoR Cost Load D.Demand Sales
1.114091 10.736364 168.181818 56.977273 3.240909 8914.045455
Nuclear Fuel_Cost
12.000000 1.102727
attr(,"scaled:scale")
Fixed_charge RoR Cost Load D.Demand Sales
0.1845112 2.2440494 41.1913495 4.4611478 3.1182503 3549.9840305
Nuclear Fuel_Cost
16.7919198 0.5560981
distance = dist(nor)
distance
1 2 3 4 5 6 7 8
2 3.096154
3 3.679230 4.916465
4 2.462149 2.164213 4.107079
5 4.123129 3.852850 4.468735 4.127368
6 3.606269 4.218804 2.992760 3.201836 4.600183
7 3.901898 3.448346 4.217769 3.969367 4.596261 3.352919
8 2.737407 3.892524 4.990876 3.692949 5.155516 4.913953 4.364509
9 3.253851 3.957125 2.752623 3.753627 4.489900 3.730814 2.796298 3.594824
10 3.099116 2.705330 3.934935 1.491427 4.045276 3.829058 4.506512 3.673884
11 3.491163 4.792640 5.902882 4.864730 6.460986 6.004557 5.995814 3.462587
12 3.223138 2.432568 4.031434 3.498769 3.603863 3.738824 1.660047 4.059770
13 3.959637 3.434878 4.385973 2.577003 4.758059 4.554909 5.010221 4.140607
14 2.113490 4.323825 2.742000 3.230069 4.818803 3.469268 4.914949 4.335241
15 2.593481 2.501195 5.156977 3.190250 4.255251 4.065764 2.930142 3.849872
16 4.033051 4.837051 5.264442 4.967244 5.816715 5.842268 5.042444 2.201457
17 4.396680 3.623588 6.356548 4.893679 5.628591 6.099456 4.577294 5.426511
18 1.877248 2.904409 2.723954 2.651532 4.338150 2.853942 2.949006 3.237409
19 2.410434 4.634878 3.179392 3.464171 5.133791 2.581208 4.515428 4.107966
20 3.174488 2.997481 3.733274 1.816465 4.385852 2.912401 3.541931 4.094283
21 3.453407 2.318451 5.088018 3.884260 3.644137 4.628341 2.675404 3.977130
22 2.509287 2.421916 4.109321 2.578463 3.771757 4.026935 4.000096 3.239374
9 10 11 12 13 14 15 16
2
3
4
5
6
7
8
9
10 3.572023
11 5.175240 5.081469
12 2.735861 3.942171 5.208504
13 3.658647 1.407032 5.309741 4.496249
14 3.816443 3.610272 4.315584 4.335484 4.385649
15 4.113606 4.264133 4.735659 2.328833 5.103646 4.239522
16 3.627307 4.531420 3.429962 4.617791 4.406173 5.169314 5.175157
17 4.901037 5.484537 4.751387 3.497555 5.606577 5.558002 3.399659 5.559320
18 2.428533 3.070750 3.945595 2.451935 3.780942 2.301050 2.998784 3.973815
19 4.109049 4.130120 4.522319 4.414578 5.010864 1.876051 4.030721 5.232256
20 2.948021 2.054393 5.352136 3.430937 2.226493 3.744430 3.782111 4.823711
21 3.742680 4.361961 4.883977 1.384124 4.937119 4.926966 2.097150 4.568885
22 3.208932 2.559945 3.436927 2.995066 2.739910 3.512207 3.352644 3.457129
17 18 19 20 21
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 4.426129
19 6.089597 2.473696
20 4.866540 2.922392 3.903723
21 3.095002 3.185250 4.972551 4.145222
22 3.628061 2.548060 3.967618 2.618050 3.012264
mydata.hclust = hclust(distance)
plot(mydata.hclust)
plot(mydata.hclust,labels=mydata$Company,main='Default from hclust')
plot(mydata.hclust,hang=-1, labels=mydata$Company,main='Default from hclust')
## Hierarchical agglomerative clustering using “average” linkage
mydata.hclust<-hclust(distance,method="average")
plot(mydata.hclust,hang=-1)
## Cluster membership
member = cutree(mydata.hclust,3)
table(member)
member
1 2 3
18 1 3
member
[1] 1 1 1 1 2 1 1 3 1 1 3 1 1 1 1 3 1 1 1 1 1 1
aggregate(nor,list(member),mean)
Group.1 Fixed_charge RoR Cost Load D.Demand Sales
1 1 -0.01313873 0.1868016 -0.2552757 0.1520422 -0.1253617 -0.2215631
2 2 2.03732429 -0.8628882 0.5782326 -1.2950193 -0.7186431 -1.5814284
3 3 -0.60027572 -0.8331800 1.3389101 -0.4805802 0.9917178 1.8565214
Nuclear Fuel_Cost
1 0.1071944 0.06692555
2 0.2143888 1.69263800
3 -0.7146294 -0.96576599
aggregate(mydata[,-c(1,1)],list(member),mean)
Group.1 Fixed_charge RoR Cost Load D.Demand Sales Nuclear
1 1 1.111667 11.155556 157.6667 57.65556 2.850000 8127.50 13.8
2 2 1.490000 8.800000 192.0000 51.20000 1.000000 3300.00 15.6
3 3 1.003333 8.866667 223.3333 54.83333 6.333333 15504.67 0.0
Fuel_Cost
1 1.1399444
2 2.0440000
3 0.5656667
library(cluster)
plot(silhouette(cutree(mydata.hclust,3), distance))
## Scree Plot
wss <- (nrow(nor)-1)*sum(apply(nor,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(nor, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
## K-means clustering
set.seed(123)
kc<-kmeans(nor,3)
kc
K-means clustering with 3 clusters of sizes 7, 5, 10
Cluster means:
Fixed_charge RoR Cost Load D.Demand Sales
1 -0.23896065 -0.65917479 0.2556961 0.7992527 -0.05435116 -0.8604593
2 0.51980100 1.02655333 -1.2959473 -0.5104679 -0.83409247 0.5120458
3 -0.09262805 -0.05185431 0.4689864 -0.3042429 0.45509205 0.3462986
Nuclear Fuel_Cost
1 -0.2884040 1.2497562
2 -0.4466434 -0.3174391
3 0.4252045 -0.7161098
Clustering vector:
[1] 3 1 2 3 1 2 1 3 3 3 3 1 3 2 1 3 1 2 2 3 1 3
Within cluster sum of squares by cluster:
[1] 34.16481 15.15613 57.53424
(between_SS / total_SS = 36.4 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
ot<-nor
datadistshortset<-dist(ot,method = "euclidean")
hc1 <- hclust(datadistshortset, method = "complete" )
pamvshortset <- pam(datadistshortset,4, diss = FALSE)
clusplot(pamvshortset, shade = FALSE,labels=2,col.clus="blue",col.p="red",span=FALSE,main="Cluster Mapping",cex=1.2)
## Cluster Analysis in R
library(factoextra)
k2 <- kmeans(nor, centers = 3, nstart = 25)
str(k2)
List of 9
$ cluster : int [1:22] 3 2 3 3 2 3 2 1 3 3 ...
$ centers : num [1:3, 1:8] -0.6 -0.239 0.289 -0.833 -0.659 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:3] "1" "2" "3"
.. ..$ : chr [1:8] "Fixed_charge" "RoR" "Cost" "Load" ...
$ totss : num 168
$ withinss : num [1:3] 9.53 34.16 58.01
$ tot.withinss: num 102
$ betweenss : num 66.3
$ size : int [1:3] 3 7 12
$ iter : int 2
$ ifault : int 0
- attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = nor)
## Optimal Clusters
fviz_nbclust(nor, kmeans, method = "wss")
## Average Silhouette Method
fviz_nbclust(nor, kmeans, method = "silhouette")
## Gap Statistic Method
gap_stat <- clusGap(nor, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
fviz_gap_stat(gap_stat)