#Import Dataset
uti <- read.csv(file.choose(), header= T)
View(uti)
str(uti)
## 'data.frame': 22 obs. of 9 variables:
## $ company : Factor w/ 22 levels "Arizona","Boston",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Fixed_charge: num 1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ...
## $ RoR : num 9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ...
## $ Cost : int 151 202 113 168 192 111 175 245 168 197 ...
## $ Load : num 54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ...
## $ D.Demand : num 1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ...
## $ Sales : int 9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ...
## $ Nuclear : num 0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ...
## $ Fuel_cost : num 0.628 1.555 1.058 0.7 2.044 ...
# Scatter Plot
plot(Fuel_cost~Sales, uti)
with(uti, text(Fuel_cost~Sales , labels= company , pos = 4, cex= 0.6))

# Remove categorical variable 'company'
uti1 <- uti[ ,-1]
uti1
## Fixed_charge RoR Cost Load D.Demand Sales Nuclear Fuel_cost
## 1 1.06 9.2 151 54.4 1.6 9077 0.0 0.628
## 2 0.89 10.3 202 57.9 2.2 5088 25.3 1.555
## 3 1.43 15.4 113 53.0 3.4 9212 0.0 1.058
## 4 1.02 11.2 168 56.0 0.3 6423 34.3 0.700
## 5 1.49 8.8 192 51.2 1.0 3300 15.6 2.044
## 6 1.32 13.5 111 60.0 -2.2 11127 22.5 1.241
## 7 1.22 12.2 175 67.6 2.2 7642 0.0 1.652
## 8 1.10 9.2 245 57.0 3.3 13082 0.0 0.309
## 9 1.34 13.0 168 60.4 7.2 8406 0.0 0.862
## 10 1.12 12.4 197 53.0 2.7 6455 39.2 0.623
## 11 0.75 7.5 173 51.5 6.5 17441 0.0 0.768
## 12 1.13 10.9 178 62.0 3.7 6154 0.0 1.897
## 13 1.15 12.7 199 53.7 6.4 7179 50.2 0.527
## 14 1.09 12.0 96 49.8 1.4 9673 0.0 0.588
## 15 0.96 7.6 164 62.2 -0.1 6468 0.9 1.400
## 16 1.16 9.9 252 56.0 9.2 15991 0.0 0.620
## 17 0.76 6.4 136 61.9 9.0 5714 8.3 1.920
## 18 1.05 12.6 150 56.7 2.7 10140 0.0 1.108
## 19 1.16 11.7 104 54.0 -2.1 13507 0.0 0.636
## 20 1.20 11.8 148 59.9 3.5 7287 41.1 0.702
## 21 1.04 8.6 204 61.0 3.5 6650 0.0 2.116
## 22 1.07 9.3 174 54.3 5.9 10093 26.6 1.306
# Normalization of the varibles
m_uti<- apply(uti1, 2, mean)
sd_uti <- apply(uti1, 2, sd)
norm_uti<- scale(uti1, m_uti, sd_uti)
# Calculate Euclidean distance
dist_uti<- dist(norm_uti)
print(dist_uti , digits = 3)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 2 3.10
## 3 3.68 4.92
## 4 2.46 2.16 4.11
## 5 4.12 3.85 4.47 4.13
## 6 3.61 4.22 2.99 3.20 4.60
## 7 3.90 3.45 4.22 3.97 4.60 3.35
## 8 2.74 3.89 4.99 3.69 5.16 4.91 4.36
## 9 3.25 3.96 2.75 3.75 4.49 3.73 2.80 3.59
## 10 3.10 2.71 3.93 1.49 4.05 3.83 4.51 3.67 3.57
## 11 3.49 4.79 5.90 4.86 6.46 6.00 6.00 3.46 5.18 5.08
## 12 3.22 2.43 4.03 3.50 3.60 3.74 1.66 4.06 2.74 3.94 5.21
## 13 3.96 3.43 4.39 2.58 4.76 4.55 5.01 4.14 3.66 1.41 5.31 4.50
## 14 2.11 4.32 2.74 3.23 4.82 3.47 4.91 4.34 3.82 3.61 4.32 4.34 4.39
## 15 2.59 2.50 5.16 3.19 4.26 4.07 2.93 3.85 4.11 4.26 4.74 2.33 5.10 4.24
## 16 4.03 4.84 5.26 4.97 5.82 5.84 5.04 2.20 3.63 4.53 3.43 4.62 4.41 5.17
## 17 4.40 3.62 6.36 4.89 5.63 6.10 4.58 5.43 4.90 5.48 4.75 3.50 5.61 5.56
## 18 1.88 2.90 2.72 2.65 4.34 2.85 2.95 3.24 2.43 3.07 3.95 2.45 3.78 2.30
## 19 2.41 4.63 3.18 3.46 5.13 2.58 4.52 4.11 4.11 4.13 4.52 4.41 5.01 1.88
## 20 3.17 3.00 3.73 1.82 4.39 2.91 3.54 4.09 2.95 2.05 5.35 3.43 2.23 3.74
## 21 3.45 2.32 5.09 3.88 3.64 4.63 2.68 3.98 3.74 4.36 4.88 1.38 4.94 4.93
## 22 2.51 2.42 4.11 2.58 3.77 4.03 4.00 3.24 3.21 2.56 3.44 3.00 2.74 3.51
## 15 16 17 18 19 20 21
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16 5.18
## 17 3.40 5.56
## 18 3.00 3.97 4.43
## 19 4.03 5.23 6.09 2.47
## 20 3.78 4.82 4.87 2.92 3.90
## 21 2.10 4.57 3.10 3.19 4.97 4.15
## 22 3.35 3.46 3.63 2.55 3.97 2.62 3.01
# Cluster Dendogram with complete linkage
hc.c_uti<- hclust(dist_uti)
plot(hc.c_uti, labels = uti$company, hang = -1)

# Cluster Dendogram with average linkage
hc.a_uti<- hclust(dist_uti, method = "average")
plot(hc.a_uti,hang=-1)

# Cluster Membership
member.c<- cutree(hc.c_uti , 3)
member.a <- cutree(hc.a_uti , 3)
table(member.c , member.a)
## member.a
## member.c 1 2 3
## 1 13 1 0
## 2 5 0 0
## 3 0 0 3
#Cluster Means (standardised)
aggregate(norm_uti, list(member.c), mean)
## Group.1 Fixed_charge RoR Cost Load D.Demand
## 1 1 0.3068832 0.4326015 -0.31481203 -0.3743722 -0.2605107
## 2 2 -0.4991075 -0.7113763 0.07812761 1.3365904 0.1343994
## 3 3 -0.6002757 -0.8331800 1.33891013 -0.4805802 0.9917178
## Sales Nuclear Fuel_cost
## 1 -0.1575387 0.3692252 -0.2389329
## 2 -0.6728046 -0.6050529 1.2484717
## 3 1.8565214 -0.7146294 -0.9657660
# Cluster Means in original units
aggregate(uti[ , -1], list(member.c), mean)
## Group.1 Fixed_charge RoR Cost Load D.Demand Sales
## 1 1 1.170714 11.707143 155.2143 55.30714 2.428571 8354.786
## 2 2 1.022000 9.140000 171.4000 62.94000 3.660000 6525.600
## 3 3 1.003333 8.866667 223.3333 54.83333 6.333333 15504.667
## Nuclear Fuel_cost
## 1 18.20 0.9698571
## 2 1.84 1.7970000
## 3 0.00 0.5656667
library(cluster)
## Warning: package 'cluster' was built under R version 3.5.3
plot(silhouette(cutree(hc.c_uti , 3),dist_uti))

# Scree plot
wss<- (nrow(norm_uti)-1)*sum(apply(norm_uti , 2 , var))
for (i in 2:20) wss[i] <- sum(kmeans(norm_uti , centers = i)$withinss)
plot(1:20 , wss, type = "b" , xlab = "Number of Clusters" , ylab = "Within group SS" , cex = 1.5 , col = rainbow(10))

kc <- kmeans(norm_uti , 3)
kc
## K-means clustering with 3 clusters of sizes 3, 7, 12
##
## Cluster means:
## Fixed_charge RoR Cost Load D.Demand Sales
## 1 -0.6002757 -0.8331800 1.3389101 -0.4805802 0.99171778 1.85652137
## 2 -0.2389606 -0.6591748 0.2556961 0.7992527 -0.05435116 -0.86045933
## 3 0.2894626 0.5928136 -0.4838836 -0.3460857 -0.21622460 0.03780427
## Nuclear Fuel_cost
## 1 -0.7146294 -0.965766
## 2 -0.2884040 1.249756
## 3 0.3468930 -0.487583
##
## Clustering vector:
## [1] 3 2 3 3 2 3 2 1 3 3 1 2 3 3 2 1 2 3 3 3 2 3
##
## Within cluster sum of squares by cluster:
## [1] 9.533522 34.164812 58.012322
## (between_SS / total_SS = 39.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(Sales~ D.Demand , uti , col= kc$cluster)
