#Import Dataset
uti <- read.csv(file.choose(), header= T)
View(uti)
str(uti)
## 'data.frame':    22 obs. of  9 variables:
##  $ company     : Factor w/ 22 levels "Arizona","Boston",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fixed_charge: num  1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ...
##  $ RoR         : num  9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ...
##  $ Cost        : int  151 202 113 168 192 111 175 245 168 197 ...
##  $ Load        : num  54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ...
##  $ D.Demand    : num  1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ...
##  $ Sales       : int  9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ...
##  $ Nuclear     : num  0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ...
##  $ Fuel_cost   : num  0.628 1.555 1.058 0.7 2.044 ...
# Scatter Plot
plot(Fuel_cost~Sales, uti)
with(uti, text(Fuel_cost~Sales , labels= company , pos = 4, cex= 0.6))

 # Remove categorical variable 'company'
uti1 <- uti[ ,-1]
uti1
##    Fixed_charge  RoR Cost Load D.Demand Sales Nuclear Fuel_cost
## 1          1.06  9.2  151 54.4      1.6  9077     0.0     0.628
## 2          0.89 10.3  202 57.9      2.2  5088    25.3     1.555
## 3          1.43 15.4  113 53.0      3.4  9212     0.0     1.058
## 4          1.02 11.2  168 56.0      0.3  6423    34.3     0.700
## 5          1.49  8.8  192 51.2      1.0  3300    15.6     2.044
## 6          1.32 13.5  111 60.0     -2.2 11127    22.5     1.241
## 7          1.22 12.2  175 67.6      2.2  7642     0.0     1.652
## 8          1.10  9.2  245 57.0      3.3 13082     0.0     0.309
## 9          1.34 13.0  168 60.4      7.2  8406     0.0     0.862
## 10         1.12 12.4  197 53.0      2.7  6455    39.2     0.623
## 11         0.75  7.5  173 51.5      6.5 17441     0.0     0.768
## 12         1.13 10.9  178 62.0      3.7  6154     0.0     1.897
## 13         1.15 12.7  199 53.7      6.4  7179    50.2     0.527
## 14         1.09 12.0   96 49.8      1.4  9673     0.0     0.588
## 15         0.96  7.6  164 62.2     -0.1  6468     0.9     1.400
## 16         1.16  9.9  252 56.0      9.2 15991     0.0     0.620
## 17         0.76  6.4  136 61.9      9.0  5714     8.3     1.920
## 18         1.05 12.6  150 56.7      2.7 10140     0.0     1.108
## 19         1.16 11.7  104 54.0     -2.1 13507     0.0     0.636
## 20         1.20 11.8  148 59.9      3.5  7287    41.1     0.702
## 21         1.04  8.6  204 61.0      3.5  6650     0.0     2.116
## 22         1.07  9.3  174 54.3      5.9 10093    26.6     1.306
  # Normalization of the varibles
m_uti<- apply(uti1, 2, mean)
sd_uti <- apply(uti1, 2, sd)
norm_uti<- scale(uti1, m_uti, sd_uti)

# Calculate Euclidean distance

dist_uti<- dist(norm_uti)
print(dist_uti , digits = 3)
##       1    2    3    4    5    6    7    8    9   10   11   12   13   14
## 2  3.10                                                                 
## 3  3.68 4.92                                                            
## 4  2.46 2.16 4.11                                                       
## 5  4.12 3.85 4.47 4.13                                                  
## 6  3.61 4.22 2.99 3.20 4.60                                             
## 7  3.90 3.45 4.22 3.97 4.60 3.35                                        
## 8  2.74 3.89 4.99 3.69 5.16 4.91 4.36                                   
## 9  3.25 3.96 2.75 3.75 4.49 3.73 2.80 3.59                              
## 10 3.10 2.71 3.93 1.49 4.05 3.83 4.51 3.67 3.57                         
## 11 3.49 4.79 5.90 4.86 6.46 6.00 6.00 3.46 5.18 5.08                    
## 12 3.22 2.43 4.03 3.50 3.60 3.74 1.66 4.06 2.74 3.94 5.21               
## 13 3.96 3.43 4.39 2.58 4.76 4.55 5.01 4.14 3.66 1.41 5.31 4.50          
## 14 2.11 4.32 2.74 3.23 4.82 3.47 4.91 4.34 3.82 3.61 4.32 4.34 4.39     
## 15 2.59 2.50 5.16 3.19 4.26 4.07 2.93 3.85 4.11 4.26 4.74 2.33 5.10 4.24
## 16 4.03 4.84 5.26 4.97 5.82 5.84 5.04 2.20 3.63 4.53 3.43 4.62 4.41 5.17
## 17 4.40 3.62 6.36 4.89 5.63 6.10 4.58 5.43 4.90 5.48 4.75 3.50 5.61 5.56
## 18 1.88 2.90 2.72 2.65 4.34 2.85 2.95 3.24 2.43 3.07 3.95 2.45 3.78 2.30
## 19 2.41 4.63 3.18 3.46 5.13 2.58 4.52 4.11 4.11 4.13 4.52 4.41 5.01 1.88
## 20 3.17 3.00 3.73 1.82 4.39 2.91 3.54 4.09 2.95 2.05 5.35 3.43 2.23 3.74
## 21 3.45 2.32 5.09 3.88 3.64 4.63 2.68 3.98 3.74 4.36 4.88 1.38 4.94 4.93
## 22 2.51 2.42 4.11 2.58 3.77 4.03 4.00 3.24 3.21 2.56 3.44 3.00 2.74 3.51
##      15   16   17   18   19   20   21
## 2                                    
## 3                                    
## 4                                    
## 5                                    
## 6                                    
## 7                                    
## 8                                    
## 9                                    
## 10                                   
## 11                                   
## 12                                   
## 13                                   
## 14                                   
## 15                                   
## 16 5.18                              
## 17 3.40 5.56                         
## 18 3.00 3.97 4.43                    
## 19 4.03 5.23 6.09 2.47               
## 20 3.78 4.82 4.87 2.92 3.90          
## 21 2.10 4.57 3.10 3.19 4.97 4.15     
## 22 3.35 3.46 3.63 2.55 3.97 2.62 3.01
# Cluster Dendogram with complete linkage
hc.c_uti<- hclust(dist_uti)
plot(hc.c_uti, labels = uti$company, hang = -1)

# Cluster Dendogram with average linkage
hc.a_uti<- hclust(dist_uti, method = "average")
plot(hc.a_uti,hang=-1)

# Cluster Membership
member.c<- cutree(hc.c_uti , 3)
member.a <- cutree(hc.a_uti , 3)
table(member.c , member.a)
##         member.a
## member.c  1  2  3
##        1 13  1  0
##        2  5  0  0
##        3  0  0  3
#Cluster Means (standardised)
aggregate(norm_uti, list(member.c), mean)
##   Group.1 Fixed_charge        RoR        Cost       Load   D.Demand
## 1       1    0.3068832  0.4326015 -0.31481203 -0.3743722 -0.2605107
## 2       2   -0.4991075 -0.7113763  0.07812761  1.3365904  0.1343994
## 3       3   -0.6002757 -0.8331800  1.33891013 -0.4805802  0.9917178
##        Sales    Nuclear  Fuel_cost
## 1 -0.1575387  0.3692252 -0.2389329
## 2 -0.6728046 -0.6050529  1.2484717
## 3  1.8565214 -0.7146294 -0.9657660
# Cluster Means in original units
aggregate(uti[ , -1], list(member.c), mean)
##   Group.1 Fixed_charge       RoR     Cost     Load D.Demand     Sales
## 1       1     1.170714 11.707143 155.2143 55.30714 2.428571  8354.786
## 2       2     1.022000  9.140000 171.4000 62.94000 3.660000  6525.600
## 3       3     1.003333  8.866667 223.3333 54.83333 6.333333 15504.667
##   Nuclear Fuel_cost
## 1   18.20 0.9698571
## 2    1.84 1.7970000
## 3    0.00 0.5656667
library(cluster)
## Warning: package 'cluster' was built under R version 3.5.3
plot(silhouette(cutree(hc.c_uti , 3),dist_uti))

# Scree plot
wss<- (nrow(norm_uti)-1)*sum(apply(norm_uti , 2 , var))
for (i in 2:20) wss[i] <- sum(kmeans(norm_uti , centers = i)$withinss)
plot(1:20 , wss, type = "b" , xlab = "Number of Clusters"  , ylab = "Within group SS" , cex = 1.5 , col = rainbow(10))

kc <- kmeans(norm_uti , 3)
kc
## K-means clustering with 3 clusters of sizes 3, 7, 12
## 
## Cluster means:
##   Fixed_charge        RoR       Cost       Load    D.Demand       Sales
## 1   -0.6002757 -0.8331800  1.3389101 -0.4805802  0.99171778  1.85652137
## 2   -0.2389606 -0.6591748  0.2556961  0.7992527 -0.05435116 -0.86045933
## 3    0.2894626  0.5928136 -0.4838836 -0.3460857 -0.21622460  0.03780427
##      Nuclear Fuel_cost
## 1 -0.7146294 -0.965766
## 2 -0.2884040  1.249756
## 3  0.3468930 -0.487583
## 
## Clustering vector:
##  [1] 3 2 3 3 2 3 2 1 3 3 1 2 3 3 2 1 2 3 3 3 2 3
## 
## Within cluster sum of squares by cluster:
## [1]  9.533522 34.164812 58.012322
##  (between_SS / total_SS =  39.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(Sales~ D.Demand , uti , col= kc$cluster)