Uni <- read.csv(file = "G:\\Classes\\Clustering\\Universities.csv")
# Normalizing continuous columns to bring them under same scale
normalized_data<-scale(Uni[,2:7]) #excluding the university name columnbefore normalizing
d <- dist(normalized_data, method = "euclidean") # distance matrix
fit <- hclust(d, method="complete")
plot(fit) # display dendrogram

plot(fit, hang=-1,labels = Uni$Univ)

library(NbClust)
NbClust::NbClust(data =Uni[,-1],min.nc = 2,max.nc = 10,method = "average")
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 3 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 5 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 6 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW
## 2 0.8557 22.6061 56.9152 3.6797 246.8007 4.137637e+25 1.053343e+18
## 3 3.6026 64.7859 33.6599 -6.6260 286.8827 1.873438e+25 8.727420e+16
## 4 3.1447 115.0152 17.5355 -3.9594 317.6200 9.739884e+24 1.361317e+16
## 5 4.7803 154.9290 4.9206 -2.6029 345.4356 5.002272e+24 4.030855e+15
## 6 3.6308 147.6500 1.9999 -2.9995 356.8341 4.565812e+24 2.592186e+15
## 7 0.1368 129.1508 11.6288 -3.8738 368.0822 3.962888e+24 2.121860e+15
## 8 1.1905 173.6638 17.1639 -2.6750 410.3209 9.555087e+23 7.853961e+14
## 9 3.1940 289.4329 7.4028 -0.4296 432.8941 4.902346e+23 1.943066e+14
## 10 2.1229 353.5597 4.2152 0.2531 452.0835 2.809082e+23 9.041549e+13
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## 2 2518648052 7337.876 9.4444 0.3585 0.3512 0.6364 0.2844
## 3 724880090 7510.085 32.8153 0.3021 0.4325 0.6010 0.3335
## 4 286514202 8325.391 83.0227 0.3705 0.4871 0.5862 0.2581
## 5 156136639 8565.998 152.3486 0.3848 0.4548 0.5791 36.0010
## 6 125307336 9696.926 189.8308 0.4416 0.3984 0.5742 517.6623
## 7 113374046 9944.095 209.8116 0.4374 0.3383 0.5968 0.1950
## 8 68876734 10505.461 345.3589 0.3518 0.3492 0.6011 0.1739
## 9 34273116 11269.490 694.0482 0.3746 0.3383 0.6325 0.1498
## 10 23431804 12530.450 1015.1670 0.4190 0.3218 0.6561 97.2860
## Pseudot2 Beale Ratkowsky Ball Ptbiserial Frey McClain Dunn
## 2 52.8358 9.2398 0.2071 1259324026 0.7020 2.4227 0.0617 0.4360
## 3 25.9855 7.1410 0.4316 241626697 0.6602 1.0741 0.3725 0.1991
## 4 23.0012 9.8326 0.3824 71628550 0.5892 1.0075 0.5546 0.2250
## 5 -2.9167 -2.8053 0.3454 31227328 0.5343 1.1477 0.6414 0.2688
## 6 0.0000 0.0000 0.3165 20884556 0.5204 2.0072 0.6636 0.3255
## 7 24.7707 13.6144 0.2969 16196292 0.5161 1.3836 0.6735 0.3255
## 8 23.7545 15.2319 0.3050 8609592 0.4514 0.8145 0.8115 0.2343
## 9 11.3471 14.5520 0.2953 3808124 0.3956 0.9311 0.8470 0.3592
## 10 -0.9897 -1.9039 0.2808 2343180 0.3723 1.0724 0.8675 0.4626
## Hubert SDindex Dindex SDbw
## 2 0 3e-04 8628.1767 0.7877
## 3 0 2e-04 4229.0195 0.4280
## 4 0 3e-04 2931.2267 0.3823
## 5 0 4e-04 2174.4337 0.1787
## 6 0 5e-04 1961.7837 0.0860
## 7 0 6e-04 1766.3700 0.0599
## 8 0 6e-04 1318.3472 0.0436
## 9 0 6e-04 942.2935 0.0327
## 10 0 9e-04 788.0180 0.0256
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.5356 18.2064 0e+00
## 3 0.4503 15.8722 0e+00
## 4 0.3506 14.8210 0e+00
## 5 0.1255 20.9054 1e+00
## 6 -0.3211 0.0000 NaN
## 7 0.2864 14.9482 0e+00
## 8 0.2445 15.4517 0e+00
## 9 0.0348 55.4763 1e-04
## 10 -0.0981 -11.1930 1e+00
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot
## Number_clusters 5.0000 10.0000 3.0000 2.0000 8.0000 3.00000e+00
## Value_Index 4.7803 353.5597 23.2552 3.6797 42.2387 1.36475e+25
## TrCovW TraceW Friedman Rubin Cindex DB
## Number_clusters 3.000000e+00 3 10.00 5.0000 3.0000 10.0000
## Value_Index 9.660685e+17 1355402073 1260.96 -31.8436 0.3021 0.3218
## Silhouette Duda PseudoT2 Beale Ratkowsky Ball
## Number_clusters 10.0000 5.000 5.0000 5.0000 3.0000 3
## Value_Index 0.6561 36.001 -2.9167 -2.8053 0.4316 1017697329
## PtBiserial Frey McClain Dunn Hubert SDindex Dindex
## Number_clusters 2.000 7.0000 2.0000 10.0000 0 3e+00 0
## Value_Index 0.702 1.3836 0.0617 0.4626 0 2e-04 0
## SDbw
## Number_clusters 10.0000
## Value_Index 0.0256
##
## $Best.partition
## [1] 1 2 1 1 1 1 1 1 1 2 1 1 3 3 1 3 1 3 3 1 3 1 3 3 1
?cutree
## starting httpd help server ... done
rect.hclust(fit, k=5, border="red")

groups <- cutree(fit, k=5) # cut tree into 5 clusters
membership<-as.matrix(groups) # groups or cluster numbers
final <- data.frame(Uni, membership)
View(final)
write.csv(final, file="final.csv",row.names = F)
aggregate(Uni[,-1],by=list(final$membership),median)
## Group.1 SAT Top10 Accept SFRatio Expenses GradRate
## 1 1 1270 80.5 37.5 13 22284 90
## 2 2 1375 91.0 23.0 10 36450 95
## 3 3 1260 62.0 59.0 9 25026 72
## 4 4 1085 40.0 68.0 16 11857 80
## 5 5 1040 38.5 78.5 22 8885 68