Hclustering_University.R

Uni <- read.csv(file = "G:\\Classes\\Clustering\\Universities.csv")

# Normalizing continuous columns to bring them under same scale
normalized_data<-scale(Uni[,2:7]) #excluding the university name columnbefore normalizing

d <- dist(normalized_data, method = "euclidean") # distance matrix

fit <- hclust(d, method="complete")

plot(fit) # display dendrogram

plot(fit, hang=-1,labels = Uni$Univ)

library(NbClust)
NbClust::NbClust(data =Uni[,-1],min.nc = 2,max.nc = 10,method = "average")

## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 3 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 5 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 6 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

## $All.index
##        KL       CH Hartigan     CCC    Scott      Marriot       TrCovW
## 2  0.8557  22.6061  56.9152  3.6797 246.8007 4.137637e+25 1.053343e+18
## 3  3.6026  64.7859  33.6599 -6.6260 286.8827 1.873438e+25 8.727420e+16
## 4  3.1447 115.0152  17.5355 -3.9594 317.6200 9.739884e+24 1.361317e+16
## 5  4.7803 154.9290   4.9206 -2.6029 345.4356 5.002272e+24 4.030855e+15
## 6  3.6308 147.6500   1.9999 -2.9995 356.8341 4.565812e+24 2.592186e+15
## 7  0.1368 129.1508  11.6288 -3.8738 368.0822 3.962888e+24 2.121860e+15
## 8  1.1905 173.6638  17.1639 -2.6750 410.3209 9.555087e+23 7.853961e+14
## 9  3.1940 289.4329   7.4028 -0.4296 432.8941 4.902346e+23 1.943066e+14
## 10 2.1229 353.5597   4.2152  0.2531 452.0835 2.809082e+23 9.041549e+13
##        TraceW  Friedman     Rubin Cindex     DB Silhouette     Duda
## 2  2518648052  7337.876    9.4444 0.3585 0.3512     0.6364   0.2844
## 3   724880090  7510.085   32.8153 0.3021 0.4325     0.6010   0.3335
## 4   286514202  8325.391   83.0227 0.3705 0.4871     0.5862   0.2581
## 5   156136639  8565.998  152.3486 0.3848 0.4548     0.5791  36.0010
## 6   125307336  9696.926  189.8308 0.4416 0.3984     0.5742 517.6623
## 7   113374046  9944.095  209.8116 0.4374 0.3383     0.5968   0.1950
## 8    68876734 10505.461  345.3589 0.3518 0.3492     0.6011   0.1739
## 9    34273116 11269.490  694.0482 0.3746 0.3383     0.6325   0.1498
## 10   23431804 12530.450 1015.1670 0.4190 0.3218     0.6561  97.2860
##    Pseudot2   Beale Ratkowsky       Ball Ptbiserial   Frey McClain   Dunn
## 2   52.8358  9.2398    0.2071 1259324026     0.7020 2.4227  0.0617 0.4360
## 3   25.9855  7.1410    0.4316  241626697     0.6602 1.0741  0.3725 0.1991
## 4   23.0012  9.8326    0.3824   71628550     0.5892 1.0075  0.5546 0.2250
## 5   -2.9167 -2.8053    0.3454   31227328     0.5343 1.1477  0.6414 0.2688
## 6    0.0000  0.0000    0.3165   20884556     0.5204 2.0072  0.6636 0.3255
## 7   24.7707 13.6144    0.2969   16196292     0.5161 1.3836  0.6735 0.3255
## 8   23.7545 15.2319    0.3050    8609592     0.4514 0.8145  0.8115 0.2343
## 9   11.3471 14.5520    0.2953    3808124     0.3956 0.9311  0.8470 0.3592
## 10  -0.9897 -1.9039    0.2808    2343180     0.3723 1.0724  0.8675 0.4626
##    Hubert SDindex    Dindex   SDbw
## 2       0   3e-04 8628.1767 0.7877
## 3       0   2e-04 4229.0195 0.4280
## 4       0   3e-04 2931.2267 0.3823
## 5       0   4e-04 2174.4337 0.1787
## 6       0   5e-04 1961.7837 0.0860
## 7       0   6e-04 1766.3700 0.0599
## 8       0   6e-04 1318.3472 0.0436
## 9       0   6e-04  942.2935 0.0327
## 10      0   9e-04  788.0180 0.0256
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.5356            18.2064        0e+00
## 3          0.4503            15.8722        0e+00
## 4          0.3506            14.8210        0e+00
## 5          0.1255            20.9054        1e+00
## 6         -0.3211             0.0000          NaN
## 7          0.2864            14.9482        0e+00
## 8          0.2445            15.4517        0e+00
## 9          0.0348            55.4763        1e-04
## 10        -0.0981           -11.1930        1e+00
## 
## $Best.nc
##                     KL       CH Hartigan    CCC   Scott     Marriot
## Number_clusters 5.0000  10.0000   3.0000 2.0000  8.0000 3.00000e+00
## Value_Index     4.7803 353.5597  23.2552 3.6797 42.2387 1.36475e+25
##                       TrCovW     TraceW Friedman    Rubin Cindex      DB
## Number_clusters 3.000000e+00          3    10.00   5.0000 3.0000 10.0000
## Value_Index     9.660685e+17 1355402073  1260.96 -31.8436 0.3021  0.3218
##                 Silhouette   Duda PseudoT2   Beale Ratkowsky       Ball
## Number_clusters    10.0000  5.000   5.0000  5.0000    3.0000          3
## Value_Index         0.6561 36.001  -2.9167 -2.8053    0.4316 1017697329
##                 PtBiserial   Frey McClain    Dunn Hubert SDindex Dindex
## Number_clusters      2.000 7.0000  2.0000 10.0000      0   3e+00      0
## Value_Index          0.702 1.3836  0.0617  0.4626      0   2e-04      0
##                    SDbw
## Number_clusters 10.0000
## Value_Index      0.0256
## 
## $Best.partition
##  [1] 1 2 1 1 1 1 1 1 1 2 1 1 3 3 1 3 1 3 3 1 3 1 3 3 1

?cutree

## starting httpd help server ... done

rect.hclust(fit, k=5, border="red")

groups <- cutree(fit, k=5) # cut tree into 5 clusters

membership<-as.matrix(groups) # groups or cluster numbers
final <- data.frame(Uni, membership)

View(final)

write.csv(final, file="final.csv",row.names = F)

aggregate(Uni[,-1],by=list(final$membership),median)

##   Group.1  SAT Top10 Accept SFRatio Expenses GradRate
## 1       1 1270  80.5   37.5      13    22284       90
## 2       2 1375  91.0   23.0      10    36450       95
## 3       3 1260  62.0   59.0       9    25026       72
## 4       4 1085  40.0   68.0      16    11857       80
## 5       5 1040  38.5   78.5      22     8885       68

Hclustering_University.R

adarsh

2019-10-11