cluster <- read.csv("C:\\Users\\prakruthi\\Desktop\\datascience assignments\\clustering\\Universities.csv")
View(cluster)
colnames(cluster)
## [1] "Univ"     "SAT"      "Top10"    "Accept"   "SFRatio"  "Expenses"
## [7] "GradRate"
# drop column names that are not required
mydata <- cluster[,-1]
View(mydata)
#normalize the data
normal <- scale(mydata)
normal
##               SAT       Top10      Accept    SFRatio    Expenses
##  [1,]  0.40199420  0.64423491 -0.87188786  0.0688409 -0.32471667
##  [2,]  1.37098850  1.21025599 -0.71981439 -1.6521815  2.50865117
##  [3,] -0.05943165 -0.74508957  1.00368486 -0.9146005 -0.16374483
##  [4,]  0.40199420 -0.02469910 -0.77050555 -0.1770194  0.28575621
##  [5,]  0.12513869  0.33549613 -0.31428516  0.0688409 -0.38294938
##  [6,]  0.67884972  0.64423491 -0.82119670 -0.6687401  0.33095589
##  [7,]  0.44813679  0.69569137 -0.46635862 -0.1770194  0.29095556
##  [8,] -0.10557424 -0.12761203 -0.77050555 -0.1770194 -0.50343562
##  [9,]  1.23256074  0.74714783 -1.27741709 -0.4228798  0.84139330
## [10,]  0.35585162 -0.07615556  0.24331754 -1.4063212  2.17006957
## [11,]  1.04799040  0.90151722 -0.46635862 -0.6687401  0.51868704
## [12,] -0.05943165  0.43840906 -0.01013823 -0.4228798  0.04603157
## [13,] -0.10557424  0.23258321  0.14193523  0.0688409 -0.85033618
## [14,] -1.71133621 -1.98004466  0.75022909  1.2981426 -1.19259198
## [15,]  1.00184782  0.74714783 -1.27741709 -1.1604608  0.19632741
## [16,] -2.41270351 -2.49460928  2.57511065  1.5440030 -1.27016627
## [17,]  0.86342006  0.69569137 -0.97327017 -0.1770194  0.62821999
## [18,] -1.76670731 -1.41402358  1.40921410  3.0191651 -1.29526179
## [19,] -0.24400199  0.95297368  0.04055292  1.0522823 -0.84908833
## [20,]  0.21742386 -0.07615556  0.54746447  0.0688409  0.76201657
## [21,] -0.79771302 -0.59072018  1.45990525  0.8064219 -0.82621120
## [22,]  0.17128128  0.18112675 -0.16221169 -0.4228798  0.01143857
## [23,] -0.38242975  0.02675736  0.24331754  0.3147012 -0.97324878
## [24,] -1.67442214 -1.87713174  1.51059641  0.5605616 -1.07668116
## [25,]  1.00184782  0.95297368 -1.02396132 -0.4228798  1.11792933
##          GradRate
##  [1,]  0.80372917
##  [2,] -0.63150149
##  [3,] -1.62512272
##  [4,]  0.14131502
##  [5,]  0.36211974
##  [6,]  0.91413153
##  [7,]  0.91413153
##  [8,]  0.58292445
##  [9,]  1.13493625
## [10,]  0.03091266
## [11,]  0.47252209
## [12,]  0.25171738
## [13,]  0.80372917
## [14,] -0.74190385
## [15,]  0.91413153
## [16,] -1.95632979
## [17,]  0.69332681
## [18,] -2.17713451
## [19,] -0.96270857
## [20,]  0.03091266
## [21,] -0.18989206
## [22,]  0.36211974
## [23,]  0.58292445
## [24,] -1.73552508
## [25,]  1.02453389
## attr(,"scaled:center")
##      SAT    Top10   Accept  SFRatio Expenses GradRate 
##  1266.44    76.48    39.20    12.72 27388.00    86.72 
## attr(,"scaled:scale")
##          SAT        Top10       Accept      SFRatio     Expenses 
##   108.359771    19.433905    19.727308     4.067350 14424.883165 
##     GradRate 
##     9.057778
#find the distance between the points

d <- dist(normal,method = "euclidean")
d
##            1         2         3         4         5         6         7
## 2  3.7857938                                                            
## 3  3.5430636 4.1848461                                                  
## 4  1.1531123 3.1908062 2.7831649                                        
## 5  0.8254086 3.8460831 2.8112483 0.9853405                              
## 6  1.0321529 2.9846624 3.5437922 1.1695165 1.4203093                    
## 7  0.7879842 3.2655674 3.4195166 1.1003951 1.0375485 0.6520549          
## 8  1.0023943 4.0861104 3.0109877 1.0421418 0.7743085 1.5037735 1.3482156
## 9  1.6050500 2.8206003 4.2390400 1.7026482 2.1590538 0.9460827 1.2992183
## 10 3.2805612 2.0559324 3.1030668 2.4711581 3.0557018 2.5422276 2.6325334
## 11 1.4188069 2.5317720 3.3203915 1.3317046 1.6013507 0.7474162 0.9438530
## 12 1.2968620 3.3913483 2.4973647 1.0656666 0.7583965 1.3511703 1.0445978
## 13 1.3157492 4.4925822 3.0064051 1.7164830 0.8279045 1.9141752 1.5054133
## 14 4.3171337 6.6524863 3.3246017 3.9678543 3.6404851 4.8991316 4.5191196
## 15 1.5263446 2.9392629 3.8980026 1.6688113 2.0042855 0.7635247 1.3939767
## 16 6.3517374 8.0714998 4.2949603 5.9159998 5.5800729 6.8296647 6.4227353
## 17 1.0984372 2.8379459 3.6581824 1.0931552 1.5171256 0.6623822 0.7693644
## 18 5.7162085 7.7424127 4.5372101 5.4290964 5.0591391 6.4025826 5.9297396
## 19 2.3890592 4.6851664 2.9363624 2.4594499 1.8928988 3.0929822 2.6712584
## 20 2.0846640 3.3231017 2.3019346 1.4398677 1.5301082 2.0285523 1.6550634
## 21 3.1913126 5.4826103 2.4975374 3.0079455 2.4295076 3.6936140 3.2173864
## 22 1.1494113 3.3976259 2.5479464 0.8062669 0.6681441 1.1720119 0.9366030
## 23 1.6643553 4.7822435 2.8844732 1.9151260 1.0590191 2.3277861 1.8949020
## 24 4.8579911 6.5384195 2.6773901 4.3438724 4.0727756 5.2769065 4.9465303
## 25 1.6882067 2.5461162 4.1262865 1.7067347 2.1416714 0.9657764 1.2000969
##            8         9        10        11        12        13        14
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
## 6                                                                       
## 7                                                                       
## 8                                                                       
## 9  2.2330469                                                            
## 10 3.1948284 2.7767763                                                  
## 11 1.9444887 1.1485169 2.3247902                                        
## 12 1.1716344 2.1870609 2.4553871 1.4076914                              
## 13 1.0917100 2.6987736 3.4949551 2.1624417 1.1906306                    
## 14 3.5677474 5.5531073 5.2320055 5.0664881 3.8261280 3.4440555          
## 15 1.9532207 1.0305916 2.8537614 1.1065707 1.9578142 2.4747924 5.4430301
## 16 5.6671223 7.5181213 6.5908909 6.8378478 5.5946826 5.3526678 2.3723788
## 17 1.7177675 0.7296074 2.5808399 0.7974530 1.5620122 2.1577457 4.9305912
## 18 5.2560351 6.9474483 6.6425435 6.3807336 5.2827001 4.9864409 2.4062530
## 19 2.4216283 3.6597256 4.1961141 2.9700953 2.1802719 2.1531056 3.3891744
## 20 1.9520641 2.5505847 2.0664907 1.8632703 1.2064514 1.8870687 3.6489650
## 21 2.7086609 4.4104528 4.1228945 3.7104626 2.4964381 2.1037634 1.9872874
## 22 0.9572291 1.9941031 2.4497464 1.3076787 0.3948852 1.1624396 3.8311793
## 23 1.2612810 3.0953932 3.7017207 2.5534614 1.4250197 0.5035883 2.9699570
## 24 4.1198754 5.9996772 5.1479941 5.2889154 4.0570101 3.9403360 1.4611148
## 25 2.3114020 0.4984911 2.4786797 1.0197810 2.0408670 2.6961591 5.5256172
##           15        16        17        18        19        20        21
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
## 6                                                                       
## 7                                                                       
## 8                                                                       
## 9                                                                       
## 10                                                                      
## 11                                                                      
## 12                                                                      
## 13                                                                      
## 14                                                                      
## 15                                                                      
## 16 7.3969180                                                            
## 17 1.1475009 6.8621046                                                  
## 18 6.9859561 2.2737148 6.2609603                                        
## 19 3.5837419 4.9416046 2.9591368 3.9162571                              
## 20 2.6897220 5.2089193 1.9603148 4.9276322 2.4655789                    
## 21 4.3190653 3.3672088 3.7679123 3.2687492 2.3156778 2.2897764          
## 22 1.7711099 5.6758983 1.3972163 5.3640129 2.3411752 1.2193516 2.5852501
## 23 2.8985394 4.8952748 2.5096574 4.5152111 1.9663610 1.9593956 1.7013168
## 24 5.7814732 1.7643346 5.3459976 2.5535840 3.6204225 3.8069831 2.2222077
## 25 1.2297027 7.4131967 0.7056109 6.8636551 3.5608604 2.3451126 4.3077487
##           22        23        24
## 2                               
## 3                               
## 4                               
## 5                               
## 6                               
## 7                               
## 8                               
## 9                               
## 10                              
## 11                              
## 12                              
## 13                              
## 14                              
## 15                              
## 16                              
## 17                              
## 18                              
## 19                              
## 20                              
## 21                              
## 22                              
## 23 1.4343185                    
## 24 4.1222095 3.5137427          
## 25 1.9212796 3.0809361 5.9158514
# do the hierarchical clustering

hcluster <- hclust(d,method = "complete")
hcluster
## 
## Call:
## hclust(d = d, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 25
#dendrogram
plot(hcluster,hang = -100)

#cutree

group <- cutree(hcluster,k=6)
group
##  [1] 1 2 3 1 1 4 4 1 4 2 4 1 1 5 4 6 4 6 1 1 5 1 1 5 4
final <- data.frame(group,mydata)
View(final)
#visualizing the cutree

rect.hclust(hcluster,plot(hcluster,hang=-100),k=6,border ="red")

# aggregation for naming the cluster accordingly

agg <- aggregate(mydata,by=list(final$group),FUN = mean)
agg
##   Group.1      SAT    Top10   Accept  SFRatio Expenses GradRate
## 1       1 1271.000 81.50000 35.40000 12.90000 23380.00 89.40000
## 2       2 1360.000 87.50000 34.50000  6.50000 61133.00 84.00000
## 3       3 1260.000 62.00000 59.00000  9.00000 25026.00 72.00000
## 4       4 1363.571 91.42857 21.42857 10.57143 35475.14 94.57143
## 5       5 1115.333 47.66667 63.66667 16.33333 12504.00 78.66667
## 6       6 1040.000 38.50000 78.50000 22.00000  8885.00 68.00000