cluster <- read.csv("C:\\Users\\prakruthi\\Desktop\\datascience assignments\\clustering\\Universities.csv")
View(cluster)
colnames(cluster)
## [1] "Univ" "SAT" "Top10" "Accept" "SFRatio" "Expenses"
## [7] "GradRate"
# drop column names that are not required
mydata <- cluster[,-1]
View(mydata)
#normalize the data
normal <- scale(mydata)
normal
## SAT Top10 Accept SFRatio Expenses
## [1,] 0.40199420 0.64423491 -0.87188786 0.0688409 -0.32471667
## [2,] 1.37098850 1.21025599 -0.71981439 -1.6521815 2.50865117
## [3,] -0.05943165 -0.74508957 1.00368486 -0.9146005 -0.16374483
## [4,] 0.40199420 -0.02469910 -0.77050555 -0.1770194 0.28575621
## [5,] 0.12513869 0.33549613 -0.31428516 0.0688409 -0.38294938
## [6,] 0.67884972 0.64423491 -0.82119670 -0.6687401 0.33095589
## [7,] 0.44813679 0.69569137 -0.46635862 -0.1770194 0.29095556
## [8,] -0.10557424 -0.12761203 -0.77050555 -0.1770194 -0.50343562
## [9,] 1.23256074 0.74714783 -1.27741709 -0.4228798 0.84139330
## [10,] 0.35585162 -0.07615556 0.24331754 -1.4063212 2.17006957
## [11,] 1.04799040 0.90151722 -0.46635862 -0.6687401 0.51868704
## [12,] -0.05943165 0.43840906 -0.01013823 -0.4228798 0.04603157
## [13,] -0.10557424 0.23258321 0.14193523 0.0688409 -0.85033618
## [14,] -1.71133621 -1.98004466 0.75022909 1.2981426 -1.19259198
## [15,] 1.00184782 0.74714783 -1.27741709 -1.1604608 0.19632741
## [16,] -2.41270351 -2.49460928 2.57511065 1.5440030 -1.27016627
## [17,] 0.86342006 0.69569137 -0.97327017 -0.1770194 0.62821999
## [18,] -1.76670731 -1.41402358 1.40921410 3.0191651 -1.29526179
## [19,] -0.24400199 0.95297368 0.04055292 1.0522823 -0.84908833
## [20,] 0.21742386 -0.07615556 0.54746447 0.0688409 0.76201657
## [21,] -0.79771302 -0.59072018 1.45990525 0.8064219 -0.82621120
## [22,] 0.17128128 0.18112675 -0.16221169 -0.4228798 0.01143857
## [23,] -0.38242975 0.02675736 0.24331754 0.3147012 -0.97324878
## [24,] -1.67442214 -1.87713174 1.51059641 0.5605616 -1.07668116
## [25,] 1.00184782 0.95297368 -1.02396132 -0.4228798 1.11792933
## GradRate
## [1,] 0.80372917
## [2,] -0.63150149
## [3,] -1.62512272
## [4,] 0.14131502
## [5,] 0.36211974
## [6,] 0.91413153
## [7,] 0.91413153
## [8,] 0.58292445
## [9,] 1.13493625
## [10,] 0.03091266
## [11,] 0.47252209
## [12,] 0.25171738
## [13,] 0.80372917
## [14,] -0.74190385
## [15,] 0.91413153
## [16,] -1.95632979
## [17,] 0.69332681
## [18,] -2.17713451
## [19,] -0.96270857
## [20,] 0.03091266
## [21,] -0.18989206
## [22,] 0.36211974
## [23,] 0.58292445
## [24,] -1.73552508
## [25,] 1.02453389
## attr(,"scaled:center")
## SAT Top10 Accept SFRatio Expenses GradRate
## 1266.44 76.48 39.20 12.72 27388.00 86.72
## attr(,"scaled:scale")
## SAT Top10 Accept SFRatio Expenses
## 108.359771 19.433905 19.727308 4.067350 14424.883165
## GradRate
## 9.057778
#find the distance between the points
d <- dist(normal,method = "euclidean")
d
## 1 2 3 4 5 6 7
## 2 3.7857938
## 3 3.5430636 4.1848461
## 4 1.1531123 3.1908062 2.7831649
## 5 0.8254086 3.8460831 2.8112483 0.9853405
## 6 1.0321529 2.9846624 3.5437922 1.1695165 1.4203093
## 7 0.7879842 3.2655674 3.4195166 1.1003951 1.0375485 0.6520549
## 8 1.0023943 4.0861104 3.0109877 1.0421418 0.7743085 1.5037735 1.3482156
## 9 1.6050500 2.8206003 4.2390400 1.7026482 2.1590538 0.9460827 1.2992183
## 10 3.2805612 2.0559324 3.1030668 2.4711581 3.0557018 2.5422276 2.6325334
## 11 1.4188069 2.5317720 3.3203915 1.3317046 1.6013507 0.7474162 0.9438530
## 12 1.2968620 3.3913483 2.4973647 1.0656666 0.7583965 1.3511703 1.0445978
## 13 1.3157492 4.4925822 3.0064051 1.7164830 0.8279045 1.9141752 1.5054133
## 14 4.3171337 6.6524863 3.3246017 3.9678543 3.6404851 4.8991316 4.5191196
## 15 1.5263446 2.9392629 3.8980026 1.6688113 2.0042855 0.7635247 1.3939767
## 16 6.3517374 8.0714998 4.2949603 5.9159998 5.5800729 6.8296647 6.4227353
## 17 1.0984372 2.8379459 3.6581824 1.0931552 1.5171256 0.6623822 0.7693644
## 18 5.7162085 7.7424127 4.5372101 5.4290964 5.0591391 6.4025826 5.9297396
## 19 2.3890592 4.6851664 2.9363624 2.4594499 1.8928988 3.0929822 2.6712584
## 20 2.0846640 3.3231017 2.3019346 1.4398677 1.5301082 2.0285523 1.6550634
## 21 3.1913126 5.4826103 2.4975374 3.0079455 2.4295076 3.6936140 3.2173864
## 22 1.1494113 3.3976259 2.5479464 0.8062669 0.6681441 1.1720119 0.9366030
## 23 1.6643553 4.7822435 2.8844732 1.9151260 1.0590191 2.3277861 1.8949020
## 24 4.8579911 6.5384195 2.6773901 4.3438724 4.0727756 5.2769065 4.9465303
## 25 1.6882067 2.5461162 4.1262865 1.7067347 2.1416714 0.9657764 1.2000969
## 8 9 10 11 12 13 14
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 2.2330469
## 10 3.1948284 2.7767763
## 11 1.9444887 1.1485169 2.3247902
## 12 1.1716344 2.1870609 2.4553871 1.4076914
## 13 1.0917100 2.6987736 3.4949551 2.1624417 1.1906306
## 14 3.5677474 5.5531073 5.2320055 5.0664881 3.8261280 3.4440555
## 15 1.9532207 1.0305916 2.8537614 1.1065707 1.9578142 2.4747924 5.4430301
## 16 5.6671223 7.5181213 6.5908909 6.8378478 5.5946826 5.3526678 2.3723788
## 17 1.7177675 0.7296074 2.5808399 0.7974530 1.5620122 2.1577457 4.9305912
## 18 5.2560351 6.9474483 6.6425435 6.3807336 5.2827001 4.9864409 2.4062530
## 19 2.4216283 3.6597256 4.1961141 2.9700953 2.1802719 2.1531056 3.3891744
## 20 1.9520641 2.5505847 2.0664907 1.8632703 1.2064514 1.8870687 3.6489650
## 21 2.7086609 4.4104528 4.1228945 3.7104626 2.4964381 2.1037634 1.9872874
## 22 0.9572291 1.9941031 2.4497464 1.3076787 0.3948852 1.1624396 3.8311793
## 23 1.2612810 3.0953932 3.7017207 2.5534614 1.4250197 0.5035883 2.9699570
## 24 4.1198754 5.9996772 5.1479941 5.2889154 4.0570101 3.9403360 1.4611148
## 25 2.3114020 0.4984911 2.4786797 1.0197810 2.0408670 2.6961591 5.5256172
## 15 16 17 18 19 20 21
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16 7.3969180
## 17 1.1475009 6.8621046
## 18 6.9859561 2.2737148 6.2609603
## 19 3.5837419 4.9416046 2.9591368 3.9162571
## 20 2.6897220 5.2089193 1.9603148 4.9276322 2.4655789
## 21 4.3190653 3.3672088 3.7679123 3.2687492 2.3156778 2.2897764
## 22 1.7711099 5.6758983 1.3972163 5.3640129 2.3411752 1.2193516 2.5852501
## 23 2.8985394 4.8952748 2.5096574 4.5152111 1.9663610 1.9593956 1.7013168
## 24 5.7814732 1.7643346 5.3459976 2.5535840 3.6204225 3.8069831 2.2222077
## 25 1.2297027 7.4131967 0.7056109 6.8636551 3.5608604 2.3451126 4.3077487
## 22 23 24
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23 1.4343185
## 24 4.1222095 3.5137427
## 25 1.9212796 3.0809361 5.9158514
# do the hierarchical clustering
hcluster <- hclust(d,method = "complete")
hcluster
##
## Call:
## hclust(d = d, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 25
#dendrogram
plot(hcluster,hang = -100)

#cutree
group <- cutree(hcluster,k=6)
group
## [1] 1 2 3 1 1 4 4 1 4 2 4 1 1 5 4 6 4 6 1 1 5 1 1 5 4
final <- data.frame(group,mydata)
View(final)
#visualizing the cutree
rect.hclust(hcluster,plot(hcluster,hang=-100),k=6,border ="red")

# aggregation for naming the cluster accordingly
agg <- aggregate(mydata,by=list(final$group),FUN = mean)
agg
## Group.1 SAT Top10 Accept SFRatio Expenses GradRate
## 1 1 1271.000 81.50000 35.40000 12.90000 23380.00 89.40000
## 2 2 1360.000 87.50000 34.50000 6.50000 61133.00 84.00000
## 3 3 1260.000 62.00000 59.00000 9.00000 25026.00 72.00000
## 4 4 1363.571 91.42857 21.42857 10.57143 35475.14 94.57143
## 5 5 1115.333 47.66667 63.66667 16.33333 12504.00 78.66667
## 6 6 1040.000 38.50000 78.50000 22.00000 8885.00 68.00000