Hierarchical_Clustering

university data

#install.packages("readxl") #library(readxl)
input <- read.csv("C:\\Users\\amits\\Desktop\\sconcept\\datascience training\\assignment\\Universities.csv")
View(input)
#mydata <- input[1:25, c(1,2:7)]
normalized_data <- scale(input[1:25,2:7]) # Excluding the university name column before normalizing the data input[1:25,3:8]
d <- dist(normalized_data, method="euclidean") #distance matrix
head(d)
## [1] 3.7857938 3.5430636 1.1531123 0.8254086 1.0321529 0.7879842
fit <-hclust(d,method="complete")
#?hclust  #help
windows()
#plot(fit) #display Dendogram
plot(fit, hang=1)

fit1 <-hclust(d,method="single")
windows()
plot(fit1, hang=1)

groups<- cutree(fit, k=5)
groups
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  1  2  3  1  1  2  2  1  2  2  2  1  1  4  2  5  2  5  1  1  4  1  1  4  2
?cutree
## starting httpd help server ... done
rect.hclust(fit, h=5, border="blue")

?rect.hclust

clustno <- as.data.frame(groups)
final <- data.frame(clustno, input) #or final<-cbind(clustno, input)
final
##    groups         Univ  SAT Top10 Accept SFRatio Expenses GradRate
## 1       1        Brown 1310    89     22      13    22704       94
## 2       2      CalTech 1415   100     25       6    63575       81
## 3       3          CMU 1260    62     59       9    25026       72
## 4       1     Columbia 1310    76     24      12    31510       88
## 5       1      Cornell 1280    83     33      13    21864       90
## 6       2    Dartmouth 1340    89     23      10    32162       95
## 7       2         Duke 1315    90     30      12    31585       95
## 8       1   Georgetown 1255    74     24      12    20126       92
## 9       2      Harvard 1400    91     14      11    39525       97
## 10      2 JohnsHopkins 1305    75     44       7    58691       87
## 11      2          MIT 1380    94     30      10    34870       91
## 12      1 Northwestern 1260    85     39      11    28052       89
## 13      1    NotreDame 1255    81     42      13    15122       94
## 14      4    PennState 1081    38     54      18    10185       80
## 15      2    Princeton 1375    91     14       8    30220       95
## 16      5       Purdue 1005    28     90      19     9066       69
## 17      2     Stanford 1360    90     20      12    36450       93
## 18      5     TexasA&M 1075    49     67      25     8704       67
## 19      1   UCBerkeley 1240    95     40      17    15140       78
## 20      1     UChicago 1290    75     50      13    38380       87
## 21      4    UMichigan 1180    65     68      16    15470       85
## 22      1        UPenn 1285    80     36      11    27553       90
## 23      1          UVA 1225    77     44      14    13349       92
## 24      4   UWisconsin 1085    40     69      15    11857       71
## 25      2         Yale 1375    95     19      11    43514       96

Load the package for writing the data into xlsx file format

#library(xlsx)

write.csv(final, file="aug_univ_clust.csv")
getwd()
## [1] "C:/Users/amits/Desktop/sconcept/datascience training/assignment"
aggregate(final[,3:8], by =list(final$groups),FUN=mean)
##   Group.1      SAT    Top10   Accept   SFRatio Expenses GradRate
## 1       1 1271.000 81.50000 35.40000 12.900000 23380.00 89.40000
## 2       2 1362.778 90.55556 24.33333  9.666667 41176.89 92.22222
## 3       3 1260.000 62.00000 59.00000  9.000000 25026.00 72.00000
## 4       4 1115.333 47.66667 63.66667 16.333333 12504.00 78.66667
## 5       5 1040.000 38.50000 78.50000 22.000000  8885.00 68.00000