Hierarchical Clustering Implementation

library(readxl)

## Warning: package 'readxl' was built under R version 3.4.4

input <- read_excel("E:\\DataScience Yogesh\\R _Codes\\R _Codes\\Clustering\\University_Clustering.xlsx")
#rm(University_Clustering)
#View(input)


mydata <- input[,c(1,3:8)]
View(mydata)

normalized_data <- scale(mydata[,2:7]) # Excluding the university name column before normalizing the data

d <- dist(normalized_data,method="euclidean") # Distance matrix
class(d)

## [1] "dist"

fit  <- hclust(d,method = "complete")
#windows()
plot(fit,hang=-1)

#fit1 <- hclust(d,method="single")
#windows()
#plot(fit1,hang=-1)


#fit2 <- hclust(d,method="average")
#windows()
#plot(fit2,hang=-1)



#fit3 <- hclust(d,method="centroid")
#windows()
#plot(fit3,hang=-1)


#plot(fit)
#plot(fit,hang=-1)
#plot(fit1,hang=-1)
#plot(fit2,hang=-1)
#plot(fit3,hang=-1)

plot(fit) # Display Dendrogram

#plot(fit,hang=-1)
#windows()
#plot(fit,hang=-1)

groups <- cutree(fit, k = 6)
groups

##  [1] 1 2 3 1 1 4 4 1 4 2 4 1 1 5 4 6 4 6 1 1 5 1 1 5 4

str(groups)

##  int [1:25] 1 2 3 1 1 4 4 1 4 2 ...

#?cutree
#windows()
plot(fit,hang = -1)

rect.hclust(fit, k=2, border="red")

clustno <- as.data.frame(groups)

final <- data.frame(clustno,input)

#View(final)

x <- aggregate(final[,4:9], by=list(final$groups), FUN=mean)
x

##   Group.1      SAT    Top10   Accept  SFRatio Expenses GradRate
## 1       1 1271.000 81.50000 35.40000 12.90000 23380.00 89.40000
## 2       2 1360.000 87.50000 34.50000  6.50000 61133.00 84.00000
## 3       3 1260.000 62.00000 59.00000  9.00000 25026.00 72.00000
## 4       4 1363.571 91.42857 21.42857 10.57143 35475.14 94.57143
## 5       5 1115.333 47.66667 63.66667 16.33333 12504.00 78.66667
## 6       6 1040.000 38.50000 78.50000 22.00000  8885.00 68.00000

#write.csv(x,file="apr23a.csv")
#getwd()