Hierarchical Clustering Implementation
library(readxl)
## Warning: package 'readxl' was built under R version 3.4.4
input <- read_excel("E:\\DataScience Yogesh\\R _Codes\\R _Codes\\Clustering\\University_Clustering.xlsx")
#rm(University_Clustering)
#View(input)
mydata <- input[,c(1,3:8)]
View(mydata)
normalized_data <- scale(mydata[,2:7]) # Excluding the university name column before normalizing the data
d <- dist(normalized_data,method="euclidean") # Distance matrix
class(d)
## [1] "dist"
fit <- hclust(d,method = "complete")
#windows()
plot(fit,hang=-1)

#fit1 <- hclust(d,method="single")
#windows()
#plot(fit1,hang=-1)
#fit2 <- hclust(d,method="average")
#windows()
#plot(fit2,hang=-1)
#fit3 <- hclust(d,method="centroid")
#windows()
#plot(fit3,hang=-1)
#plot(fit)
#plot(fit,hang=-1)
#plot(fit1,hang=-1)
#plot(fit2,hang=-1)
#plot(fit3,hang=-1)
plot(fit) # Display Dendrogram

#plot(fit,hang=-1)
#windows()
#plot(fit,hang=-1)
groups <- cutree(fit, k = 6)
groups
## [1] 1 2 3 1 1 4 4 1 4 2 4 1 1 5 4 6 4 6 1 1 5 1 1 5 4
str(groups)
## int [1:25] 1 2 3 1 1 4 4 1 4 2 ...
#?cutree
#windows()
plot(fit,hang = -1)
rect.hclust(fit, k=2, border="red")

clustno <- as.data.frame(groups)
final <- data.frame(clustno,input)
#View(final)
x <- aggregate(final[,4:9], by=list(final$groups), FUN=mean)
x
## Group.1 SAT Top10 Accept SFRatio Expenses GradRate
## 1 1 1271.000 81.50000 35.40000 12.90000 23380.00 89.40000
## 2 2 1360.000 87.50000 34.50000 6.50000 61133.00 84.00000
## 3 3 1260.000 62.00000 59.00000 9.00000 25026.00 72.00000
## 4 4 1363.571 91.42857 21.42857 10.57143 35475.14 94.57143
## 5 5 1115.333 47.66667 63.66667 16.33333 12504.00 78.66667
## 6 6 1040.000 38.50000 78.50000 22.00000 8885.00 68.00000
#write.csv(x,file="apr23a.csv")
#getwd()