#install.packages("readxl") #library(readxl)
input <- read.csv("C:\\Users\\amits\\Desktop\\sconcept\\datascience training\\assignment\\Universities.csv")
View(input)
#mydata <- input[1:25, c(1,2:7)]
normalized_data <- scale(input[1:25,2:7]) # Excluding the university name column before normalizing the data input[1:25,3:8]
d <- dist(normalized_data, method="euclidean") #distance matrix
head(d)
## [1] 3.7857938 3.5430636 1.1531123 0.8254086 1.0321529 0.7879842
fit <-hclust(d,method="complete")
#?hclust #help
windows()
#plot(fit) #display Dendogram
plot(fit, hang=1)
fit1 <-hclust(d,method="single")
windows()
plot(fit1, hang=1)
groups<- cutree(fit, k=5)
groups
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 1 2 3 1 1 2 2 1 2 2 2 1 1 4 2 5 2 5 1 1 4 1 1 4 2
?cutree
## starting httpd help server ... done
rect.hclust(fit, h=5, border="blue")
?rect.hclust
clustno <- as.data.frame(groups)
final <- data.frame(clustno, input) #or final<-cbind(clustno, input)
final
## groups Univ SAT Top10 Accept SFRatio Expenses GradRate
## 1 1 Brown 1310 89 22 13 22704 94
## 2 2 CalTech 1415 100 25 6 63575 81
## 3 3 CMU 1260 62 59 9 25026 72
## 4 1 Columbia 1310 76 24 12 31510 88
## 5 1 Cornell 1280 83 33 13 21864 90
## 6 2 Dartmouth 1340 89 23 10 32162 95
## 7 2 Duke 1315 90 30 12 31585 95
## 8 1 Georgetown 1255 74 24 12 20126 92
## 9 2 Harvard 1400 91 14 11 39525 97
## 10 2 JohnsHopkins 1305 75 44 7 58691 87
## 11 2 MIT 1380 94 30 10 34870 91
## 12 1 Northwestern 1260 85 39 11 28052 89
## 13 1 NotreDame 1255 81 42 13 15122 94
## 14 4 PennState 1081 38 54 18 10185 80
## 15 2 Princeton 1375 91 14 8 30220 95
## 16 5 Purdue 1005 28 90 19 9066 69
## 17 2 Stanford 1360 90 20 12 36450 93
## 18 5 TexasA&M 1075 49 67 25 8704 67
## 19 1 UCBerkeley 1240 95 40 17 15140 78
## 20 1 UChicago 1290 75 50 13 38380 87
## 21 4 UMichigan 1180 65 68 16 15470 85
## 22 1 UPenn 1285 80 36 11 27553 90
## 23 1 UVA 1225 77 44 14 13349 92
## 24 4 UWisconsin 1085 40 69 15 11857 71
## 25 2 Yale 1375 95 19 11 43514 96
#library(xlsx)
write.csv(final, file="aug_univ_clust.csv")
getwd()
## [1] "C:/Users/amits/Desktop/sconcept/datascience training/assignment"
aggregate(final[,3:8], by =list(final$groups),FUN=mean)
## Group.1 SAT Top10 Accept SFRatio Expenses GradRate
## 1 1 1271.000 81.50000 35.40000 12.900000 23380.00 89.40000
## 2 2 1362.778 90.55556 24.33333 9.666667 41176.89 92.22222
## 3 3 1260.000 62.00000 59.00000 9.000000 25026.00 72.00000
## 4 4 1115.333 47.66667 63.66667 16.333333 12504.00 78.66667
## 5 5 1040.000 38.50000 78.50000 22.000000 8885.00 68.00000