Clustering

Hierarchical Clustering

# load package 'readxl' to load data from xlsx file
library(readxl)
## Warning: package 'readxl' was built under R version 3.4.4
input <- read.csv("E:\\Excelr DS\\R _Codes\\Clustering\\Universities.csv")
#View(input)
mydata <- input
#View(mydata)
normalized_data <- scale(mydata[,2:7]) # Excluding the university name 
View(normalized_data)
# Calculate Distance
d <- dist(normalized_data,method="euclidean") # Distance matrix
d
##            1         2         3         4         5         6         7
## 2  3.7857938                                                            
## 3  3.5430636 4.1848461                                                  
## 4  1.1531123 3.1908062 2.7831649                                        
## 5  0.8254086 3.8460831 2.8112483 0.9853405                              
## 6  1.0321529 2.9846624 3.5437922 1.1695165 1.4203093                    
## 7  0.7879842 3.2655674 3.4195166 1.1003951 1.0375485 0.6520549          
## 8  1.0023943 4.0861104 3.0109877 1.0421418 0.7743085 1.5037735 1.3482156
## 9  1.6050500 2.8206003 4.2390400 1.7026482 2.1590538 0.9460827 1.2992183
## 10 3.2805612 2.0559324 3.1030668 2.4711581 3.0557018 2.5422276 2.6325334
## 11 1.4188069 2.5317720 3.3203915 1.3317046 1.6013507 0.7474162 0.9438530
## 12 1.2968620 3.3913483 2.4973647 1.0656666 0.7583965 1.3511703 1.0445978
## 13 1.3157492 4.4925822 3.0064051 1.7164830 0.8279045 1.9141752 1.5054133
## 14 4.3171337 6.6524863 3.3246017 3.9678543 3.6404851 4.8991316 4.5191196
## 15 1.5263446 2.9392629 3.8980026 1.6688113 2.0042855 0.7635247 1.3939767
## 16 6.3517374 8.0714998 4.2949603 5.9159998 5.5800729 6.8296647 6.4227353
## 17 1.0984372 2.8379459 3.6581824 1.0931552 1.5171256 0.6623822 0.7693644
## 18 5.7162085 7.7424127 4.5372101 5.4290964 5.0591391 6.4025826 5.9297396
## 19 2.3890592 4.6851664 2.9363624 2.4594499 1.8928988 3.0929822 2.6712584
## 20 2.0846640 3.3231017 2.3019346 1.4398677 1.5301082 2.0285523 1.6550634
## 21 3.1913126 5.4826103 2.4975374 3.0079455 2.4295076 3.6936140 3.2173864
## 22 1.1494113 3.3976259 2.5479464 0.8062669 0.6681441 1.1720119 0.9366030
## 23 1.6643553 4.7822435 2.8844732 1.9151260 1.0590191 2.3277861 1.8949020
## 24 4.8579911 6.5384195 2.6773901 4.3438724 4.0727756 5.2769065 4.9465303
## 25 1.6882067 2.5461162 4.1262865 1.7067347 2.1416714 0.9657764 1.2000969
##            8         9        10        11        12        13        14
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
## 6                                                                       
## 7                                                                       
## 8                                                                       
## 9  2.2330469                                                            
## 10 3.1948284 2.7767763                                                  
## 11 1.9444887 1.1485169 2.3247902                                        
## 12 1.1716344 2.1870609 2.4553871 1.4076914                              
## 13 1.0917100 2.6987736 3.4949551 2.1624417 1.1906306                    
## 14 3.5677474 5.5531073 5.2320055 5.0664881 3.8261280 3.4440555          
## 15 1.9532207 1.0305916 2.8537614 1.1065707 1.9578142 2.4747924 5.4430301
## 16 5.6671223 7.5181213 6.5908909 6.8378478 5.5946826 5.3526678 2.3723788
## 17 1.7177675 0.7296074 2.5808399 0.7974530 1.5620122 2.1577457 4.9305912
## 18 5.2560351 6.9474483 6.6425435 6.3807336 5.2827001 4.9864409 2.4062530
## 19 2.4216283 3.6597256 4.1961141 2.9700953 2.1802719 2.1531056 3.3891744
## 20 1.9520641 2.5505847 2.0664907 1.8632703 1.2064514 1.8870687 3.6489650
## 21 2.7086609 4.4104528 4.1228945 3.7104626 2.4964381 2.1037634 1.9872874
## 22 0.9572291 1.9941031 2.4497464 1.3076787 0.3948852 1.1624396 3.8311793
## 23 1.2612810 3.0953932 3.7017207 2.5534614 1.4250197 0.5035883 2.9699570
## 24 4.1198754 5.9996772 5.1479941 5.2889154 4.0570101 3.9403360 1.4611148
## 25 2.3114020 0.4984911 2.4786797 1.0197810 2.0408670 2.6961591 5.5256172
##           15        16        17        18        19        20        21
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
## 6                                                                       
## 7                                                                       
## 8                                                                       
## 9                                                                       
## 10                                                                      
## 11                                                                      
## 12                                                                      
## 13                                                                      
## 14                                                                      
## 15                                                                      
## 16 7.3969180                                                            
## 17 1.1475009 6.8621046                                                  
## 18 6.9859561 2.2737148 6.2609603                                        
## 19 3.5837419 4.9416046 2.9591368 3.9162571                              
## 20 2.6897220 5.2089193 1.9603148 4.9276322 2.4655789                    
## 21 4.3190653 3.3672088 3.7679123 3.2687492 2.3156778 2.2897764          
## 22 1.7711099 5.6758983 1.3972163 5.3640129 2.3411752 1.2193516 2.5852501
## 23 2.8985394 4.8952748 2.5096574 4.5152111 1.9663610 1.9593956 1.7013168
## 24 5.7814732 1.7643346 5.3459976 2.5535840 3.6204225 3.8069831 2.2222077
## 25 1.2297027 7.4131967 0.7056109 6.8636551 3.5608604 2.3451126 4.3077487
##           22        23        24
## 2                               
## 3                               
## 4                               
## 5                               
## 6                               
## 7                               
## 8                               
## 9                               
## 10                              
## 11                              
## 12                              
## 13                              
## 14                              
## 15                              
## 16                              
## 17                              
## 18                              
## 19                              
## 20                              
## 21                              
## 22                              
## 23 1.4343185                    
## 24 4.1222095 3.5137427          
## 25 1.9212796 3.0809361 5.9158514
str(d)
## Class 'dist'  atomic [1:300] 3.786 3.543 1.153 0.825 1.032 ...
##   ..- attr(*, "Size")= int 25
##   ..- attr(*, "Diag")= logi FALSE
##   ..- attr(*, "Upper")= logi FALSE
##   ..- attr(*, "method")= chr "euclidean"
##   ..- attr(*, "call")= language dist(x = normalized_data, method = "euclidean")
fit <- hclust(d,method="complete")

fit1 <- hclust(d,method="single")

fit2 <- hclust(d,method="average")

fit3 <- hclust(d,method="centroid")
#  Plot Dendrogram
plot(fit)

 plot1 <- plot(fit,hang=-1)   # Complete Linkage

#plot(fit1,hang=-1)  # Single Linkage
#plot(fit2,hang=-1)  # Average Linkage
#plot(fit3,hang=-1)  # Centroid Linkage
groups <- cutree(fit,k=4)
groups
##  [1] 1 2 3 1 1 2 2 1 2 2 2 1 1 3 2 4 2 4 1 1 3 1 1 3 2
str(groups)
##  int [1:25] 1 2 3 1 1 2 2 1 2 2 ...
#?cutree
plot(fit)

rect.hclust(fit,plot(fit,hang=-1),k=4,border="red")

#?rect.hclust
#class(clustno)
str(groups)
##  int [1:25] 1 2 3 1 1 2 2 1 2 2 ...
clustno <- as.data.frame(groups)
View(clustno)

final <- data.frame(clustno,mydata)
final <- cbind(clustno,mydata)
View(final)

aggregate(final[,3:8], by=list(final$groups), FUN=mean)
##   Group.1      SAT    Top10   Accept   SFRatio Expenses GradRate
## 1       1 1271.000 81.50000 35.40000 12.900000 23380.00 89.40000
## 2       2 1362.778 90.55556 24.33333  9.666667 41176.89 92.22222
## 3       3 1151.500 51.25000 62.50000 14.500000 15634.50 77.00000
## 4       4 1040.000 38.50000 78.50000 22.000000  8885.00 68.00000
# Load the package for writing the data into xlsx file format
#write.csv(final,file="E:/Excelr DS/R _Codes/Clustering/final1July.csv")
#getwd()
#setwd()



#setwd("E:\\Excelr DS\\April")
# Explore setcolor for repositioning the columns in R 
# Also install the package data.table
#getwd()