Import Data from Excel File
library(readxl) # use for read_excel function
input <- read_excel("C:\\Users\\Pawan Srivastav\\Downloads\\University_Clustering.xlsx")
input
## # A tibble: 25 x 8
## Univ State SAT Top10 Accept SFRatio Expenses GradRate
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Brown RI 1310 89 22 13 22704 94
## 2 CalTech CA 1415 100 25 6 63575 81
## 3 CMU PA 1260 62 59 9 25026 72
## 4 Columbia NY 1310 76 24 12 31510 88
## 5 Cornell NY 1280 83 33 13 21864 90
## 6 Dartmouth NH 1340 89 23 10 32162 95
## 7 Duke NC 1315 90 30 12 31585 95
## 8 Georgetown DC 1255 74 24 12 20126 92
## 9 Harvard MA 1400 91 14 11 39525 97
## 10 JohnsHopkins MD 1305 75 44 7 58691 87
## # ... with 15 more rows
Extracting the numeric columns from main data
mydata <- input[,3:8]
mydata
## # A tibble: 25 x 6
## SAT Top10 Accept SFRatio Expenses GradRate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1310 89 22 13 22704 94
## 2 1415 100 25 6 63575 81
## 3 1260 62 59 9 25026 72
## 4 1310 76 24 12 31510 88
## 5 1280 83 33 13 21864 90
## 6 1340 89 23 10 32162 95
## 7 1315 90 30 12 31585 95
## 8 1255 74 24 12 20126 92
## 9 1400 91 14 11 39525 97
## 10 1305 75 44 7 58691 87
## # ... with 15 more rows
Normalizing the input
normalizedinput <- scale(mydata)
normalizedinput
## SAT Top10 Accept SFRatio Expenses
## [1,] 0.40199420 0.64423491 -0.87188786 0.0688409 -0.32471667
## [2,] 1.37098850 1.21025599 -0.71981439 -1.6521815 2.50865117
## [3,] -0.05943165 -0.74508957 1.00368486 -0.9146005 -0.16374483
## [4,] 0.40199420 -0.02469910 -0.77050555 -0.1770194 0.28575621
## [5,] 0.12513869 0.33549613 -0.31428516 0.0688409 -0.38294938
## [6,] 0.67884972 0.64423491 -0.82119670 -0.6687401 0.33095589
## [7,] 0.44813679 0.69569137 -0.46635862 -0.1770194 0.29095556
## [8,] -0.10557424 -0.12761203 -0.77050555 -0.1770194 -0.50343562
## [9,] 1.23256074 0.74714783 -1.27741709 -0.4228798 0.84139330
## [10,] 0.35585162 -0.07615556 0.24331754 -1.4063212 2.17006957
## [11,] 1.04799040 0.90151722 -0.46635862 -0.6687401 0.51868704
## [12,] -0.05943165 0.43840906 -0.01013823 -0.4228798 0.04603157
## [13,] -0.10557424 0.23258321 0.14193523 0.0688409 -0.85033618
## [14,] -1.71133621 -1.98004466 0.75022909 1.2981426 -1.19259198
## [15,] 1.00184782 0.74714783 -1.27741709 -1.1604608 0.19632741
## [16,] -2.41270351 -2.49460928 2.57511065 1.5440030 -1.27016627
## [17,] 0.86342006 0.69569137 -0.97327017 -0.1770194 0.62821999
## [18,] -1.76670731 -1.41402358 1.40921410 3.0191651 -1.29526179
## [19,] -0.24400199 0.95297368 0.04055292 1.0522823 -0.84908833
## [20,] 0.21742386 -0.07615556 0.54746447 0.0688409 0.76201657
## [21,] -0.79771302 -0.59072018 1.45990525 0.8064219 -0.82621120
## [22,] 0.17128128 0.18112675 -0.16221169 -0.4228798 0.01143857
## [23,] -0.38242975 0.02675736 0.24331754 0.3147012 -0.97324878
## [24,] -1.67442214 -1.87713174 1.51059641 0.5605616 -1.07668116
## [25,] 1.00184782 0.95297368 -1.02396132 -0.4228798 1.11792933
## GradRate
## [1,] 0.80372917
## [2,] -0.63150149
## [3,] -1.62512272
## [4,] 0.14131502
## [5,] 0.36211974
## [6,] 0.91413153
## [7,] 0.91413153
## [8,] 0.58292445
## [9,] 1.13493625
## [10,] 0.03091266
## [11,] 0.47252209
## [12,] 0.25171738
## [13,] 0.80372917
## [14,] -0.74190385
## [15,] 0.91413153
## [16,] -1.95632979
## [17,] 0.69332681
## [18,] -2.17713451
## [19,] -0.96270857
## [20,] 0.03091266
## [21,] -0.18989206
## [22,] 0.36211974
## [23,] 0.58292445
## [24,] -1.73552508
## [25,] 1.02453389
## attr(,"scaled:center")
## SAT Top10 Accept SFRatio Expenses GradRate
## 1266.44 76.48 39.20 12.72 27388.00 86.72
## attr(,"scaled:scale")
## SAT Top10 Accept SFRatio Expenses
## 108.359771 19.433905 19.727308 4.067350 14424.883165
## GradRate
## 9.057778
] Creating K-mean clustering on normalized data
fit_Kmeans <- kmeans(normalizedinput,5)
str(fit_Kmeans)
## List of 9
## $ cluster : int [1:25] 3 5 4 3 3 1 1 3 1 5 ...
## $ centers : num [1:5, 1:6] 0.8964 -1.8913 0.0739 -0.367 0.8634 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:5] "1" "2" "3" "4" ...
## .. ..$ : chr [1:6] "SAT" "Top10" "Accept" "SFRatio" ...
## $ totss : num 144
## $ withinss : num [1:5] 2.83 7.09 6.63 6.74 2.11
## $ tot.withinss: num 25.4
## $ betweenss : num 119
## $ size : int [1:5] 7 4 9 3 2
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
# TO see all centroid point
fit_Kmeans$centers
## SAT Top10 Accept SFRatio Expenses GradRate
## 1 0.89637905 0.7692006 -0.9008542 -0.52824852 0.5606384 0.8668162
## 2 -1.89129229 -1.9414523 1.5612876 1.60546806 -1.2086753 -1.6527233
## 3 0.07386915 0.1811267 -0.2185352 -0.06774818 -0.2143826 0.4357213
## 4 -0.36704889 -0.1276120 0.8347143 0.31470124 -0.6130148 -0.9259078
## 5 0.86342006 0.5670502 -0.2382484 -1.52925136 2.3393604 -0.3002944
Merging cluster ouput with data
finaloutput <- data.frame(fit_Kmeans$cluster,input)
aggregate(finaloutput[,4:9], by=list(finaloutput$fit_Kmeans.cluster), FUN=mean)
## Group.1 SAT Top10 Accept SFRatio Expenses GradRate
## 1 1 1363.571 91.42857 21.42857 10.57143 35475.14 94.57143
## 2 2 1061.500 38.75000 70.00000 19.25000 9953.00 71.75000
## 3 3 1274.444 80.00000 34.88889 12.44444 24295.56 90.66667
## 4 4 1226.667 74.00000 55.66667 14.00000 18545.33 78.33333
## 5 5 1360.000 87.50000 34.50000 6.50000 61133.00 84.00000
Other way doing clustering
library(cluster)
xds <- rbind(cbind(rnorm(5000,0,8),rnorm(5000,0,8)),cbind(rnorm(5000,50,8),rnorm(5000,50,8))) # generating random number
# Using Clara
xcl <- clara(xds,5) # using clara to create cluster it's based on centroid
clusplot(xcl) #Ploting cluster
str(xcl)
## List of 10
## $ sample : int [1:50] 138 153 357 575 681 928 1118 1325 1634 2015 ...
## $ medoids : num [1:5, 1:2] 4.97 -5.78 -7.03 51.13 43.99 ...
## $ i.med : int [1:5] 2142 3115 2747 6852 8605
## $ clustering: int [1:10000] 1 1 2 1 1 3 2 1 1 3 ...
## $ objective : num 7.72
## $ clusinfo : num [1:5, 1:4] 2356 1772 872 3134 1866 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:4] "size" "max_diss" "av_diss" "isolation"
## $ diss :Classes 'dissimilarity', 'dist' atomic [1:1225] 14.7 12.4 13.6 23.7 15.4 ...
## .. ..- attr(*, "Size")= int 50
## .. ..- attr(*, "Metric")= chr "euclidean"
## $ call : language clara(x = xds, k = 5)
## $ silinfo :List of 3
## ..$ widths : num [1:50, 1:3] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:50] "1325" "2623" "4311" "3468" ...
## .. .. ..$ : chr [1:3] "cluster" "neighbor" "sil_width"
## ..$ clus.avg.widths: num [1:5] 0.416 0.379 0.306 0.249 0.491
## ..$ avg.width : num 0.348
## $ data : num [1:10000, 1:2] 11.72 12.2 -6.46 2.03 8.37 ...
## - attr(*, "class")= chr [1:2] "clara" "partition"
#Using Partition around medoids
xpm <- pam(xds,2) # Partition around medoids
clusplot(xpm)