k-mean clustering on Universites data set

Import Data from Excel File

library(readxl) # use for read_excel function
input <- read_excel("C:\\Users\\Pawan Srivastav\\Downloads\\University_Clustering.xlsx")
input

## # A tibble: 25 x 8
##    Univ         State   SAT Top10 Accept SFRatio Expenses GradRate
##    <chr>        <chr> <dbl> <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
##  1 Brown        RI     1310    89     22      13    22704       94
##  2 CalTech      CA     1415   100     25       6    63575       81
##  3 CMU          PA     1260    62     59       9    25026       72
##  4 Columbia     NY     1310    76     24      12    31510       88
##  5 Cornell      NY     1280    83     33      13    21864       90
##  6 Dartmouth    NH     1340    89     23      10    32162       95
##  7 Duke         NC     1315    90     30      12    31585       95
##  8 Georgetown   DC     1255    74     24      12    20126       92
##  9 Harvard      MA     1400    91     14      11    39525       97
## 10 JohnsHopkins MD     1305    75     44       7    58691       87
## # ... with 15 more rows

Extracting the numeric columns from main data

  mydata <- input[,3:8]
  mydata

## # A tibble: 25 x 6
##      SAT Top10 Accept SFRatio Expenses GradRate
##    <dbl> <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
##  1  1310    89     22      13    22704       94
##  2  1415   100     25       6    63575       81
##  3  1260    62     59       9    25026       72
##  4  1310    76     24      12    31510       88
##  5  1280    83     33      13    21864       90
##  6  1340    89     23      10    32162       95
##  7  1315    90     30      12    31585       95
##  8  1255    74     24      12    20126       92
##  9  1400    91     14      11    39525       97
## 10  1305    75     44       7    58691       87
## # ... with 15 more rows

Normalizing the input

  normalizedinput <- scale(mydata)
  normalizedinput

##               SAT       Top10      Accept    SFRatio    Expenses
##  [1,]  0.40199420  0.64423491 -0.87188786  0.0688409 -0.32471667
##  [2,]  1.37098850  1.21025599 -0.71981439 -1.6521815  2.50865117
##  [3,] -0.05943165 -0.74508957  1.00368486 -0.9146005 -0.16374483
##  [4,]  0.40199420 -0.02469910 -0.77050555 -0.1770194  0.28575621
##  [5,]  0.12513869  0.33549613 -0.31428516  0.0688409 -0.38294938
##  [6,]  0.67884972  0.64423491 -0.82119670 -0.6687401  0.33095589
##  [7,]  0.44813679  0.69569137 -0.46635862 -0.1770194  0.29095556
##  [8,] -0.10557424 -0.12761203 -0.77050555 -0.1770194 -0.50343562
##  [9,]  1.23256074  0.74714783 -1.27741709 -0.4228798  0.84139330
## [10,]  0.35585162 -0.07615556  0.24331754 -1.4063212  2.17006957
## [11,]  1.04799040  0.90151722 -0.46635862 -0.6687401  0.51868704
## [12,] -0.05943165  0.43840906 -0.01013823 -0.4228798  0.04603157
## [13,] -0.10557424  0.23258321  0.14193523  0.0688409 -0.85033618
## [14,] -1.71133621 -1.98004466  0.75022909  1.2981426 -1.19259198
## [15,]  1.00184782  0.74714783 -1.27741709 -1.1604608  0.19632741
## [16,] -2.41270351 -2.49460928  2.57511065  1.5440030 -1.27016627
## [17,]  0.86342006  0.69569137 -0.97327017 -0.1770194  0.62821999
## [18,] -1.76670731 -1.41402358  1.40921410  3.0191651 -1.29526179
## [19,] -0.24400199  0.95297368  0.04055292  1.0522823 -0.84908833
## [20,]  0.21742386 -0.07615556  0.54746447  0.0688409  0.76201657
## [21,] -0.79771302 -0.59072018  1.45990525  0.8064219 -0.82621120
## [22,]  0.17128128  0.18112675 -0.16221169 -0.4228798  0.01143857
## [23,] -0.38242975  0.02675736  0.24331754  0.3147012 -0.97324878
## [24,] -1.67442214 -1.87713174  1.51059641  0.5605616 -1.07668116
## [25,]  1.00184782  0.95297368 -1.02396132 -0.4228798  1.11792933
##          GradRate
##  [1,]  0.80372917
##  [2,] -0.63150149
##  [3,] -1.62512272
##  [4,]  0.14131502
##  [5,]  0.36211974
##  [6,]  0.91413153
##  [7,]  0.91413153
##  [8,]  0.58292445
##  [9,]  1.13493625
## [10,]  0.03091266
## [11,]  0.47252209
## [12,]  0.25171738
## [13,]  0.80372917
## [14,] -0.74190385
## [15,]  0.91413153
## [16,] -1.95632979
## [17,]  0.69332681
## [18,] -2.17713451
## [19,] -0.96270857
## [20,]  0.03091266
## [21,] -0.18989206
## [22,]  0.36211974
## [23,]  0.58292445
## [24,] -1.73552508
## [25,]  1.02453389
## attr(,"scaled:center")
##      SAT    Top10   Accept  SFRatio Expenses GradRate 
##  1266.44    76.48    39.20    12.72 27388.00    86.72 
## attr(,"scaled:scale")
##          SAT        Top10       Accept      SFRatio     Expenses 
##   108.359771    19.433905    19.727308     4.067350 14424.883165 
##     GradRate 
##     9.057778

] Creating K-mean clustering on normalized data

fit_Kmeans <- kmeans(normalizedinput,5)

str(fit_Kmeans)

## List of 9
##  $ cluster     : int [1:25] 3 5 4 3 3 1 1 3 1 5 ...
##  $ centers     : num [1:5, 1:6] 0.8964 -1.8913 0.0739 -0.367 0.8634 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:6] "SAT" "Top10" "Accept" "SFRatio" ...
##  $ totss       : num 144
##  $ withinss    : num [1:5] 2.83 7.09 6.63 6.74 2.11
##  $ tot.withinss: num 25.4
##  $ betweenss   : num 119
##  $ size        : int [1:5] 7 4 9 3 2
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

# TO see all centroid point 
fit_Kmeans$centers

##           SAT      Top10     Accept     SFRatio   Expenses   GradRate
## 1  0.89637905  0.7692006 -0.9008542 -0.52824852  0.5606384  0.8668162
## 2 -1.89129229 -1.9414523  1.5612876  1.60546806 -1.2086753 -1.6527233
## 3  0.07386915  0.1811267 -0.2185352 -0.06774818 -0.2143826  0.4357213
## 4 -0.36704889 -0.1276120  0.8347143  0.31470124 -0.6130148 -0.9259078
## 5  0.86342006  0.5670502 -0.2382484 -1.52925136  2.3393604 -0.3002944

Merging cluster ouput with data

finaloutput <- data.frame(fit_Kmeans$cluster,input)

aggregate(finaloutput[,4:9], by=list(finaloutput$fit_Kmeans.cluster), FUN=mean)

##   Group.1      SAT    Top10   Accept  SFRatio Expenses GradRate
## 1       1 1363.571 91.42857 21.42857 10.57143 35475.14 94.57143
## 2       2 1061.500 38.75000 70.00000 19.25000  9953.00 71.75000
## 3       3 1274.444 80.00000 34.88889 12.44444 24295.56 90.66667
## 4       4 1226.667 74.00000 55.66667 14.00000 18545.33 78.33333
## 5       5 1360.000 87.50000 34.50000  6.50000 61133.00 84.00000

Other way doing clustering

library(cluster)
xds <- rbind(cbind(rnorm(5000,0,8),rnorm(5000,0,8)),cbind(rnorm(5000,50,8),rnorm(5000,50,8))) #  generating random number

# Using Clara
xcl <- clara(xds,5) # using clara to create cluster it's based on centroid
clusplot(xcl) #Ploting cluster

str(xcl)

## List of 10
##  $ sample    : int [1:50] 138 153 357 575 681 928 1118 1325 1634 2015 ...
##  $ medoids   : num [1:5, 1:2] 4.97 -5.78 -7.03 51.13 43.99 ...
##  $ i.med     : int [1:5] 2142 3115 2747 6852 8605
##  $ clustering: int [1:10000] 1 1 2 1 1 3 2 1 1 3 ...
##  $ objective : num 7.72
##  $ clusinfo  : num [1:5, 1:4] 2356 1772 872 3134 1866 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:4] "size" "max_diss" "av_diss" "isolation"
##  $ diss      :Classes 'dissimilarity', 'dist'  atomic [1:1225] 14.7 12.4 13.6 23.7 15.4 ...
##   .. ..- attr(*, "Size")= int 50
##   .. ..- attr(*, "Metric")= chr "euclidean"
##  $ call      : language clara(x = xds, k = 5)
##  $ silinfo   :List of 3
##   ..$ widths         : num [1:50, 1:3] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:50] "1325" "2623" "4311" "3468" ...
##   .. .. ..$ : chr [1:3] "cluster" "neighbor" "sil_width"
##   ..$ clus.avg.widths: num [1:5] 0.416 0.379 0.306 0.249 0.491
##   ..$ avg.width      : num 0.348
##  $ data      : num [1:10000, 1:2] 11.72 12.2 -6.46 2.03 8.37 ...
##  - attr(*, "class")= chr [1:2] "clara" "partition"

#Using Partition around medoids
xpm <- pam(xds,2) # Partition around medoids
clusplot(xpm)

K-Mean Clustering

Pawan Srivastav

1 July 2018

k-mean clustering on Universites data set