#install.packages("plyr")

library(plyr)

## Warning: package 'plyr' was built under R version 3.4.4

x <-  runif(50) # generating 50 random numbers
x

##  [1] 0.98224476 0.29106727 0.16483506 0.62310420 0.64822102 0.85617045
##  [7] 0.74314934 0.01933727 0.65393362 0.18729862 0.30674777 0.05214781
## [13] 0.25877840 0.50530777 0.93096327 0.01381117 0.58484422 0.17566185
## [19] 0.17434614 0.69030130 0.60069277 0.42446900 0.74523872 0.36871371
## [25] 0.67644568 0.13528992 0.47430907 0.02713500 0.58910406 0.73019511
## [31] 0.24092859 0.10746193 0.45397323 0.40662543 0.17946205 0.14583983
## [37] 0.24534250 0.28650813 0.46164647 0.62343491 0.99870845 0.05203136
## [43] 0.70025910 0.69759472 0.78617928 0.85304906 0.33823324 0.43609671
## [49] 0.27682144 0.53135109

y <-  runif(50) # generating 50 random numbers 
y

##  [1] 0.16001853 0.87061343 0.24640378 0.69304879 0.24799944 0.97624316
##  [7] 0.37053385 0.63442439 0.24435480 0.40121268 0.41736505 0.95799630
## [13] 0.45093549 0.94841379 0.85492245 0.96775682 0.08560128 0.87469003
## [19] 0.46669907 0.23788290 0.07764530 0.42105337 0.13176325 0.93667116
## [25] 0.04508907 0.78211701 0.32533923 0.03956951 0.17916764 0.33936523
## [31] 0.46270558 0.11466888 0.44084835 0.98199056 0.60572076 0.13190243
## [37] 0.26949958 0.18311904 0.19249030 0.38289937 0.18485075 0.91774454
## [43] 0.67203555 0.78387077 0.98185197 0.72310703 0.69809134 0.84617392
## [49] 0.93855278 0.79281112

data <- cbind(x,y) 
View(data)

plot(data)

plot(data, type="n")
text(data,plot(data,type = "n" ,rownames(data)))

km <- kmeans(data,4) #kmeans clustering
str(km)

## List of 9
##  $ cluster     : int [1:50] 4 1 3 2 4 2 4 1 4 3 ...
##  $ centers     : num [1:4, 1:2] 0.211 0.72 0.264 0.713 0.847 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "1" "2" "3" "4"
##   .. ..$ : chr [1:2] "x" "y"
##  $ totss       : num 8.71
##  $ withinss    : num [1:4] 0.473 0.295 0.562 0.364
##  $ tot.withinss: num 1.69
##  $ betweenss   : num 7.02
##  $ size        : int [1:4] 13 9 15 13
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

km$cluster

##  [1] 4 1 3 2 4 2 4 1 4 3 3 1 3 2 2 1 4 1 3 4 4 3 4 1 4 1 3 3 4 4 3 3 3 1 1
## [36] 3 3 3 3 4 4 1 2 2 2 2 1 1 1 2

#install.packages("animation")
#library(animation)

#km1 <- kmeans.ani(data, 4)
#km$centers

#elbow curve & k ~ sqrt(n/2) to decide the k value

#wss = (nrow(normalized_data)-1)*sum(apply(normalized_data, 2, var))         # Determine number of clusters by scree-plot 
#for (i in 2:8) wss[i] = sum(kmeans(normalized_data, centers=i)$withinss)
#plot(1:8, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")   # Look for an "elbow" in the scree plot #
#title(sub = "K-Means Clustering Scree-Plot")

# selecting K for kmeans clustering using kselection
#install.packages("kselection")
#library(kselection)
#k <- kselection(iris[,-5], parallel = TRUE, k_threshold = 0.9, #max_centers=12)
#?kselection

#install.packages("doParallel")
#library(doParallel)
#registerDoParallel(cores=4)
#k <- kselection(iris[,-5], parallel = TRUE, k_threshold = 0.9, #max_centers=12)
#k

input <- read.csv("E:\\Excelr DS\\R _Codes\\Clustering\\Universities.csv")
mydata <- input
normalized_data <- scale(mydata[,2:7])

fit <- kmeans(normalized_data, 5) # 5 cluster solution

#plot(fit)
str(fit)

## List of 9
##  $ cluster     : int [1:25] 1 2 5 1 1 3 3 1 3 2 ...
##  $ centers     : num [1:5, 1:6] 0.0739 0.8634 0.8964 -1.8913 -0.367 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:6] "SAT" "Top10" "Accept" "SFRatio" ...
##  $ totss       : num 144
##  $ withinss    : num [1:5] 6.63 2.11 2.83 7.09 6.74
##  $ tot.withinss: num 25.4
##  $ betweenss   : num 119
##  $ size        : int [1:5] 9 2 7 4 3
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

fit$cluster

##  [1] 1 2 5 1 1 3 3 1 3 2 3 1 1 4 3 4 3 4 5 1 5 1 1 4 3

final2 <- data.frame(fit$cluster,mydata) # append cluster membership
final2

##    fit.cluster         Univ  SAT Top10 Accept SFRatio Expenses GradRate
## 1            1        Brown 1310    89     22      13    22704       94
## 2            2      CalTech 1415   100     25       6    63575       81
## 3            5          CMU 1260    62     59       9    25026       72
## 4            1     Columbia 1310    76     24      12    31510       88
## 5            1      Cornell 1280    83     33      13    21864       90
## 6            3    Dartmouth 1340    89     23      10    32162       95
## 7            3         Duke 1315    90     30      12    31585       95
## 8            1   Georgetown 1255    74     24      12    20126       92
## 9            3      Harvard 1400    91     14      11    39525       97
## 10           2 JohnsHopkins 1305    75     44       7    58691       87
## 11           3          MIT 1380    94     30      10    34870       91
## 12           1 Northwestern 1260    85     39      11    28052       89
## 13           1    NotreDame 1255    81     42      13    15122       94
## 14           4    PennState 1081    38     54      18    10185       80
## 15           3    Princeton 1375    91     14       8    30220       95
## 16           4       Purdue 1005    28     90      19     9066       69
## 17           3     Stanford 1360    90     20      12    36450       93
## 18           4     TexasA&M 1075    49     67      25     8704       67
## 19           5   UCBerkeley 1240    95     40      17    15140       78
## 20           1     UChicago 1290    75     50      13    38380       87
## 21           5    UMichigan 1180    65     68      16    15470       85
## 22           1        UPenn 1285    80     36      11    27553       90
## 23           1          UVA 1225    77     44      14    13349       92
## 24           4   UWisconsin 1085    40     69      15    11857       71
## 25           3         Yale 1375    95     19      11    43514       96

x<-aggregate(mydata[,2:7], by=list(fit$cluster), FUN=mean)

# k clustering alternative for large dataset - Clustering Large Applications (Clara)
#install.packages("cluster")
library(cluster)

## Warning: package 'cluster' was built under R version 3.4.4

xds <- rbind(cbind(rnorm(5000, 0, 8), rnorm(5000, 0, 8)), 
             cbind(rnorm(5000, 50, 8), rnorm(5000, 50, 8)))
xcl <- clara(xds, 5)
clusplot(xcl)

str(xcl)

## List of 10
##  $ sample    : int [1:50] 138 153 357 575 681 928 1118 1325 1634 2015 ...
##  $ medoids   : num [1:5, 1:2] 4.21 -8.8 51.8 57.39 43.38 ...
##  $ i.med     : int [1:5] 3944 2015 6852 8605 7426
##  $ clustering: int [1:10000] 1 1 2 1 2 1 1 1 2 1 ...
##  $ objective : num 7.65
##  $ clusinfo  : num [1:5, 1:4] 3134 1866 1964 1240 1796 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:4] "size" "max_diss" "av_diss" "isolation"
##  $ diss      :Classes 'dissimilarity', 'dist'  atomic [1:1225] 9.67 12.85 24.78 7.82 22.4 ...
##   .. ..- attr(*, "Size")= int 50
##   .. ..- attr(*, "Metric")= chr "euclidean"
##  $ call      : language clara(x = xds, k = 5)
##  $ silinfo   :List of 3
##   ..$ widths         : num [1:50, 1:3] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:50] "4311" "3944" "3468" "928" ...
##   .. .. ..$ : chr [1:3] "cluster" "neighbor" "sil_width"
##   ..$ clus.avg.widths: num [1:5] 0.339 0.373 0.282 0.709 0.347
##   ..$ avg.width      : num 0.39
##  $ data      : num [1:10000, 1:2] 14.055 13.279 -0.592 4.268 -8.111 ...
##  - attr(*, "class")= chr [1:2] "clara" "partition"

y<-xcl$clustering
y <- as.data.frame(y)
#Partitioning around medoids
xpm <- pam(xds, 2)
clusplot(xpm)