#install.packages("plyr")
library(plyr)
## Warning: package 'plyr' was built under R version 3.4.4
x <- runif(50) # generating 50 random numbers
x
## [1] 0.98224476 0.29106727 0.16483506 0.62310420 0.64822102 0.85617045
## [7] 0.74314934 0.01933727 0.65393362 0.18729862 0.30674777 0.05214781
## [13] 0.25877840 0.50530777 0.93096327 0.01381117 0.58484422 0.17566185
## [19] 0.17434614 0.69030130 0.60069277 0.42446900 0.74523872 0.36871371
## [25] 0.67644568 0.13528992 0.47430907 0.02713500 0.58910406 0.73019511
## [31] 0.24092859 0.10746193 0.45397323 0.40662543 0.17946205 0.14583983
## [37] 0.24534250 0.28650813 0.46164647 0.62343491 0.99870845 0.05203136
## [43] 0.70025910 0.69759472 0.78617928 0.85304906 0.33823324 0.43609671
## [49] 0.27682144 0.53135109
y <- runif(50) # generating 50 random numbers
y
## [1] 0.16001853 0.87061343 0.24640378 0.69304879 0.24799944 0.97624316
## [7] 0.37053385 0.63442439 0.24435480 0.40121268 0.41736505 0.95799630
## [13] 0.45093549 0.94841379 0.85492245 0.96775682 0.08560128 0.87469003
## [19] 0.46669907 0.23788290 0.07764530 0.42105337 0.13176325 0.93667116
## [25] 0.04508907 0.78211701 0.32533923 0.03956951 0.17916764 0.33936523
## [31] 0.46270558 0.11466888 0.44084835 0.98199056 0.60572076 0.13190243
## [37] 0.26949958 0.18311904 0.19249030 0.38289937 0.18485075 0.91774454
## [43] 0.67203555 0.78387077 0.98185197 0.72310703 0.69809134 0.84617392
## [49] 0.93855278 0.79281112
data <- cbind(x,y)
View(data)
plot(data)

plot(data, type="n")
text(data,plot(data,type = "n" ,rownames(data)))

km <- kmeans(data,4) #kmeans clustering
str(km)
## List of 9
## $ cluster : int [1:50] 4 1 3 2 4 2 4 1 4 3 ...
## $ centers : num [1:4, 1:2] 0.211 0.72 0.264 0.713 0.847 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:2] "x" "y"
## $ totss : num 8.71
## $ withinss : num [1:4] 0.473 0.295 0.562 0.364
## $ tot.withinss: num 1.69
## $ betweenss : num 7.02
## $ size : int [1:4] 13 9 15 13
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
km$cluster
## [1] 4 1 3 2 4 2 4 1 4 3 3 1 3 2 2 1 4 1 3 4 4 3 4 1 4 1 3 3 4 4 3 3 3 1 1
## [36] 3 3 3 3 4 4 1 2 2 2 2 1 1 1 2
#install.packages("animation")
#library(animation)
#km1 <- kmeans.ani(data, 4)
#km$centers
#elbow curve & k ~ sqrt(n/2) to decide the k value
#wss = (nrow(normalized_data)-1)*sum(apply(normalized_data, 2, var)) # Determine number of clusters by scree-plot
#for (i in 2:8) wss[i] = sum(kmeans(normalized_data, centers=i)$withinss)
#plot(1:8, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") # Look for an "elbow" in the scree plot #
#title(sub = "K-Means Clustering Scree-Plot")
# selecting K for kmeans clustering using kselection
#install.packages("kselection")
#library(kselection)
#k <- kselection(iris[,-5], parallel = TRUE, k_threshold = 0.9, #max_centers=12)
#?kselection
#install.packages("doParallel")
#library(doParallel)
#registerDoParallel(cores=4)
#k <- kselection(iris[,-5], parallel = TRUE, k_threshold = 0.9, #max_centers=12)
#k
input <- read.csv("E:\\Excelr DS\\R _Codes\\Clustering\\Universities.csv")
mydata <- input
normalized_data <- scale(mydata[,2:7])
fit <- kmeans(normalized_data, 5) # 5 cluster solution
#plot(fit)
str(fit)
## List of 9
## $ cluster : int [1:25] 1 2 5 1 1 3 3 1 3 2 ...
## $ centers : num [1:5, 1:6] 0.0739 0.8634 0.8964 -1.8913 -0.367 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:5] "1" "2" "3" "4" ...
## .. ..$ : chr [1:6] "SAT" "Top10" "Accept" "SFRatio" ...
## $ totss : num 144
## $ withinss : num [1:5] 6.63 2.11 2.83 7.09 6.74
## $ tot.withinss: num 25.4
## $ betweenss : num 119
## $ size : int [1:5] 9 2 7 4 3
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fit$cluster
## [1] 1 2 5 1 1 3 3 1 3 2 3 1 1 4 3 4 3 4 5 1 5 1 1 4 3
final2 <- data.frame(fit$cluster,mydata) # append cluster membership
final2
## fit.cluster Univ SAT Top10 Accept SFRatio Expenses GradRate
## 1 1 Brown 1310 89 22 13 22704 94
## 2 2 CalTech 1415 100 25 6 63575 81
## 3 5 CMU 1260 62 59 9 25026 72
## 4 1 Columbia 1310 76 24 12 31510 88
## 5 1 Cornell 1280 83 33 13 21864 90
## 6 3 Dartmouth 1340 89 23 10 32162 95
## 7 3 Duke 1315 90 30 12 31585 95
## 8 1 Georgetown 1255 74 24 12 20126 92
## 9 3 Harvard 1400 91 14 11 39525 97
## 10 2 JohnsHopkins 1305 75 44 7 58691 87
## 11 3 MIT 1380 94 30 10 34870 91
## 12 1 Northwestern 1260 85 39 11 28052 89
## 13 1 NotreDame 1255 81 42 13 15122 94
## 14 4 PennState 1081 38 54 18 10185 80
## 15 3 Princeton 1375 91 14 8 30220 95
## 16 4 Purdue 1005 28 90 19 9066 69
## 17 3 Stanford 1360 90 20 12 36450 93
## 18 4 TexasA&M 1075 49 67 25 8704 67
## 19 5 UCBerkeley 1240 95 40 17 15140 78
## 20 1 UChicago 1290 75 50 13 38380 87
## 21 5 UMichigan 1180 65 68 16 15470 85
## 22 1 UPenn 1285 80 36 11 27553 90
## 23 1 UVA 1225 77 44 14 13349 92
## 24 4 UWisconsin 1085 40 69 15 11857 71
## 25 3 Yale 1375 95 19 11 43514 96
x<-aggregate(mydata[,2:7], by=list(fit$cluster), FUN=mean)
# k clustering alternative for large dataset - Clustering Large Applications (Clara)
#install.packages("cluster")
library(cluster)
## Warning: package 'cluster' was built under R version 3.4.4
xds <- rbind(cbind(rnorm(5000, 0, 8), rnorm(5000, 0, 8)),
cbind(rnorm(5000, 50, 8), rnorm(5000, 50, 8)))
xcl <- clara(xds, 5)
clusplot(xcl)

str(xcl)
## List of 10
## $ sample : int [1:50] 138 153 357 575 681 928 1118 1325 1634 2015 ...
## $ medoids : num [1:5, 1:2] 4.21 -8.8 51.8 57.39 43.38 ...
## $ i.med : int [1:5] 3944 2015 6852 8605 7426
## $ clustering: int [1:10000] 1 1 2 1 2 1 1 1 2 1 ...
## $ objective : num 7.65
## $ clusinfo : num [1:5, 1:4] 3134 1866 1964 1240 1796 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:4] "size" "max_diss" "av_diss" "isolation"
## $ diss :Classes 'dissimilarity', 'dist' atomic [1:1225] 9.67 12.85 24.78 7.82 22.4 ...
## .. ..- attr(*, "Size")= int 50
## .. ..- attr(*, "Metric")= chr "euclidean"
## $ call : language clara(x = xds, k = 5)
## $ silinfo :List of 3
## ..$ widths : num [1:50, 1:3] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:50] "4311" "3944" "3468" "928" ...
## .. .. ..$ : chr [1:3] "cluster" "neighbor" "sil_width"
## ..$ clus.avg.widths: num [1:5] 0.339 0.373 0.282 0.709 0.347
## ..$ avg.width : num 0.39
## $ data : num [1:10000, 1:2] 14.055 13.279 -0.592 4.268 -8.111 ...
## - attr(*, "class")= chr [1:2] "clara" "partition"
y<-xcl$clustering
y <- as.data.frame(y)
#Partitioning around medoids
xpm <- pam(xds, 2)
clusplot(xpm)
