#UnSupervised Learning

#K-Means Algorithm

#1. specify the number of clusters (k) to be created (by the analyst)
#2. select rendomly k objects from the dataset as the initial cluster centers or means
#3. assigns each observation to their closest centroid, based on the Euclidiean distance between the object and the centroid
#4.  For each of the k clusters update the cluster centroid by 
# calculating the new mean values of all the data points in the cluster.
# The centroid of a K th cluster is a vector of length p containing the means of all variables for the observations
# in the k th cluster;p is the number of variables
#5. iteratively minimize the total within sum of square. That is, iterate steps 3 and 4 
# until the cluster assignments stop changing or the maximum number of iterations is reached. 
# by default, R uses 10 as the default value for the maximum number of iterations. 

#Import data QS Indonesia
#Download data QS di : https://drive.google.com/file/d/1SQvdBX7ADhSB1pB5Q9QDczEAcJzADxUr/view?usp=sharing

data=read.csv(file="D:/DATA ANALYSIS/Data QS di Indonesia.csv",header=T, sep=";")
str(data) 
## 'data.frame':    26 obs. of  13 variables:
##  $ No                            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ X2025.Rank                    : chr  "206" "239" "256" "308" ...
##  $ X2024.Rank                    : chr  "237" "263" "281" "345" ...
##  $ Institution.Name              : chr  "Universitas Indonesia" "Gadjah Mada University" "Bandung Institute of Technology (ITB)" "Airlangga University" ...
##  $ Academic.Reputation           : num  53.8 54.4 43.2 37.1 23.5 17.3 19 19.2 19.2 12.5 ...
##  $ Employer.Reputation           : num  79.2 73.5 72.7 79.1 46.9 48.9 41.6 41.7 42.4 21.8 ...
##  $ Faculty.Student               : num  55.7 57.6 55.8 57 71.4 47.8 41.4 19.3 11.5 19.9 ...
##  $ Citations.per.Faculty         : num  2.1 1.7 2.6 1.5 1.8 1.6 1.4 1.5 1.4 2.1 ...
##  $ International.Faculty         : num  88.3 40.3 94.4 58 53.2 47.5 34.8 27.9 13.7 34.7 ...
##  $ International.Students        : num  7.1 4.1 3.8 6.7 3.6 5.3 2.2 3 1.4 4.3 ...
##  $ International.Research.Network: num  37.2 38.1 26.3 40.3 19.1 12 19.6 27.4 19.6 10.7 ...
##  $ Employment.Outcomes           : num  77.3 69 49.2 18.9 33.9 13 33.4 14.6 13.1 12.9 ...
##  $ Sustainability                : num  32.7 26.8 37.6 16.3 35.1 16.1 21.3 9.7 9 4.1 ...
df=data[5:13] 
rownames(df)=data[,4]
df=scale(df)
sum(is.na(df))
## [1] 0
str(df)
##  num [1:26, 1:9] 2.571 2.613 1.836 1.412 0.468 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:26] "Universitas Indonesia" "Gadjah Mada University" "Bandung Institute of Technology (ITB)" "Airlangga University" ...
##   ..$ : chr [1:9] "Academic.Reputation" "Employer.Reputation" "Faculty.Student" "Citations.per.Faculty" ...
##  - attr(*, "scaled:center")= Named num [1:9] 16.75 24.77 23.12 1.54 26.87 ...
##   ..- attr(*, "names")= chr [1:9] "Academic.Reputation" "Employer.Reputation" "Faculty.Student" "Citations.per.Faculty" ...
##  - attr(*, "scaled:scale")= Named num [1:9] 14.408 27.173 21.28 0.331 26.285 ...
##   ..- attr(*, "names")= chr [1:9] "Academic.Reputation" "Employer.Reputation" "Faculty.Student" "Citations.per.Faculty" ...
#View the 5 rows of the data
head(df, n=5)
##                                                       Academic.Reputation
## Universitas Indonesia                                           2.5712897
## Gadjah Mada University                                          2.6129344
## Bandung Institute of Technology (ITB)                           1.8355677
## Airlangga University                                            1.4121805
## IPB University (a.k.a. Bogor Agricultural University)           0.4682353
##                                                       Employer.Reputation
## Universitas Indonesia                                           2.0031556
## Gadjah Mada University                                          1.7933848
## Bandung Institute of Technology (ITB)                           1.7639432
## Airlangga University                                            1.9994754
## IPB University (a.k.a. Bogor Agricultural University)           0.8144543
##                                                       Faculty.Student
## Universitas Indonesia                                        1.531234
## Gadjah Mada University                                       1.620520
## Bandung Institute of Technology (ITB)                        1.535933
## Airlangga University                                         1.592324
## IPB University (a.k.a. Bogor Agricultural University)        2.269017
##                                                       Citations.per.Faculty
## Universitas Indonesia                                             1.6835081
## Gadjah Mada University                                            0.4760264
## Bandung Institute of Technology (ITB)                             3.1928603
## Airlangga University                                             -0.1277144
## IPB University (a.k.a. Bogor Agricultural University)             0.7778969
##                                                       International.Faculty
## Universitas Indonesia                                             2.3370884
## Gadjah Mada University                                            0.5109637
## Bandung Institute of Technology (ITB)                             2.5691585
## Airlangga University                                              1.1843472
## IPB University (a.k.a. Bogor Agricultural University)             1.0017348
##                                                       International.Students
## Universitas Indonesia                                              2.2851660
## Gadjah Mada University                                             0.6908641
## Bandung Institute of Technology (ITB)                              0.5314340
## Airlangga University                                               2.0725924
## IPB University (a.k.a. Bogor Agricultural University)              0.4251472
##                                                       International.Research.Network
## Universitas Indonesia                                                      2.0436339
## Gadjah Mada University                                                     2.1284079
## Bandung Institute of Technology (ITB)                                      1.0169261
## Airlangga University                                                       2.3356334
## IPB University (a.k.a. Bogor Agricultural University)                      0.3387339
##                                                       Employment.Outcomes
## Universitas Indonesia                                           2.9778796
## Gadjah Mada University                                          2.5750043
## Bandung Institute of Technology (ITB)                           1.6139282
## Airlangga University                                            0.1431906
## IPB University (a.k.a. Bogor Agricultural University)           0.8712785
##                                                       Sustainability
## Universitas Indonesia                                      1.9774282
## Gadjah Mada University                                     1.4801536
## Bandung Institute of Technology (ITB)                      2.3904189
## Airlangga University                                       0.5951735
## IPB University (a.k.a. Bogor Agricultural University)      2.1797094
#loading package for clustering
#install.packages("factoextra") *jalankan sekali
library(factoextra)
## Loading required package: ggplot2
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
############ Number of cluster ############

library(NbClust)
# Elbow method
fviz_nbclust(df, kmeans, method = "wss")+
  geom_vline(xintercept = 4, linetype = 2)+
  labs(subtitle = "Elbow method")

# Silhouette method
fviz_nbclust(df, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

# Gap statistic
# nboot = 50 to keep the function speedy.
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
set.seed(123)
fviz_nbclust(df, kmeans, nstart = 25, method = "gap_stat", nboot = 50)+
  labs(subtitle = "Gap statistic method")

#summary from some index 
nb <- NbClust(df, distance = "euclidean", min.nc = 2,
              max.nc = 10, method = "kmeans")
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 7 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 2 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
fviz_nbclust(nb)
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 11 proposed  2 as the best number of clusters
## * 7 proposed  3 as the best number of clusters
## * 1 proposed  4 as the best number of clusters
## * 2 proposed  6 as the best number of clusters
## * 1 proposed  7 as the best number of clusters
## * 2 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

#computing k-means with k=
set.seed(123)
km.res=kmeans(df,4,nstart=25)

#print the results
print(km.res)
## K-means clustering with 4 clusters of sizes 2, 3, 16, 5
## 
## Cluster means:
##   Academic.Reputation Employer.Reputation Faculty.Student Citations.per.Faculty
## 1          2.20342872           1.8835494       1.5335833             2.4381842
## 2          1.49778339           1.5357715       1.8272867             0.3754030
## 3         -0.52559677          -0.6205897      -0.6345348            -0.4484517
## 4         -0.09813186           0.3110043       0.3207060             0.2345301
##   International.Faculty International.Students International.Research.Network
## 1             2.4531234              1.4083000                      1.5302800
## 2             0.8990152              1.0628679                      1.6009250
## 3            -0.5847111             -0.6177920                     -0.5042968
## 4             0.3504169              0.7758936                      0.0410828
##   Employment.Outcomes Sustainability
## 1         2.295903891      2.1839236
## 2         1.196491140      1.4183455
## 3        -0.510875020     -0.5732110
## 4        -0.001456176      0.1096986
## 
## Clustering vector:
##                                 Universitas Indonesia 
##                                                     1 
##                                Gadjah Mada University 
##                                                     2 
##                 Bandung Institute of Technology (ITB) 
##                                                     1 
##                                  Airlangga University 
##                                                     2 
## IPB University (a.k.a. Bogor Agricultural University) 
##                                                     2 
##    Institut Teknologi Sepuluh Nopember (ITS Surabaya) 
##                                                     4 
##                               Universitas Padjadjaran 
##                                                     4 
##                                 Diponegoro University 
##                                                     4 
##                                Universitas Brawijaya  
##                                                     3 
##                     Bina Nusantara University (BINUS) 
##                                                     4 
##                                     Telkom University 
##                                                     4 
##                                Universitas Hasanuddin 
##                                                     3 
##                             Universitas Sebelas Maret 
##                                                     3 
##                 Atma Jaya Catholic University Jakarta 
##                                                     3 
##                           Universitas Islam Indonesia 
##                                                     3 
##                   Universitas Muhammadiyah Yogyakarta 
##                                                     3 
##                      Universitas Pendidikan Indonesia 
##                                                     3 
##                            Universitas Sumatera Utara 
##                                                     3 
##                           Yogyakarta State University 
##                                                     3 
##                            State University of Malang 
##                                                     3 
##                                    Udayana University 
##                                                     3 
##                                   Universitas Andalas 
##                                                     3 
##                             Universitas Kristen Petra 
##                                                     3 
##                    Universitas Muhammadiyah Surakarta 
##                                                     3 
##                                 University of Lampung 
##                                                     3 
##                                 University of Mataram 
##                                                     3 
## 
## Within cluster sum of squares by cluster:
## [1]  4.545581 12.425822 20.725456 14.724307
##  (between_SS / total_SS =  76.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#compute the mean of each clusters using original data
aggregate(df, by=list(km.res$cluster),mean)
##   Group.1 Academic.Reputation Employer.Reputation Faculty.Student
## 1       1          2.20342872           1.8835494       1.5335833
## 2       2          1.49778339           1.5357715       1.8272867
## 3       3         -0.52559677          -0.6205897      -0.6345348
## 4       4         -0.09813186           0.3110043       0.3207060
##   Citations.per.Faculty International.Faculty International.Students
## 1             2.4381842             2.4531234              1.4083000
## 2             0.3754030             0.8990152              1.0628679
## 3            -0.4484517            -0.5847111             -0.6177920
## 4             0.2345301             0.3504169              0.7758936
##   International.Research.Network Employment.Outcomes Sustainability
## 1                      1.5302800         2.295903891      2.1839236
## 2                      1.6009250         1.196491140      1.4183455
## 3                     -0.5042968        -0.510875020     -0.5732110
## 4                      0.0410828        -0.001456176      0.1096986
dd=cbind(df, cluster=km.res$cluster)
head(dd)
##                                                       Academic.Reputation
## Universitas Indonesia                                           2.5712897
## Gadjah Mada University                                          2.6129344
## Bandung Institute of Technology (ITB)                           1.8355677
## Airlangga University                                            1.4121805
## IPB University (a.k.a. Bogor Agricultural University)           0.4682353
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)              0.0379073
##                                                       Employer.Reputation
## Universitas Indonesia                                           2.0031556
## Gadjah Mada University                                          1.7933848
## Bandung Institute of Technology (ITB)                           1.7639432
## Airlangga University                                            1.9994754
## IPB University (a.k.a. Bogor Agricultural University)           0.8144543
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)              0.8880581
##                                                       Faculty.Student
## Universitas Indonesia                                        1.531234
## Gadjah Mada University                                       1.620520
## Bandung Institute of Technology (ITB)                        1.535933
## Airlangga University                                         1.592324
## IPB University (a.k.a. Bogor Agricultural University)        2.269017
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)           1.159993
##                                                       Citations.per.Faculty
## Universitas Indonesia                                             1.6835081
## Gadjah Mada University                                            0.4760264
## Bandung Institute of Technology (ITB)                             3.1928603
## Airlangga University                                             -0.1277144
## IPB University (a.k.a. Bogor Agricultural University)             0.7778969
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)                0.1741560
##                                                       International.Faculty
## Universitas Indonesia                                             2.3370884
## Gadjah Mada University                                            0.5109637
## Bandung Institute of Technology (ITB)                             2.5691585
## Airlangga University                                              1.1843472
## IPB University (a.k.a. Bogor Agricultural University)             1.0017348
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)                0.7848824
##                                                       International.Students
## Universitas Indonesia                                              2.2851660
## Gadjah Mada University                                             0.6908641
## Bandung Institute of Technology (ITB)                              0.5314340
## Airlangga University                                               2.0725924
## IPB University (a.k.a. Bogor Agricultural University)              0.4251472
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)                 1.3285849
##                                                       International.Research.Network
## Universitas Indonesia                                                      2.0436339
## Gadjah Mada University                                                     2.1284079
## Bandung Institute of Technology (ITB)                                      1.0169261
## Airlangga University                                                       2.3356334
## IPB University (a.k.a. Bogor Agricultural University)                      0.3387339
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)                        -0.3300391
##                                                       Employment.Outcomes
## Universitas Indonesia                                           2.9778796
## Gadjah Mada University                                          2.5750043
## Bandung Institute of Technology (ITB)                           1.6139282
## Airlangga University                                            0.1431906
## IPB University (a.k.a. Bogor Agricultural University)           0.8712785
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)             -0.1431906
##                                                       Sustainability cluster
## Universitas Indonesia                                      1.9774282       1
## Gadjah Mada University                                     1.4801536       2
## Bandung Institute of Technology (ITB)                      2.3904189       1
## Airlangga University                                       0.5951735       2
## IPB University (a.k.a. Bogor Agricultural University)      2.1797094       2
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)         0.5783167       4
#cluster number for each of the observations
km.res$cluster
##                                 Universitas Indonesia 
##                                                     1 
##                                Gadjah Mada University 
##                                                     2 
##                 Bandung Institute of Technology (ITB) 
##                                                     1 
##                                  Airlangga University 
##                                                     2 
## IPB University (a.k.a. Bogor Agricultural University) 
##                                                     2 
##    Institut Teknologi Sepuluh Nopember (ITS Surabaya) 
##                                                     4 
##                               Universitas Padjadjaran 
##                                                     4 
##                                 Diponegoro University 
##                                                     4 
##                                Universitas Brawijaya  
##                                                     3 
##                     Bina Nusantara University (BINUS) 
##                                                     4 
##                                     Telkom University 
##                                                     4 
##                                Universitas Hasanuddin 
##                                                     3 
##                             Universitas Sebelas Maret 
##                                                     3 
##                 Atma Jaya Catholic University Jakarta 
##                                                     3 
##                           Universitas Islam Indonesia 
##                                                     3 
##                   Universitas Muhammadiyah Yogyakarta 
##                                                     3 
##                      Universitas Pendidikan Indonesia 
##                                                     3 
##                            Universitas Sumatera Utara 
##                                                     3 
##                           Yogyakarta State University 
##                                                     3 
##                            State University of Malang 
##                                                     3 
##                                    Udayana University 
##                                                     3 
##                                   Universitas Andalas 
##                                                     3 
##                             Universitas Kristen Petra 
##                                                     3 
##                    Universitas Muhammadiyah Surakarta 
##                                                     3 
##                                 University of Lampung 
##                                                     3 
##                                 University of Mataram 
##                                                     3
head(km.res$cluster,4)
##                 Universitas Indonesia                Gadjah Mada University 
##                                     1                                     2 
## Bandung Institute of Technology (ITB)                  Airlangga University 
##                                     1                                     2
#cluster size 
km.res$size
## [1]  2  3 16  5
#cluster means
km.res$centers
##   Academic.Reputation Employer.Reputation Faculty.Student Citations.per.Faculty
## 1          2.20342872           1.8835494       1.5335833             2.4381842
## 2          1.49778339           1.5357715       1.8272867             0.3754030
## 3         -0.52559677          -0.6205897      -0.6345348            -0.4484517
## 4         -0.09813186           0.3110043       0.3207060             0.2345301
##   International.Faculty International.Students International.Research.Network
## 1             2.4531234              1.4083000                      1.5302800
## 2             0.8990152              1.0628679                      1.6009250
## 3            -0.5847111             -0.6177920                     -0.5042968
## 4             0.3504169              0.7758936                      0.0410828
##   Employment.Outcomes Sustainability
## 1         2.295903891      2.1839236
## 2         1.196491140      1.4183455
## 3        -0.510875020     -0.5732110
## 4        -0.001456176      0.1096986
#tot.withinss
km.res$tot.withinss
## [1] 52.42117
############ visualizing k-means clusters ##############

fviz_cluster(km.res,data=df,
             palette=c("jco"),
             ellipse.type="euclid",# Concentration ellipse
             star.plot=TRUE,# Addsegmentsfromcentroidsto items
             repel=TRUE,#Avoid labeloverplotting (slow)
             ggtheme=theme_minimal()
)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

######## Hitung means tiap fitur di tiap cluster ########
# *menggunakan data terstandarisasi ke data asli
centers_original <- t(t(km.res$centers) * attr(df,"scaled:scale") +
                        attr(df,"scaled:center"))
centers_original 
##   Academic.Reputation Employer.Reputation Faculty.Student Citations.per.Faculty
## 1            48.50000            75.95000         55.7500              2.350000
## 2            38.33333            66.50000         62.0000              1.666667
## 3             9.18125             7.90625          9.6125              1.393750
## 4            15.34000            33.22000         29.9400              1.620000
##   International.Faculty International.Students International.Research.Network
## 1                 91.35                 5.4500                          31.75
## 2                 50.50                 4.8000                          32.50
## 3                 11.50                 1.6375                          10.15
## 4                 36.08                 4.2600                          15.94
##   Employment.Outcomes Sustainability
## 1              63.250       35.15000
## 2              40.600       26.06667
## 3               5.425        2.43750
## 4              15.920       10.54000
# Visualize the means of each cluster

library(ggplot2)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# ambil centroid
centers <- as.data.frame(centers_original)
centers$cluster <- factor(1:nrow(centers))

# ubah ke format long
centers_long <- centers %>%
  pivot_longer(-cluster, names_to = "variable", values_to = "value")

# plot
ggplot(centers_long, aes(x = variable, y = value, fill = cluster)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Centroid Value per Variable",
       x = "Variable",
       y = "Center Value") +
  theme_minimal()

##########################################
############# Fuzzy K-Means ##############

#install.packages("cluster") *jalankan sekali
library(cluster)
res.fanny=fanny(df,2)

res.fanny$coeff
## dunn_coeff normalized 
##  0.6349999  0.2699999
head(res.fanny$clustering)
##                                 Universitas Indonesia 
##                                                     1 
##                                Gadjah Mada University 
##                                                     1 
##                 Bandung Institute of Technology (ITB) 
##                                                     1 
##                                  Airlangga University 
##                                                     1 
## IPB University (a.k.a. Bogor Agricultural University) 
##                                                     1 
##    Institut Teknologi Sepuluh Nopember (ITS Surabaya) 
##                                                     1
library(factoextra)
fviz_cluster(res.fanny, ellipse.type = "norm", repel = TRUE,
             palette = "jco", ggtheme = theme_minimal(),
             legend = "right")

fviz_silhouette(res.fanny, palette = "jco",
                ggtheme = theme_minimal())
##   cluster size ave.sil.width
## 1       1    8          0.19
## 2       2   18          0.64

#################################
############ DBScan #############
# Compute DBSCAN using fpc package
#install.packages("fpc") *jalankan sekali

library("fpc")
set.seed(123)
db <- fpc::dbscan(df, eps = 0.15, MinPts = 5)

fviz_cluster(db,data=df,stand=FALSE,
             ellipse=FALSE,show.clust.cent=FALSE,
             geom="point",palette="jco",ggtheme=theme_classic())