#UnSupervised Learning
#K-Means Algorithm
#1. specify the number of clusters (k) to be created (by the analyst)
#2. select rendomly k objects from the dataset as the initial cluster centers or means
#3. assigns each observation to their closest centroid, based on the Euclidiean distance between the object and the centroid
#4. For each of the k clusters update the cluster centroid by
# calculating the new mean values of all the data points in the cluster.
# The centroid of a K th cluster is a vector of length p containing the means of all variables for the observations
# in the k th cluster;p is the number of variables
#5. iteratively minimize the total within sum of square. That is, iterate steps 3 and 4
# until the cluster assignments stop changing or the maximum number of iterations is reached.
# by default, R uses 10 as the default value for the maximum number of iterations.
#Import data QS Indonesia
#Download data QS di : https://drive.google.com/file/d/1SQvdBX7ADhSB1pB5Q9QDczEAcJzADxUr/view?usp=sharing
data=read.csv(file="D:/DATA ANALYSIS/Data QS di Indonesia.csv",header=T, sep=";")
str(data)
## 'data.frame': 26 obs. of 13 variables:
## $ No : int 1 2 3 4 5 6 7 8 9 10 ...
## $ X2025.Rank : chr "206" "239" "256" "308" ...
## $ X2024.Rank : chr "237" "263" "281" "345" ...
## $ Institution.Name : chr "Universitas Indonesia" "Gadjah Mada University" "Bandung Institute of Technology (ITB)" "Airlangga University" ...
## $ Academic.Reputation : num 53.8 54.4 43.2 37.1 23.5 17.3 19 19.2 19.2 12.5 ...
## $ Employer.Reputation : num 79.2 73.5 72.7 79.1 46.9 48.9 41.6 41.7 42.4 21.8 ...
## $ Faculty.Student : num 55.7 57.6 55.8 57 71.4 47.8 41.4 19.3 11.5 19.9 ...
## $ Citations.per.Faculty : num 2.1 1.7 2.6 1.5 1.8 1.6 1.4 1.5 1.4 2.1 ...
## $ International.Faculty : num 88.3 40.3 94.4 58 53.2 47.5 34.8 27.9 13.7 34.7 ...
## $ International.Students : num 7.1 4.1 3.8 6.7 3.6 5.3 2.2 3 1.4 4.3 ...
## $ International.Research.Network: num 37.2 38.1 26.3 40.3 19.1 12 19.6 27.4 19.6 10.7 ...
## $ Employment.Outcomes : num 77.3 69 49.2 18.9 33.9 13 33.4 14.6 13.1 12.9 ...
## $ Sustainability : num 32.7 26.8 37.6 16.3 35.1 16.1 21.3 9.7 9 4.1 ...
df=data[5:13]
rownames(df)=data[,4]
df=scale(df)
sum(is.na(df))
## [1] 0
str(df)
## num [1:26, 1:9] 2.571 2.613 1.836 1.412 0.468 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:26] "Universitas Indonesia" "Gadjah Mada University" "Bandung Institute of Technology (ITB)" "Airlangga University" ...
## ..$ : chr [1:9] "Academic.Reputation" "Employer.Reputation" "Faculty.Student" "Citations.per.Faculty" ...
## - attr(*, "scaled:center")= Named num [1:9] 16.75 24.77 23.12 1.54 26.87 ...
## ..- attr(*, "names")= chr [1:9] "Academic.Reputation" "Employer.Reputation" "Faculty.Student" "Citations.per.Faculty" ...
## - attr(*, "scaled:scale")= Named num [1:9] 14.408 27.173 21.28 0.331 26.285 ...
## ..- attr(*, "names")= chr [1:9] "Academic.Reputation" "Employer.Reputation" "Faculty.Student" "Citations.per.Faculty" ...
#View the 5 rows of the data
head(df, n=5)
## Academic.Reputation
## Universitas Indonesia 2.5712897
## Gadjah Mada University 2.6129344
## Bandung Institute of Technology (ITB) 1.8355677
## Airlangga University 1.4121805
## IPB University (a.k.a. Bogor Agricultural University) 0.4682353
## Employer.Reputation
## Universitas Indonesia 2.0031556
## Gadjah Mada University 1.7933848
## Bandung Institute of Technology (ITB) 1.7639432
## Airlangga University 1.9994754
## IPB University (a.k.a. Bogor Agricultural University) 0.8144543
## Faculty.Student
## Universitas Indonesia 1.531234
## Gadjah Mada University 1.620520
## Bandung Institute of Technology (ITB) 1.535933
## Airlangga University 1.592324
## IPB University (a.k.a. Bogor Agricultural University) 2.269017
## Citations.per.Faculty
## Universitas Indonesia 1.6835081
## Gadjah Mada University 0.4760264
## Bandung Institute of Technology (ITB) 3.1928603
## Airlangga University -0.1277144
## IPB University (a.k.a. Bogor Agricultural University) 0.7778969
## International.Faculty
## Universitas Indonesia 2.3370884
## Gadjah Mada University 0.5109637
## Bandung Institute of Technology (ITB) 2.5691585
## Airlangga University 1.1843472
## IPB University (a.k.a. Bogor Agricultural University) 1.0017348
## International.Students
## Universitas Indonesia 2.2851660
## Gadjah Mada University 0.6908641
## Bandung Institute of Technology (ITB) 0.5314340
## Airlangga University 2.0725924
## IPB University (a.k.a. Bogor Agricultural University) 0.4251472
## International.Research.Network
## Universitas Indonesia 2.0436339
## Gadjah Mada University 2.1284079
## Bandung Institute of Technology (ITB) 1.0169261
## Airlangga University 2.3356334
## IPB University (a.k.a. Bogor Agricultural University) 0.3387339
## Employment.Outcomes
## Universitas Indonesia 2.9778796
## Gadjah Mada University 2.5750043
## Bandung Institute of Technology (ITB) 1.6139282
## Airlangga University 0.1431906
## IPB University (a.k.a. Bogor Agricultural University) 0.8712785
## Sustainability
## Universitas Indonesia 1.9774282
## Gadjah Mada University 1.4801536
## Bandung Institute of Technology (ITB) 2.3904189
## Airlangga University 0.5951735
## IPB University (a.k.a. Bogor Agricultural University) 2.1797094
#loading package for clustering
#install.packages("factoextra") *jalankan sekali
library(factoextra)
## Loading required package: ggplot2
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
############ Number of cluster ############
library(NbClust)
# Elbow method
fviz_nbclust(df, kmeans, method = "wss")+
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")

# Silhouette method
fviz_nbclust(df, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")

# Gap statistic
# nboot = 50 to keep the function speedy.
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
set.seed(123)
fviz_nbclust(df, kmeans, nstart = 25, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")

#summary from some index
nb <- NbClust(df, distance = "euclidean", min.nc = 2,
max.nc = 10, method = "kmeans")
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 11 proposed 2 as the best number of clusters
## * 7 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 11 proposed 2 as the best number of clusters
## * 7 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .

#computing k-means with k=
set.seed(123)
km.res=kmeans(df,4,nstart=25)
#print the results
print(km.res)
## K-means clustering with 4 clusters of sizes 2, 3, 16, 5
##
## Cluster means:
## Academic.Reputation Employer.Reputation Faculty.Student Citations.per.Faculty
## 1 2.20342872 1.8835494 1.5335833 2.4381842
## 2 1.49778339 1.5357715 1.8272867 0.3754030
## 3 -0.52559677 -0.6205897 -0.6345348 -0.4484517
## 4 -0.09813186 0.3110043 0.3207060 0.2345301
## International.Faculty International.Students International.Research.Network
## 1 2.4531234 1.4083000 1.5302800
## 2 0.8990152 1.0628679 1.6009250
## 3 -0.5847111 -0.6177920 -0.5042968
## 4 0.3504169 0.7758936 0.0410828
## Employment.Outcomes Sustainability
## 1 2.295903891 2.1839236
## 2 1.196491140 1.4183455
## 3 -0.510875020 -0.5732110
## 4 -0.001456176 0.1096986
##
## Clustering vector:
## Universitas Indonesia
## 1
## Gadjah Mada University
## 2
## Bandung Institute of Technology (ITB)
## 1
## Airlangga University
## 2
## IPB University (a.k.a. Bogor Agricultural University)
## 2
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)
## 4
## Universitas Padjadjaran
## 4
## Diponegoro University
## 4
## Universitas Brawijaya
## 3
## Bina Nusantara University (BINUS)
## 4
## Telkom University
## 4
## Universitas Hasanuddin
## 3
## Universitas Sebelas Maret
## 3
## Atma Jaya Catholic University Jakarta
## 3
## Universitas Islam Indonesia
## 3
## Universitas Muhammadiyah Yogyakarta
## 3
## Universitas Pendidikan Indonesia
## 3
## Universitas Sumatera Utara
## 3
## Yogyakarta State University
## 3
## State University of Malang
## 3
## Udayana University
## 3
## Universitas Andalas
## 3
## Universitas Kristen Petra
## 3
## Universitas Muhammadiyah Surakarta
## 3
## University of Lampung
## 3
## University of Mataram
## 3
##
## Within cluster sum of squares by cluster:
## [1] 4.545581 12.425822 20.725456 14.724307
## (between_SS / total_SS = 76.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#compute the mean of each clusters using original data
aggregate(df, by=list(km.res$cluster),mean)
## Group.1 Academic.Reputation Employer.Reputation Faculty.Student
## 1 1 2.20342872 1.8835494 1.5335833
## 2 2 1.49778339 1.5357715 1.8272867
## 3 3 -0.52559677 -0.6205897 -0.6345348
## 4 4 -0.09813186 0.3110043 0.3207060
## Citations.per.Faculty International.Faculty International.Students
## 1 2.4381842 2.4531234 1.4083000
## 2 0.3754030 0.8990152 1.0628679
## 3 -0.4484517 -0.5847111 -0.6177920
## 4 0.2345301 0.3504169 0.7758936
## International.Research.Network Employment.Outcomes Sustainability
## 1 1.5302800 2.295903891 2.1839236
## 2 1.6009250 1.196491140 1.4183455
## 3 -0.5042968 -0.510875020 -0.5732110
## 4 0.0410828 -0.001456176 0.1096986
dd=cbind(df, cluster=km.res$cluster)
head(dd)
## Academic.Reputation
## Universitas Indonesia 2.5712897
## Gadjah Mada University 2.6129344
## Bandung Institute of Technology (ITB) 1.8355677
## Airlangga University 1.4121805
## IPB University (a.k.a. Bogor Agricultural University) 0.4682353
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) 0.0379073
## Employer.Reputation
## Universitas Indonesia 2.0031556
## Gadjah Mada University 1.7933848
## Bandung Institute of Technology (ITB) 1.7639432
## Airlangga University 1.9994754
## IPB University (a.k.a. Bogor Agricultural University) 0.8144543
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) 0.8880581
## Faculty.Student
## Universitas Indonesia 1.531234
## Gadjah Mada University 1.620520
## Bandung Institute of Technology (ITB) 1.535933
## Airlangga University 1.592324
## IPB University (a.k.a. Bogor Agricultural University) 2.269017
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) 1.159993
## Citations.per.Faculty
## Universitas Indonesia 1.6835081
## Gadjah Mada University 0.4760264
## Bandung Institute of Technology (ITB) 3.1928603
## Airlangga University -0.1277144
## IPB University (a.k.a. Bogor Agricultural University) 0.7778969
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) 0.1741560
## International.Faculty
## Universitas Indonesia 2.3370884
## Gadjah Mada University 0.5109637
## Bandung Institute of Technology (ITB) 2.5691585
## Airlangga University 1.1843472
## IPB University (a.k.a. Bogor Agricultural University) 1.0017348
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) 0.7848824
## International.Students
## Universitas Indonesia 2.2851660
## Gadjah Mada University 0.6908641
## Bandung Institute of Technology (ITB) 0.5314340
## Airlangga University 2.0725924
## IPB University (a.k.a. Bogor Agricultural University) 0.4251472
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) 1.3285849
## International.Research.Network
## Universitas Indonesia 2.0436339
## Gadjah Mada University 2.1284079
## Bandung Institute of Technology (ITB) 1.0169261
## Airlangga University 2.3356334
## IPB University (a.k.a. Bogor Agricultural University) 0.3387339
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) -0.3300391
## Employment.Outcomes
## Universitas Indonesia 2.9778796
## Gadjah Mada University 2.5750043
## Bandung Institute of Technology (ITB) 1.6139282
## Airlangga University 0.1431906
## IPB University (a.k.a. Bogor Agricultural University) 0.8712785
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) -0.1431906
## Sustainability cluster
## Universitas Indonesia 1.9774282 1
## Gadjah Mada University 1.4801536 2
## Bandung Institute of Technology (ITB) 2.3904189 1
## Airlangga University 0.5951735 2
## IPB University (a.k.a. Bogor Agricultural University) 2.1797094 2
## Institut Teknologi Sepuluh Nopember (ITS Surabaya) 0.5783167 4
#cluster number for each of the observations
km.res$cluster
## Universitas Indonesia
## 1
## Gadjah Mada University
## 2
## Bandung Institute of Technology (ITB)
## 1
## Airlangga University
## 2
## IPB University (a.k.a. Bogor Agricultural University)
## 2
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)
## 4
## Universitas Padjadjaran
## 4
## Diponegoro University
## 4
## Universitas Brawijaya
## 3
## Bina Nusantara University (BINUS)
## 4
## Telkom University
## 4
## Universitas Hasanuddin
## 3
## Universitas Sebelas Maret
## 3
## Atma Jaya Catholic University Jakarta
## 3
## Universitas Islam Indonesia
## 3
## Universitas Muhammadiyah Yogyakarta
## 3
## Universitas Pendidikan Indonesia
## 3
## Universitas Sumatera Utara
## 3
## Yogyakarta State University
## 3
## State University of Malang
## 3
## Udayana University
## 3
## Universitas Andalas
## 3
## Universitas Kristen Petra
## 3
## Universitas Muhammadiyah Surakarta
## 3
## University of Lampung
## 3
## University of Mataram
## 3
head(km.res$cluster,4)
## Universitas Indonesia Gadjah Mada University
## 1 2
## Bandung Institute of Technology (ITB) Airlangga University
## 1 2
#cluster size
km.res$size
## [1] 2 3 16 5
#cluster means
km.res$centers
## Academic.Reputation Employer.Reputation Faculty.Student Citations.per.Faculty
## 1 2.20342872 1.8835494 1.5335833 2.4381842
## 2 1.49778339 1.5357715 1.8272867 0.3754030
## 3 -0.52559677 -0.6205897 -0.6345348 -0.4484517
## 4 -0.09813186 0.3110043 0.3207060 0.2345301
## International.Faculty International.Students International.Research.Network
## 1 2.4531234 1.4083000 1.5302800
## 2 0.8990152 1.0628679 1.6009250
## 3 -0.5847111 -0.6177920 -0.5042968
## 4 0.3504169 0.7758936 0.0410828
## Employment.Outcomes Sustainability
## 1 2.295903891 2.1839236
## 2 1.196491140 1.4183455
## 3 -0.510875020 -0.5732110
## 4 -0.001456176 0.1096986
#tot.withinss
km.res$tot.withinss
## [1] 52.42117
############ visualizing k-means clusters ##############
fviz_cluster(km.res,data=df,
palette=c("jco"),
ellipse.type="euclid",# Concentration ellipse
star.plot=TRUE,# Addsegmentsfromcentroidsto items
repel=TRUE,#Avoid labeloverplotting (slow)
ggtheme=theme_minimal()
)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

######## Hitung means tiap fitur di tiap cluster ########
# *menggunakan data terstandarisasi ke data asli
centers_original <- t(t(km.res$centers) * attr(df,"scaled:scale") +
attr(df,"scaled:center"))
centers_original
## Academic.Reputation Employer.Reputation Faculty.Student Citations.per.Faculty
## 1 48.50000 75.95000 55.7500 2.350000
## 2 38.33333 66.50000 62.0000 1.666667
## 3 9.18125 7.90625 9.6125 1.393750
## 4 15.34000 33.22000 29.9400 1.620000
## International.Faculty International.Students International.Research.Network
## 1 91.35 5.4500 31.75
## 2 50.50 4.8000 32.50
## 3 11.50 1.6375 10.15
## 4 36.08 4.2600 15.94
## Employment.Outcomes Sustainability
## 1 63.250 35.15000
## 2 40.600 26.06667
## 3 5.425 2.43750
## 4 15.920 10.54000
# Visualize the means of each cluster
library(ggplot2)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# ambil centroid
centers <- as.data.frame(centers_original)
centers$cluster <- factor(1:nrow(centers))
# ubah ke format long
centers_long <- centers %>%
pivot_longer(-cluster, names_to = "variable", values_to = "value")
# plot
ggplot(centers_long, aes(x = variable, y = value, fill = cluster)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Centroid Value per Variable",
x = "Variable",
y = "Center Value") +
theme_minimal()

##########################################
############# Fuzzy K-Means ##############
#install.packages("cluster") *jalankan sekali
library(cluster)
res.fanny=fanny(df,2)
res.fanny$coeff
## dunn_coeff normalized
## 0.6349999 0.2699999
head(res.fanny$clustering)
## Universitas Indonesia
## 1
## Gadjah Mada University
## 1
## Bandung Institute of Technology (ITB)
## 1
## Airlangga University
## 1
## IPB University (a.k.a. Bogor Agricultural University)
## 1
## Institut Teknologi Sepuluh Nopember (ITS Surabaya)
## 1
library(factoextra)
fviz_cluster(res.fanny, ellipse.type = "norm", repel = TRUE,
palette = "jco", ggtheme = theme_minimal(),
legend = "right")

fviz_silhouette(res.fanny, palette = "jco",
ggtheme = theme_minimal())
## cluster size ave.sil.width
## 1 1 8 0.19
## 2 2 18 0.64

#################################
############ DBScan #############
# Compute DBSCAN using fpc package
#install.packages("fpc") *jalankan sekali
library("fpc")
set.seed(123)
db <- fpc::dbscan(df, eps = 0.15, MinPts = 5)
fviz_cluster(db,data=df,stand=FALSE,
ellipse=FALSE,show.clust.cent=FALSE,
geom="point",palette="jco",ggtheme=theme_classic())
