Library

library(readxl)

## Warning: package 'readxl' was built under R version 4.0.5

library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 4.0.5

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.0.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.0.5

library(ggplot2)
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(mvnormtest)

## Warning: package 'mvnormtest' was built under R version 4.0.3

library(tidyverse)  # data manipulation

## Warning: package 'tidyverse' was built under R version 4.0.5

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.6     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.5

## Warning: package 'purrr' was built under R version 4.0.5

## Warning: package 'stringr' was built under R version 4.0.5

## Warning: package 'forcats' was built under R version 4.0.5

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%()    masks ggplot2::%+%()
## x psych::alpha()  masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization

## Warning: package 'factoextra' was built under R version 4.0.5

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(dendextend) # for comparing two dendrograms

## 
## ---------------------
## Welcome to dendextend version 1.16.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(stats)
library(fpc)

## Warning: package 'fpc' was built under R version 4.0.5

library(clustertend)

## Warning: package 'clustertend' was built under R version 4.0.5

## Package `clustertend` is deprecated.  Use package `hopkins` instead.

library(cluster)
library(ClusterR)

## Warning: package 'ClusterR' was built under R version 4.0.5

## Loading required package: gtools

## Warning: package 'gtools' was built under R version 4.0.5

## 
## Attaching package: 'gtools'

## The following object is masked from 'package:psych':
## 
##     logit

#import data
data1<- read_excel("D:/My Documents/Document to Go to/EFFORT/COLLEGE/ACADEMIC/SEMESTER 7/Kerja Praktik/Paper/Clustering/cluster_data_fix.xlsx")
data <- data1[,c(3:18)]
colnames(data1)

##  [1] "ID"                                  
##  [2] "kabupaten_kota"                      
##  [3] "volume_usaha"                        
##  [4] "shu"                                 
##  [5] "pertanian_kehutanan_dan_perikanan"   
##  [6] "pertambangan_dan_penggalian_"        
##  [7] "industri_pengolahan"                 
##  [8] "konstruksi"                          
##  [9] "penyediaan_akomodasi_dan_makan_minum"
## [10] "lainnya"                             
## [11] "mikro__kuk_1"                        
## [12] "kecil__kuk_2"                        
## [13] "menengah__kuk_3"                     
## [14] "besar__kuk_4"                        
## [15] "S1/lebih"                            
## [16] "kurang dari S1"                      
## [17] "21_PDRB"                             
## [18] "proporsi_SDM"

data2 <- scale(data)
view(data)

Checking the assumption

There are two assumptions before we conduct a clustering analysis:

1. The data sample have been adequate (assumption of adequacy.
2. There is correlation between variables (multicolinearity).

To check the adequity, we can use KMO (Kaiser Meyer Olkin). If the KMO value is greater than 0.5, it is assumed that the sample represents the population so that the next stage of the assumption test can be carried out.

#adequity
KMO(cor(data2)) #KMO test must be accepted (do not reject H0 or p-value > 0.05)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(data2))
## Overall MSA =  0.53
## MSA for each item = 
##                         volume_usaha                                  shu 
##                                 0.57                                 0.77 
##    pertanian_kehutanan_dan_perikanan         pertambangan_dan_penggalian_ 
##                                 0.14                                 0.35 
##                  industri_pengolahan                           konstruksi 
##                                 0.19                                 0.50 
## penyediaan_akomodasi_dan_makan_minum                              lainnya 
##                                 0.15                                 0.47 
##                         mikro__kuk_1                         kecil__kuk_2 
##                                 0.45                                 0.49 
##                      menengah__kuk_3                         besar__kuk_4 
##                                 0.90                                 0.75 
##                             S1/lebih                       kurang dari S1 
##                                 0.78                                 0.68 
##                              21_PDRB                         proporsi_SDM 
##                                 0.94                                 0.51

The second assumption that should be checked is multicolinearity. To conduct clustering analysis, there must be correlation between variables. Multicolinearity test obtained using Bartlett Sphericity with the hypothesis written below:

H0: There is no correlation between variables H1: There is correlation between variables

We can continue to cluster the data when the p-value < 0.05 so we can reject H0 because there is correlation between variables.

#bartlett
cortest.bartlett(cor(data),38) #must be rejected

## $chisq
## [1] 758.9948
## 
## $p.value
## [1] 1.953445e-93
## 
## $df
## [1] 120

K-Means Clustering

Before we cluster the data using K-Means method, we should define the number of k or how many clusters should we form.

We can obtain the optimum cluster depending to the data condition based on the graphic (elbow, silhouette, and gap statistic method) and we can validate the result numerically using Calinski-Harabaz formula.

Graphically

fviz_nbclust(scale(data), kmeans, method="wss")

fviz_nbclust(scale(data), kmeans, method= "silhouette")

fviz_nbclust(scale(data), kmeans, method= "gap_stat")

Numerically

After we get the graphic result, we can validate it numerically using this Calinski-Harabaz (CH) function. We can conclude that the total cluster will be more optimum when the index of Calinski-Harabaz is greater.

# k = 2
km <- kmeans(data2,2)
round(calinhara(data2,km$cluster),digits=2)

## [1] 17.18

# k = 3
km <- kmeans(data2,3)
round(calinhara(data2,km$cluster),digits=2)

## [1] 12.13

We can see that the CH index from k = 2 is greater than k = 3. So, we can conclude that the optimum total cluster should be formed is 2.

Cluster Distribution

fviz_cluster(km, scale(data), repel = FALSE, geom = 
"point",show.clust.cent = TRUE,
ggtheme = theme_minimal(),
main = "",xlab="",ylab="",
alpha = 0)+scale_fill_manual(values = c("darkgreen", 
"brown","red"))

The two clusters formed are not overlapping so it indicates that the clusters tend to be well formed.

Obtained the member and characteristic for each member of the 2 clusters

# member of cluster
hasil=data.frame(data1[,2], km$cluster)
hasil

##      kabupaten_kota km.cluster
## 1           Pacitan          3
## 2          Ponorogo          3
## 3        Trenggalek          3
## 4       Tulungagung          2
## 5            Blitar          2
## 6            Kediri          2
## 7            Malang          1
## 8          Lumajang          3
## 9            Jember          1
## 10       Banyuwangi          2
## 11        Bondowoso          3
## 12        Situbondo          3
## 13      Probolinggo          3
## 14         Pasuruan          1
## 15         Sidoarjo          1
## 16        Mojokerto          2
## 17          Jombang          2
## 18          Nganjuk          3
## 19           Madiun          3
## 20          Magetan          2
## 21            Ngawi          3
## 22       Bojonegoro          2
## 23            Tuban          2
## 24         Lamongan          2
## 25           Gresik          2
## 26        Bangkalan          3
## 27          Sampang          3
## 28        Pamekasan          3
## 29          Sumenep          2
## 30      Kota Kediri          3
## 31      Kota Blitar          3
## 32      Kota Malang          1
## 33 Kota Probolinggo          3
## 34    Kota Pasuruan          3
## 35   Kota Mojokerto          3
## 36      Kota Madiun          3
## 37    Kota Surabaya          1
## 38        Kota Batu          3

head(hasil)

##   kabupaten_kota km.cluster
## 1        Pacitan          3
## 2       Ponorogo          3
## 3     Trenggalek          3
## 4    Tulungagung          2
## 5         Blitar          2
## 6         Kediri          2

write.csv(hasil, "group_kmeans.csv")

When we run the code above, we will obtain the members of cluster 1 and cluster 2 and to know the characteristic for each cluster, we can run the code below.

#characteristic of cluster
summ <- data%>%
 mutate(cluster=hasil$km.cluster)%>%
 group_by(cluster)%>%
 summarise_all("median")
final <- data.frame(summ)
write.csv(final,"kmeans_characteristic.csv")

After we know the members of each cluster, we can make e thematic map to visualize the result in an easier way.

The cluster result

Clustering analysis of cooperative’s contribution for each district/city in East Java

Andrea Ernest & Wahidatul Wardah A.M.

2023-03-11