Mapping and Clustering of Regional Welfare Characteristics in Java

Lecturer

Prof. Kismiantini, S.Si., M.Si., Ph.D.

Group B Members

Elsya Anggraini — 23031030013
Qolbu Salim — 23031030020
Inneda Berta Anggraini — 23031030022
Widha Zhariin Muslimah — 23031030023
Syifa Bima Pratama — 23031030033
Okta Mianda Br Sihotang — 23031030046

Library

library(psych)

## Warning: package 'psych' was built under R version 4.5.2

library(corrplot)

## corrplot 0.95 loaded

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tidyverse)

## Warning: package 'lubridate' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.2
## ✔ purrr     1.1.0     ✔ tibble    3.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(scatterplot3d)

## Warning: package 'scatterplot3d' was built under R version 4.5.2

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.5.2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(sf)

## Warning: package 'sf' was built under R version 4.5.2

## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE

library(stringr)
library(RColorBrewer)
library(tibble)

Data

dat <- read.csv("D:/stat/IKK.csv", dec = ",")
dat <- dat[,2:7]
dat_mds <- dat[,2:6]
head(dat)

str(dat)

## 'data.frame':    119 obs. of  6 variables:
##  $ Wilayah: chr  " Pandeglang" " Lebak" " Tangerang" " Serang" ...
##  $ IKK    : num  1.21 1.35 1.06 0.56 0.72 0.42 0.77 0.32 2.32 2.11 ...
##  $ IPM    : num  65.8 64.7 73 67.8 78.9 ...
##  $ TPT    : num  9.24 8.55 7.88 10.61 7.16 ...
##  $ PPK    : int  8827 8854 12427 10916 14909 13185 13709 15997 10511 16002 ...
##  $ RLS    : num  7.13 6.59 8.92 7.78 10.84 ...

Summary Statistics

describe(dat)

summary(dat)

##    Wilayah               IKK             IPM             TPT        
##  Length:119         Min.   :0.320   Min.   :63.39   Min.   : 1.360  
##  Class :character   1st Qu.:0.930   1st Qu.:69.78   1st Qu.: 4.495  
##  Mode  :character   Median :1.330   Median :72.97   Median : 5.920  
##                     Mean   :1.425   Mean   :73.71   Mean   : 6.124  
##                     3rd Qu.:1.735   3rd Qu.:76.77   3rd Qu.: 7.830  
##                     Max.   :3.720   Max.   :87.69   Max.   :10.780  
##       PPK             RLS        
##  Min.   : 4173   Min.   : 6.120  
##  1st Qu.: 9974   1st Qu.: 7.755  
##  Median :12379   Median : 9.620  
##  Mean   :12546   Mean   :10.184  
##  3rd Qu.:15105   3rd Qu.:12.710  
##  Max.   :24221   Max.   :15.760

Exploratory Data Analysis

a) Histogram

dat_long <- dat %>% 
  pivot_longer(
    cols = c(IKK, IPM, TPT, PPK, RLS),
    names_to = "Variable",
    values_to = "Value"
  )

ggplot(dat_long, aes(x = Value, fill = Variable)) +
  geom_histogram(bins = 30, color = "white", alpha = 0.9) +
  facet_wrap(~ Variable, scales = "free") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Distribution of Variables",
    x = "Value",
    y = "Count"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    strip.text = element_text(face = "bold"),
    legend.position = "none"
  )

b) Boxplot

dat_long <- dat %>%
  pivot_longer(
    cols = c(IKK, IPM, TPT, PPK, RLS),
    names_to = "Variable",
    values_to = "Value"
  )

ggplot(dat_long, aes(x = Variable, y = Value, fill = Variable)) +
  geom_boxplot(alpha = 0.9, color = "black") +
  facet_wrap(~ Variable, scales = "free") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Boxplot of Variables",
    x = "Variable",
    y = "Value"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    legend.position = "none",
    strip.text = element_text(face = "bold")
  )

c) Correlation

corr <- cor(dat[,-1])
corrplot(corr, method="number", type="lower", tl.col = "deepskyblue4")

Standardization

data_scaled <- scale(dat_mds)
head(data_scaled)

##             IKK        IPM       TPT        PPK         RLS
## [1,] -0.3177316 -1.4805002 1.3674590 -0.9910274 -1.14343261
## [2,] -0.1103909 -1.6931774 1.0646813 -0.9838324 -1.34563686
## [3,] -0.5398823 -0.1385636 0.7706799 -0.0316934 -0.47316297
## [4,] -1.2803848 -1.1210193 1.9686261 -0.4343472 -0.90003861
## [5,] -1.0434240  0.9775211 0.4547380  0.6297141  0.24578546
## [6,] -1.4877255  0.0458821 0.8672177  0.1702997  0.05855931

Method 1 : Multidimensional Scaling

Distance Matrix (D)

matriks_jarak <- as.matrix(dist(data_scaled))

B Matrix

A <- matriks_jarak^2
n <- nrow(matriks_jarak)
I <- diag(n)
J <- matrix(rep(1, n), nrow=n, ncol=n)
V <- I - (1/n) * J
 
aa <- V %*% A
BB <- aa %*% V          
B <- (-1/2) * BB

Eigen Values and Eigen Vector from B Matrix

eigen <- eigen(B)
nilai_eigen <- eigen$values
vektor_eigen <- eigen$vectors

Cummulative Variance from Eigen Values

cumulative_variance <- cumsum(nilai_eigen) / sum(nilai_eigen)

Coordinate Object

fit <- cmdscale(matriks_jarak, k = 3)
dat$MDS1 <- fit[, 1]
dat$MDS2 <- fit[, 2]
dat$MDS3 <- if (ncol(fit) >= 3) fit[, 3] else NA

Matrix Disparities

disparities <- as.matrix(dist(fit))
for (i in 1:n) {
  for (j in 1:n) {
    disparities[i, j] <- sqrt(sum((fit[i, ] - fit[j, ])^2))
    }
  }

STRESS for Each Dimensions

max_dim <- 5  
stress_values <- numeric(max_dim)
for (k in 1:max_dim) {
  fit_k <- cmdscale(matriks_jarak, k = k)  # MDS for dimensions k
  disparities_k <- as.matrix(dist(fit_k))  # 
stress_values[k] <- sqrt(sum((matriks_jarak - disparities_k)^2) / sum(matriks_jarak^2))  # Value of stress
}
stress_values

## [1] 4.810098e-01 2.701668e-01 1.332582e-01 4.216117e-02 7.910995e-16

Graph of STRESS

plot(1:max_dim, stress_values, type = "b", pch = 19, col = "blue",
     xlab = "Dimensi", ylab = "Nilai STRESS",
     main = "Grafik STRESS setiap Dimensi",
     ylim = c(min(stress_values) - 0.01, max(stress_values) + 0.01))
abline(h = 0.01, col = "red", lty = 2)

Visualization

2D

ggplot(dat, aes(x = MDS1, y = MDS2, label = Wilayah)) +
  geom_point(size = 3, color = "blue") +
  geom_text(vjust = -1, size = 3) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "MDS 2D Welfare Visualization",
       x = "Dimensi 1",
       y = "Dimensi 2") +
  theme_minimal()

stress_2d <- sqrt(sum((matriks_jarak - as.matrix(dist(fit[, 1:2])))^2) / sum(matriks_jarak^2))
stress_2d

## [1] 0.2701668

3D

plot_ly(dat, x = ~MDS1, y = ~MDS2, z = ~MDS3, 
        type = "scatter3d", mode = "markers+text",
        text = ~Wilayah) %>%
  layout(title = "MDS 3D Welfare Visualization",
         scene = list(xaxis = list(title = "Dimensi 1"),
                      yaxis = list(title = "Dimensi 2"),
                      zaxis = list(title = "Dimensi 3")))

stress_3d <- sqrt(sum((matriks_jarak - as.matrix(dist(fit[, 1:3])))^2) / sum(matriks_jarak^2))
stress_3d

## [1] 0.1332582

Method 2 : K means Clustering

Data for K-Means (Coordinate MDS 3 dimension)

data_kmeans_mds <- fit[, 1:3]
colnames(data_kmeans_mds) <- c("MDS1", "MDS2", "MDS3")

Find the Optimal Number of Clusters (k)

#Elbow Method
fviz_nbclust(data_kmeans_mds, kmeans, method = "wss") +
  geom_vline(xintercept = 5, linetype=2) +
  labs(subtitle="Elbow Method")

#Silhouette Method
fviz_nbclust(data_kmeans_mds, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method")

Get the Optimal Number of Clusters (k) = 5

set.seed(2025)
k_opt <- 5
kmeans_mds <- kmeans(data_kmeans_mds, centers = k_opt,  nstart = 25)
kmeans_mds$centers

##          MDS1         MDS2       MDS3
## 1  0.71898761 -1.468218434  0.1263411
## 2  0.09762834  0.429710178 -1.1160768
## 3  1.87027041  0.850923370 -0.1085968
## 4 -2.37875976 -0.009646127 -0.0361262
## 5 -0.03983325  0.779586895  1.0269064

Input the Cluster Result into Dataset

dat$Cluster_MDS <- as.factor(kmeans_mds$cluster)
head(dat$Cluster_MDS)

## [1] 1 1 1 1 4 1
## Levels: 1 2 3 4 5

Summary

table(dat$Cluster_MDS)

## 
##  1  2  3  4  5 
## 30 24 17 23 25

Visualization

ggplot

ggplot(dat, aes(x = MDS1, y = MDS2, color = Cluster_MDS, label = Wilayah)) +
  geom_point(size = 3) +
  geom_text(vjust = -1, size = 3) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(title = paste("Clustering Based Coordinate MDS 3D (k =", k_opt, ")"),
       x = "Dimensi 1",
       y = "Dimensi 2",
       color = "Cluster") +
  theme_minimal() +
  theme(legend.position = "right")

plotly

plot_ly(dat, x = ~MDS1, y = ~MDS2, z = ~MDS3,
        type = "scatter3d", mode = "markers+text",
        text = ~Wilayah,
        color = ~Cluster_MDS) %>%
  layout(
    title = paste("Cluster MDS 3D (k =", k_opt, ")"),
    scene = list(
      xaxis = list(title = "Dimensi 1"),
      yaxis = list(title = "Dimensi 2"),
      zaxis = list(title = "Dimensi 3")
    )
  )

regions by cluster

cluster_list_mds <- split(dat$Wilayah, dat$Cluster_MDS)

for (k in names(cluster_list_mds)) {
  cat("Cluster", k, ":\n")
  print(cluster_list_mds[[k]])
  cat("\n")
}

## Cluster 1 :
##  [1] " Pandeglang"      " Lebak"           " Tangerang"       " Serang"         
##  [5] "Cilegon"          "Kota Serang"      "Kepulauan Seribu" "Bogor"           
##  [9] "Sukabumi"         "Cianjur"          "Bandung"          "Garut"           
## [13] "Kuningan"         "Cirebon"          "Sumedang"         "Indramayu"       
## [17] "Subang"           "Purwakarta"       "Karawang"         "Bekasi"          
## [21] "Bandung Barat"    "Kota Bogor"       "Kota Sukabumi"    "Kota Cirebon"    
## [25] "Cimahi"           "Banjar"           "Cilacap"          "Batang"          
## [29] "Tegal"            "Brebes"          
## 
## Cluster 2 :
##  [1] "Bantul"          "Banyumas"        "Purbalingga"     "Purworejo"      
##  [5] "Magelang"        "Boyolali"        "Klaten"          "Sukoharjo"      
##  [9] "Wonogiri"        "Karanganyar"     "Sragen"          "Grobogan"       
## [13] "Blora"           "Rembang"         "Pati"            "Kudus"          
## [17] "Jepara"          "Demak"           "Semarang"        "Temanggung"     
## [21] "Kendal"          "Pekalongan"      "Kota Pekalongan" "Kota Tegal"     
## 
## Cluster 3 :
##  [1] "Kulonprogo"       "Gunungkidul"      "Tasikmalaya"      "Ciamis"          
##  [5] "Majalengka"       "Pangandaran"      "Kota Tasikmalaya" "Banjarnegara"    
##  [9] "Kebumen"          "Wonosobo"         "Pemalang"         "Probolinggo"     
## [13] "Tuban"            "Bangkalan"        "Sampang"          "Pamekasan"       
## [17] "Sumenep"         
## 
## Cluster 4 :
##  [1] "Kota Tangerang"    "Tangerang Selatan" "Sleman"           
##  [4] "Yogyakarta"        "Jakarta Selatan"   "Jakarta Timur"    
##  [7] "Jakarta Pusat"     "Jakarta Barat"     "Jakarta Utara"    
## [10] "Kota Bandung"      "Kota Bekasi"       "Depok"            
## [13] "Kota Magelang"     "Surakarta"         "Salatiga"         
## [16] "Kota Semarang"     "Sidoarjo"          "Kota Blitar"      
## [19] "Kota Malang"       "Kota Mojokerto"    "Kota Madiun"      
## [22] "Surabaya"          "Batu"             
## 
## Cluster 5 :
##  [1] "Pacitan"          "Ponorogo"         "Trenggalek"       "Tulungagung"     
##  [5] "Blitar"           "Kediri"           "Malang"           "Lumajang"        
##  [9] "Jember"           "Banyuwangi"       "Bondowoso"        "Situbondo"       
## [13] "Pasuruan"         "Mojokerto"        "Jombang"          "Nganjuk"         
## [17] "Madiun"           "Magetan"          "Ngawi"            "Bojonegoro"      
## [21] "Lamongan"         "Gresik"           "Kota Kediri"      "Kota Probolinggo"
## [25] "Kota Pasuruan"

clusters summary

dat$Cluster_MDS <- as.factor(kmeans_mds$cluster)
head(dat)

aggregate(dat[, c("IKK", "IPM", "TPT", "PPK", "RLS")], by = list(Cluster = dat$Cluster_MDS), FUN = mean)

mapping using GADM 2

indo <- st_read("D:/stat/gadm41_IDN_2.shp")

## Reading layer `gadm41_IDN_2' from data source `D:\stat\gadm41_IDN_2.shp' using driver `ESRI Shapefile'
## Simple feature collection with 502 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 95.00971 ymin: -11.00761 xmax: 141.0194 ymax: 6.076941
## Geodetic CRS:  WGS 84

jawa <- indo %>% 
  filter(NAME_1 %in% c("Banten","Jakarta Raya","Jawa Barat",
                       "Jawa Tengah","Jawa Timur","DI Yogyakarta"))

jawa <- jawa %>% 
  mutate(
    Wilayah_raw = NAME_2,
    Wilayah = NAME_2,
    Wilayah = str_replace(Wilayah, "Kabupaten ", "Kab "),
    Wilayah = str_replace(Wilayah, "Kota ", "Kota "),
    Wilayah = str_trim(Wilayah)
  )

dat <- dat %>% 
  mutate(
    Wilayah = str_replace(Wilayah, "Kab ", "Kab "),
    Wilayah = str_replace(Wilayah, "Kota ", "Kota "),
    Wilayah = str_trim(Wilayah)
  )

map_cluster <- jawa %>% left_join(dat, by = "Wilayah")

ggplot(map_cluster) +
  geom_sf(aes(fill = as.factor(Cluster_MDS)), 
          color = "white", 
          size = 0.2) +
  scale_fill_brewer(palette = "Set2", na.value = "grey80") +
  labs(
    title = "Cluster Map Kabupaten/Kota in Java and Madura ",
    subtitle = "Based on MDS Result",
    fill = "Cluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    legend.position = "right"
  )

Mapping and Clustering of Regional Welfare Characteristics in Java

Widha Zhariin

29 November 2025

Lecturer

Group B Members

Library

Data

Summary Statistics

Exploratory Data Analysis

a) Histogram

b) Boxplot

c) Correlation

Standardization

Method 1 : Multidimensional Scaling

Distance Matrix (D)

B Matrix

Eigen Values and Eigen Vector from B Matrix

Cummulative Variance from Eigen Values

Coordinate Object

Matrix Disparities

STRESS for Each Dimensions

Graph of STRESS

Visualization

2D

3D

Method 2 : K means Clustering

Data for K-Means (Coordinate MDS 3 dimension)

Find the Optimal Number of Clusters (k)

Get the Optimal Number of Clusters (k) = 5

Input the Cluster Result into Dataset

Summary

Visualization

ggplot

plotly

regions by cluster

clusters summary

mapping using GADM 2