Data Enthusiast | Statistics and Data Science | IPB University
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(readxl)
library(factoextra)
library(cluster)
library(ggplot2)
library(tidyr)
library(ggiraphExtra)
library(spdep)## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
setwd("C:/Users/Deden Ahmad Rabani/OneDrive - apps.ipb.ac.id/Semester 4/Gammaphics")
# Membaca file Excel
datak <- read_excel("ikp.xlsx", sheet = "IKP22")
# Menampilkan beberapa baris pertama dari data
head(datak)## # A tibble: 6 × 5
## PROVINSI IK IA IP IKP
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ACEH 80.2 69.3 66.2 70.2
## 2 SUMATERA UTARA 80.2 75.0 64.4 71.2
## 3 SUMATERA BARAT 87.5 84.2 71.8 79.4
## 4 RIAU 38.8 86.5 69.8 67.6
## 5 JAMBI 58.5 82 67.0 69.5
## 6 SUMATERA SELATAN 81.6 74.1 61.3 69.6
## tibble [34 × 5] (S3: tbl_df/tbl/data.frame)
## $ PROVINSI: chr [1:34] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
## $ IK : num [1:34] 80.2 80.2 87.5 38.8 58.5 ...
## $ IA : num [1:34] 69.3 75 84.2 86.5 82 ...
## $ IP : num [1:34] 66.2 64.4 71.8 69.8 67 ...
## $ IKP : num [1:34] 70.2 71.2 79.5 67.6 69.5 ...
## tibble [34 × 5] (S3: tbl_df/tbl/data.frame)
## $ PROVINSI: chr [1:34] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
## $ IK : num [1:34] 80.2 80.2 87.5 38.8 58.5 ...
## $ IA : num [1:34] 69.3 75 84.2 86.5 82 ...
## $ IP : num [1:34] 66.2 64.4 71.8 69.8 67 ...
## $ IKP : num [1:34] 70.2 71.2 79.5 67.6 69.5 ...
## # A tibble: 34 × 3
## IK IA IP
## <dbl> <dbl> <dbl>
## 1 80.2 69.3 66.2
## 2 80.2 75.0 64.4
## 3 87.5 84.2 71.8
## 4 38.8 86.5 69.8
## 5 58.5 82 67.0
## 6 81.6 74.1 61.3
## 7 69.9 75.2 61.5
## 8 94.9 79.6 67.7
## 9 36.4 92.5 77.5
## 10 5.83 88.9 72.5
## # ℹ 24 more rows
Bisa Subjektif (Tentuin mau berapa klaster) atau sesuai teori optimal:
Hint: Lihat k yang nilai silhouette paling tinggi
# Load required libraries
library(cluster)
library(ggplot2)
# Compute distance matrix
dist_matrix <- dist(data)
# Initialize silhouette width vector
sil_width <- c(0) # set silhouette width for k = 1 as 0
# Loop over several k values
for(k in 2:10) {
# Perform k-means clustering
cluster <- kmeans(data, centers=k)
# Compute silhouette widths
sil <- silhouette(cluster$cluster, dist_matrix)
# Compute and store the average silhouette width
sil_width[k] <- mean(sil[, 3])
}
# Create a data frame for plotting
sil_df <- data.frame(
k = 1:10,
silhouette_width = sil_width[1:10]
)
# Create a ggplot
ggplot(sil_df, aes(x=k, y=silhouette_width)) +
geom_line(color = "darkblue", size = 1.2) +
geom_point(color = "red", size = 4) +
labs(title = "Silhouette Method to Determine Optimal k",
x = "Number of Clusters",
y = "Average Silhouette Width") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, color = "darkblue", size = 14, face = "bold.italic"),
axis.title.x = element_text(color = "darkblue", size = 12, face = "bold"),
axis.title.y = element_text(color = "darkblue", size = 12, face = "bold"),
axis.text.x = element_text(color = "black", size = 10),
axis.text.y = element_text(color = "black", size = 10)
) + scale_x_continuous(breaks = 1:10)## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Hint: Lihat k yang nilai WSS paling membuat sudut tajam
# Menghitung matriks jarak
d <- dist(data)
# Menghitung jumlah kluster k optimal menggunakan metode elbow
wss <- c(NA)
for(i in 1:10) {
km.res <- kmeans(data, centers = i)
wss[i] <- km.res$tot.withinss
}
# Membuat dataframe untuk plotting
df <- data.frame(
k = 1:10,
wss = wss
)
# Membuat plot dengan ggplot2
ggplot(df, aes(x = k, y = wss)) +
geom_line(color = "darkblue", size = 1.2) +
geom_point(color = "red", size = 4) +
labs(title = "Elbow Method to Determine Optimal k",
x = "Number of Clusters",
y = "Total Within Sum of Squares") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, color = "darkblue", size = 14, face = "bold.italic"),
axis.title.x = element_text(color = "darkblue", size = 12, face = "bold"),
axis.title.y = element_text(color = "darkblue", size = 12, face = "bold"),
axis.text.x = element_text(color = "black", size = 10),
axis.text.y = element_text(color = "black", size = 10)
) + scale_x_continuous(breaks = 1:10)set.seed(123)
# Melakukan klustering k-means dengan k = 3 pada data yang telah distandarisasi
km.res <- eclust(data, FUNcluster = "kmeans", k = 4, graph = FALSE)
# Menampilkan hasil klustering
print(km.res)## K-means clustering with 4 clusters of sizes 5, 11, 11, 7
##
## Cluster means:
## IK IA IP
## 1 9.96400 74.44800 62.55400
## 2 89.12364 82.51909 73.65727
## 3 77.58000 77.19000 65.02182
## 4 46.45857 85.83857 70.83571
##
## Clustering vector:
## [1] 3 3 2 4 4 3 3 2 4 1 1 2 2 2 2 3 2 2 3 3 4 2 4 4 3 3 2 3 2 3 4 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 2325.3017 808.2114 1092.8916 1002.1799
## (between_SS / total_SS = 83.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault" "silinfo"
## [11] "nbclust" "data"
set.seed(123)
factoextra::fviz_cluster(km.res, data = data,
geom = "point",
ellipse.type = "convex",
pointsize = 2,
palette = "jco",
ggtheme = theme_minimal())# Ambil data centeroids
centers <- km.res$centers
# Konvert data centeroids ke data frame
centers_df <- as.data.frame(centers)
# Tambahkan Kolom Kluster
centers_df$cluster <- rownames(centers_df)
# Konversi Data Wide ke Long
library(tidyr)
centers_long <- gather(centers_df, key = "variable", value = "value", -cluster)## IK IA IP cluster
## 1 9.96400 74.44800 62.55400 1
## 2 89.12364 82.51909 73.65727 2
## 3 77.58000 77.19000 65.02182 3
## 4 46.45857 85.83857 70.83571 4
Dari hasil diatas, bisa dilihat variabel apa yang tinggi/rendah disetiap klasternya. Visualisasinya kayak dibawah. (Paling Rekomendasi: Radar Chart)
# Plot the cluster profiles
library(ggplot2)
ggplot(centers_long, aes(x = cluster, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Cluster", y = "Mean Value", fill = "Variable") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Profil Klaster Demografi di Indonesia")library(ggplot2)
# Melt the data for easier plotting
melted_data <- reshape2::melt(centers_df, id.vars = "cluster")
# Plotting
ggplot(melted_data, aes(x = factor(cluster), y = value, fill = factor(cluster))) +
geom_bar(stat = "identity") +
facet_wrap(~variable, scales = "free_y") +
theme_minimal() +
labs(
title = "Perbandingan Cluster",
x = "Cluster",
y = "Nilai",
fill = "Cluster"
) +
theme(legend.position = "none") +
scale_fill_brewer(palette = "Set1")# Add the cluster assignments to your data
data$Cluster <- km.res$cluster
# Load the 'ggiraphExtra' library for the ggRadar function
library(ggiraphExtra)
# Radar Plot
old_par <- par()
par(oma = c(10, 4, 4, 8))
ggRadar(
data = data,
mapping = aes(colours = Cluster),
) +
theme_light() +
theme(
text = element_text(size = 10), # Mengubah ukuran font global
title = element_text(size = 12), # Mengubah ukuran font judul
axis.text = element_text(size = 10), # Mengubah ukuran font label sumbu
legend.text = element_text(size = 8) # Mengubah ukuran font legenda
)## Warning in par(old_par): graphical parameter "cin" cannot be set
## Warning in par(old_par): graphical parameter "cra" cannot be set
## Warning in par(old_par): graphical parameter "csi" cannot be set
## Warning in par(old_par): graphical parameter "cxy" cannot be set
## Warning in par(old_par): graphical parameter "din" cannot be set
## Warning in par(old_par): graphical parameter "page" cannot be set
data_cluster <- data
data_cluster$cluster <- km.res$cluster #Ambil Angka Cluster
data_cluster <- cbind(datak[,1],data_cluster) #Ambil Provinsi sama Kode BPS
data_cluster## PROVINSI IK IA IP Cluster cluster
## 1 ACEH 80.17 69.34 66.25 3 3
## 2 SUMATERA UTARA 80.19 75.02 64.44 3 3
## 3 SUMATERA BARAT 87.53 84.23 71.84 2 2
## 4 RIAU 38.78 86.50 69.81 4 4
## 5 JAMBI 58.48 82.00 67.04 4 4
## 6 SUMATERA SELATAN 81.57 74.11 61.29 3 3
## 7 BENGKULU 69.92 75.18 61.53 3 3
## 8 LAMPUNG 94.94 79.62 67.72 2 2
## 9 KEPULAUAN BANGKA BELITUNG 36.38 92.47 77.54 4 4
## 10 KEPULAUAN RIAU 5.83 88.90 72.53 1 1
## 11 DKI JAKARTA 0.00 83.22 81.12 1 1
## 12 JAWA BARAT 84.59 79.11 73.71 2 2
## 13 JAWA TENGAH 88.88 81.47 80.69 2 2
## 14 DAERAH LSTIMEWA YOGYAKARTA 84.20 79.66 79.97 2 2
## 15 JAWA TIMUR 90.64 79.42 74.27 2 2
## 16 BANTEN 82.81 83.99 62.79 3 3
## 17 BALI 79.08 94.00 82.08 2 2
## 18 NUSA TENGGARA BARAT 92.85 76.24 67.81 2 2
## 19 NUSA TENGGARA TIMUR 86.25 61.62 60.55 3 3
## 20 KALIMANTAN BARAT 76.21 86.33 56.29 3 3
## 21 KALIMANTAN TENGAH 49.87 91.07 67.55 4 4
## 22 KALIMANTAN SELATAN 88.69 90.68 69.01 2 2
## 23 KALIMANTAN TIMUR 52.33 90.27 79.43 4 4
## 24 KALIMANTAN UTARA 43.50 89.99 72.21 4 4
## 25 SULAWESI UTARA 70.37 81.42 71.42 3 3
## 26 SULAWESI TENGAH 77.98 80.39 70.88 3 3
## 27 SULAWESI SELATAN 94.71 83.52 71.22 2 2
## 28 SULAWESI TENGGARA 71.80 78.24 74.38 3 3
## 29 GORONTALO 94.25 79.76 71.91 2 2
## 30 SULAWESI BARAT 76.11 83.45 65.42 3 3
## 31 MALUKU 45.87 68.57 62.27 4 4
## 32 MALUKU UTARA 14.50 88.12 60.61 1 1
## 33 PAPUA BARAT 11.81 62.11 56.17 1 1
## 34 PAPUA 17.68 49.89 42.34 1 1
## 'data.frame': 34 obs. of 6 variables:
## $ PROVINSI: chr "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
## $ IK : num 80.2 80.2 87.5 38.8 58.5 ...
## $ IA : num 69.3 75 84.2 86.5 82 ...
## $ IP : num 66.2 64.4 71.8 69.8 67 ...
## $ Cluster : int 3 3 2 4 4 3 3 2 4 1 ...
## $ cluster : int 3 3 2 4 4 3 3 2 4 1 ...
library(spdep)
library(sf)
map_indonesia <- st_read("C:/Users/Deden Ahmad Rabani/OneDrive - apps.ipb.ac.id/Semester 4/Gammaphics/Batas Provinsi/BATAS_PROVINSI_DESEMBER_2019_DUKCAPIL.shp")## Reading layer `BATAS_PROVINSI_DESEMBER_2019_DUKCAPIL' from data source
## `C:\Users\Deden Ahmad Rabani\OneDrive - apps.ipb.ac.id\Semester 4\Gammaphics\Batas Provinsi\BATAS_PROVINSI_DESEMBER_2019_DUKCAPIL.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 34 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XYZ
## Bounding box: xmin: 95.01079 ymin: -11.00762 xmax: 141.0194 ymax: 6.07693
## z_range: zmin: 2.65e-05 zmax: 2.65e-05
## Geodetic CRS: WGS 84
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Data_Indonesia <- map_indonesia %>%
inner_join(data_cluster, by = c("PROVINSI" = "PROVINSI"))
Data_Indonesia## Simple feature collection with 33 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XYZ
## Bounding box: xmin: 95.01079 ymin: -11.00762 xmax: 141.0194 ymax: 6.07693
## z_range: zmin: 2.65e-05 zmax: 2.65e-05
## Geodetic CRS: WGS 84
## First 10 features:
## OBJECTID PROVINSI Shape_Leng Shape_Area IK IA IP Cluster cluster
## 1 1 ACEH 27.455786 4.62543602 80.17 69.34 66.25 3 3
## 2 2 BALI 6.026646 0.45871702 79.08 94.00 82.08 2 2
## 3 3 BANTEN 9.282228 0.76491115 82.81 83.99 62.79 3 3
## 4 4 BENGKULU 11.706367 1.63012892 69.92 75.18 61.53 3 3
## 5 6 DKI JAKARTA 3.123689 0.05342567 0.00 83.22 81.12 1 1
## 6 7 GORONTALO 11.013237 0.97760000 94.25 79.76 71.91 2 2
## 7 8 JAMBI 11.835072 3.97771124 58.48 82.00 67.04 4 4
## 8 9 JAWA BARAT 11.614950 3.03278484 84.59 79.11 73.71 2 2
## 9 10 JAWA TENGAH 15.456349 2.81983779 88.88 81.47 80.69 2 2
## 10 11 JAWA TIMUR 33.571284 3.93831998 90.64 79.42 74.27 2 2
## geometry
## 1 MULTIPOLYGON Z (((97.39178 ...
## 2 MULTIPOLYGON Z (((115.1251 ...
## 3 MULTIPOLYGON Z (((105.5498 ...
## 4 MULTIPOLYGON Z (((102.3862 ...
## 5 MULTIPOLYGON Z (((106.8768 ...
## 6 MULTIPOLYGON Z (((121.4254 ...
## 7 MULTIPOLYGON Z (((104.4071 ...
## 8 MULTIPOLYGON Z (((108.685 -...
## 9 MULTIPOLYGON Z (((108.8835 ...
## 10 MULTIPOLYGON Z (((114.2038 ...
cluster_full <- ggplot() +
geom_sf(data = Data_Indonesia, aes(fill = factor(`cluster`))) +
geom_sf(data = Data_Indonesia, fill = NA, color = "black") + # Tambahkan baris ini
scale_fill_manual(values = c("#CC415E", "#E9CE2C", "#66D7D1", "#2F41DD"), name = "Keterangan") +
theme_minimal() +
theme(
legend.text = element_text(size = 10),
legend.title = element_text(size = 10, face = "bold"),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 12, face = "bold", hjust = 0.5),
panel.background = element_blank(),
panel.grid = element_blank()
) +
scale_x_continuous(labels = function(x) paste0(x, "°")) +
scale_y_continuous(labels = function(y) paste0(y, "°"))
cluster_full