Load Library
library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(rpart)
library(rpart.plot)
library(pROC)
library(recipes)
library(themis)
set.seed(123)
Import Data
kemiskinan <- read_excel("D:/Users/user/Downloads/uas_datmin.xlsx")
cat("====== DIMENSI DATA ======\n")
## ====== DIMENSI DATA ======
## [1] 514 13
cat("\n====== STRUKTUR DATA ======\n")
##
## ====== STRUKTUR DATA ======
## tibble [514 × 13] (S3: tbl_df/tbl/data.frame)
## $ Provinsi : chr [1:514] "ACEH" "ACEH" "ACEH" "ACEH" ...
## $ Kab/Kota : chr [1:514] "Simeulue" "Aceh Singkil" "Aceh Selatan" "Aceh Tenggara" ...
## $ Persentase Penduduk Miskin (P0) Menurut Kabupaten/Kota (Persen) : num [1:514] 19 20.4 13.2 13.4 14.4 ...
## $ Rata-rata Lama Sekolah Penduduk 15+ (Tahun) : num [1:514] 9.48 8.68 8.88 9.67 8.21 ...
## $ Pengeluaran per Kapita Disesuaikan (Ribu Rupiah/Orang/Tahun) : num [1:514] 7148 8776 8180 8030 8577 ...
## $ Indeks Pembangunan Manusia : num [1:514] 66.4 69.2 67.4 69.4 67.8 ...
## $ Umur Harapan Hidup (Tahun) : num [1:514] 65.3 67.4 64.4 68.2 68.7 ...
## $ Persentase rumah tangga yang memiliki akses terhadap sanitasi layak : num [1:514] 71.6 69.6 62.5 62.7 66.8 ...
## $ Persentase rumah tangga yang memiliki akses terhadap air minum layak: num [1:514] 87.5 78.6 79.7 86.7 83.2 ...
## $ Tingkat Pengangguran Terbuka : num [1:514] 5.71 8.36 6.46 6.43 7.13 2.61 7.09 7.7 7.28 4.32 ...
## $ Tingkat Partisipasi Angkatan Kerja : num [1:514] 71.2 62.9 60.9 69.6 59.5 ...
## $ PDRB atas Dasar Harga Konstan menurut Pengeluaran (Rupiah) : num [1:514] 1648096 1780419 4345784 3487157 8433526 ...
## $ Klasifikasi Kemiskinan : num [1:514] 0 1 0 0 0 0 0 0 0 0 ...
cat("\n====== 6 BARIS PERTAMA ======\n")
##
## ====== 6 BARIS PERTAMA ======
## # A tibble: 6 × 13
## Provinsi `Kab/Kota` Persentase Penduduk Miskin (P0…¹ Rata-rata Lama Sekol…²
## <chr> <chr> <dbl> <dbl>
## 1 ACEH Simeulue 19.0 9.48
## 2 ACEH Aceh Singkil 20.4 8.68
## 3 ACEH Aceh Selatan 13.2 8.88
## 4 ACEH Aceh Tenggara 13.4 9.67
## 5 ACEH Aceh Timur 14.4 8.21
## 6 ACEH Aceh Tengah 15.3 9.86
## # ℹ abbreviated names:
## # ¹​`Persentase Penduduk Miskin (P0) Menurut Kabupaten/Kota (Persen)`,
## # ²​`Rata-rata Lama Sekolah Penduduk 15+ (Tahun)`
## # ℹ 9 more variables:
## # `Pengeluaran per Kapita Disesuaikan (Ribu Rupiah/Orang/Tahun)` <dbl>,
## # `Indeks Pembangunan Manusia` <dbl>, `Umur Harapan Hidup (Tahun)` <dbl>,
## # `Persentase rumah tangga yang memiliki akses terhadap sanitasi layak` <dbl>, …
cat("\n====== NAMA KOLOM ======\n")
##
## ====== NAMA KOLOM ======
## [1] "Provinsi"
## [2] "Kab/Kota"
## [3] "Persentase Penduduk Miskin (P0) Menurut Kabupaten/Kota (Persen)"
## [4] "Rata-rata Lama Sekolah Penduduk 15+ (Tahun)"
## [5] "Pengeluaran per Kapita Disesuaikan (Ribu Rupiah/Orang/Tahun)"
## [6] "Indeks Pembangunan Manusia"
## [7] "Umur Harapan Hidup (Tahun)"
## [8] "Persentase rumah tangga yang memiliki akses terhadap sanitasi layak"
## [9] "Persentase rumah tangga yang memiliki akses terhadap air minum layak"
## [10] "Tingkat Pengangguran Terbuka"
## [11] "Tingkat Partisipasi Angkatan Kerja"
## [12] "PDRB atas Dasar Harga Konstan menurut Pengeluaran (Rupiah)"
## [13] "Klasifikasi Kemiskinan"
Rename Variabel
names(kemiskinan) <- c(
"provinsi",
"kab_kota",
"persentase_miskin",
"rata_lama_sekolah",
"pengeluaran_perkapita",
"ipm",
"umur_harapan_hidup",
"akses_sanitasi",
"akses_air_minum",
"tpt",
"tpak",
"pdrb",
"klasifikasi_kemiskinan"
)
cat("====== NAMA KOLOM SETELAH RENAME ======\n")
## ====== NAMA KOLOM SETELAH RENAME ======
## [1] "provinsi" "kab_kota" "persentase_miskin"
## [4] "rata_lama_sekolah" "pengeluaran_perkapita" "ipm"
## [7] "umur_harapan_hidup" "akses_sanitasi" "akses_air_minum"
## [10] "tpt" "tpak" "pdrb"
## [13] "klasifikasi_kemiskinan"
Konversi Tipe Data
variabel_numerik <- c(
"persentase_miskin",
"rata_lama_sekolah",
"pengeluaran_perkapita",
"ipm",
"umur_harapan_hidup",
"akses_sanitasi",
"akses_air_minum",
"tpt",
"tpak",
"pdrb"
)
# Ubah koma desimal ke titik lalu konversi ke numeric
kemiskinan <- kemiskinan %>%
mutate(
across(
all_of(variabel_numerik),
~ as.numeric(gsub(",", ".", as.character(.)))
)
)
# Konversi target ke factor berlabel
kemiskinan$klasifikasi_kemiskinan <- factor(
kemiskinan$klasifikasi_kemiskinan,
levels = c(0, 1),
labels = c("Tidak_Miskin", "Miskin")
)
cat("====== STRUKTUR SETELAH KONVERSI ======\n")
## ====== STRUKTUR SETELAH KONVERSI ======
## tibble [514 × 13] (S3: tbl_df/tbl/data.frame)
## $ provinsi : chr [1:514] "ACEH" "ACEH" "ACEH" "ACEH" ...
## $ kab_kota : chr [1:514] "Simeulue" "Aceh Singkil" "Aceh Selatan" "Aceh Tenggara" ...
## $ persentase_miskin : num [1:514] 19 20.4 13.2 13.4 14.4 ...
## $ rata_lama_sekolah : num [1:514] 9.48 8.68 8.88 9.67 8.21 ...
## $ pengeluaran_perkapita : num [1:514] 7148 8776 8180 8030 8577 ...
## $ ipm : num [1:514] 66.4 69.2 67.4 69.4 67.8 ...
## $ umur_harapan_hidup : num [1:514] 65.3 67.4 64.4 68.2 68.7 ...
## $ akses_sanitasi : num [1:514] 71.6 69.6 62.5 62.7 66.8 ...
## $ akses_air_minum : num [1:514] 87.5 78.6 79.7 86.7 83.2 ...
## $ tpt : num [1:514] 5.71 8.36 6.46 6.43 7.13 2.61 7.09 7.7 7.28 4.32 ...
## $ tpak : num [1:514] 71.2 62.9 60.9 69.6 59.5 ...
## $ pdrb : num [1:514] 1648096 1780419 4345784 3487157 8433526 ...
## $ klasifikasi_kemiskinan: Factor w/ 2 levels "Tidak_Miskin",..: 1 2 1 1 1 1 1 1 1 1 ...
cat("\n====== RINGKASAN STATISTIK ======\n")
##
## ====== RINGKASAN STATISTIK ======
## provinsi kab_kota persentase_miskin rata_lama_sekolah
## Length :514 Length :514 Min. : 2.38 Min. : 1.420
## N.unique : 34 N.unique :514 1st Qu.: 7.15 1st Qu.: 7.510
## N.blank : 0 N.blank : 0 Median :10.46 Median : 8.305
## Min.nchar: 4 Min.nchar: 4 Mean :12.27 Mean : 8.437
## Max.nchar: 20 Max.nchar: 26 3rd Qu.:14.89 3rd Qu.: 9.338
## Max. :41.66 Max. :12.830
## pengeluaran_perkapita ipm umur_harapan_hidup akses_sanitasi
## Min. : 3976 Min. :32.84 Min. :55.43 Min. : 0.00
## 1st Qu.: 8574 1st Qu.:66.64 1st Qu.:67.39 1st Qu.:70.22
## Median :10196 Median :69.61 Median :69.97 Median :81.80
## Mean :10325 Mean :69.93 Mean :69.66 Mean :77.20
## 3rd Qu.:11719 3rd Qu.:73.11 3rd Qu.:72.04 3rd Qu.:89.88
## Max. :23888 Max. :87.18 Max. :77.73 Max. :99.97
## akses_air_minum tpt tpak pdrb
## Min. : 0.00 Min. : 0.000 Min. :56.39 Min. : 147485
## 1st Qu.: 79.04 1st Qu.: 3.180 1st Qu.:65.07 1st Qu.: 3654292
## Median : 89.80 Median : 4.565 Median :68.95 Median : 8814926
## Mean : 85.14 Mean : 5.059 Mean :69.46 Mean : 21964077
## 3rd Qu.: 96.40 3rd Qu.: 6.530 3rd Qu.:72.34 3rd Qu.: 19735101
## Max. :100.00 Max. :13.370 Max. :97.93 Max. :460081046
## klasifikasi_kemiskinan
## Tidak_Miskin:452
## Miskin : 62
##
##
##
##
Exploratory Data
Analysis (EDA)
Distribusi Kelas
Target
tabel_target <- table(kemiskinan$klasifikasi_kemiskinan)
print(tabel_target)
##
## Tidak_Miskin Miskin
## 452 62
## Proporsi:
print(round(prop.table(tabel_target) * 100, 2))
##
## Tidak_Miskin Miskin
## 87.94 12.06
ggplot(kemiskinan, aes(x = klasifikasi_kemiskinan, fill = klasifikasi_kemiskinan)) +
geom_bar() +
geom_text(
stat = "count",
aes(label = after_stat(count)),
vjust = -0.4,
size = 4.5
) +
scale_fill_manual(values = c("Tidak_Miskin" = "#2196F3", "Miskin" = "#F44336")) +
labs(
title = "Distribusi Kelas Target",
subtitle = "Klasifikasi Kemiskinan Kabupaten/Kota",
x = "Klasifikasi",
y = "Jumlah",
fill = "Kategori"
) +
theme_minimal(base_size = 13)

Distribusi per
Provinsi
distribusi_provinsi <- kemiskinan %>%
count(provinsi, klasifikasi_kemiskinan) %>%
group_by(provinsi) %>%
mutate(pct = round(n / sum(n) * 100, 1))
cat("====== 10 PROVINSI DENGAN PROPORSI MISKIN TERTINGGI ======\n")
## ====== 10 PROVINSI DENGAN PROPORSI MISKIN TERTINGGI ======
distribusi_provinsi %>%
filter(klasifikasi_kemiskinan == "Miskin") %>%
arrange(desc(pct)) %>%
head(10) %>%
print()
## # A tibble: 10 × 4
## # Groups: provinsi [10]
## provinsi klasifikasi_kemiskinan n pct
## <chr> <fct> <int> <dbl>
## 1 PAPUA Miskin 23 79.3
## 2 PAPUA BARAT Miskin 9 69.2
## 3 MALUKU Miskin 7 63.6
## 4 NUSA TENGGARA TIMUR Miskin 14 63.6
## 5 NUSA TENGGARA BARAT Miskin 1 10
## 6 RIAU Miskin 1 8.3
## 7 JAWA TIMUR Miskin 3 7.9
## 8 SUMATERA UTARA Miskin 2 6.1
## 9 SUMATERA SELATAN Miskin 1 5.9
## 10 ACEH Miskin 1 4.3
Boxplot Variabel
Numerik berdasarkan Kelas
kemiskinan_long <- kemiskinan %>%
select(all_of(variabel_numerik), klasifikasi_kemiskinan) %>%
pivot_longer(
cols = all_of(variabel_numerik),
names_to = "variabel",
values_to = "nilai"
)
ggplot(kemiskinan_long, aes(x = klasifikasi_kemiskinan, y = nilai, fill = klasifikasi_kemiskinan)) +
geom_boxplot(outlier.size = 1, alpha = 0.7) +
scale_fill_manual(values = c("Tidak_Miskin" = "#2196F3", "Miskin" = "#F44336")) +
facet_wrap(~ variabel, scales = "free_y", ncol = 3) +
labs(
title = "Distribusi Variabel Prediktor berdasarkan Kelas",
x = NULL,
y = NULL,
fill = "Kategori"
) +
theme_minimal(base_size = 11) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))

Matriks Korelasi
matriks_korelasi <- cor(
kemiskinan %>% select(all_of(variabel_numerik)),
use = "complete.obs"
)
cat("====== MATRIKS KORELASI ======\n")
## ====== MATRIKS KORELASI ======
print(round(matriks_korelasi, 2))
## persentase_miskin rata_lama_sekolah pengeluaran_perkapita
## persentase_miskin 1.00 -0.54 -0.64
## rata_lama_sekolah -0.54 1.00 0.67
## pengeluaran_perkapita -0.64 0.67 1.00
## ipm -0.71 0.87 0.87
## umur_harapan_hidup -0.54 0.42 0.57
## akses_sanitasi -0.56 0.61 0.57
## akses_air_minum -0.37 0.46 0.45
## tpt -0.40 0.52 0.55
## tpak 0.46 -0.50 -0.40
## pdrb -0.24 0.27 0.51
## ipm umur_harapan_hidup akses_sanitasi akses_air_minum
## persentase_miskin -0.71 -0.54 -0.56 -0.37
## rata_lama_sekolah 0.87 0.42 0.61 0.46
## pengeluaran_perkapita 0.87 0.57 0.57 0.45
## ipm 1.00 0.71 0.70 0.55
## umur_harapan_hidup 0.71 1.00 0.46 0.41
## akses_sanitasi 0.70 0.46 1.00 0.54
## akses_air_minum 0.55 0.41 0.54 1.00
## tpt 0.58 0.42 0.31 0.41
## tpak -0.51 -0.23 -0.44 -0.40
## pdrb 0.37 0.32 0.19 0.23
## tpt tpak pdrb
## persentase_miskin -0.40 0.46 -0.24
## rata_lama_sekolah 0.52 -0.50 0.27
## pengeluaran_perkapita 0.55 -0.40 0.51
## ipm 0.58 -0.51 0.37
## umur_harapan_hidup 0.42 -0.23 0.32
## akses_sanitasi 0.31 -0.44 0.19
## akses_air_minum 0.41 -0.40 0.23
## tpt 1.00 -0.58 0.39
## tpak -0.58 1.00 -0.21
## pdrb 0.39 -0.21 1.00
Penanganan Missing
Value
mv <- sapply(kemiskinan, function(x) sum(is.na(x)))
print(mv)
## provinsi kab_kota persentase_miskin
## 0 0 0
## rata_lama_sekolah pengeluaran_perkapita ipm
## 0 0 0
## umur_harapan_hidup akses_sanitasi akses_air_minum
## 0 0 0
## tpt tpak pdrb
## 0 0 0
## klasifikasi_kemiskinan
## 0
cat("Total missing value:", sum(mv), "\n")
## Total missing value: 0
kemiskinan_clean <- kemiskinan %>% drop_na()
cat("\nJumlah baris sebelum drop_na :", nrow(kemiskinan), "\n")
##
## Jumlah baris sebelum drop_na : 514
cat("Jumlah baris setelah drop_na :", nrow(kemiskinan_clean), "\n")
## Jumlah baris setelah drop_na : 514
cat("Baris dihapus :", nrow(kemiskinan) - nrow(kemiskinan_clean), "\n")
## Baris dihapus : 0
Deteksi &
Penanganan Outlier (IQR Method)
outlier_summary <- kemiskinan_clean %>%
select(all_of(variabel_numerik)) %>%
summarise(
across(
everything(),
list(
Q1 = ~ quantile(., 0.25, na.rm = TRUE),
Q3 = ~ quantile(., 0.75, na.rm = TRUE),
IQR = ~ IQR(., na.rm = TRUE),
n_out = ~ sum(
. < quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE) |
. > quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE),
na.rm = TRUE
)
)
)
)
outlier_n <- outlier_summary %>%
select(ends_with("_n_out")) %>%
pivot_longer(everything(), names_to = "variabel", values_to = "n_outlier") %>%
mutate(variabel = gsub("_n_out", "", variabel))
cat("====== RINGKASAN OUTLIER (IQR Method) ======\n")
## ====== RINGKASAN OUTLIER (IQR Method) ======
## # A tibble: 10 × 2
## variabel n_outlier
## <chr> <int>
## 1 persentase_miskin 36
## 2 rata_lama_sekolah 14
## 3 pengeluaran_perkapita 15
## 4 ipm 27
## 5 umur_harapan_hidup 8
## 6 akses_sanitasi 25
## 7 akses_air_minum 23
## 8 tpt 12
## 9 tpak 20
## 10 pdrb 56
Persiapan Data
Model
data_model <- kemiskinan_clean %>%
select(-provinsi, -kab_kota)
cat("====== STRUKTUR DATA MODEL ======\n")
## ====== STRUKTUR DATA MODEL ======
## tibble [514 × 11] (S3: tbl_df/tbl/data.frame)
## $ persentase_miskin : num [1:514] 19 20.4 13.2 13.4 14.4 ...
## $ rata_lama_sekolah : num [1:514] 9.48 8.68 8.88 9.67 8.21 ...
## $ pengeluaran_perkapita : num [1:514] 7148 8776 8180 8030 8577 ...
## $ ipm : num [1:514] 66.4 69.2 67.4 69.4 67.8 ...
## $ umur_harapan_hidup : num [1:514] 65.3 67.4 64.4 68.2 68.7 ...
## $ akses_sanitasi : num [1:514] 71.6 69.6 62.5 62.7 66.8 ...
## $ akses_air_minum : num [1:514] 87.5 78.6 79.7 86.7 83.2 ...
## $ tpt : num [1:514] 5.71 8.36 6.46 6.43 7.13 2.61 7.09 7.7 7.28 4.32 ...
## $ tpak : num [1:514] 71.2 62.9 60.9 69.6 59.5 ...
## $ pdrb : num [1:514] 1648096 1780419 4345784 3487157 8433526 ...
## $ klasifikasi_kemiskinan: Factor w/ 2 levels "Tidak_Miskin",..: 1 2 1 1 1 1 1 1 1 1 ...
Split Data Train - Test
(80:20)
set.seed(123)
index_train <- createDataPartition(
y = data_model$klasifikasi_kemiskinan,
p = 0.80,
list = FALSE
)
data_train <- data_model[index_train, ]
data_test <- data_model[-index_train, ]
cat(">> Data Train:\n")
## >> Data Train:
print(round(prop.table(table(data_train$klasifikasi_kemiskinan)) * 100, 2))
##
## Tidak_Miskin Miskin
## 87.86 12.14
## >> Data Test:
print(round(prop.table(table(data_test$klasifikasi_kemiskinan)) * 100, 2))
##
## Tidak_Miskin Miskin
## 88.24 11.76
cat("\nUkuran data train:", nrow(data_train), "| data test:", nrow(data_test), "\n")
##
## Ukuran data train: 412 | data test: 102
SMOTE
recipe_smote <- recipe(klasifikasi_kemiskinan ~ ., data = data_train) %>%
step_smote(klasifikasi_kemiskinan, over_ratio = 1, seed = 123)
prep_smote <- prep(recipe_smote)
train_smote <- bake(prep_smote, new_data = NULL)
cat("====== DISTRIBUSI SETELAH SMOTE ======\n")
## ====== DISTRIBUSI SETELAH SMOTE ======
print(table(train_smote$klasifikasi_kemiskinan))
##
## Tidak_Miskin Miskin
## 362 362
print(round(prop.table(table(train_smote$klasifikasi_kemiskinan)) * 100, 2))
##
## Tidak_Miskin Miskin
## 50 50
Model Decision Tree
Awal
model_dt <- rpart(
klasifikasi_kemiskinan ~ .,
data = train_smote,
method = "class",
control = rpart.control(cp = 0.01, maxdepth = 5, minsplit = 20, minbucket = 7)
)
cat("====== RINGKASAN MODEL ======\n")
## ====== RINGKASAN MODEL ======
## Call:
## rpart(formula = klasifikasi_kemiskinan ~ ., data = train_smote,
## method = "class", control = rpart.control(cp = 0.01, maxdepth = 5,
## minsplit = 20, minbucket = 7))
## n= 724
##
## CP nsplit rel error xerror xstd
## 1 0.91712707 0 1.00000000 1.07734807 0.03705337
## 2 0.01012891 1 0.08287293 0.10220994 0.01636822
## 3 0.01000000 5 0.03867403 0.09116022 0.01550309
##
## Variable importance
## persentase_miskin ipm pengeluaran_perkapita
## 24 18 15
## pdrb umur_harapan_hidup akses_sanitasi
## 15 15 11
## akses_air_minum
## 1
##
## Node number 1: 724 observations, complexity param=0.9171271
## predicted class=Tidak_Miskin expected loss=0.5 P(node) =1
## class counts: 362 362
## probabilities: 0.500 0.500
## left son=2 (378 obs) right son=3 (346 obs)
## Primary splits:
## persentase_miskin < 18.82007 to the left, improve=305.0822, (0 missing)
## ipm < 65.71 to the right, improve=180.9541, (0 missing)
## umur_harapan_hidup < 68.075 to the right, improve=163.0172, (0 missing)
## pengeluaran_perkapita < 9659.706 to the right, improve=141.4260, (0 missing)
## pdrb < 3345765 to the right, improve=137.5035, (0 missing)
## Surrogate splits:
## ipm < 65.71 to the right, agree=0.872, adj=0.731, (0 split)
## pdrb < 4271089 to the right, agree=0.829, adj=0.642, (0 split)
## pengeluaran_perkapita < 9058.5 to the right, agree=0.826, adj=0.636, (0 split)
## umur_harapan_hidup < 68.075 to the right, agree=0.812, adj=0.607, (0 split)
## akses_sanitasi < 66.30282 to the right, agree=0.735, adj=0.445, (0 split)
##
## Node number 2: 378 observations, complexity param=0.01012891
## predicted class=Tidak_Miskin expected loss=0.06084656 P(node) =0.5220994
## class counts: 355 23
## probabilities: 0.939 0.061
## left son=4 (290 obs) right son=5 (88 obs)
## Primary splits:
## umur_harapan_hidup < 67.64468 to the right, improve=7.251372, (0 missing)
## persentase_miskin < 15.45402 to the left, improve=4.848785, (0 missing)
## akses_air_minum < 81.63133 to the right, improve=4.770171, (0 missing)
## pengeluaran_perkapita < 10207 to the right, improve=3.536388, (0 missing)
## akses_sanitasi < 82.135 to the right, improve=2.889230, (0 missing)
## Surrogate splits:
## ipm < 65.865 to the right, agree=0.831, adj=0.273, (0 split)
## pengeluaran_perkapita < 6677.5 to the right, agree=0.783, adj=0.068, (0 split)
## pdrb < 1664476 to the right, agree=0.783, adj=0.068, (0 split)
## persentase_miskin < 16.315 to the left, agree=0.775, adj=0.034, (0 split)
##
## Node number 3: 346 observations
## predicted class=Miskin expected loss=0.02023121 P(node) =0.4779006
## class counts: 7 339
## probabilities: 0.020 0.980
##
## Node number 4: 290 observations
## predicted class=Tidak_Miskin expected loss=0.006896552 P(node) =0.4005525
## class counts: 288 2
## probabilities: 0.993 0.007
##
## Node number 5: 88 observations, complexity param=0.01012891
## predicted class=Tidak_Miskin expected loss=0.2386364 P(node) =0.121547
## class counts: 67 21
## probabilities: 0.761 0.239
## left son=10 (42 obs) right son=11 (46 obs)
## Primary splits:
## akses_air_minum < 81.63133 to the right, improve=9.151186, (0 missing)
## pdrb < 9661515 to the left, improve=5.962022, (0 missing)
## ipm < 69.64993 to the left, improve=5.150429, (0 missing)
## persentase_miskin < 17.39 to the left, improve=4.802914, (0 missing)
## pengeluaran_perkapita < 10274 to the right, improve=3.758523, (0 missing)
## Surrogate splits:
## akses_sanitasi < 68.27 to the right, agree=0.670, adj=0.310, (0 split)
## pengeluaran_perkapita < 9182.622 to the left, agree=0.648, adj=0.262, (0 split)
## ipm < 67.08 to the left, agree=0.648, adj=0.262, (0 split)
## umur_harapan_hidup < 66.86456 to the left, agree=0.625, adj=0.214, (0 split)
## tpak < 66.1197 to the left, agree=0.625, adj=0.214, (0 split)
##
## Node number 10: 42 observations
## predicted class=Tidak_Miskin expected loss=0 P(node) =0.05801105
## class counts: 42 0
## probabilities: 1.000 0.000
##
## Node number 11: 46 observations, complexity param=0.01012891
## predicted class=Tidak_Miskin expected loss=0.4565217 P(node) =0.06353591
## class counts: 25 21
## probabilities: 0.543 0.457
## left son=22 (33 obs) right son=23 (13 obs)
## Primary splits:
## persentase_miskin < 15.40402 to the left, improve=7.889024, (0 missing)
## pdrb < 9661515 to the left, improve=6.182148, (0 missing)
## pengeluaran_perkapita < 10295 to the right, improve=6.026087, (0 missing)
## akses_sanitasi < 77.88508 to the left, improve=4.610401, (0 missing)
## akses_air_minum < 70.08 to the left, improve=4.522165, (0 missing)
## Surrogate splits:
## akses_air_minum < 78.08788 to the left, agree=0.804, adj=0.308, (0 split)
## umur_harapan_hidup < 67.36145 to the left, agree=0.783, adj=0.231, (0 split)
## pengeluaran_perkapita < 8031 to the right, agree=0.761, adj=0.154, (0 split)
## pdrb < 2781274 to the right, agree=0.761, adj=0.154, (0 split)
## ipm < 63.975 to the right, agree=0.739, adj=0.077, (0 split)
##
## Node number 22: 33 observations, complexity param=0.01012891
## predicted class=Tidak_Miskin expected loss=0.2727273 P(node) =0.04558011
## class counts: 24 9
## probabilities: 0.727 0.273
## left son=44 (24 obs) right son=45 (9 obs)
## Primary splits:
## akses_sanitasi < 77.88508 to the left, improve=6.313131, (0 missing)
## ipm < 69.30573 to the left, improve=5.890909, (0 missing)
## pdrb < 9661515 to the left, improve=5.890909, (0 missing)
## rata_lama_sekolah < 8.395519 to the left, improve=3.208556, (0 missing)
## umur_harapan_hidup < 66.20588 to the left, improve=3.190909, (0 missing)
## Surrogate splits:
## tpt < 2.728599 to the right, agree=0.848, adj=0.444, (0 split)
##
## Node number 23: 13 observations
## predicted class=Miskin expected loss=0.07692308 P(node) =0.0179558
## class counts: 1 12
## probabilities: 0.077 0.923
##
## Node number 44: 24 observations
## predicted class=Tidak_Miskin expected loss=0.08333333 P(node) =0.03314917
## class counts: 22 2
## probabilities: 0.917 0.083
##
## Node number 45: 9 observations
## predicted class=Miskin expected loss=0.2222222 P(node) =0.01243094
## class counts: 2 7
## probabilities: 0.222 0.778
cat("\n====== TABEL CP ======\n")
##
## ====== TABEL CP ======
##
## Classification tree:
## rpart(formula = klasifikasi_kemiskinan ~ ., data = train_smote,
## method = "class", control = rpart.control(cp = 0.01, maxdepth = 5,
## minsplit = 20, minbucket = 7))
##
## Variables actually used in tree construction:
## [1] akses_air_minum akses_sanitasi persentase_miskin umur_harapan_hidup
##
## Root node error: 362/724 = 0.5
##
## n= 724
##
## CP nsplit rel error xerror xstd
## 1 0.917127 0 1.000000 1.07735 0.037053
## 2 0.010129 1 0.082873 0.10221 0.016368
## 3 0.010000 5 0.038674 0.09116 0.015503
Pruning Decision
Tree
cp_optimal <- model_dt$cptable[which.min(model_dt$cptable[, "xerror"]), "CP"]
cat("CP Optimal:", cp_optimal, "\n")
## CP Optimal: 0.01
model_dt_pruned <- prune(model_dt, cp = cp_optimal)
cat("\n====== PERBANDINGAN JUMLAH SPLIT ======\n")
##
## ====== PERBANDINGAN JUMLAH SPLIT ======
cat("Model awal :", model_dt$cptable[nrow(model_dt$cptable), "nsplit"], "split\n")
## Model awal : 5 split
cat("Model pruned :", model_dt_pruned$cptable[nrow(model_dt_pruned$cptable), "nsplit"], "split\n")
## Model pruned : 5 split
Visualisasi Tree
rpart.plot(
model_dt,
type = 2,
extra = 104,
fallen.leaves = TRUE,
box.palette = c("#F44336", "#2196F3"),
shadow.col = "gray70",
main = "Decision Tree Klasifikasi Kemiskinan"
)

rpart.plot(
model_dt_pruned,
type = 2,
extra = 104,
fallen.leaves = TRUE,
box.palette = c("#F44336", "#2196F3"),
shadow.col = "gray70",
main = "Decision Tree - Setelah Pruning"
)

plotcp(model_dt)
title("Grafik Complexity Parameter (CP)")

Grid Search
set.seed(123)
ctrl_grid <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
grid_cp <- expand.grid(cp = seq(0.001, 0.05, by = 0.005))
model_grid <- train(
klasifikasi_kemiskinan ~ .,
data = train_smote,
method = "rpart",
metric = "ROC",
tuneGrid = grid_cp,
trControl = ctrl_grid,
parms = list(split = "gini"),
control = rpart.control(maxdepth = 5, minsplit = 20, minbucket = 7)
)
cat("====== HASIL GRID SEARCH ======\n")
## ====== HASIL GRID SEARCH ======
## CART
##
## 724 samples
## 10 predictor
## 2 classes: 'Tidak_Miskin', 'Miskin'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 580, 579, 579, 579, 579
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.001 0.9685069 0.9392314 0.9641172
## 0.006 0.9661149 0.9392314 0.9613394
## 0.011 0.9486301 0.9501903 0.9364536
## 0.016 0.9446918 0.9612253 0.9281583
## 0.021 0.9446918 0.9612253 0.9281583
## 0.026 0.9446918 0.9612253 0.9281583
## 0.031 0.9446918 0.9612253 0.9281583
## 0.036 0.9446918 0.9612253 0.9281583
## 0.041 0.9446918 0.9612253 0.9281583
## 0.046 0.9446918 0.9612253 0.9281583
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001.
cat("\n====== PARAMETER TERBAIK ======\n")
##
## ====== PARAMETER TERBAIK ======
print(model_grid$bestTune)
## cp
## 1 0.001
cat("\n====== SELURUH HASIL ======\n")
##
## ====== SELURUH HASIL ======
print(model_grid$results)
## cp ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.001 0.9685069 0.9392314 0.9641172 0.01485216 0.02509382 0.02073890
## 2 0.006 0.9661149 0.9392314 0.9613394 0.01905503 0.02509382 0.02030757
## 3 0.011 0.9486301 0.9501903 0.9364536 0.02325840 0.02536243 0.04422831
## 4 0.016 0.9446918 0.9612253 0.9281583 0.01616676 0.02680029 0.03827054
## 5 0.021 0.9446918 0.9612253 0.9281583 0.01616676 0.02680029 0.03827054
## 6 0.026 0.9446918 0.9612253 0.9281583 0.01616676 0.02680029 0.03827054
## 7 0.031 0.9446918 0.9612253 0.9281583 0.01616676 0.02680029 0.03827054
## 8 0.036 0.9446918 0.9612253 0.9281583 0.01616676 0.02680029 0.03827054
## 9 0.041 0.9446918 0.9612253 0.9281583 0.01616676 0.02680029 0.03827054
## 10 0.046 0.9446918 0.9612253 0.9281583 0.01616676 0.02680029 0.03827054

Random Search
set.seed(123)
ctrl_random <- trainControl(
method = "cv",
number = 5,
search = "random",
classProbs = TRUE,
summaryFunction = twoClassSummary
)
model_random <- train(
klasifikasi_kemiskinan ~ .,
data = train_smote,
method = "rpart",
metric = "ROC",
tuneLength = 20,
trControl = ctrl_random,
parms = list(split = "gini"),
control = rpart.control(
maxdepth = sample(3:10, 1),
minsplit = sample(seq(5, 30, 5), 1),
minbucket = sample(seq(2, 10, 2), 1)
)
)
cat("====== HASIL RANDOM SEARCH ======\n")
## ====== HASIL RANDOM SEARCH ======
## CART
##
## 724 samples
## 10 predictor
## 2 classes: 'Tidak_Miskin', 'Miskin'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 580, 579, 579, 579, 579
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.00000000 0.9656007 0.9584475 0.9473744
## 0.01012891 0.9446918 0.9612253 0.9281583
## 0.91712707 0.6724125 0.7889650 0.5558600
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
cat("\n====== PARAMETER TERBAIK ======\n")
##
## ====== PARAMETER TERBAIK ======
print(model_random$bestTune)
## cp
## 1 0

Evaluasi Model
Fungsi Evaluasi
tampilkan_evaluasi <- function(label, pred_kelas, pred_prob, aktual) {
cat("\n", strrep("=", 50), "\n")
cat(" EVALUASI:", label, "\n")
cat(strrep("=", 50), "\n")
cm <- confusionMatrix(data = pred_kelas, reference = aktual, positive = "Miskin")
print(cm)
roc_obj <- roc(as.numeric(aktual == "Miskin"), pred_prob[, "Miskin"], quiet = TRUE)
cat("AUC:", round(auc(roc_obj), 4), "\n")
invisible(list(cm = cm, roc = roc_obj))
}
Model Awal
pred_awal <- predict(model_dt, newdata = data_test, type = "class")
prob_awal <- predict(model_dt, newdata = data_test, type = "prob")
eval_awal <- tampilkan_evaluasi("Model Awal", pred_awal, prob_awal, data_test$klasifikasi_kemiskinan)
##
## ==================================================
## EVALUASI: Model Awal
## ==================================================
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Miskin Miskin
## Tidak_Miskin 82 0
## Miskin 8 12
##
## Accuracy : 0.9216
## 95% CI : (0.8513, 0.9655)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 0.13893
##
## Kappa : 0.7069
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 1.0000
## Specificity : 0.9111
## Pos Pred Value : 0.6000
## Neg Pred Value : 1.0000
## Prevalence : 0.1176
## Detection Rate : 0.1176
## Detection Prevalence : 0.1961
## Balanced Accuracy : 0.9556
##
## 'Positive' Class : Miskin
##
## AUC: 0.9667
Model Pruned
pred_pruned <- predict(model_dt_pruned, newdata = data_test, type = "class")
prob_pruned <- predict(model_dt_pruned, newdata = data_test, type = "prob")
eval_pruned <- tampilkan_evaluasi("Model Pruned", pred_pruned, prob_pruned, data_test$klasifikasi_kemiskinan)
##
## ==================================================
## EVALUASI: Model Pruned
## ==================================================
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Miskin Miskin
## Tidak_Miskin 82 0
## Miskin 8 12
##
## Accuracy : 0.9216
## 95% CI : (0.8513, 0.9655)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 0.13893
##
## Kappa : 0.7069
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 1.0000
## Specificity : 0.9111
## Pos Pred Value : 0.6000
## Neg Pred Value : 1.0000
## Prevalence : 0.1176
## Detection Rate : 0.1176
## Detection Prevalence : 0.1961
## Balanced Accuracy : 0.9556
##
## 'Positive' Class : Miskin
##
## AUC: 0.9667
Grid Search
pred_grid <- predict(model_grid, newdata = data_test)
prob_grid <- predict(model_grid, newdata = data_test, type = "prob")
eval_grid <- tampilkan_evaluasi("Model Grid Search", pred_grid, prob_grid, data_test$klasifikasi_kemiskinan)
##
## ==================================================
## EVALUASI: Model Grid Search
## ==================================================
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Miskin Miskin
## Tidak_Miskin 82 0
## Miskin 8 12
##
## Accuracy : 0.9216
## 95% CI : (0.8513, 0.9655)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 0.13893
##
## Kappa : 0.7069
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 1.0000
## Specificity : 0.9111
## Pos Pred Value : 0.6000
## Neg Pred Value : 1.0000
## Prevalence : 0.1176
## Detection Rate : 0.1176
## Detection Prevalence : 0.1961
## Balanced Accuracy : 0.9556
##
## 'Positive' Class : Miskin
##
## AUC: 0.9667
Random Search
pred_random <- predict(model_random, newdata = data_test)
prob_random <- predict(model_random, newdata = data_test, type = "prob")
eval_random <- tampilkan_evaluasi("Model Random Search", pred_random, prob_random, data_test$klasifikasi_kemiskinan)
##
## ==================================================
## EVALUASI: Model Random Search
## ==================================================
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Miskin Miskin
## Tidak_Miskin 89 0
## Miskin 1 12
##
## Accuracy : 0.9902
## 95% CI : (0.9466, 0.9998)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 4.167e-05
##
## Kappa : 0.9544
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9889
## Pos Pred Value : 0.9231
## Neg Pred Value : 1.0000
## Prevalence : 0.1176
## Detection Rate : 0.1176
## Detection Prevalence : 0.1275
## Balanced Accuracy : 0.9944
##
## 'Positive' Class : Miskin
##
## AUC: 0.9944
Perbandingan
Metrik
tabel_metrik <- data.frame(
Model = c("Sebelum Pruning", "Setelah Pruning", "Grid Search", "Random Search"),
Accuracy = c(
round(eval_awal$cm$overall["Accuracy"] * 100, 2),
round(eval_pruned$cm$overall["Accuracy"] * 100, 2),
round(eval_grid$cm$overall["Accuracy"] * 100, 2),
round(eval_random$cm$overall["Accuracy"] * 100, 2)
),
Sensitivity = c(
round(eval_awal$cm$byClass["Sensitivity"] * 100, 2),
round(eval_pruned$cm$byClass["Sensitivity"] * 100, 2),
round(eval_grid$cm$byClass["Sensitivity"] * 100, 2),
round(eval_random$cm$byClass["Sensitivity"] * 100, 2)
),
Specificity = c(
round(eval_awal$cm$byClass["Specificity"] * 100, 2),
round(eval_pruned$cm$byClass["Specificity"] * 100, 2),
round(eval_grid$cm$byClass["Specificity"] * 100, 2),
round(eval_random$cm$byClass["Specificity"] * 100, 2)
),
Precision = c(
round(eval_awal$cm$byClass["Pos Pred Value"] * 100, 2),
round(eval_pruned$cm$byClass["Pos Pred Value"] * 100, 2),
round(eval_grid$cm$byClass["Pos Pred Value"] * 100, 2),
round(eval_random$cm$byClass["Pos Pred Value"] * 100, 2)
),
F1 = c(
round(eval_awal$cm$byClass["F1"] * 100, 2),
round(eval_pruned$cm$byClass["F1"] * 100, 2),
round(eval_grid$cm$byClass["F1"] * 100, 2),
round(eval_random$cm$byClass["F1"] * 100, 2)
),
AUC = c(
round(auc(eval_awal$roc), 4),
round(auc(eval_pruned$roc), 4),
round(auc(eval_grid$roc), 4),
round(auc(eval_random$roc), 4)
)
)
print(tabel_metrik)
## Model Accuracy Sensitivity Specificity Precision F1 AUC
## 1 Sebelum Pruning 92.16 100 91.11 60.00 75 0.9667
## 2 Setelah Pruning 92.16 100 91.11 60.00 75 0.9667
## 3 Grid Search 92.16 100 91.11 60.00 75 0.9667
## 4 Random Search 99.02 100 98.89 92.31 96 0.9944
tabel_metrik_long <- tabel_metrik %>%
pivot_longer(cols = -Model, names_to = "Metrik", values_to = "Nilai")
ggplot(tabel_metrik_long, aes(x = Metrik, y = Nilai, fill = Model)) +
geom_bar(
stat = "identity",
position = position_dodge(width = 0.8),
width = 0.68,
color = "white"
) +
geom_text(
aes(label = round(Nilai, 2)),
position = position_dodge(width = 0.8),
hjust = -0.2,
size = 3
) +
coord_flip() +
scale_fill_manual(values = c(
"Grid Search" = "#F8766D",
"Random Search" = "#7CAE00",
"Sebelum Pruning" = "#00BFC4",
"Setelah Pruning" = "#C77CFF"
)) +
scale_y_continuous(limits = c(0, 120), expand = expansion(mult = c(0, 0.05))) +
labs(
title = "Perbandingan Metrik Evaluasi",
subtitle = "Decision Tree: Grid Search vs Random Search vs Pruning",
x = NULL,
y = "Nilai (%)",
fill = "Model"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 18, hjust = 0.5),
plot.subtitle = element_text(size = 11, hjust = 0.5),
axis.text.y = element_text(size = 11, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12, face = "bold"),
legend.position = "bottom",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9),
panel.grid.minor = element_blank()
)

Variable
Importance
importance_df <- data.frame(
variabel = names(model_dt_pruned$variable.importance),
importance = model_dt_pruned$variable.importance,
row.names = NULL
) %>%
arrange(desc(importance)) %>%
mutate(
pct = round(importance / sum(importance) * 100, 2),
pct_kum = cumsum(pct)
)
print(importance_df)
## variabel importance pct pct_kum
## 1 persentase_miskin 313.218409 24.20 24.20
## 2 ipm 228.061555 17.62 41.82
## 3 pengeluaran_perkapita 198.087735 15.30 57.12
## 4 pdrb 197.454477 15.26 72.38
## 5 umur_harapan_hidup 196.198368 15.16 87.54
## 6 akses_sanitasi 144.933663 11.20 98.74
## 7 akses_air_minum 11.578578 0.89 99.63
## 8 tpt 2.805836 0.22 99.85
## 9 tpak 1.960968 0.15 100.00
ggplot(importance_df, aes(x = reorder(variabel, importance), y = importance, fill = importance)) +
geom_col(show.legend = FALSE, width = 0.7) +
geom_text(aes(label = paste0(pct, "%")), hjust = -0.1, size = 3.5) +
coord_flip() +
labs(
title = "Variable Importance",
x = "Variabel",
y = "Importance Score"
) +
theme_minimal(base_size = 12)

Prediksi Data
Baru
data_baru <- data.frame(
persentase_miskin = c(25.5, 8.0, 40.0),
rata_lama_sekolah = c(6.5, 10.2, 4.5),
pengeluaran_perkapita = c(6500, 14000, 5000),
ipm = c(58.0, 78.0, 48.0),
umur_harapan_hidup = c(62.0, 72.0, 58.0),
akses_sanitasi = c(50.0, 95.0, 10.0),
akses_air_minum = c(70.0, 98.0, 40.0),
tpt = c(3.0, 9.0, 1.5),
tpak = c(75.0, 63.0, 88.0),
pdrb = c(2000000, 50000000, 800000)
)
prediksi_baru <- predict(model_dt_pruned, newdata = data_baru, type = "class")
prob_baru <- predict(model_dt_pruned, newdata = data_baru, type = "prob")
data_baru$prediksi <- prediksi_baru
data_baru$prob_miskin <- round(prob_baru[, "Miskin"] * 100, 2)
cat("====== HASIL PREDIKSI DATA BARU ======\n")
## ====== HASIL PREDIKSI DATA BARU ======
print(data_baru %>% select(persentase_miskin, ipm, prediksi, prob_miskin))
## persentase_miskin ipm prediksi prob_miskin
## 1 25.5 58 Miskin 97.98
## 2 8.0 78 Tidak_Miskin 0.69
## 3 40.0 48 Miskin 97.98
Interpretasi
Hasil
cat(strrep("=", 60), "\n")
## ============================================================
cat(" INTERPRETASI HASIL ANALISIS\n")
## INTERPRETASI HASIL ANALISIS
cat(strrep("=", 60), "\n")
## ============================================================
cat("
METODE YANG DIGUNAKAN
* Decision Tree
* SMOTE
* Pruning
* Grid Search
* Random Search
* Cross Validation
KESIMPULAN
* Model mampu melakukan klasifikasi kemiskinan
dengan performa yang sangat baik.
* Variabel paling penting:
1. Persentase penduduk miskin
2. IPM
3. Pengeluaran per kapita
* AUC tinggi menunjukkan model memiliki
kemampuan diskriminasi yang sangat baik.
")
##
## METODE YANG DIGUNAKAN
## * Decision Tree
## * SMOTE
## * Pruning
## * Grid Search
## * Random Search
## * Cross Validation
##
## KESIMPULAN
## * Model mampu melakukan klasifikasi kemiskinan
## dengan performa yang sangat baik.
## * Variabel paling penting:
## 1. Persentase penduduk miskin
## 2. IPM
## 3. Pengeluaran per kapita
## * AUC tinggi menunjukkan model memiliki
## kemampuan diskriminasi yang sangat baik.
cat(strrep("=", 60), "\n")
## ============================================================
cat(" ANALISIS SELESAI\n")
## ANALISIS SELESAI
cat(strrep("=", 60), "\n")
## ============================================================