1 Load Library

library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(rpart)
library(rpart.plot)
library(pROC)
library(recipes)
library(themis)

set.seed(123)

2 Import Data

kemiskinan <- read_excel("D:/Users/user/Downloads/uas_datmin.xlsx")
cat("====== DIMENSI DATA ======\n")
## ====== DIMENSI DATA ======
dim(kemiskinan)
## [1] 514  13
cat("\n====== STRUKTUR DATA ======\n")
## 
## ====== STRUKTUR DATA ======
str(kemiskinan)
## tibble [514 × 13] (S3: tbl_df/tbl/data.frame)
##  $ Provinsi                                                            : chr [1:514] "ACEH" "ACEH" "ACEH" "ACEH" ...
##  $ Kab/Kota                                                            : chr [1:514] "Simeulue" "Aceh Singkil" "Aceh Selatan" "Aceh Tenggara" ...
##  $ Persentase Penduduk Miskin (P0) Menurut Kabupaten/Kota (Persen)     : num [1:514] 19 20.4 13.2 13.4 14.4 ...
##  $ Rata-rata Lama Sekolah Penduduk 15+ (Tahun)                         : num [1:514] 9.48 8.68 8.88 9.67 8.21 ...
##  $ Pengeluaran per Kapita Disesuaikan (Ribu Rupiah/Orang/Tahun)        : num [1:514] 7148 8776 8180 8030 8577 ...
##  $ Indeks Pembangunan Manusia                                          : num [1:514] 66.4 69.2 67.4 69.4 67.8 ...
##  $ Umur Harapan Hidup (Tahun)                                          : num [1:514] 65.3 67.4 64.4 68.2 68.7 ...
##  $ Persentase rumah tangga yang memiliki akses terhadap sanitasi layak : num [1:514] 71.6 69.6 62.5 62.7 66.8 ...
##  $ Persentase rumah tangga yang memiliki akses terhadap air minum layak: num [1:514] 87.5 78.6 79.7 86.7 83.2 ...
##  $ Tingkat Pengangguran Terbuka                                        : num [1:514] 5.71 8.36 6.46 6.43 7.13 2.61 7.09 7.7 7.28 4.32 ...
##  $ Tingkat Partisipasi Angkatan Kerja                                  : num [1:514] 71.2 62.9 60.9 69.6 59.5 ...
##  $ PDRB atas Dasar Harga Konstan menurut Pengeluaran (Rupiah)          : num [1:514] 1648096 1780419 4345784 3487157 8433526 ...
##  $ Klasifikasi Kemiskinan                                              : num [1:514] 0 1 0 0 0 0 0 0 0 0 ...
cat("\n====== 6 BARIS PERTAMA ======\n")
## 
## ====== 6 BARIS PERTAMA ======
head(kemiskinan)
## # A tibble: 6 × 13
##   Provinsi `Kab/Kota`    Persentase Penduduk Miskin (P0…¹ Rata-rata Lama Sekol…²
##   <chr>    <chr>                                    <dbl>                  <dbl>
## 1 ACEH     Simeulue                                  19.0                   9.48
## 2 ACEH     Aceh Singkil                              20.4                   8.68
## 3 ACEH     Aceh Selatan                              13.2                   8.88
## 4 ACEH     Aceh Tenggara                             13.4                   9.67
## 5 ACEH     Aceh Timur                                14.4                   8.21
## 6 ACEH     Aceh Tengah                               15.3                   9.86
## # ℹ abbreviated names:
## #   ¹​`Persentase Penduduk Miskin (P0) Menurut Kabupaten/Kota (Persen)`,
## #   ²​`Rata-rata Lama Sekolah Penduduk 15+ (Tahun)`
## # ℹ 9 more variables:
## #   `Pengeluaran per Kapita Disesuaikan (Ribu Rupiah/Orang/Tahun)` <dbl>,
## #   `Indeks Pembangunan Manusia` <dbl>, `Umur Harapan Hidup (Tahun)` <dbl>,
## #   `Persentase rumah tangga yang memiliki akses terhadap sanitasi layak` <dbl>, …
cat("\n====== NAMA KOLOM ======\n")
## 
## ====== NAMA KOLOM ======
names(kemiskinan)
##  [1] "Provinsi"                                                            
##  [2] "Kab/Kota"                                                            
##  [3] "Persentase Penduduk Miskin (P0) Menurut Kabupaten/Kota (Persen)"     
##  [4] "Rata-rata Lama Sekolah Penduduk 15+ (Tahun)"                         
##  [5] "Pengeluaran per Kapita Disesuaikan (Ribu Rupiah/Orang/Tahun)"        
##  [6] "Indeks Pembangunan Manusia"                                          
##  [7] "Umur Harapan Hidup (Tahun)"                                          
##  [8] "Persentase rumah tangga yang memiliki akses terhadap sanitasi layak" 
##  [9] "Persentase rumah tangga yang memiliki akses terhadap air minum layak"
## [10] "Tingkat Pengangguran Terbuka"                                        
## [11] "Tingkat Partisipasi Angkatan Kerja"                                  
## [12] "PDRB atas Dasar Harga Konstan menurut Pengeluaran (Rupiah)"          
## [13] "Klasifikasi Kemiskinan"

3 Rename Variabel

names(kemiskinan) <- c(
  "provinsi",
  "kab_kota",
  "persentase_miskin",
  "rata_lama_sekolah",
  "pengeluaran_perkapita",
  "ipm",
  "umur_harapan_hidup",
  "akses_sanitasi",
  "akses_air_minum",
  "tpt",
  "tpak",
  "pdrb",
  "klasifikasi_kemiskinan"
)

cat("====== NAMA KOLOM SETELAH RENAME ======\n")
## ====== NAMA KOLOM SETELAH RENAME ======
names(kemiskinan)
##  [1] "provinsi"               "kab_kota"               "persentase_miskin"     
##  [4] "rata_lama_sekolah"      "pengeluaran_perkapita"  "ipm"                   
##  [7] "umur_harapan_hidup"     "akses_sanitasi"         "akses_air_minum"       
## [10] "tpt"                    "tpak"                   "pdrb"                  
## [13] "klasifikasi_kemiskinan"

4 Konversi Tipe Data

variabel_numerik <- c(
  "persentase_miskin",
  "rata_lama_sekolah",
  "pengeluaran_perkapita",
  "ipm",
  "umur_harapan_hidup",
  "akses_sanitasi",
  "akses_air_minum",
  "tpt",
  "tpak",
  "pdrb"
)

# Ubah koma desimal ke titik lalu konversi ke numeric
kemiskinan <- kemiskinan %>%
  mutate(
    across(
      all_of(variabel_numerik),
      ~ as.numeric(gsub(",", ".", as.character(.)))
    )
  )

# Konversi target ke factor berlabel
kemiskinan$klasifikasi_kemiskinan <- factor(
  kemiskinan$klasifikasi_kemiskinan,
  levels = c(0, 1),
  labels = c("Tidak_Miskin", "Miskin")
)

cat("====== STRUKTUR SETELAH KONVERSI ======\n")
## ====== STRUKTUR SETELAH KONVERSI ======
str(kemiskinan)
## tibble [514 × 13] (S3: tbl_df/tbl/data.frame)
##  $ provinsi              : chr [1:514] "ACEH" "ACEH" "ACEH" "ACEH" ...
##  $ kab_kota              : chr [1:514] "Simeulue" "Aceh Singkil" "Aceh Selatan" "Aceh Tenggara" ...
##  $ persentase_miskin     : num [1:514] 19 20.4 13.2 13.4 14.4 ...
##  $ rata_lama_sekolah     : num [1:514] 9.48 8.68 8.88 9.67 8.21 ...
##  $ pengeluaran_perkapita : num [1:514] 7148 8776 8180 8030 8577 ...
##  $ ipm                   : num [1:514] 66.4 69.2 67.4 69.4 67.8 ...
##  $ umur_harapan_hidup    : num [1:514] 65.3 67.4 64.4 68.2 68.7 ...
##  $ akses_sanitasi        : num [1:514] 71.6 69.6 62.5 62.7 66.8 ...
##  $ akses_air_minum       : num [1:514] 87.5 78.6 79.7 86.7 83.2 ...
##  $ tpt                   : num [1:514] 5.71 8.36 6.46 6.43 7.13 2.61 7.09 7.7 7.28 4.32 ...
##  $ tpak                  : num [1:514] 71.2 62.9 60.9 69.6 59.5 ...
##  $ pdrb                  : num [1:514] 1648096 1780419 4345784 3487157 8433526 ...
##  $ klasifikasi_kemiskinan: Factor w/ 2 levels "Tidak_Miskin",..: 1 2 1 1 1 1 1 1 1 1 ...
cat("\n====== RINGKASAN STATISTIK ======\n")
## 
## ====== RINGKASAN STATISTIK ======
summary(kemiskinan)
##       provinsi        kab_kota   persentase_miskin rata_lama_sekolah
##  Length   :514   Length   :514   Min.   : 2.38     Min.   : 1.420   
##  N.unique : 34   N.unique :514   1st Qu.: 7.15     1st Qu.: 7.510   
##  N.blank  :  0   N.blank  :  0   Median :10.46     Median : 8.305   
##  Min.nchar:  4   Min.nchar:  4   Mean   :12.27     Mean   : 8.437   
##  Max.nchar: 20   Max.nchar: 26   3rd Qu.:14.89     3rd Qu.: 9.338   
##                                  Max.   :41.66     Max.   :12.830   
##  pengeluaran_perkapita      ipm        umur_harapan_hidup akses_sanitasi 
##  Min.   : 3976         Min.   :32.84   Min.   :55.43      Min.   : 0.00  
##  1st Qu.: 8574         1st Qu.:66.64   1st Qu.:67.39      1st Qu.:70.22  
##  Median :10196         Median :69.61   Median :69.97      Median :81.80  
##  Mean   :10325         Mean   :69.93   Mean   :69.66      Mean   :77.20  
##  3rd Qu.:11719         3rd Qu.:73.11   3rd Qu.:72.04      3rd Qu.:89.88  
##  Max.   :23888         Max.   :87.18   Max.   :77.73      Max.   :99.97  
##  akses_air_minum       tpt              tpak            pdrb          
##  Min.   :  0.00   Min.   : 0.000   Min.   :56.39   Min.   :   147485  
##  1st Qu.: 79.04   1st Qu.: 3.180   1st Qu.:65.07   1st Qu.:  3654292  
##  Median : 89.80   Median : 4.565   Median :68.95   Median :  8814926  
##  Mean   : 85.14   Mean   : 5.059   Mean   :69.46   Mean   : 21964077  
##  3rd Qu.: 96.40   3rd Qu.: 6.530   3rd Qu.:72.34   3rd Qu.: 19735101  
##  Max.   :100.00   Max.   :13.370   Max.   :97.93   Max.   :460081046  
##   klasifikasi_kemiskinan
##  Tidak_Miskin:452       
##  Miskin      : 62       
##                         
##                         
##                         
## 

5 Exploratory Data Analysis (EDA)

5.1 Distribusi Kelas Target

tabel_target <- table(kemiskinan$klasifikasi_kemiskinan)
print(tabel_target)
## 
## Tidak_Miskin       Miskin 
##          452           62
cat("Proporsi:\n")
## Proporsi:
print(round(prop.table(tabel_target) * 100, 2))
## 
## Tidak_Miskin       Miskin 
##        87.94        12.06
ggplot(kemiskinan, aes(x = klasifikasi_kemiskinan, fill = klasifikasi_kemiskinan)) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = after_stat(count)),
    vjust = -0.4,
    size = 4.5
  ) +
  scale_fill_manual(values = c("Tidak_Miskin" = "#2196F3", "Miskin" = "#F44336")) +
  labs(
    title    = "Distribusi Kelas Target",
    subtitle = "Klasifikasi Kemiskinan Kabupaten/Kota",
    x        = "Klasifikasi",
    y        = "Jumlah",
    fill     = "Kategori"
  ) +
  theme_minimal(base_size = 13)

5.2 Distribusi per Provinsi

distribusi_provinsi <- kemiskinan %>%
  count(provinsi, klasifikasi_kemiskinan) %>%
  group_by(provinsi) %>%
  mutate(pct = round(n / sum(n) * 100, 1))

cat("====== 10 PROVINSI DENGAN PROPORSI MISKIN TERTINGGI ======\n")
## ====== 10 PROVINSI DENGAN PROPORSI MISKIN TERTINGGI ======
distribusi_provinsi %>%
  filter(klasifikasi_kemiskinan == "Miskin") %>%
  arrange(desc(pct)) %>%
  head(10) %>%
  print()
## # A tibble: 10 × 4
## # Groups:   provinsi [10]
##    provinsi            klasifikasi_kemiskinan     n   pct
##    <chr>               <fct>                  <int> <dbl>
##  1 PAPUA               Miskin                    23  79.3
##  2 PAPUA BARAT         Miskin                     9  69.2
##  3 MALUKU              Miskin                     7  63.6
##  4 NUSA TENGGARA TIMUR Miskin                    14  63.6
##  5 NUSA TENGGARA BARAT Miskin                     1  10  
##  6 RIAU                Miskin                     1   8.3
##  7 JAWA TIMUR          Miskin                     3   7.9
##  8 SUMATERA UTARA      Miskin                     2   6.1
##  9 SUMATERA SELATAN    Miskin                     1   5.9
## 10 ACEH                Miskin                     1   4.3

5.3 Boxplot Variabel Numerik berdasarkan Kelas

kemiskinan_long <- kemiskinan %>%
  select(all_of(variabel_numerik), klasifikasi_kemiskinan) %>%
  pivot_longer(
    cols      = all_of(variabel_numerik),
    names_to  = "variabel",
    values_to = "nilai"
  )

ggplot(kemiskinan_long, aes(x = klasifikasi_kemiskinan, y = nilai, fill = klasifikasi_kemiskinan)) +
  geom_boxplot(outlier.size = 1, alpha = 0.7) +
  scale_fill_manual(values = c("Tidak_Miskin" = "#2196F3", "Miskin" = "#F44336")) +
  facet_wrap(~ variabel, scales = "free_y", ncol = 3) +
  labs(
    title = "Distribusi Variabel Prediktor berdasarkan Kelas",
    x     = NULL,
    y     = NULL,
    fill  = "Kategori"
  ) +
  theme_minimal(base_size = 11) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

5.4 Matriks Korelasi

matriks_korelasi <- cor(
  kemiskinan %>% select(all_of(variabel_numerik)),
  use = "complete.obs"
)

cat("====== MATRIKS KORELASI ======\n")
## ====== MATRIKS KORELASI ======
print(round(matriks_korelasi, 2))
##                       persentase_miskin rata_lama_sekolah pengeluaran_perkapita
## persentase_miskin                  1.00             -0.54                 -0.64
## rata_lama_sekolah                 -0.54              1.00                  0.67
## pengeluaran_perkapita             -0.64              0.67                  1.00
## ipm                               -0.71              0.87                  0.87
## umur_harapan_hidup                -0.54              0.42                  0.57
## akses_sanitasi                    -0.56              0.61                  0.57
## akses_air_minum                   -0.37              0.46                  0.45
## tpt                               -0.40              0.52                  0.55
## tpak                               0.46             -0.50                 -0.40
## pdrb                              -0.24              0.27                  0.51
##                         ipm umur_harapan_hidup akses_sanitasi akses_air_minum
## persentase_miskin     -0.71              -0.54          -0.56           -0.37
## rata_lama_sekolah      0.87               0.42           0.61            0.46
## pengeluaran_perkapita  0.87               0.57           0.57            0.45
## ipm                    1.00               0.71           0.70            0.55
## umur_harapan_hidup     0.71               1.00           0.46            0.41
## akses_sanitasi         0.70               0.46           1.00            0.54
## akses_air_minum        0.55               0.41           0.54            1.00
## tpt                    0.58               0.42           0.31            0.41
## tpak                  -0.51              -0.23          -0.44           -0.40
## pdrb                   0.37               0.32           0.19            0.23
##                         tpt  tpak  pdrb
## persentase_miskin     -0.40  0.46 -0.24
## rata_lama_sekolah      0.52 -0.50  0.27
## pengeluaran_perkapita  0.55 -0.40  0.51
## ipm                    0.58 -0.51  0.37
## umur_harapan_hidup     0.42 -0.23  0.32
## akses_sanitasi         0.31 -0.44  0.19
## akses_air_minum        0.41 -0.40  0.23
## tpt                    1.00 -0.58  0.39
## tpak                  -0.58  1.00 -0.21
## pdrb                   0.39 -0.21  1.00

6 Penanganan Missing Value

mv <- sapply(kemiskinan, function(x) sum(is.na(x)))
print(mv)
##               provinsi               kab_kota      persentase_miskin 
##                      0                      0                      0 
##      rata_lama_sekolah  pengeluaran_perkapita                    ipm 
##                      0                      0                      0 
##     umur_harapan_hidup         akses_sanitasi        akses_air_minum 
##                      0                      0                      0 
##                    tpt                   tpak                   pdrb 
##                      0                      0                      0 
## klasifikasi_kemiskinan 
##                      0
cat("Total missing value:", sum(mv), "\n")
## Total missing value: 0
kemiskinan_clean <- kemiskinan %>% drop_na()

cat("\nJumlah baris sebelum drop_na :", nrow(kemiskinan), "\n")
## 
## Jumlah baris sebelum drop_na : 514
cat("Jumlah baris setelah  drop_na :", nrow(kemiskinan_clean), "\n")
## Jumlah baris setelah  drop_na : 514
cat("Baris dihapus                 :", nrow(kemiskinan) - nrow(kemiskinan_clean), "\n")
## Baris dihapus                 : 0

7 Deteksi & Penanganan Outlier (IQR Method)

outlier_summary <- kemiskinan_clean %>%
  select(all_of(variabel_numerik)) %>%
  summarise(
    across(
      everything(),
      list(
        Q1    = ~ quantile(., 0.25, na.rm = TRUE),
        Q3    = ~ quantile(., 0.75, na.rm = TRUE),
        IQR   = ~ IQR(., na.rm = TRUE),
        n_out = ~ sum(
          . < quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE) |
          . > quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE),
          na.rm = TRUE
        )
      )
    )
  )

outlier_n <- outlier_summary %>%
  select(ends_with("_n_out")) %>%
  pivot_longer(everything(), names_to = "variabel", values_to = "n_outlier") %>%
  mutate(variabel = gsub("_n_out", "", variabel))

cat("====== RINGKASAN OUTLIER (IQR Method) ======\n")
## ====== RINGKASAN OUTLIER (IQR Method) ======
print(outlier_n)
## # A tibble: 10 × 2
##    variabel              n_outlier
##    <chr>                     <int>
##  1 persentase_miskin            36
##  2 rata_lama_sekolah            14
##  3 pengeluaran_perkapita        15
##  4 ipm                          27
##  5 umur_harapan_hidup            8
##  6 akses_sanitasi               25
##  7 akses_air_minum              23
##  8 tpt                          12
##  9 tpak                         20
## 10 pdrb                         56

8 Persiapan Data Model

data_model <- kemiskinan_clean %>%
  select(-provinsi, -kab_kota)

cat("====== STRUKTUR DATA MODEL ======\n")
## ====== STRUKTUR DATA MODEL ======
str(data_model)
## tibble [514 × 11] (S3: tbl_df/tbl/data.frame)
##  $ persentase_miskin     : num [1:514] 19 20.4 13.2 13.4 14.4 ...
##  $ rata_lama_sekolah     : num [1:514] 9.48 8.68 8.88 9.67 8.21 ...
##  $ pengeluaran_perkapita : num [1:514] 7148 8776 8180 8030 8577 ...
##  $ ipm                   : num [1:514] 66.4 69.2 67.4 69.4 67.8 ...
##  $ umur_harapan_hidup    : num [1:514] 65.3 67.4 64.4 68.2 68.7 ...
##  $ akses_sanitasi        : num [1:514] 71.6 69.6 62.5 62.7 66.8 ...
##  $ akses_air_minum       : num [1:514] 87.5 78.6 79.7 86.7 83.2 ...
##  $ tpt                   : num [1:514] 5.71 8.36 6.46 6.43 7.13 2.61 7.09 7.7 7.28 4.32 ...
##  $ tpak                  : num [1:514] 71.2 62.9 60.9 69.6 59.5 ...
##  $ pdrb                  : num [1:514] 1648096 1780419 4345784 3487157 8433526 ...
##  $ klasifikasi_kemiskinan: Factor w/ 2 levels "Tidak_Miskin",..: 1 2 1 1 1 1 1 1 1 1 ...

9 Split Data Train - Test (80:20)

set.seed(123)

index_train <- createDataPartition(
  y    = data_model$klasifikasi_kemiskinan,
  p    = 0.80,
  list = FALSE
)

data_train <- data_model[index_train, ]
data_test  <- data_model[-index_train, ]

cat(">> Data Train:\n")
## >> Data Train:
print(round(prop.table(table(data_train$klasifikasi_kemiskinan)) * 100, 2))
## 
## Tidak_Miskin       Miskin 
##        87.86        12.14
cat(">> Data Test:\n")
## >> Data Test:
print(round(prop.table(table(data_test$klasifikasi_kemiskinan)) * 100, 2))
## 
## Tidak_Miskin       Miskin 
##        88.24        11.76
cat("\nUkuran data train:", nrow(data_train), "| data test:", nrow(data_test), "\n")
## 
## Ukuran data train: 412 | data test: 102

10 SMOTE

recipe_smote <- recipe(klasifikasi_kemiskinan ~ ., data = data_train) %>%
  step_smote(klasifikasi_kemiskinan, over_ratio = 1, seed = 123)

prep_smote  <- prep(recipe_smote)
train_smote <- bake(prep_smote, new_data = NULL)

cat("====== DISTRIBUSI SETELAH SMOTE ======\n")
## ====== DISTRIBUSI SETELAH SMOTE ======
print(table(train_smote$klasifikasi_kemiskinan))
## 
## Tidak_Miskin       Miskin 
##          362          362
print(round(prop.table(table(train_smote$klasifikasi_kemiskinan)) * 100, 2))
## 
## Tidak_Miskin       Miskin 
##           50           50

11 Model Decision Tree Awal

model_dt <- rpart(
  klasifikasi_kemiskinan ~ .,
  data    = train_smote,
  method  = "class",
  control = rpart.control(cp = 0.01, maxdepth = 5, minsplit = 20, minbucket = 7)
)

cat("====== RINGKASAN MODEL ======\n")
## ====== RINGKASAN MODEL ======
summary(model_dt)
## Call:
## rpart(formula = klasifikasi_kemiskinan ~ ., data = train_smote, 
##     method = "class", control = rpart.control(cp = 0.01, maxdepth = 5, 
##         minsplit = 20, minbucket = 7))
##   n= 724 
## 
##           CP nsplit  rel error     xerror       xstd
## 1 0.91712707      0 1.00000000 1.07734807 0.03705337
## 2 0.01012891      1 0.08287293 0.10220994 0.01636822
## 3 0.01000000      5 0.03867403 0.09116022 0.01550309
## 
## Variable importance
##     persentase_miskin                   ipm pengeluaran_perkapita 
##                    24                    18                    15 
##                  pdrb    umur_harapan_hidup        akses_sanitasi 
##                    15                    15                    11 
##       akses_air_minum 
##                     1 
## 
## Node number 1: 724 observations,    complexity param=0.9171271
##   predicted class=Tidak_Miskin  expected loss=0.5  P(node) =1
##     class counts:   362   362
##    probabilities: 0.500 0.500 
##   left son=2 (378 obs) right son=3 (346 obs)
##   Primary splits:
##       persentase_miskin     < 18.82007 to the left,  improve=305.0822, (0 missing)
##       ipm                   < 65.71    to the right, improve=180.9541, (0 missing)
##       umur_harapan_hidup    < 68.075   to the right, improve=163.0172, (0 missing)
##       pengeluaran_perkapita < 9659.706 to the right, improve=141.4260, (0 missing)
##       pdrb                  < 3345765  to the right, improve=137.5035, (0 missing)
##   Surrogate splits:
##       ipm                   < 65.71    to the right, agree=0.872, adj=0.731, (0 split)
##       pdrb                  < 4271089  to the right, agree=0.829, adj=0.642, (0 split)
##       pengeluaran_perkapita < 9058.5   to the right, agree=0.826, adj=0.636, (0 split)
##       umur_harapan_hidup    < 68.075   to the right, agree=0.812, adj=0.607, (0 split)
##       akses_sanitasi        < 66.30282 to the right, agree=0.735, adj=0.445, (0 split)
## 
## Node number 2: 378 observations,    complexity param=0.01012891
##   predicted class=Tidak_Miskin  expected loss=0.06084656  P(node) =0.5220994
##     class counts:   355    23
##    probabilities: 0.939 0.061 
##   left son=4 (290 obs) right son=5 (88 obs)
##   Primary splits:
##       umur_harapan_hidup    < 67.64468 to the right, improve=7.251372, (0 missing)
##       persentase_miskin     < 15.45402 to the left,  improve=4.848785, (0 missing)
##       akses_air_minum       < 81.63133 to the right, improve=4.770171, (0 missing)
##       pengeluaran_perkapita < 10207    to the right, improve=3.536388, (0 missing)
##       akses_sanitasi        < 82.135   to the right, improve=2.889230, (0 missing)
##   Surrogate splits:
##       ipm                   < 65.865   to the right, agree=0.831, adj=0.273, (0 split)
##       pengeluaran_perkapita < 6677.5   to the right, agree=0.783, adj=0.068, (0 split)
##       pdrb                  < 1664476  to the right, agree=0.783, adj=0.068, (0 split)
##       persentase_miskin     < 16.315   to the left,  agree=0.775, adj=0.034, (0 split)
## 
## Node number 3: 346 observations
##   predicted class=Miskin        expected loss=0.02023121  P(node) =0.4779006
##     class counts:     7   339
##    probabilities: 0.020 0.980 
## 
## Node number 4: 290 observations
##   predicted class=Tidak_Miskin  expected loss=0.006896552  P(node) =0.4005525
##     class counts:   288     2
##    probabilities: 0.993 0.007 
## 
## Node number 5: 88 observations,    complexity param=0.01012891
##   predicted class=Tidak_Miskin  expected loss=0.2386364  P(node) =0.121547
##     class counts:    67    21
##    probabilities: 0.761 0.239 
##   left son=10 (42 obs) right son=11 (46 obs)
##   Primary splits:
##       akses_air_minum       < 81.63133 to the right, improve=9.151186, (0 missing)
##       pdrb                  < 9661515  to the left,  improve=5.962022, (0 missing)
##       ipm                   < 69.64993 to the left,  improve=5.150429, (0 missing)
##       persentase_miskin     < 17.39    to the left,  improve=4.802914, (0 missing)
##       pengeluaran_perkapita < 10274    to the right, improve=3.758523, (0 missing)
##   Surrogate splits:
##       akses_sanitasi        < 68.27    to the right, agree=0.670, adj=0.310, (0 split)
##       pengeluaran_perkapita < 9182.622 to the left,  agree=0.648, adj=0.262, (0 split)
##       ipm                   < 67.08    to the left,  agree=0.648, adj=0.262, (0 split)
##       umur_harapan_hidup    < 66.86456 to the left,  agree=0.625, adj=0.214, (0 split)
##       tpak                  < 66.1197  to the left,  agree=0.625, adj=0.214, (0 split)
## 
## Node number 10: 42 observations
##   predicted class=Tidak_Miskin  expected loss=0  P(node) =0.05801105
##     class counts:    42     0
##    probabilities: 1.000 0.000 
## 
## Node number 11: 46 observations,    complexity param=0.01012891
##   predicted class=Tidak_Miskin  expected loss=0.4565217  P(node) =0.06353591
##     class counts:    25    21
##    probabilities: 0.543 0.457 
##   left son=22 (33 obs) right son=23 (13 obs)
##   Primary splits:
##       persentase_miskin     < 15.40402 to the left,  improve=7.889024, (0 missing)
##       pdrb                  < 9661515  to the left,  improve=6.182148, (0 missing)
##       pengeluaran_perkapita < 10295    to the right, improve=6.026087, (0 missing)
##       akses_sanitasi        < 77.88508 to the left,  improve=4.610401, (0 missing)
##       akses_air_minum       < 70.08    to the left,  improve=4.522165, (0 missing)
##   Surrogate splits:
##       akses_air_minum       < 78.08788 to the left,  agree=0.804, adj=0.308, (0 split)
##       umur_harapan_hidup    < 67.36145 to the left,  agree=0.783, adj=0.231, (0 split)
##       pengeluaran_perkapita < 8031     to the right, agree=0.761, adj=0.154, (0 split)
##       pdrb                  < 2781274  to the right, agree=0.761, adj=0.154, (0 split)
##       ipm                   < 63.975   to the right, agree=0.739, adj=0.077, (0 split)
## 
## Node number 22: 33 observations,    complexity param=0.01012891
##   predicted class=Tidak_Miskin  expected loss=0.2727273  P(node) =0.04558011
##     class counts:    24     9
##    probabilities: 0.727 0.273 
##   left son=44 (24 obs) right son=45 (9 obs)
##   Primary splits:
##       akses_sanitasi     < 77.88508 to the left,  improve=6.313131, (0 missing)
##       ipm                < 69.30573 to the left,  improve=5.890909, (0 missing)
##       pdrb               < 9661515  to the left,  improve=5.890909, (0 missing)
##       rata_lama_sekolah  < 8.395519 to the left,  improve=3.208556, (0 missing)
##       umur_harapan_hidup < 66.20588 to the left,  improve=3.190909, (0 missing)
##   Surrogate splits:
##       tpt < 2.728599 to the right, agree=0.848, adj=0.444, (0 split)
## 
## Node number 23: 13 observations
##   predicted class=Miskin        expected loss=0.07692308  P(node) =0.0179558
##     class counts:     1    12
##    probabilities: 0.077 0.923 
## 
## Node number 44: 24 observations
##   predicted class=Tidak_Miskin  expected loss=0.08333333  P(node) =0.03314917
##     class counts:    22     2
##    probabilities: 0.917 0.083 
## 
## Node number 45: 9 observations
##   predicted class=Miskin        expected loss=0.2222222  P(node) =0.01243094
##     class counts:     2     7
##    probabilities: 0.222 0.778
cat("\n====== TABEL CP ======\n")
## 
## ====== TABEL CP ======
printcp(model_dt)
## 
## Classification tree:
## rpart(formula = klasifikasi_kemiskinan ~ ., data = train_smote, 
##     method = "class", control = rpart.control(cp = 0.01, maxdepth = 5, 
##         minsplit = 20, minbucket = 7))
## 
## Variables actually used in tree construction:
## [1] akses_air_minum    akses_sanitasi     persentase_miskin  umur_harapan_hidup
## 
## Root node error: 362/724 = 0.5
## 
## n= 724 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.917127      0  1.000000 1.07735 0.037053
## 2 0.010129      1  0.082873 0.10221 0.016368
## 3 0.010000      5  0.038674 0.09116 0.015503

12 Pruning Decision Tree

cp_optimal <- model_dt$cptable[which.min(model_dt$cptable[, "xerror"]), "CP"]
cat("CP Optimal:", cp_optimal, "\n")
## CP Optimal: 0.01
model_dt_pruned <- prune(model_dt, cp = cp_optimal)

cat("\n====== PERBANDINGAN JUMLAH SPLIT ======\n")
## 
## ====== PERBANDINGAN JUMLAH SPLIT ======
cat("Model awal   :", model_dt$cptable[nrow(model_dt$cptable), "nsplit"], "split\n")
## Model awal   : 5 split
cat("Model pruned :", model_dt_pruned$cptable[nrow(model_dt_pruned$cptable), "nsplit"], "split\n")
## Model pruned : 5 split

13 Visualisasi Tree

rpart.plot(
  model_dt,
  type          = 2,
  extra         = 104,
  fallen.leaves = TRUE,
  box.palette   = c("#F44336", "#2196F3"),
  shadow.col    = "gray70",
  main          = "Decision Tree Klasifikasi Kemiskinan"
)

rpart.plot(
  model_dt_pruned,
  type          = 2,
  extra         = 104,
  fallen.leaves = TRUE,
  box.palette   = c("#F44336", "#2196F3"),
  shadow.col    = "gray70",
  main          = "Decision Tree - Setelah Pruning"
)

plotcp(model_dt)
title("Grafik Complexity Parameter (CP)")


16 Evaluasi Model

16.1 Fungsi Evaluasi

tampilkan_evaluasi <- function(label, pred_kelas, pred_prob, aktual) {
  cat("\n", strrep("=", 50), "\n")
  cat("  EVALUASI:", label, "\n")
  cat(strrep("=", 50), "\n")

  cm <- confusionMatrix(data = pred_kelas, reference = aktual, positive = "Miskin")
  print(cm)

  roc_obj <- roc(as.numeric(aktual == "Miskin"), pred_prob[, "Miskin"], quiet = TRUE)
  cat("AUC:", round(auc(roc_obj), 4), "\n")

  invisible(list(cm = cm, roc = roc_obj))
}

16.2 Model Awal

pred_awal <- predict(model_dt, newdata = data_test, type = "class")
prob_awal <- predict(model_dt, newdata = data_test, type = "prob")

eval_awal <- tampilkan_evaluasi("Model Awal", pred_awal, prob_awal, data_test$klasifikasi_kemiskinan)
## 
##  ================================================== 
##   EVALUASI: Model Awal 
## ================================================== 
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Tidak_Miskin Miskin
##   Tidak_Miskin           82      0
##   Miskin                  8     12
##                                           
##                Accuracy : 0.9216          
##                  95% CI : (0.8513, 0.9655)
##     No Information Rate : 0.8824          
##     P-Value [Acc > NIR] : 0.13893         
##                                           
##                   Kappa : 0.7069          
##                                           
##  Mcnemar's Test P-Value : 0.01333         
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9111          
##          Pos Pred Value : 0.6000          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.1176          
##          Detection Rate : 0.1176          
##    Detection Prevalence : 0.1961          
##       Balanced Accuracy : 0.9556          
##                                           
##        'Positive' Class : Miskin          
##                                           
## AUC: 0.9667

16.3 Model Pruned

pred_pruned <- predict(model_dt_pruned, newdata = data_test, type = "class")
prob_pruned <- predict(model_dt_pruned, newdata = data_test, type = "prob")

eval_pruned <- tampilkan_evaluasi("Model Pruned", pred_pruned, prob_pruned, data_test$klasifikasi_kemiskinan)
## 
##  ================================================== 
##   EVALUASI: Model Pruned 
## ================================================== 
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Tidak_Miskin Miskin
##   Tidak_Miskin           82      0
##   Miskin                  8     12
##                                           
##                Accuracy : 0.9216          
##                  95% CI : (0.8513, 0.9655)
##     No Information Rate : 0.8824          
##     P-Value [Acc > NIR] : 0.13893         
##                                           
##                   Kappa : 0.7069          
##                                           
##  Mcnemar's Test P-Value : 0.01333         
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9111          
##          Pos Pred Value : 0.6000          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.1176          
##          Detection Rate : 0.1176          
##    Detection Prevalence : 0.1961          
##       Balanced Accuracy : 0.9556          
##                                           
##        'Positive' Class : Miskin          
##                                           
## AUC: 0.9667

16.4 Grid Search

pred_grid <- predict(model_grid, newdata = data_test)
prob_grid <- predict(model_grid, newdata = data_test, type = "prob")

eval_grid <- tampilkan_evaluasi("Model Grid Search", pred_grid, prob_grid, data_test$klasifikasi_kemiskinan)
## 
##  ================================================== 
##   EVALUASI: Model Grid Search 
## ================================================== 
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Tidak_Miskin Miskin
##   Tidak_Miskin           82      0
##   Miskin                  8     12
##                                           
##                Accuracy : 0.9216          
##                  95% CI : (0.8513, 0.9655)
##     No Information Rate : 0.8824          
##     P-Value [Acc > NIR] : 0.13893         
##                                           
##                   Kappa : 0.7069          
##                                           
##  Mcnemar's Test P-Value : 0.01333         
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9111          
##          Pos Pred Value : 0.6000          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.1176          
##          Detection Rate : 0.1176          
##    Detection Prevalence : 0.1961          
##       Balanced Accuracy : 0.9556          
##                                           
##        'Positive' Class : Miskin          
##                                           
## AUC: 0.9667

16.5 Random Search

pred_random <- predict(model_random, newdata = data_test)
prob_random <- predict(model_random, newdata = data_test, type = "prob")

eval_random <- tampilkan_evaluasi("Model Random Search", pred_random, prob_random, data_test$klasifikasi_kemiskinan)
## 
##  ================================================== 
##   EVALUASI: Model Random Search 
## ================================================== 
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Tidak_Miskin Miskin
##   Tidak_Miskin           89      0
##   Miskin                  1     12
##                                           
##                Accuracy : 0.9902          
##                  95% CI : (0.9466, 0.9998)
##     No Information Rate : 0.8824          
##     P-Value [Acc > NIR] : 4.167e-05       
##                                           
##                   Kappa : 0.9544          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9889          
##          Pos Pred Value : 0.9231          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.1176          
##          Detection Rate : 0.1176          
##    Detection Prevalence : 0.1275          
##       Balanced Accuracy : 0.9944          
##                                           
##        'Positive' Class : Miskin          
##                                           
## AUC: 0.9944

17 Perbandingan Metrik

tabel_metrik <- data.frame(
  Model = c("Sebelum Pruning", "Setelah Pruning", "Grid Search", "Random Search"),

  Accuracy = c(
    round(eval_awal$cm$overall["Accuracy"]   * 100, 2),
    round(eval_pruned$cm$overall["Accuracy"] * 100, 2),
    round(eval_grid$cm$overall["Accuracy"]   * 100, 2),
    round(eval_random$cm$overall["Accuracy"] * 100, 2)
  ),

  Sensitivity = c(
    round(eval_awal$cm$byClass["Sensitivity"]   * 100, 2),
    round(eval_pruned$cm$byClass["Sensitivity"] * 100, 2),
    round(eval_grid$cm$byClass["Sensitivity"]   * 100, 2),
    round(eval_random$cm$byClass["Sensitivity"] * 100, 2)
  ),

  Specificity = c(
    round(eval_awal$cm$byClass["Specificity"]   * 100, 2),
    round(eval_pruned$cm$byClass["Specificity"] * 100, 2),
    round(eval_grid$cm$byClass["Specificity"]   * 100, 2),
    round(eval_random$cm$byClass["Specificity"] * 100, 2)
  ),

  Precision = c(
    round(eval_awal$cm$byClass["Pos Pred Value"]   * 100, 2),
    round(eval_pruned$cm$byClass["Pos Pred Value"] * 100, 2),
    round(eval_grid$cm$byClass["Pos Pred Value"]   * 100, 2),
    round(eval_random$cm$byClass["Pos Pred Value"] * 100, 2)
  ),

  F1 = c(
    round(eval_awal$cm$byClass["F1"]   * 100, 2),
    round(eval_pruned$cm$byClass["F1"] * 100, 2),
    round(eval_grid$cm$byClass["F1"]   * 100, 2),
    round(eval_random$cm$byClass["F1"] * 100, 2)
  ),

  AUC = c(
    round(auc(eval_awal$roc),   4),
    round(auc(eval_pruned$roc), 4),
    round(auc(eval_grid$roc),   4),
    round(auc(eval_random$roc), 4)
  )
)

print(tabel_metrik)
##             Model Accuracy Sensitivity Specificity Precision F1    AUC
## 1 Sebelum Pruning    92.16         100       91.11     60.00 75 0.9667
## 2 Setelah Pruning    92.16         100       91.11     60.00 75 0.9667
## 3     Grid Search    92.16         100       91.11     60.00 75 0.9667
## 4   Random Search    99.02         100       98.89     92.31 96 0.9944
tabel_metrik_long <- tabel_metrik %>%
  pivot_longer(cols = -Model, names_to = "Metrik", values_to = "Nilai")

ggplot(tabel_metrik_long, aes(x = Metrik, y = Nilai, fill = Model)) +
  geom_bar(
    stat     = "identity",
    position = position_dodge(width = 0.8),
    width    = 0.68,
    color    = "white"
  ) +
  geom_text(
    aes(label = round(Nilai, 2)),
    position = position_dodge(width = 0.8),
    hjust    = -0.2,
    size     = 3
  ) +
  coord_flip() +
  scale_fill_manual(values = c(
    "Grid Search"     = "#F8766D",
    "Random Search"   = "#7CAE00",
    "Sebelum Pruning" = "#00BFC4",
    "Setelah Pruning" = "#C77CFF"
  )) +
  scale_y_continuous(limits = c(0, 120), expand = expansion(mult = c(0, 0.05))) +
  labs(
    title    = "Perbandingan Metrik Evaluasi",
    subtitle = "Decision Tree: Grid Search vs Random Search vs Pruning",
    x        = NULL,
    y        = "Nilai (%)",
    fill     = "Model"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title      = element_text(face = "bold", size = 18, hjust = 0.5),
    plot.subtitle   = element_text(size = 11, hjust = 0.5),
    axis.text.y     = element_text(size = 11, face = "bold"),
    axis.text.x     = element_text(size = 10),
    axis.title.x    = element_text(size = 12, face = "bold"),
    legend.position = "bottom",
    legend.title    = element_text(size = 10, face = "bold"),
    legend.text     = element_text(size = 9),
    panel.grid.minor = element_blank()
  )


18 Variable Importance

importance_df <- data.frame(
  variabel   = names(model_dt_pruned$variable.importance),
  importance = model_dt_pruned$variable.importance,
  row.names  = NULL
) %>%
  arrange(desc(importance)) %>%
  mutate(
    pct     = round(importance / sum(importance) * 100, 2),
    pct_kum = cumsum(pct)
  )

print(importance_df)
##                variabel importance   pct pct_kum
## 1     persentase_miskin 313.218409 24.20   24.20
## 2                   ipm 228.061555 17.62   41.82
## 3 pengeluaran_perkapita 198.087735 15.30   57.12
## 4                  pdrb 197.454477 15.26   72.38
## 5    umur_harapan_hidup 196.198368 15.16   87.54
## 6        akses_sanitasi 144.933663 11.20   98.74
## 7       akses_air_minum  11.578578  0.89   99.63
## 8                   tpt   2.805836  0.22   99.85
## 9                  tpak   1.960968  0.15  100.00
ggplot(importance_df, aes(x = reorder(variabel, importance), y = importance, fill = importance)) +
  geom_col(show.legend = FALSE, width = 0.7) +
  geom_text(aes(label = paste0(pct, "%")), hjust = -0.1, size = 3.5) +
  coord_flip() +
  labs(
    title = "Variable Importance",
    x     = "Variabel",
    y     = "Importance Score"
  ) +
  theme_minimal(base_size = 12)


19 Prediksi Data Baru

data_baru <- data.frame(
  persentase_miskin     = c(25.5, 8.0, 40.0),
  rata_lama_sekolah     = c(6.5, 10.2, 4.5),
  pengeluaran_perkapita = c(6500, 14000, 5000),
  ipm                   = c(58.0, 78.0, 48.0),
  umur_harapan_hidup    = c(62.0, 72.0, 58.0),
  akses_sanitasi        = c(50.0, 95.0, 10.0),
  akses_air_minum       = c(70.0, 98.0, 40.0),
  tpt                   = c(3.0, 9.0, 1.5),
  tpak                  = c(75.0, 63.0, 88.0),
  pdrb                  = c(2000000, 50000000, 800000)
)

prediksi_baru <- predict(model_dt_pruned, newdata = data_baru, type = "class")
prob_baru     <- predict(model_dt_pruned, newdata = data_baru, type = "prob")

data_baru$prediksi    <- prediksi_baru
data_baru$prob_miskin <- round(prob_baru[, "Miskin"] * 100, 2)

cat("====== HASIL PREDIKSI DATA BARU ======\n")
## ====== HASIL PREDIKSI DATA BARU ======
print(data_baru %>% select(persentase_miskin, ipm, prediksi, prob_miskin))
##   persentase_miskin ipm     prediksi prob_miskin
## 1              25.5  58       Miskin       97.98
## 2               8.0  78 Tidak_Miskin        0.69
## 3              40.0  48       Miskin       97.98

20 Interpretasi Hasil

cat(strrep("=", 60), "\n")
## ============================================================
cat("  INTERPRETASI HASIL ANALISIS\n")
##   INTERPRETASI HASIL ANALISIS
cat(strrep("=", 60), "\n")
## ============================================================
cat("
METODE YANG DIGUNAKAN
* Decision Tree
* SMOTE
* Pruning
* Grid Search
* Random Search
* Cross Validation

KESIMPULAN
* Model mampu melakukan klasifikasi kemiskinan
  dengan performa yang sangat baik.
* Variabel paling penting:
  1. Persentase penduduk miskin
  2. IPM
  3. Pengeluaran per kapita
* AUC tinggi menunjukkan model memiliki
  kemampuan diskriminasi yang sangat baik.
")
## 
## METODE YANG DIGUNAKAN
## * Decision Tree
## * SMOTE
## * Pruning
## * Grid Search
## * Random Search
## * Cross Validation
## 
## KESIMPULAN
## * Model mampu melakukan klasifikasi kemiskinan
##   dengan performa yang sangat baik.
## * Variabel paling penting:
##   1. Persentase penduduk miskin
##   2. IPM
##   3. Pengeluaran per kapita
## * AUC tinggi menunjukkan model memiliki
##   kemampuan diskriminasi yang sangat baik.
cat(strrep("=", 60), "\n")
## ============================================================
cat("  ANALISIS SELESAI\n")
##   ANALISIS SELESAI
cat(strrep("=", 60), "\n")
## ============================================================