UJI KOMPETENSI SERTIFIKASI SAINS DATA
LSP Politeknik Statistika STIS — 2026 Skema: Data Scientist (DS) | Studi Kasus: Penanganan Stunting Balita
Unit Kompetensi: J.62DMI00.001.1 — Menentukan Objektif Bisnis
Elemen: El.1 Permasalahan & Objektif | El.2 Terminologi & Batasan | El.3 Risiko & Mitigasi | El.4 Biaya & Keuntungan
Dimensi: TS | KUK: 1.1–1.3, 2.1–2.2
(Jawaban naratif — IK 01 tidak memerlukan kode R)
Dimensi: TS | KUK: 3.1–3.3, 4.1–4.3
(Jawaban naratif — IK 02 tidak memerlukan kode R)
Unit Kompetensi: J.62DMI00.002.1
Elemen: El.1 Tujuan Teknis | El.2 Kriteria Kesuksesan Teknis
Dimensi: JRES | KUK: 1.1–1.2, 2.1
# ── Muat Dataset Utama ───────────────────────────────────────────────────────
library(readr)
library(tidyverse)
data_stunting_raw <- read_csv("C:/Users/User/Downloads/data_stunting_raw.csv")
str(data_stunting_raw)#> spc_tbl_ [500 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> $ id_balita : chr [1:500] "BAL0001" "BAL0002" "BAL0003" "BAL0004" ...
#> $ kab_kota : chr [1:500] "KAB_02" "KAB_07" "KAB_27" "KAB_14" ...
#> $ jenis_kelamin : chr [1:500] "L" "L" "L" "L" ...
#> $ usia_bulan : num [1:500] 33 24 10 27 55 29 21 32 26 23 ...
#> $ bb_lahir : num [1:500] 2.16 2.95 3.03 3 3.07 2.76 3.11 3.08 2.52 3.71 ...
#> $ tb_sekarang : num [1:500] 78.4 65.2 61.1 74.6 90.3 73.1 79.7 70.8 65.2 72 ...
#> $ bb_sekarang : num [1:500] 9.39 10.61 8.45 11.99 12.78 ...
#> $ pend_ibu : chr [1:500] "SD" "SMP" "SD" "SMA" ...
#> $ pendapatan : num [1:500] 2828197 1981942 779264 7148622 7289472 ...
#> $ akses_air : chr [1:500] "Tidak" "Ya" "Ya" "Ya" ...
#> $ sanitasi : chr [1:500] "Baik" "Baik" "Kurang" "Kurang" ...
#> $ asi_eksklusif : chr [1:500] "Tidak" "Ya" "Ya" "Ya" ...
#> $ imunisasi : chr [1:500] "Lengkap" "Tidak Lengkap" "Lengkap" "Lengkap" ...
#> $ diare_3bln : num [1:500] 0 2 0 1 0 0 1 0 0 0 ...
#> $ kunjungan_posyandu: num [1:500] 11 11 1 5 7 5 4 7 8 3 ...
#> $ zscore_tbu : num [1:500] 0.246 -2.549 -2.2 -0.189 1.257 ...
#> $ status_stunting : chr [1:500] "Normal" "Stunting" "Stunting" "Normal" ...
#> - attr(*, "spec")=
#> .. cols(
#> .. id_balita = col_character(),
#> .. kab_kota = col_character(),
#> .. jenis_kelamin = col_character(),
#> .. usia_bulan = col_double(),
#> .. bb_lahir = col_double(),
#> .. tb_sekarang = col_double(),
#> .. bb_sekarang = col_double(),
#> .. pend_ibu = col_character(),
#> .. pendapatan = col_double(),
#> .. akses_air = col_character(),
#> .. sanitasi = col_character(),
#> .. asi_eksklusif = col_character(),
#> .. imunisasi = col_character(),
#> .. diare_3bln = col_double(),
#> .. kunjungan_posyandu = col_double(),
#> .. zscore_tbu = col_double(),
#> .. status_stunting = col_character()
#> .. )
#> - attr(*, "problems")=<externalptr>
# ── Distribusi Frekuensi & Persentase ────────────────────────────────────────
library(knitr)
library(kableExtra)
tabel_target <- data_stunting_raw %>%
count(status_stunting) %>%
mutate(persentase = round(n / sum(n) * 100, 2))
kable(tabel_target,
caption = "Distribusi Frekuensi Variabel Target: status_stunting",
col.names = c("Status Stunting", "Frekuensi (n)", "Persentase (%)"),
align = "lrr") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE, position = "center") %>%
row_spec(which(tabel_target$status_stunting == "Stunting"),
background = "#FFEBEE", color = "#B71C1C", bold = TRUE)| Status Stunting | Frekuensi (n) | Persentase (%) |
|---|---|---|
| Normal | 371 | 74.2 |
| Stunting | 129 | 25.8 |
# ── Rasio Imbalance ──────────────────────────────────────────────────────────
pct_minoritas <- tabel_target %>%
filter(status_stunting == "Stunting") %>% pull(persentase)
cat("Persentase kelas minoritas (Stunting) :", pct_minoritas, "%\n")#> Persentase kelas minoritas (Stunting) : 25.8 %
#> Persentase kelas mayoritas (Normal) : 74.2 %
cat("Rasio imbalance (Normal : Stunting) :",
round((100 - pct_minoritas) / pct_minoritas, 2), ": 1\n")#> Rasio imbalance (Normal : Stunting) : 2.88 : 1
# ── Bar Chart Distribusi Target ───────────────────────────────────────────────
ggplot(data_stunting_raw, aes(x = status_stunting, fill = status_stunting)) +
geom_bar(width = 0.55, colour = "white", linewidth = 0.4) +
geom_text(stat = "count",
aes(label = paste0(..count.., "\n(",
round(..count.. / nrow(data_stunting_raw) * 100, 1), "%)")),
vjust = -0.4, size = 4.2, fontface = "bold") +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING),
guide = guide_legend(title.position = "top")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(title = "Distribusi Variabel Target: Status Stunting",
subtitle = "Dataset data_stunting_raw.csv — 500 Balita, 50 Kab/Kota",
x = "Status Stunting",
y = "Jumlah Balita",
fill = "Status",
caption = "Sumber: data_stunting_raw.csv") +
annotate("text", x = 1.5, y = 430,
label = paste0("Rasio Imbalance\n",
round((100-pct_minoritas)/pct_minoritas, 1), " : 1"),
colour = COL_ACCENT, size = 3.5, fontface = "italic")Figure 2.1: Distribusi Variabel Target Status Stunting (n = 500 balita)
Unit Kompetensi: J.62DMI00.005.1
Elemen: El.1 Tipe & Relasi Data | El.2 Karakteristik Data | El.3 Laporan Telaah
Dimensi: TMS | KUK: 1.1–1.3
# ── Struktur Data ─────────────────────────────────────────────────────────────
library(corrplot)
str(data_stunting_raw)#> spc_tbl_ [500 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> $ id_balita : chr [1:500] "BAL0001" "BAL0002" "BAL0003" "BAL0004" ...
#> $ kab_kota : chr [1:500] "KAB_02" "KAB_07" "KAB_27" "KAB_14" ...
#> $ jenis_kelamin : chr [1:500] "L" "L" "L" "L" ...
#> $ usia_bulan : num [1:500] 33 24 10 27 55 29 21 32 26 23 ...
#> $ bb_lahir : num [1:500] 2.16 2.95 3.03 3 3.07 2.76 3.11 3.08 2.52 3.71 ...
#> $ tb_sekarang : num [1:500] 78.4 65.2 61.1 74.6 90.3 73.1 79.7 70.8 65.2 72 ...
#> $ bb_sekarang : num [1:500] 9.39 10.61 8.45 11.99 12.78 ...
#> $ pend_ibu : chr [1:500] "SD" "SMP" "SD" "SMA" ...
#> $ pendapatan : num [1:500] 2828197 1981942 779264 7148622 7289472 ...
#> $ akses_air : chr [1:500] "Tidak" "Ya" "Ya" "Ya" ...
#> $ sanitasi : chr [1:500] "Baik" "Baik" "Kurang" "Kurang" ...
#> $ asi_eksklusif : chr [1:500] "Tidak" "Ya" "Ya" "Ya" ...
#> $ imunisasi : chr [1:500] "Lengkap" "Tidak Lengkap" "Lengkap" "Lengkap" ...
#> $ diare_3bln : num [1:500] 0 2 0 1 0 0 1 0 0 0 ...
#> $ kunjungan_posyandu: num [1:500] 11 11 1 5 7 5 4 7 8 3 ...
#> $ zscore_tbu : num [1:500] 0.246 -2.549 -2.2 -0.189 1.257 ...
#> $ status_stunting : chr [1:500] "Normal" "Stunting" "Stunting" "Normal" ...
#> - attr(*, "spec")=
#> .. cols(
#> .. id_balita = col_character(),
#> .. kab_kota = col_character(),
#> .. jenis_kelamin = col_character(),
#> .. usia_bulan = col_double(),
#> .. bb_lahir = col_double(),
#> .. tb_sekarang = col_double(),
#> .. bb_sekarang = col_double(),
#> .. pend_ibu = col_character(),
#> .. pendapatan = col_double(),
#> .. akses_air = col_character(),
#> .. sanitasi = col_character(),
#> .. asi_eksklusif = col_character(),
#> .. imunisasi = col_character(),
#> .. diare_3bln = col_double(),
#> .. kunjungan_posyandu = col_double(),
#> .. zscore_tbu = col_double(),
#> .. status_stunting = col_character()
#> .. )
#> - attr(*, "problems")=<externalptr>
#> # A tibble: 6 × 17
#> id_balita kab_kota jenis_kelamin usia_bulan bb_lahir tb_sekarang bb_sekarang
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 BAL0001 KAB_02 L 33 2.16 78.4 9.39
#> 2 BAL0002 KAB_07 L 24 2.95 65.2 10.6
#> 3 BAL0003 KAB_27 L 10 3.03 61.1 8.45
#> 4 BAL0004 KAB_14 L 27 3 74.6 12.0
#> 5 BAL0005 KAB_14 P 55 3.07 90.3 12.8
#> 6 BAL0006 KAB_30 P 29 2.76 73.1 35
#> # ℹ 10 more variables: pend_ibu <chr>, pendapatan <dbl>, akses_air <chr>,
#> # sanitasi <chr>, asi_eksklusif <chr>, imunisasi <chr>, diare_3bln <dbl>,
#> # kunjungan_posyandu <dbl>, zscore_tbu <dbl>, status_stunting <chr>
# ── Matriks Korelasi Variabel Numerik ─────────────────────────────────────────
numerik_vars <- data_stunting_raw %>%
select(usia_bulan, bb_lahir, tb_sekarang, bb_sekarang,
pendapatan, diare_3bln, kunjungan_posyandu, zscore_tbu)
cor_matrix <- cor(numerik_vars, use = "complete.obs")
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
addCoef.col = "#37474F",
tl.col = "#1A237E",
tl.srt = 45,
tl.cex = 0.85,
number.cex = 0.72,
col = colorRampPalette(c(COL_STUNTING, "white", COL_NORMAL))(200),
cl.cex = 0.75,
title = "Matriks Korelasi Variabel Numerik",
mar = c(0, 0, 2, 0))Figure 3.1: Matriks Korelasi Variabel Numerik — Dataset Stunting
# ── Pasangan Korelasi Tertinggi ───────────────────────────────────────────────
cor_df <- as.data.frame(as.table(cor_matrix)) %>%
filter(Var1 != Var2) %>%
mutate(abs_cor = abs(Freq)) %>%
arrange(desc(abs_cor)) %>%
filter(!duplicated(abs_cor))
kable(head(cor_df, 5),
caption = "5 Pasangan Variabel dengan Korelasi Tertinggi",
col.names = c("Variabel 1", "Variabel 2", "Korelasi", "|Korelasi|"),
digits = 3, align = "llrr") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, position = "center") %>%
column_spec(3:4, bold = TRUE)| Variabel 1 | Variabel 2 | Korelasi | |Korelasi| |
|---|---|---|---|
| zscore_tbu | tb_sekarang | 0.880 | 0.880 |
| bb_sekarang | usia_bulan | 0.493 | 0.493 |
| tb_sekarang | usia_bulan | 0.470 | 0.470 |
| bb_sekarang | tb_sekarang | 0.261 | 0.261 |
| bb_sekarang | bb_lahir | -0.091 | 0.091 |
Dimensi: TMS + CMS | KUK: 2.1–2.3
# ── Statistik Deskriptif Lengkap ──────────────────────────────────────────────
library(e1071)
stat_desk <- data_stunting_raw %>%
select(usia_bulan, bb_lahir, tb_sekarang, bb_sekarang,
pendapatan, diare_3bln, kunjungan_posyandu, zscore_tbu) %>%
summarise(across(everything(), list(
Min = ~min(., na.rm = TRUE),
Maks = ~max(., na.rm = TRUE),
Mean = ~mean(., na.rm = TRUE),
Median = ~median(., na.rm = TRUE),
SD = ~sd(., na.rm = TRUE),
P25 = ~quantile(., 0.25, na.rm = TRUE),
P75 = ~quantile(., 0.75, na.rm = TRUE),
Skew = ~skewness(., na.rm = TRUE)
), .names = "{.col}__{.fn}")) %>%
pivot_longer(everything(),
names_to = c("Variabel", "Statistik"),
names_sep = "__") %>%
pivot_wider(names_from = Statistik, values_from = value)
kable(stat_desk,
caption = "Statistik Deskriptif Variabel Numerik",
digits = 2,
align = c("l", rep("r", 8))) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = TRUE) %>%
column_spec(1, bold = TRUE, background = "#E8EAF6") %>%
add_header_above(c(" " = 1, "Ukuran Pemusatan & Sebaran" = 7, "Bentuk" = 1))| Variabel | Min | Maks | Mean | Median | SD | P25 | P75 | Skew |
|---|---|---|---|---|---|---|---|---|
| usia_bulan | 6.00 | 59.00 | 32.96 | 32.50 | 15.77 | 19.00 | 48.00 | 0.04 |
| bb_lahir | 1.54 | 4.43 | 3.10 | 3.10 | 0.51 | 2.76 | 3.45 | -0.07 |
| tb_sekarang | 5.00 | 200.00 | 74.93 | 74.70 | 12.47 | 68.60 | 81.00 | 2.45 |
| bb_sekarang | 0.50 | 50.00 | 12.10 | 12.02 | 3.17 | 10.37 | 13.77 | 4.04 |
| pendapatan | 500000.00 | 14881706.00 | 3063535.35 | 2650597.50 | 1959533.69 | 1697066.25 | 3879258.00 | 1.94 |
| diare_3bln | 0.00 | 5.00 | 0.99 | 0.50 | 1.32 | 0.00 | 2.00 | 1.42 |
| kunjungan_posyandu | 1.00 | 12.00 | 6.62 | 7.00 | 3.56 | 3.00 | 10.00 | -0.05 |
| zscore_tbu | -20.29 | 36.51 | -0.74 | -0.77 | 3.13 | -2.02 | 0.61 | 3.69 |
# ── Visualisasi 1: Histogram Z-Score TB/U ─────────────────────────────────────
ggplot(data_stunting_raw, aes(x = zscore_tbu, fill = status_stunting)) +
geom_histogram(bins = 40, alpha = 0.78, position = "identity",
colour = "white", linewidth = 0.2) +
geom_vline(xintercept = -2, linetype = "dashed",
colour = "#E65100", linewidth = 1) +
annotate("rect", xmin = -Inf, xmax = -2,
ymin = -Inf, ymax = Inf,
fill = "#FFCCBC", alpha = 0.15) +
annotate("label", x = -3.2, y = 30,
label = "Zona\nStunting\n(< -2 SD)",
fill = "#FFF3E0", colour = "#E65100",
size = 3, label.size = 0.3) +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
scale_x_continuous(breaks = seq(-6, 4, 1)) +
labs(title = "Distribusi Z-Score TB/U berdasarkan Status Stunting",
subtitle = "Garis putus-putus: cut-off WHO −2 SD",
x = "Z-Score TB/U",
y = "Frekuensi",
fill = "Status Stunting",
caption = "Sumber: data_stunting_raw.csv")Figure 3.2: Distribusi Z-Score TB/U berdasarkan Status Stunting
# ── Visualisasi 2: Boxplot Tinggi & Berat Badan ───────────────────────────────
library(patchwork)
p1 <- ggplot(data_stunting_raw,
aes(x = status_stunting, y = tb_sekarang,
fill = status_stunting)) +
geom_boxplot(outlier.colour = "#FF6F00", outlier.shape = 21,
outlier.size = 2.5, outlier.fill = "#FFB300",
notch = FALSE, width = 0.5, linewidth = 0.5) +
stat_summary(fun = mean, geom = "point", shape = 23,
size = 3, fill = "white", colour = COL_ACCENT) +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
labs(title = "Tinggi Badan (cm)",
x = "Status",
y = "Tinggi Badan (cm)") +
theme(legend.position = "none")
p2 <- ggplot(data_stunting_raw,
aes(x = status_stunting, y = bb_sekarang,
fill = status_stunting)) +
geom_boxplot(outlier.colour = "#FF6F00", outlier.shape = 21,
outlier.size = 2.5, outlier.fill = "#FFB300",
notch = FALSE, width = 0.5, linewidth = 0.5) +
stat_summary(fun = mean, geom = "point", shape = 23,
size = 3, fill = "white", colour = COL_ACCENT) +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
labs(title = "Berat Badan (kg)",
x = "Status",
y = "Berat Badan (kg)") +
theme(legend.position = "none")
(p1 | p2) +
plot_annotation(
title = "Distribusi Antropometri per Status Stunting",
subtitle = "Berlian putih = nilai rata-rata; titik oranye = outlier",
theme = theme(plot.title = element_text(face = "bold", colour = "#1A237E"),
plot.subtitle = element_text(colour = "#546E7A"))
)Figure 3.3: Boxplot Tinggi dan Berat Badan per Status Stunting
# ── Visualisasi 3: Scatter Plot Z-Score vs Tinggi Badan ───────────────────────
ggplot(data_stunting_raw,
aes(x = tb_sekarang, y = zscore_tbu, colour = status_stunting)) +
geom_point(alpha = 0.55, size = 2, shape = 16) +
geom_smooth(aes(group = status_stunting),
method = "lm", se = TRUE, linewidth = 0.9, linetype = "solid") +
geom_hline(yintercept = -2, linetype = "dashed",
colour = "#E65100", linewidth = 0.9) +
annotate("label", x = 110, y = -1.7,
label = "Cut-off −2 SD", fill = "#FFF3E0",
colour = "#E65100", size = 3, label.size = 0.3) +
scale_colour_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
labs(title = "Scatter Plot: Tinggi Badan vs Z-Score TB/U",
subtitle = "Garis regresi per kelompok status stunting",
x = "Tinggi Badan Sekarang (cm)",
y = "Z-Score TB/U",
colour = "Status Stunting",
caption = "Sumber: data_stunting_raw.csv")Figure 3.4: Scatter Plot Tinggi Badan vs Z-Score TB/U
# ── Nilai Skewness ─────────────────────────────────────────────────────────────
skew_df <- data_stunting_raw %>%
select(usia_bulan, bb_lahir, tb_sekarang, bb_sekarang,
pendapatan, diare_3bln, kunjungan_posyandu, zscore_tbu) %>%
summarise(across(everything(), ~round(skewness(., na.rm = TRUE), 3))) %>%
pivot_longer(everything(), names_to = "Variabel", values_to = "Skewness") %>%
mutate(Kategori = case_when(
abs(Skewness) < 0.5 ~ "Simetris",
abs(Skewness) < 1 ~ "Skewed Sedang",
TRUE ~ "Skewed Berat"
))
kable(skew_df,
caption = "Nilai Skewness Variabel Numerik",
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, position = "center") %>%
column_spec(3, color = ifelse(abs(skew_df$Skewness) >= 1, COL_STUNTING, "black"),
bold = ifelse(abs(skew_df$Skewness) >= 1, TRUE, FALSE))| Variabel | Skewness | Kategori |
|---|---|---|
| usia_bulan | 0.045 | Simetris |
| bb_lahir | -0.073 | Simetris |
| tb_sekarang | 2.450 | Skewed Berat |
| bb_sekarang | 4.039 | Skewed Berat |
| pendapatan | 1.939 | Skewed Berat |
| diare_3bln | 1.422 | Skewed Berat |
| kunjungan_posyandu | -0.049 | Simetris |
| zscore_tbu | 3.695 | Skewed Berat |
Dimensi: CMS + JRES | KUK: 3.1–3.2
# ── Tabel Proporsi per Variabel Kategorik ─────────────────────────────────────
buat_tabel_proporsi <- function(data, var_prediktor) {
data %>%
group_by(across(all_of(c(var_prediktor, "status_stunting")))) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(across(all_of(var_prediktor))) %>%
mutate(proporsi = round(n / sum(n) * 100, 1)) %>%
ungroup()
}
var_kategorik <- c("pend_ibu", "akses_air", "sanitasi",
"asi_eksklusif", "imunisasi")
for (v in var_kategorik) {
cat("\n=== Tabel Proporsi:", v, "vs status_stunting ===\n")
print(buat_tabel_proporsi(data_stunting_raw, v))
}#>
#> === Tabel Proporsi: pend_ibu vs status_stunting ===
#> # A tibble: 12 × 4
#> pend_ibu status_stunting n proporsi
#> <chr> <chr> <int> <dbl>
#> 1 PT Normal 37 84.1
#> 2 PT Stunting 7 15.9
#> 3 SD Normal 75 68.2
#> 4 SD Stunting 35 31.8
#> 5 SMA Normal 107 69.9
#> 6 SMA Stunting 46 30.1
#> 7 SMP Normal 123 80.4
#> 8 SMP Stunting 30 19.6
#> 9 Tidak Sekolah Normal 19 76
#> 10 Tidak Sekolah Stunting 6 24
#> 11 <NA> Normal 10 66.7
#> 12 <NA> Stunting 5 33.3
#>
#> === Tabel Proporsi: akses_air vs status_stunting ===
#> # A tibble: 4 × 4
#> akses_air status_stunting n proporsi
#> <chr> <chr> <int> <dbl>
#> 1 Tidak Normal 101 78.3
#> 2 Tidak Stunting 28 21.7
#> 3 Ya Normal 270 72.8
#> 4 Ya Stunting 101 27.2
#>
#> === Tabel Proporsi: sanitasi vs status_stunting ===
#> # A tibble: 4 × 4
#> sanitasi status_stunting n proporsi
#> <chr> <chr> <int> <dbl>
#> 1 Baik Normal 220 74.3
#> 2 Baik Stunting 76 25.7
#> 3 Kurang Normal 151 74
#> 4 Kurang Stunting 53 26
#>
#> === Tabel Proporsi: asi_eksklusif vs status_stunting ===
#> # A tibble: 4 × 4
#> asi_eksklusif status_stunting n proporsi
#> <chr> <chr> <int> <dbl>
#> 1 Tidak Normal 166 73.1
#> 2 Tidak Stunting 61 26.9
#> 3 Ya Normal 205 75.1
#> 4 Ya Stunting 68 24.9
#>
#> === Tabel Proporsi: imunisasi vs status_stunting ===
#> # A tibble: 4 × 4
#> imunisasi status_stunting n proporsi
#> <chr> <chr> <int> <dbl>
#> 1 Lengkap Normal 254 72.8
#> 2 Lengkap Stunting 95 27.2
#> 3 Tidak Lengkap Normal 117 77.5
#> 4 Tidak Lengkap Stunting 34 22.5
# ── Visualisasi 1: Pendidikan Ibu ─────────────────────────────────────────────
data_stunting_raw %>%
filter(!is.na(pend_ibu)) %>%
mutate(pend_ibu = factor(pend_ibu,
levels = c("Tidak Sekolah", "SD", "SMP", "SMA", "PT"))) %>%
ggplot(aes(x = pend_ibu, fill = status_stunting)) +
geom_bar(position = "fill", colour = "white", linewidth = 0.3) +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(title = "Proporsi Status Stunting per Pendidikan Ibu",
subtitle = "Semakin rendah pendidikan ibu → proporsi stunting cenderung lebih tinggi",
x = "Tingkat Pendidikan Ibu",
y = "Proporsi (%)",
fill = "Status Stunting",
caption = "Sumber: data_stunting_raw.csv") +
theme(axis.text.x = element_text(angle = 15, hjust = 1))Figure 3.5: Proporsi Status Stunting per Tingkat Pendidikan Ibu
# ── Visualisasi 2: Panel 4 Variabel Kategorik ─────────────────────────────────
data_stunting_raw %>%
select(akses_air, sanitasi, asi_eksklusif, imunisasi, status_stunting) %>%
pivot_longer(-status_stunting, names_to = "variabel", values_to = "kategori") %>%
mutate(variabel = recode(variabel,
akses_air = "Akses Air Bersih",
sanitasi = "Sanitasi",
asi_eksklusif = "ASI Eksklusif",
imunisasi = "Imunisasi")) %>%
ggplot(aes(x = kategori, fill = status_stunting)) +
geom_bar(position = "fill", colour = "white", linewidth = 0.3) +
facet_wrap(~variabel, scales = "free_x", nrow = 2) +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(title = "Proporsi Status Stunting per Variabel Kategorik",
subtitle = "Faktor akses layanan dan perilaku kesehatan",
x = NULL,
y = "Proporsi (%)",
fill = "Status Stunting",
caption = "Sumber: data_stunting_raw.csv") +
theme(axis.text.x = element_text(angle = 25, hjust = 1))Figure 3.6: Proporsi Status Stunting per Variabel Kategorik (Panel)
Unit Kompetensi: J.62DMI00.006.1
Elemen: El.1 Pengecekan Kelengkapan | El.2 Rekomendasi Kelengkapan
Dimensi: TMS + CMS | KUK: 1.1–1.2
# ── IK 07a: Identifikasi Outlier Berdasarkan Batas WHO ───────────────────────
batas_who <- data.frame(
Variabel = c("tb_sekarang", "bb_sekarang", "bb_lahir"),
Batas_Bawah = c(55, 4, 1.5),
Batas_Atas = c(120, 25, 5.0)
)
kable(batas_who,
caption = "Batas Wajar WHO untuk Balita Usia 6–59 Bulan",
col.names = c("Variabel", "Batas Bawah", "Batas Atas"),
align = "lrr") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE)| Variabel | Batas Bawah | Batas Atas |
|---|---|---|
| tb_sekarang | 55.0 | 120 |
| bb_sekarang | 4.0 | 25 |
| bb_lahir | 1.5 | 5 |
# ── Identifikasi outlier per variabel ────────────────────────────────────────
cat("\n=== tb_sekarang: Nilai di luar batas WHO (55-120 cm) ===\n")#>
#> === tb_sekarang: Nilai di luar batas WHO (55-120 cm) ===
tb_outlier <- data_stunting_raw %>%
filter(tb_sekarang < 55 | tb_sekarang > 120) %>%
select(id_balita, usia_bulan, tb_sekarang, status_stunting)
print(tb_outlier)#> # A tibble: 12 × 4
#> id_balita usia_bulan tb_sekarang status_stunting
#> <chr> <dbl> <dbl> <chr>
#> 1 BAL0078 11 53.2 Stunting
#> 2 BAL0079 7 52.1 Stunting
#> 3 BAL0129 51 20 Stunting
#> 4 BAL0206 13 50.2 Stunting
#> 5 BAL0209 17 54.9 Stunting
#> 6 BAL0254 7 54.8 Stunting
#> 7 BAL0301 29 5 Stunting
#> 8 BAL0341 11 15 Stunting
#> 9 BAL0351 16 53 Stunting
#> 10 BAL0390 54 180 Normal
#> 11 BAL0467 19 200 Normal
#> 12 BAL0477 13 52.3 Stunting
#> Jumlah outlier tb_sekarang: 12
#>
#> === bb_sekarang: Nilai di luar batas WHO (4-25 kg) ===
bb_outlier <- data_stunting_raw %>%
filter(bb_sekarang < 4 | bb_sekarang > 25) %>%
select(id_balita, usia_bulan, bb_sekarang, status_stunting)
print(bb_outlier)#> # A tibble: 3 × 4
#> id_balita usia_bulan bb_sekarang status_stunting
#> <chr> <dbl> <dbl> <chr>
#> 1 BAL0006 29 35 Normal
#> 2 BAL0028 52 0.5 Normal
#> 3 BAL0212 56 50 Normal
#> Jumlah outlier bb_sekarang: 3
#>
#> === bb_lahir: Nilai di luar batas WHO (1.5-5.0 kg) ===
bblahir_outlier <- data_stunting_raw %>%
filter(bb_lahir < 1.5 | bb_lahir > 5.0) %>%
select(id_balita, usia_bulan, bb_lahir, status_stunting)
print(bblahir_outlier)#> # A tibble: 0 × 4
#> # ℹ 4 variables: id_balita <chr>, usia_bulan <dbl>, bb_lahir <dbl>,
#> # status_stunting <chr>
#> Jumlah outlier bb_lahir: 0
#>
#> === RINGKASAN OUTLIER ===
cat("tb_sekarang:", nrow(tb_outlier), "outlier dari 500 observasi (",
round(nrow(tb_outlier)/500*100,1), "%)\n")#> tb_sekarang: 12 outlier dari 500 observasi ( 2.4 %)
cat("bb_sekarang:", nrow(bb_outlier), "outlier dari 500 observasi (",
round(nrow(bb_outlier)/500*100,1), "%)\n")#> bb_sekarang: 3 outlier dari 500 observasi ( 0.6 %)
cat("bb_lahir :", nrow(bblahir_outlier), "outlier dari 500 observasi (",
round(nrow(bblahir_outlier)/500*100,1), "%)\n")#> bb_lahir : 0 outlier dari 500 observasi ( 0 %)
#>
#> === Batas outlier SOAL IK09 untuk perbandingan ===
#> tb_sekarang soal: 30-130 cm
#> bb_sekarang soal: 2-20 kg
#> bb_lahir soal: 0.5-5.5 kg
tb_soal <- data_stunting_raw %>% filter(tb_sekarang < 30 | tb_sekarang > 130)
bb_soal <- data_stunting_raw %>% filter(bb_sekarang < 2 | bb_sekarang > 20)
bbl_soal <- data_stunting_raw %>% filter(bb_lahir < 0.5 | bb_lahir > 5.5)
cat("Outlier tb_sekarang (batas soal):", nrow(tb_soal), "\n")#> Outlier tb_sekarang (batas soal): 5
#> Outlier bb_sekarang (batas soal): 3
#> Outlier bb_lahir (batas soal): 0
#>
#> === IK 07b: Kecukupan Data ===
n_obs <- nrow(data_stunting_raw)
n_prediktor <- 15
min_obs <- n_prediktor * 10
rasio <- n_obs / n_prediktor
cat("Jumlah observasi :", n_obs, "\n")#> Jumlah observasi : 500
#> Jumlah variabel prediktor: 15
#> Minimum obs (10x pred) : 150
#> Rasio observasi/prediktor: 33.33
#> Kecukupan data: CUKUP ✓
#>
#> === Missing Values per Variabel ===
data_stunting_raw %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Variabel", values_to = "Missing") %>%
filter(Missing > 0) %>%
mutate(Persen = round(Missing/500*100, 1)) %>%
print()#> # A tibble: 3 × 3
#> Variabel Missing Persen
#> <chr> <int> <dbl>
#> 1 bb_lahir 20 4
#> 2 pend_ibu 15 3
#> 3 kunjungan_posyandu 10 2
# ── IK 07a — Identifikasi Outlier per Variabel (Batas WHO ±3SD) ──────────────
outlier_tb <- data_stunting_raw %>%
filter(tb_sekarang < 61.2 | tb_sekarang > 124.0) %>%
select(id_balita, usia_bulan, tb_sekarang, status_stunting)
cat("=== Outlier tb_sekarang (WHO: 61.2–124.0 cm) ===\n")#> === Outlier tb_sekarang (WHO: 61.2–124.0 cm) ===
#> Jumlah outlier: 42
#> # A tibble: 42 × 4
#> id_balita usia_bulan tb_sekarang status_stunting
#> <chr> <dbl> <dbl> <chr>
#> 1 BAL0003 10 61.1 Stunting
#> 2 BAL0023 6 61 Normal
#> 3 BAL0040 18 60.8 Stunting
#> 4 BAL0060 12 60.5 Stunting
#> 5 BAL0071 24 56.2 Stunting
#> 6 BAL0078 11 53.2 Stunting
#> 7 BAL0079 7 52.1 Stunting
#> 8 BAL0085 11 58.1 Stunting
#> 9 BAL0102 16 58.9 Stunting
#> 10 BAL0109 16 58.7 Stunting
#> # ℹ 32 more rows
outlier_bb <- data_stunting_raw %>%
filter(bb_sekarang < 5.5 | bb_sekarang > 24.0) %>%
select(id_balita, usia_bulan, bb_sekarang, status_stunting)
cat("\n=== Outlier bb_sekarang (WHO: 5.5–24.0 kg) ===\n")#>
#> === Outlier bb_sekarang (WHO: 5.5–24.0 kg) ===
#> Jumlah outlier: 3
#> # A tibble: 3 × 4
#> id_balita usia_bulan bb_sekarang status_stunting
#> <chr> <dbl> <dbl> <chr>
#> 1 BAL0006 29 35 Normal
#> 2 BAL0028 52 0.5 Normal
#> 3 BAL0212 56 50 Normal
outlier_bbl <- data_stunting_raw %>%
filter(!is.na(bb_lahir)) %>%
filter(bb_lahir < 2.0 | bb_lahir > 4.5) %>%
select(id_balita, usia_bulan, bb_lahir, status_stunting)
cat("\n=== Outlier bb_lahir (WHO: 2.0–4.5 kg) ===\n")#>
#> === Outlier bb_lahir (WHO: 2.0–4.5 kg) ===
#> Jumlah outlier: 9
#> # A tibble: 9 × 4
#> id_balita usia_bulan bb_lahir status_stunting
#> <chr> <dbl> <dbl> <chr>
#> 1 BAL0013 28 1.72 Normal
#> 2 BAL0132 9 1.95 Normal
#> 3 BAL0155 21 1.83 Normal
#> 4 BAL0156 16 1.85 Normal
#> 5 BAL0249 57 1.54 Normal
#> 6 BAL0326 45 1.7 Stunting
#> 7 BAL0358 42 1.98 Normal
#> 8 BAL0370 37 1.66 Stunting
#> 9 BAL0494 9 1.78 Normal
outlier_zscore <- data_stunting_raw %>%
filter(zscore_tbu < -6 | zscore_tbu > 6) %>%
select(id_balita, usia_bulan, zscore_tbu, status_stunting)
cat("\n=== Outlier zscore_tbu (WHO: -6 s/d +6) ===\n")#>
#> === Outlier zscore_tbu (WHO: -6 s/d +6) ===
#> Jumlah outlier: 5
#> # A tibble: 5 × 4
#> id_balita usia_bulan zscore_tbu status_stunting
#> <chr> <dbl> <dbl> <chr>
#> 1 BAL0129 51 -18.4 Stunting
#> 2 BAL0301 29 -20.3 Stunting
#> 3 BAL0341 11 -15.5 Stunting
#> 4 BAL0390 54 27.0 Normal
#> 5 BAL0467 19 36.5 Normal
#>
#> === Ringkasan Statistik Variabel Antropometri ===
#> tb_sekarang bb_sekarang bb_lahir zscore_tbu
#> Min. : 5.00 Min. : 0.50 Min. :1.540 Min. :-20.2914
#> 1st Qu.: 68.60 1st Qu.:10.37 1st Qu.:2.760 1st Qu.: -2.0200
#> Median : 74.70 Median :12.02 Median :3.100 Median : -0.7685
#> Mean : 74.93 Mean :12.10 Mean :3.102 Mean : -0.7399
#> 3rd Qu.: 81.00 3rd Qu.:13.77 3rd Qu.:3.450 3rd Qu.: 0.6128
#> Max. :200.00 Max. :50.00 Max. :4.430 Max. : 36.5086
#> NA's :20
#>
#> === REKAP TOTAL NILAI TIDAK VALID (Batas WHO) ===
#> tb_sekarang : 42 data tidak valid
#> bb_sekarang : 3 data tidak valid
#> bb_lahir : 9 data tidak valid
#> zscore_tbu : 5 data tidak valid
cat("TOTAL :", nrow(outlier_tb) + nrow(outlier_bb) +
nrow(outlier_bbl) + nrow(outlier_zscore),
"data tidak valid\n")#> TOTAL : 59 data tidak valid
# ── IK 07b — Rule of Thumb Kecukupan Data ────────────────────────────────────
n_observasi <- nrow(data_stunting_raw)
n_prediktor <- 14
min_obs <- n_prediktor * 10
rasio <- round(n_observasi / n_prediktor, 2)
cat("\n=== Penilaian Kecukupan Data (Rule of Thumb: 10 obs/prediktor) ===\n")#>
#> === Penilaian Kecukupan Data (Rule of Thumb: 10 obs/prediktor) ===
#> Jumlah observasi : 500
#> Jumlah variabel prediktor : 14
#> Minimum observasi (10x) : 140
#> Rasio obs/prediktor : 35.71
if (n_observasi >= min_obs) {
cat("Kesimpulan : CUKUP ✓ (rasio", rasio, ">= 10)\n")
} else {
cat("Kesimpulan : TIDAK CUKUP ✗ (rasio", rasio, "< 10)\n")
}#> Kesimpulan : CUKUP ✓ (rasio 35.71 >= 10)
#>
#> === Missing Values per Variabel ===
data_stunting_raw %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(),
names_to = "Variabel",
values_to = "Jumlah_NA") %>%
filter(Jumlah_NA > 0) %>%
print()#> # A tibble: 3 × 2
#> Variabel Jumlah_NA
#> <chr> <int>
#> 1 bb_lahir 20
#> 2 pend_ibu 15
#> 3 kunjungan_posyandu 10
Unit Kompetensi: J.62DMI00.007.1
Elemen: El.1 Kriteria & Teknik Pemilihan | El.2 Attributes & Records
Dimensi: TMS | KUK: 1.1–1.2, 2.1–2.2
# ── IK 08a: Ringkasan variabel & keputusan penggunaan ─────────────────────────
cat("=== Dimensi Dataset ===\n")#> === Dimensi Dataset ===
#> Jumlah baris (observasi): 500
#> Jumlah kolom (variabel) : 17
#>
#> === Nama Variabel ===
#> [1] "id_balita" "kab_kota" "jenis_kelamin"
#> [4] "usia_bulan" "bb_lahir" "tb_sekarang"
#> [7] "bb_sekarang" "pend_ibu" "pendapatan"
#> [10] "akses_air" "sanitasi" "asi_eksklusif"
#> [13] "imunisasi" "diare_3bln" "kunjungan_posyandu"
#> [16] "zscore_tbu" "status_stunting"
#>
#> === Tipe Data per Variabel ===
#> Rows: 500
#> Columns: 17
#> $ id_balita <chr> "BAL0001", "BAL0002", "BAL0003", "BAL0004", "BAL000…
#> $ kab_kota <chr> "KAB_02", "KAB_07", "KAB_27", "KAB_14", "KAB_14", "…
#> $ jenis_kelamin <chr> "L", "L", "L", "L", "P", "P", "L", "P", "L", "P", "…
#> $ usia_bulan <dbl> 33, 24, 10, 27, 55, 29, 21, 32, 26, 23, 17, 35, 28,…
#> $ bb_lahir <dbl> 2.16, 2.95, 3.03, 3.00, 3.07, 2.76, 3.11, 3.08, 2.5…
#> $ tb_sekarang <dbl> 78.4, 65.2, 61.1, 74.6, 90.3, 73.1, 79.7, 70.8, 65.…
#> $ bb_sekarang <dbl> 9.39, 10.61, 8.45, 11.99, 12.78, 35.00, 12.21, 10.9…
#> $ pend_ibu <chr> "SD", "SMP", "SD", "SMA", "SD", "SD", "SD", "SMA", …
#> $ pendapatan <dbl> 2828197, 1981942, 779264, 7148622, 7289472, 3748342…
#> $ akses_air <chr> "Tidak", "Ya", "Ya", "Ya", "Tidak", "Ya", "Ya", "Ya…
#> $ sanitasi <chr> "Baik", "Baik", "Kurang", "Kurang", "Baik", "Baik",…
#> $ asi_eksklusif <chr> "Tidak", "Ya", "Ya", "Ya", "Tidak", "Ya", "Ya", "Ya…
#> $ imunisasi <chr> "Lengkap", "Tidak Lengkap", "Lengkap", "Lengkap", "…
#> $ diare_3bln <dbl> 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
#> $ kunjungan_posyandu <dbl> 11, 11, 1, 5, 7, 5, 4, 7, 8, 3, 1, 5, 7, 12, 9, 8, …
#> $ zscore_tbu <dbl> 0.2457, -2.5486, -2.2000, -0.1886, 1.2571, -0.8343,…
#> $ status_stunting <chr> "Normal", "Stunting", "Stunting", "Normal", "Normal…
#>
#> === Distribusi Target: status_stunting ===
data_stunting_raw %>%
count(status_stunting) %>%
mutate(persen = round(n/sum(n)*100, 2)) %>%
print()#> # A tibble: 2 × 3
#> status_stunting n persen
#> <chr> <int> <dbl>
#> 1 Normal 371 74.2
#> 2 Stunting 129 25.8
#>
#> === Proporsi Stunting per Jenis Kelamin ===
data_stunting_raw %>%
group_by(jenis_kelamin, status_stunting) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(jenis_kelamin) %>%
mutate(proporsi = round(n/sum(n)*100, 2)) %>%
print()#> # A tibble: 4 × 4
#> # Groups: jenis_kelamin [2]
#> jenis_kelamin status_stunting n proporsi
#> <chr> <chr> <int> <dbl>
#> 1 L Normal 184 69.4
#> 2 L Stunting 81 30.6
#> 3 P Normal 187 79.6
#> 4 P Stunting 48 20.4
#>
#> === Proporsi Stunting per Kelompok Usia ===
data_stunting_raw %>%
mutate(kelompok_usia = case_when(
usia_bulan >= 6 & usia_bulan <= 11 ~ "6-11 bln",
usia_bulan >= 12 & usia_bulan <= 23 ~ "12-23 bln",
usia_bulan >= 24 & usia_bulan <= 35 ~ "24-35 bln",
usia_bulan >= 36 & usia_bulan <= 47 ~ "36-47 bln",
usia_bulan >= 48 & usia_bulan <= 59 ~ "48-59 bln"
)) %>%
group_by(kelompok_usia, status_stunting) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(kelompok_usia) %>%
mutate(proporsi = round(n/sum(n)*100, 2)) %>%
arrange(kelompok_usia) %>%
print()#> # A tibble: 10 × 4
#> # Groups: kelompok_usia [5]
#> kelompok_usia status_stunting n proporsi
#> <chr> <chr> <int> <dbl>
#> 1 12-23 bln Normal 90 77.6
#> 2 12-23 bln Stunting 26 22.4
#> 3 24-35 bln Normal 87 73.7
#> 4 24-35 bln Stunting 31 26.3
#> 5 36-47 bln Normal 66 72.5
#> 6 36-47 bln Stunting 25 27.5
#> 7 48-59 bln Normal 95 74.8
#> 8 48-59 bln Stunting 32 25.2
#> 9 6-11 bln Normal 33 68.8
#> 10 6-11 bln Stunting 15 31.2
#>
#> === Simulasi Stratified Split 80:20 ===
#> Training set (80%):
#> - Target Normal : ekspektasi 297 observasi
#> - Target Stunting: ekspektasi 103 observasi
#> Test set (20%):
#> - Target Normal : ekspektasi 74 observasi
#> - Target Stunting: ekspektasi 26 observasi
#>
#> === Missing Values per Variabel Prediktor ===
data_stunting_raw %>%
select(-id_balita, -kab_kota, -status_stunting) %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(),
names_to = "Variabel",
values_to = "Jumlah_NA") %>%
print()#> # A tibble: 14 × 2
#> Variabel Jumlah_NA
#> <chr> <int>
#> 1 jenis_kelamin 0
#> 2 usia_bulan 0
#> 3 bb_lahir 20
#> 4 tb_sekarang 0
#> 5 bb_sekarang 0
#> 6 pend_ibu 15
#> 7 pendapatan 0
#> 8 akses_air 0
#> 9 sanitasi 0
#> 10 asi_eksklusif 0
#> 11 imunisasi 0
#> 12 diare_3bln 0
#> 13 kunjungan_posyandu 10
#> 14 zscore_tbu 0
Unit Kompetensi: J.62DMI00.008.1
Elemen: El.1 Pembersihan Data Kotor | El.2 Laporan & Rekomendasi
Dimensi: TMS + CMS | KUK: 1.1–1.2
# ── IK 09a: RINGKASAN MASALAH DATA SEBELUM CLEANING ──────────────────────────
cat("=== KONDISI DATA SEBELUM CLEANING ===\n")#> === KONDISI DATA SEBELUM CLEANING ===
#> Jumlah baris: 500
#>
#> --- Missing Values ---
data_stunting_raw %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Variabel", values_to = "NA_Count") %>%
filter(NA_Count > 0) %>%
mutate(Persen = round(NA_Count / 500 * 100, 1)) %>%
print()#> # A tibble: 3 × 3
#> Variabel NA_Count Persen
#> <chr> <int> <dbl>
#> 1 bb_lahir 20 4
#> 2 pend_ibu 15 3
#> 3 kunjungan_posyandu 10 2
#>
#> --- Outlier Berdasarkan Batas Soal ---
n_tb <- sum(data_stunting_raw$tb_sekarang < 30 | data_stunting_raw$tb_sekarang > 130, na.rm = TRUE)
n_bb <- sum(data_stunting_raw$bb_sekarang < 2 | data_stunting_raw$bb_sekarang > 20, na.rm = TRUE)
n_bbl <- sum(data_stunting_raw$bb_lahir < 0.5| data_stunting_raw$bb_lahir > 5.5, na.rm = TRUE)
cat("Outlier tb_sekarang (batas 30-130 cm) :", n_tb, "\n")#> Outlier tb_sekarang (batas 30-130 cm) : 5
#> Outlier bb_sekarang (batas 2-20 kg) : 3
#> Outlier bb_lahir (batas 0.5-5.5 kg): 0
#>
#> --- Data Duplikat ---
#> Jumlah baris duplikat: 0
# ── LANGKAH 1 — Hapus Outlier Ekstrem ────────────────────────────────────────
data_step1 <- data_stunting_raw %>%
filter(
tb_sekarang >= 30 & tb_sekarang <= 130,
bb_sekarang >= 2 & bb_sekarang <= 20,
(bb_lahir >= 0.5 & bb_lahir <= 5.5) | is.na(bb_lahir)
)
cat("\nSetelah hapus outlier antropometri:", nrow(data_step1), "baris tersisa\n")#>
#> Setelah hapus outlier antropometri: 492 baris tersisa
#> Baris yang dihapus: 8
# ── LANGKAH 2 — Imputasi Missing Values ──────────────────────────────────────
data_step2 <- data_step1 %>%
mutate(
kelompok_usia = case_when(
usia_bulan <= 12 ~ "0-12",
usia_bulan <= 24 ~ "13-24",
usia_bulan <= 36 ~ "25-36",
usia_bulan <= 48 ~ "37-48",
TRUE ~ "49-59"
)
)
cat("\n--- Median bb_lahir per Kelompok Usia ---\n")#>
#> --- Median bb_lahir per Kelompok Usia ---
median_bb_lahir <- data_step2 %>%
group_by(kelompok_usia) %>%
summarise(median_bb = median(bb_lahir, na.rm = TRUE))
print(median_bb_lahir)#> # A tibble: 5 × 2
#> kelompok_usia median_bb
#> <chr> <dbl>
#> 1 0-12 3.08
#> 2 13-24 3.10
#> 3 25-36 3.08
#> 4 37-48 3.1
#> 5 49-59 3.13
#>
#> --- Median kunjungan_posyandu per Kelompok Usia ---
median_kunjungan <- data_step2 %>%
group_by(kelompok_usia) %>%
summarise(median_kunjungan = median(kunjungan_posyandu, na.rm = TRUE))
print(median_kunjungan)#> # A tibble: 5 × 2
#> kelompok_usia median_kunjungan
#> <chr> <dbl>
#> 1 0-12 7
#> 2 13-24 6
#> 3 25-36 7
#> 4 37-48 7
#> 5 49-59 7
#>
#> --- Modus pend_ibu ---
modus_pend_ibu <- data_step2 %>%
filter(!is.na(pend_ibu)) %>%
count(pend_ibu, sort = TRUE) %>%
slice(1) %>%
pull(pend_ibu)
cat("Modus pend_ibu:", modus_pend_ibu, "\n")#> Modus pend_ibu: SMP
data_step2 <- data_step2 %>%
left_join(median_bb_lahir, by = "kelompok_usia") %>%
left_join(median_kunjungan, by = "kelompok_usia") %>%
mutate(
bb_lahir = ifelse(is.na(bb_lahir), median_bb, bb_lahir),
kunjungan_posyandu = ifelse(is.na(kunjungan_posyandu), median_kunjungan, kunjungan_posyandu),
pend_ibu = ifelse(is.na(pend_ibu), modus_pend_ibu, pend_ibu)
) %>%
select(-median_bb, -median_kunjungan)
cat("\n--- Missing Values Setelah Imputasi ---\n")#>
#> --- Missing Values Setelah Imputasi ---
data_step2 %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Variabel", values_to = "NA_Count") %>%
filter(NA_Count > 0) %>%
print()#> # A tibble: 0 × 2
#> # ℹ 2 variables: Variabel <chr>, NA_Count <int>
#> (Jika tidak ada output = tidak ada missing values tersisa)
# ── LANGKAH 3 — Hapus Duplikat ────────────────────────────────────────────────
data_step3 <- data_step2 %>% distinct()
cat("\nSetelah hapus duplikat:", nrow(data_step3), "baris tersisa\n")#>
#> Setelah hapus duplikat: 492 baris tersisa
#> Duplikat yang dihapus: 0
#>
#> === SUMMARY SEBELUM CLEANING ===
data_stunting_raw %>%
select(tb_sekarang, bb_sekarang, bb_lahir, kunjungan_posyandu) %>%
summary() %>% print()#> tb_sekarang bb_sekarang bb_lahir kunjungan_posyandu
#> Min. : 5.00 Min. : 0.50 Min. :1.540 Min. : 1.000
#> 1st Qu.: 68.60 1st Qu.:10.37 1st Qu.:2.760 1st Qu.: 3.000
#> Median : 74.70 Median :12.02 Median :3.100 Median : 7.000
#> Mean : 74.93 Mean :12.10 Mean :3.102 Mean : 6.618
#> 3rd Qu.: 81.00 3rd Qu.:13.77 3rd Qu.:3.450 3rd Qu.:10.000
#> Max. :200.00 Max. :50.00 Max. :4.430 Max. :12.000
#> NA's :20 NA's :10
#>
#> === SUMMARY SESUDAH CLEANING ===
data_step3 %>%
select(tb_sekarang, bb_sekarang, bb_lahir, kunjungan_posyandu) %>%
summary() %>% print()#> tb_sekarang bb_sekarang bb_lahir kunjungan_posyandu
#> Min. : 50.20 Min. : 5.88 Min. :1.540 Min. : 1.00
#> 1st Qu.: 68.67 1st Qu.:10.37 1st Qu.:2.788 1st Qu.: 3.00
#> Median : 74.70 Median :12.01 Median :3.100 Median : 7.00
#> Mean : 74.80 Mean :12.00 Mean :3.105 Mean : 6.62
#> 3rd Qu.: 80.90 3rd Qu.:13.66 3rd Qu.:3.440 3rd Qu.:10.00
#> Max. :104.10 Max. :19.34 Max. :4.430 Max. :12.00
# ── Boxplot Sebelum vs Sesudah Cleaning ──────────────────────────────────────
library(patchwork)
p1a <- ggplot(data_stunting_raw, aes(y = tb_sekarang)) +
geom_boxplot(fill = "#FFCDD2", outlier.colour = COL_STUNTING,
colour = COL_STUNTING, width = 0.45) +
labs(title = "Tinggi Badan\n(Sebelum Cleaning)", y = "cm") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p1b <- ggplot(data_step3, aes(y = tb_sekarang)) +
geom_boxplot(fill = "#C8E6C9", outlier.colour = "#2E7D32",
colour = "#2E7D32", width = 0.45) +
labs(title = "Tinggi Badan\n(Sesudah Cleaning)", y = "cm") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p2a <- ggplot(data_stunting_raw, aes(y = bb_sekarang)) +
geom_boxplot(fill = "#FFCDD2", outlier.colour = COL_STUNTING,
colour = COL_STUNTING, width = 0.45) +
labs(title = "Berat Badan\n(Sebelum Cleaning)", y = "kg") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p2b <- ggplot(data_step3, aes(y = bb_sekarang)) +
geom_boxplot(fill = "#C8E6C9", outlier.colour = "#2E7D32",
colour = "#2E7D32", width = 0.45) +
labs(title = "Berat Badan\n(Sesudah Cleaning)", y = "kg") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
(p1a | p1b | p2a | p2b) +
plot_annotation(
title = "Perbandingan Distribusi Sebelum dan Sesudah Cleaning",
subtitle = "Merah = sebelum cleaning | Hijau = sesudah cleaning",
theme = theme(plot.title = element_text(face = "bold", colour = "#1A237E"),
plot.subtitle = element_text(colour = "#546E7A"))
)Figure 6.1: Perbandingan Distribusi Variabel Antropometri Sebelum dan Sesudah Cleaning
# ── Simpan data bersih ────────────────────────────────────────────────────────
data_stunting_clean <- data_step3 %>% select(-kelompok_usia)
write_csv(data_stunting_clean, "C:/Users/User/Downloads/data_stunting_clean.csv")
cat("\nFile data_stunting_clean.csv berhasil disimpan!\n")#>
#> File data_stunting_clean.csv berhasil disimpan!
#> Dimensi final: 492 baris x 17 kolom
Dimensi: TRS + JRES | KUK: 2.2–2.3
# ── IK 10a: Prevalensi Sebelum vs Sesudah Cleaning ───────────────────────────
prev_sebelum <- data_stunting_raw %>%
count(status_stunting) %>%
mutate(persen = round(n / sum(n) * 100, 2),
dataset = "Sebelum Cleaning")
prev_sesudah <- data_stunting_clean %>%
count(status_stunting) %>%
mutate(persen = round(n / sum(n) * 100, 2),
dataset = "Sesudah Cleaning")
cat("=== PREVALENSI STUNTING SEBELUM CLEANING ===\n"); print(prev_sebelum)#> === PREVALENSI STUNTING SEBELUM CLEANING ===
#> # A tibble: 2 × 4
#> status_stunting n persen dataset
#> <chr> <int> <dbl> <chr>
#> 1 Normal 371 74.2 Sebelum Cleaning
#> 2 Stunting 129 25.8 Sebelum Cleaning
#>
#> === PREVALENSI STUNTING SESUDAH CLEANING ===
#> # A tibble: 2 × 4
#> status_stunting n persen dataset
#> <chr> <int> <dbl> <chr>
#> 1 Normal 366 74.4 Sesudah Cleaning
#> 2 Stunting 126 25.6 Sesudah Cleaning
perbandingan <- bind_rows(prev_sebelum, prev_sesudah) %>%
select(dataset, status_stunting, n, persen)
kable(perbandingan,
caption = "Perbandingan Prevalensi Stunting Sebelum dan Sesudah Cleaning",
col.names = c("Dataset", "Status", "n", "%"),
align = "llrr") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, position = "center") %>%
collapse_rows(columns = 1, valign = "middle")| Dataset | Status | n | % |
|---|---|---|---|
| Sebelum Cleaning | Normal | 371 | 74.20 |
| Stunting | 129 | 25.80 | |
| Sesudah Cleaning | Normal | 366 | 74.39 |
| Stunting | 126 | 25.61 |
pct_sblm <- prev_sebelum %>% filter(status_stunting == "Stunting") %>% pull(persen)
pct_ssdh <- prev_sesudah %>% filter(status_stunting == "Stunting") %>% pull(persen)
cat("\nPrevalensi Stunting sebelum:", pct_sblm, "%\n")#>
#> Prevalensi Stunting sebelum: 25.8 %
#> Prevalensi Stunting sesudah: 25.61 %
#> Selisih : -0.19 poin persentase
bind_rows(prev_sebelum, prev_sesudah) %>%
ggplot(aes(x = dataset, y = persen, fill = status_stunting)) +
geom_bar(stat = "identity", position = "fill",
colour = "white", linewidth = 0.4, width = 0.5) +
geom_text(aes(label = paste0(persen, "%")),
position = position_fill(vjust = 0.5),
colour = "white", size = 4.2, fontface = "bold") +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(title = "Perbandingan Prevalensi Stunting",
subtitle = "Sebelum vs Sesudah Pembersihan Data",
x = NULL,
y = "Proporsi (%)",
fill = "Status Stunting",
caption = "Sumber: data_stunting_raw.csv & data_stunting_clean.csv")Figure 6.2: Perbandingan Prevalensi Stunting Sebelum dan Sesudah Cleaning
# Status stunting data outlier yang dihapus
cat("\n=== STATUS STUNTING PADA DATA YANG DIHAPUS (OUTLIER) ===\n")#>
#> === STATUS STUNTING PADA DATA YANG DIHAPUS (OUTLIER) ===
data_dihapus <- data_stunting_raw %>%
filter(
tb_sekarang < 30 | tb_sekarang > 130 |
bb_sekarang < 2 | bb_sekarang > 20 |
(!is.na(bb_lahir) & (bb_lahir < 0.5 | bb_lahir > 5.5))
)
cat("Jumlah data outlier yang dihapus:", nrow(data_dihapus), "\n")#> Jumlah data outlier yang dihapus: 8
#> # A tibble: 2 × 3
#> status_stunting n persen
#> <chr> <int> <dbl>
#> 1 Normal 5 62.5
#> 2 Stunting 3 37.5
Unit Kompetensi: J.62DMI00.009.1
Elemen: El.1 Teknik Transformasi | El.2 Transformasi Data | El.3 Dokumentasi
Dimensi: TMS + CMS | KUK: 1.1–1.2, 2.1
# ── Muat data clean ───────────────────────────────────────────────────────────
data_stunting_clean <- read_csv("C:/Users/User/Downloads/data_stunting_clean.csv")
cat("Dimensi data clean:", dim(data_stunting_clean), "\n")#> Dimensi data clean: 492 17
# ── Ordinal Encoding: pend_ibu & Binary Encoding ─────────────────────────────
data_encoded <- data_stunting_clean %>%
mutate(
pend_ibu_enc = case_when(
pend_ibu == "Tidak Sekolah" ~ 0,
pend_ibu == "SD" ~ 1,
pend_ibu == "SMP" ~ 2,
pend_ibu == "SMA" ~ 3,
pend_ibu == "PT" ~ 4,
TRUE ~ NA_real_
),
jenis_kelamin_enc = ifelse(jenis_kelamin == "L", 1, 0),
akses_air_enc = ifelse(akses_air == "Ya", 1, 0),
sanitasi_enc = ifelse(sanitasi == "Baik", 1, 0),
asi_eksklusif_enc = ifelse(asi_eksklusif == "Ya", 1, 0),
imunisasi_enc = ifelse(imunisasi == "Lengkap", 1, 0)
)
cat("\n=== Hasil Encoding pend_ibu ===\n")#>
#> === Hasil Encoding pend_ibu ===
#> # A tibble: 5 × 3
#> pend_ibu pend_ibu_enc n
#> <chr> <dbl> <int>
#> 1 Tidak Sekolah 0 25
#> 2 SD 1 109
#> 3 SMP 2 165
#> 4 SMA 3 149
#> 5 PT 4 44
#>
#> === Hasil Encoding Variabel Binary ===
data_encoded %>%
select(jenis_kelamin, jenis_kelamin_enc,
akses_air, akses_air_enc,
sanitasi, sanitasi_enc,
asi_eksklusif, asi_eksklusif_enc,
imunisasi, imunisasi_enc) %>%
distinct() %>%
arrange(jenis_kelamin_enc) %>%
print(n = 20)#> # A tibble: 32 × 10
#> jenis_kelamin jenis_kelamin_enc akses_air akses_air_enc sanitasi sanitasi_enc
#> <chr> <dbl> <chr> <dbl> <chr> <dbl>
#> 1 P 0 Tidak 0 Baik 1
#> 2 P 0 Ya 1 Kurang 0
#> 3 P 0 Ya 1 Baik 1
#> 4 P 0 Ya 1 Baik 1
#> 5 P 0 Tidak 0 Kurang 0
#> 6 P 0 Tidak 0 Baik 1
#> 7 P 0 Ya 1 Kurang 0
#> 8 P 0 Tidak 0 Baik 1
#> 9 P 0 Ya 1 Kurang 0
#> 10 P 0 Ya 1 Baik 1
#> 11 P 0 Tidak 0 Kurang 0
#> 12 P 0 Tidak 0 Kurang 0
#> 13 P 0 Tidak 0 Baik 1
#> 14 P 0 Tidak 0 Kurang 0
#> 15 P 0 Ya 1 Baik 1
#> 16 P 0 Ya 1 Kurang 0
#> 17 L 1 Tidak 0 Baik 1
#> 18 L 1 Ya 1 Baik 1
#> 19 L 1 Ya 1 Kurang 0
#> 20 L 1 Tidak 0 Kurang 0
#> # ℹ 12 more rows
#> # ℹ 4 more variables: asi_eksklusif <chr>, asi_eksklusif_enc <dbl>,
#> # imunisasi <chr>, imunisasi_enc <dbl>
# ── Buat 2 Fitur Baru ─────────────────────────────────────────────────────────
data_encoded <- data_encoded %>%
mutate(
bmi = bb_sekarang / (tb_sekarang / 100)^2,
rasio_bb = bb_sekarang / bb_lahir
)
cat("\n=== Ringkasan Fitur Baru ===\n")#>
#> === Ringkasan Fitur Baru ===
data_encoded %>%
select(bmi, rasio_bb, status_stunting) %>%
group_by(status_stunting) %>%
summarise(
n = n(),
BMI_mean = round(mean(bmi, na.rm=TRUE), 3),
BMI_median = round(median(bmi, na.rm=TRUE), 3),
BMI_sd = round(sd(bmi, na.rm=TRUE), 3),
RasioBB_mean = round(mean(rasio_bb, na.rm=TRUE), 3),
RasioBB_median = round(median(rasio_bb, na.rm=TRUE), 3),
RasioBB_sd = round(sd(rasio_bb, na.rm=TRUE), 3)
) %>% print()#> # A tibble: 2 × 8
#> status_stunting n BMI_mean BMI_median BMI_sd RasioBB_mean RasioBB_median
#> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Normal 366 20.0 19.6 4.02 3.95 3.85
#> 2 Stunting 126 27.9 27.1 5.97 4.11 3.86
#> # ℹ 1 more variable: RasioBB_sd <dbl>
#>
#> === Uji Mann-Whitney: BMI vs status_stunting ===
#>
#> Wilcoxon rank sum test with continuity correction
#>
#> data: bmi by status_stunting
#> W = 5824, p-value < 2.2e-16
#> alternative hypothesis: true location shift is not equal to 0
#>
#> === Uji Mann-Whitney: rasio_bb vs status_stunting ===
#>
#> Wilcoxon rank sum test with continuity correction
#>
#> data: rasio_bb by status_stunting
#> W = 22114, p-value = 0.4928
#> alternative hypothesis: true location shift is not equal to 0
library(patchwork)
p_bmi <- ggplot(data_encoded,
aes(x = status_stunting, y = bmi, fill = status_stunting)) +
geom_violin(alpha = 0.35, trim = FALSE, colour = NA) +
geom_boxplot(width = 0.25, outlier.colour = "#FF6F00",
outlier.shape = 21, outlier.size = 2,
outlier.fill = "#FFB300", linewidth = 0.5) +
stat_summary(fun = mean, geom = "point", shape = 23,
size = 3.5, fill = "white", colour = COL_ACCENT) +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
labs(title = "Indeks Massa Tubuh (BMI)",
x = "Status Stunting",
y = "BMI (kg/m²)") +
theme(legend.position = "none")
p_rasio <- ggplot(data_encoded,
aes(x = status_stunting, y = rasio_bb, fill = status_stunting)) +
geom_violin(alpha = 0.35, trim = FALSE, colour = NA) +
geom_boxplot(width = 0.25, outlier.colour = "#FF6F00",
outlier.shape = 21, outlier.size = 2,
outlier.fill = "#FFB300", linewidth = 0.5) +
stat_summary(fun = mean, geom = "point", shape = 23,
size = 3.5, fill = "white", colour = COL_ACCENT) +
scale_fill_manual(values = c("Normal" = COL_NORMAL, "Stunting" = COL_STUNTING)) +
labs(title = "Rasio BB Sekarang / BB Lahir",
x = "Status Stunting",
y = "Rasio BB") +
theme(legend.position = "none")
(p_bmi | p_rasio) +
plot_annotation(
title = "Distribusi Fitur Rekayasa per Status Stunting",
subtitle = "Violin + Boxplot | Berlian putih = rata-rata",
theme = theme(plot.title = element_text(face = "bold", colour = "#1A237E"),
plot.subtitle = element_text(colour = "#546E7A"))
)Figure 7.1: Distribusi Fitur Rekayasa Baru per Status Stunting
# ── Fungsi Min-Max & Normalisasi ──────────────────────────────────────────────
minmax <- function(x) {
(x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE))
}
vars_normalisasi <- c("usia_bulan", "bb_lahir", "tb_sekarang", "bb_sekarang",
"pendapatan", "diare_3bln", "kunjungan_posyandu",
"zscore_tbu", "bmi", "rasio_bb", "pend_ibu_enc")
data_normalized <- data_encoded %>%
mutate(across(all_of(vars_normalisasi), minmax, .names = "{.col}_norm"))
cat("\n=== Verifikasi Range Setelah Min-Max Normalisasi ===\n")#>
#> === Verifikasi Range Setelah Min-Max Normalisasi ===
data_normalized %>%
select(ends_with("_norm")) %>%
summarise(across(everything(), list(
Min = ~round(min(., na.rm=TRUE), 3),
Max = ~round(max(., na.rm=TRUE), 3)
))) %>%
pivot_longer(everything(),
names_to = c("Variabel","Stat"),
names_sep = "_(?=[^_]+$)") %>%
pivot_wider(names_from = Stat, values_from = value) %>%
print(n = 15)#> # A tibble: 11 × 3
#> Variabel Min Max
#> <chr> <dbl> <dbl>
#> 1 usia_bulan_norm 0 1
#> 2 bb_lahir_norm 0 1
#> 3 tb_sekarang_norm 0 1
#> 4 bb_sekarang_norm 0 1
#> 5 pendapatan_norm 0 1
#> 6 diare_3bln_norm 0 1
#> 7 kunjungan_posyandu_norm 0 1
#> 8 zscore_tbu_norm 0 1
#> 9 bmi_norm 0 1
#> 10 rasio_bb_norm 0 1
#> 11 pend_ibu_enc_norm 0 1
Dimensi: TRS + JRES | KUK: 2.3–2.4
# ── Deteksi Multikolinearitas dengan VIF ──────────────────────────────────────
library(car)
fitur_model <- data_encoded %>%
select(usia_bulan, bb_lahir, tb_sekarang, bb_sekarang,
pendapatan, diare_3bln, kunjungan_posyandu,
bmi, rasio_bb, pend_ibu_enc,
jenis_kelamin_enc, akses_air_enc, sanitasi_enc,
asi_eksklusif_enc, imunisasi_enc,
status_stunting) %>%
mutate(status_stunting_num = ifelse(status_stunting == "Stunting", 1, 0)) %>%
drop_na()
model_vif <- lm(status_stunting_num ~ usia_bulan + bb_lahir + tb_sekarang +
bb_sekarang + pendapatan + diare_3bln + kunjungan_posyandu +
bmi + rasio_bb + pend_ibu_enc +
jenis_kelamin_enc + akses_air_enc + sanitasi_enc +
asi_eksklusif_enc + imunisasi_enc,
data = fitur_model)
vif_result <- vif(model_vif)
vif_df <- data.frame(Fitur = names(vif_result),
VIF = round(vif_result, 3)) %>%
arrange(desc(VIF))
kable(vif_df,
caption = "Nilai VIF per Fitur Prediktor",
col.names = c("Fitur", "VIF"),
align = "lr") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, position = "center") %>%
row_spec(which(vif_df$VIF > 10),
background = "#FFEBEE", color = COL_STUNTING, bold = TRUE) %>%
row_spec(which(vif_df$VIF <= 5 & vif_df$VIF > 0),
background = "#E8F5E9")| Fitur | VIF | |
|---|---|---|
| bb_sekarang | bb_sekarang | 21.610 |
| rasio_bb | rasio_bb | 20.178 |
| tb_sekarang | tb_sekarang | 18.240 |
| bmi | bmi | 17.548 |
| bb_lahir | bb_lahir | 8.717 |
| usia_bulan | usia_bulan | 2.390 |
| akses_air_enc | akses_air_enc | 1.042 |
| asi_eksklusif_enc | asi_eksklusif_enc | 1.039 |
| diare_3bln | diare_3bln | 1.038 |
| pend_ibu_enc | pend_ibu_enc | 1.035 |
| imunisasi_enc | imunisasi_enc | 1.029 |
| kunjungan_posyandu | kunjungan_posyandu | 1.026 |
| sanitasi_enc | sanitasi_enc | 1.026 |
| jenis_kelamin_enc | jenis_kelamin_enc | 1.023 |
| pendapatan | pendapatan | 1.021 |
#>
#> === Pasangan korelasi r > 0.9 ===
cor_fitur <- cor(fitur_model %>%
select(usia_bulan, bb_lahir, tb_sekarang, bb_sekarang,
pendapatan, diare_3bln, kunjungan_posyandu,
bmi, rasio_bb),
use = "complete.obs")
cor_melt <- as.data.frame(as.table(cor_fitur)) %>%
filter(Var1 != Var2, abs(Freq) > 0.9) %>%
arrange(desc(abs(Freq)))
print(cor_melt)#> [1] Var1 Var2 Freq
#> <0 rows> (or 0-length row.names)
# ── Dataset Final untuk Modeling ─────────────────────────────────────────────
data_stunting_model <- data_encoded %>%
select(
usia_bulan, jenis_kelamin_enc,
bb_lahir, tb_sekarang, bb_sekarang,
pend_ibu_enc, pendapatan,
akses_air_enc, sanitasi_enc, kunjungan_posyandu,
asi_eksklusif_enc, imunisasi_enc, diare_3bln,
bmi, rasio_bb,
status_stunting
) %>%
drop_na()
cat("\n=== Dataset Final untuk Modeling ===\n")#>
#> === Dataset Final untuk Modeling ===
#> Dimensi: 492 16
#> Variabel: usia_bulan jenis_kelamin_enc bb_lahir tb_sekarang bb_sekarang pend_ibu_enc pendapatan akses_air_enc sanitasi_enc kunjungan_posyandu asi_eksklusif_enc imunisasi_enc diare_3bln bmi rasio_bb status_stunting
#>
#> Missing values:
#> usia_bulan jenis_kelamin_enc bb_lahir tb_sekarang
#> 0 0 0 0
#> bb_sekarang pend_ibu_enc pendapatan akses_air_enc
#> 0 0 0 0
#> sanitasi_enc kunjungan_posyandu asi_eksklusif_enc imunisasi_enc
#> 0 0 0 0
#> diare_3bln bmi rasio_bb status_stunting
#> 0 0 0 0
write_csv(data_stunting_model, "C:/Users/User/Downloads/data_stunting_model.csv")
cat("\nFile data_stunting_model.csv berhasil disimpan!\n")#>
#> File data_stunting_model.csv berhasil disimpan!
#>
#> Head 5 baris:
#> # A tibble: 5 × 16
#> usia_bulan jenis_kelamin_enc bb_lahir tb_sekarang bb_sekarang pend_ibu_enc
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 33 1 2.16 78.4 9.39 1
#> 2 24 1 2.95 65.2 10.6 2
#> 3 10 1 3.03 61.1 8.45 1
#> 4 27 1 3 74.6 12.0 3
#> 5 55 0 3.07 90.3 12.8 1
#> # ℹ 10 more variables: pendapatan <dbl>, akses_air_enc <dbl>,
#> # sanitasi_enc <dbl>, kunjungan_posyandu <dbl>, asi_eksklusif_enc <dbl>,
#> # imunisasi_enc <dbl>, diare_3bln <dbl>, bmi <dbl>, rasio_bb <dbl>,
#> # status_stunting <chr>
Unit Kompetensi: J.62DMI00.012.1
Elemen: El.1 Identifikasi Teknik | El.2 Penentuan Teknik | El.3 Skenario Pengujian
Dimensi: TMS | KUK: 1.1–1.2, 2.1–2.2
(Jawaban naratif — IK 13 tidak memerlukan kode R)
Dimensi: TMS + JRES | KUK: 3.1–3.2
# ── Muat data model ──────────────────────────────────────────────────────────
library(caret)
data_stunting_model <- read_csv("C:/Users/User/Downloads/data_stunting_model.csv")
data_stunting_model <- data_stunting_model %>%
mutate(status_stunting = factor(status_stunting,
levels = c("Normal", "Stunting")))
cat("Dimensi data:", dim(data_stunting_model), "\n")#> Dimensi data: 492 16
#> Distribusi target:
#>
#> Normal Stunting
#> 366 126
#>
#> Normal Stunting
#> 0.7439024 0.2560976
# ── SKENARIO 1: Percentage Split 80:20 ───────────────────────────────────────
set.seed(42)
split_index <- createDataPartition(
data_stunting_model$status_stunting,
p = 0.8,
list = FALSE
)
train_set <- data_stunting_model[split_index, ]
test_set <- data_stunting_model[-split_index, ]
cat("\n=== SKENARIO 1: Percentage Split 80:20 ===\n")#>
#> === SKENARIO 1: Percentage Split 80:20 ===
#> Training set: 394 observasi
#> Test set : 98 observasi
#>
#> Distribusi kelas — Training set:
#>
#> Normal Stunting
#> 293 101
#>
#> Normal Stunting
#> 74.4 25.6
#>
#> Distribusi kelas — Test set:
#>
#> Normal Stunting
#> 73 25
#>
#> Normal Stunting
#> 74.5 25.5
# ── SKENARIO 2: Stratified 5-Fold Cross Validation ───────────────────────────
set.seed(42)
cv_folds <- createFolds(
data_stunting_model$status_stunting,
k = 5,
list = TRUE,
returnTrain = FALSE
)
cat("\n=== SKENARIO 2: Stratified 5-Fold Cross Validation ===\n")#>
#> === SKENARIO 2: Stratified 5-Fold Cross Validation ===
#> Jumlah fold: 5
for (i in 1:5) {
fold_test <- data_stunting_model[cv_folds[[i]], ]
fold_train <- data_stunting_model[-cv_folds[[i]], ]
cat(sprintf("Fold %d | Train: %d | Test: %d | Stunting di test: %d (%.1f%%)\n",
i,
nrow(fold_train),
nrow(fold_test),
sum(fold_test$status_stunting == "Stunting"),
mean(fold_test$status_stunting == "Stunting") * 100))
}#> Fold 1 | Train: 392 | Test: 100 | Stunting di test: 26 (26.0%)
#> Fold 2 | Train: 394 | Test: 98 | Stunting di test: 25 (25.5%)
#> Fold 3 | Train: 394 | Test: 98 | Stunting di test: 25 (25.5%)
#> Fold 4 | Train: 394 | Test: 98 | Stunting di test: 25 (25.5%)
#> Fold 5 | Train: 394 | Test: 98 | Stunting di test: 25 (25.5%)
ctrl_cv <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final",
sampling = NULL
)
cat("\n=== trainControl (untuk UK-9) sudah disiapkan ===\n")#>
#> === trainControl (untuk UK-9) sudah disiapkan ===
#> Method: 5-fold CV | ClassProbs: TRUE | Summary: ROC
saveRDS(train_set, "C:/Users/User/Downloads/train_set.rds")
saveRDS(test_set, "C:/Users/User/Downloads/test_set.rds")
saveRDS(ctrl_cv, "C:/Users/User/Downloads/ctrl_cv.rds")
cat("\nFile train_set.rds, test_set.rds, ctrl_cv.rds tersimpan!\n")#>
#> File train_set.rds, test_set.rds, ctrl_cv.rds tersimpan!
Unit Kompetensi: J.62DMI00.013.1
Elemen: El.1 Parameter Model | El.2 Tools & Eksekusi
Dimensi: TMS + CMS | KUK: 1.1, 2.1–2.2
# ── Muat data & set factor ────────────────────────────────────────────────────
data_stunting_model <- read_csv("C:/Users/User/Downloads/data_stunting_model.csv") %>%
mutate(status_stunting = factor(status_stunting, levels = c("Normal", "Stunting")))
train_set <- readRDS("C:/Users/User/Downloads/train_set.rds") %>%
mutate(status_stunting = factor(status_stunting, levels = c("Normal", "Stunting")))
test_set <- readRDS("C:/Users/User/Downloads/test_set.rds") %>%
mutate(status_stunting = factor(status_stunting, levels = c("Normal", "Stunting")))
cat("Train:", nrow(train_set), "| Test:", nrow(test_set), "\n")#> Train: 394 | Test: 98
# ── Bangun Logistic Regression ────────────────────────────────────────────────
set.seed(42)
model_lr <- glm(status_stunting ~ .,
data = train_set,
family = binomial(link = "logit"))
cat("\n=== RINGKASAN MODEL LOGISTIC REGRESSION ===\n")#>
#> === RINGKASAN MODEL LOGISTIC REGRESSION ===
#>
#> Call:
#> glm(formula = status_stunting ~ ., family = binomial(link = "logit"),
#> data = train_set)
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 4.391e+03 7.279e+05 0.006 0.995
#> usia_bulan 2.218e+01 2.098e+03 0.011 0.992
#> jenis_kelamin_enc 7.199e+00 1.173e+04 0.001 1.000
#> bb_lahir 1.048e+01 6.031e+04 0.000 1.000
#> tb_sekarang -7.283e+01 1.032e+04 -0.007 0.994
#> bb_sekarang 5.256e+01 3.143e+04 0.002 0.999
#> pend_ibu_enc -1.713e+01 9.155e+03 -0.002 0.999
#> pendapatan -5.541e-06 1.653e-03 -0.003 0.997
#> akses_air_enc -2.897e+01 4.421e+04 -0.001 0.999
#> sanitasi_enc 1.688e+01 2.199e+04 0.001 0.999
#> kunjungan_posyandu -2.637e+00 4.162e+03 -0.001 0.999
#> asi_eksklusif_enc 1.924e+01 1.387e+04 0.001 0.999
#> imunisasi_enc 1.093e+01 1.171e+04 0.001 0.999
#> diare_3bln 4.910e-01 4.859e+03 0.000 1.000
#> bmi -2.184e+01 1.233e+04 -0.002 0.999
#> rasio_bb -1.120e+01 4.165e+04 0.000 1.000
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 4.4853e+02 on 393 degrees of freedom
#> Residual deviance: 2.9908e-07 on 378 degrees of freedom
#> AIC: 32
#>
#> Number of Fisher Scoring iterations: 25
# ── Odds Ratio ────────────────────────────────────────────────────────────────
cat("\n=== ODDS RATIO (exp(coef)) ===\n")#>
#> === ODDS RATIO (exp(coef)) ===
or_df <- data.frame(
Variabel = names(coef(model_lr)),
Koefisien = round(coef(model_lr), 4),
OddsRatio = round(exp(coef(model_lr)), 4),
CI_Lower = round(exp(confint(model_lr))[,1], 4),
CI_Upper = round(exp(confint(model_lr))[,2], 4)
)
print(or_df)#> Variabel Koefisien OddsRatio CI_Lower
#> (Intercept) (Intercept) 4391.2463 Inf 0.000
#> usia_bulan usia_bulan 22.1843 4.310624e+09 0.001
#> jenis_kelamin_enc jenis_kelamin_enc 7.1988 1.337831e+03 0.000
#> bb_lahir bb_lahir 10.4839 3.573497e+04 0.000
#> tb_sekarang tb_sekarang -72.8332 0.000000e+00 0.000
#> bb_sekarang bb_sekarang 52.5554 6.675928e+22 0.000
#> pend_ibu_enc pend_ibu_enc -17.1328 0.000000e+00 0.000
#> pendapatan pendapatan 0.0000 1.000000e+00 NA
#> akses_air_enc akses_air_enc -28.9656 0.000000e+00 0.000
#> sanitasi_enc sanitasi_enc 16.8814 2.145304e+07 0.000
#> kunjungan_posyandu kunjungan_posyandu -2.6374 7.150000e-02 NA
#> asi_eksklusif_enc asi_eksklusif_enc 19.2392 2.267191e+08 0.000
#> imunisasi_enc imunisasi_enc 10.9326 5.596998e+04 0.000
#> diare_3bln diare_3bln 0.4910 1.633900e+00 0.000
#> bmi bmi -21.8438 0.000000e+00 0.000
#> rasio_bb rasio_bb -11.2021 0.000000e+00 NA
#> CI_Upper
#> (Intercept) Inf
#> usia_bulan 3.928817e+21
#> jenis_kelamin_enc 1.139606e+251
#> bb_lahir NA
#> tb_sekarang 5.028605e+88
#> bb_sekarang Inf
#> pend_ibu_enc 1.686495e+251
#> pendapatan 1.000200e+00
#> akses_air_enc Inf
#> sanitasi_enc Inf
#> kunjungan_posyandu 1.124387e+128
#> asi_eksklusif_enc NA
#> imunisasi_enc Inf
#> diare_3bln 1.418001e+149
#> bmi NA
#> rasio_bb Inf
#>
#> === VARIABEL SIGNIFIKAN (p < 0.05) ===
coef_summary <- summary(model_lr)$coefficients
sig_vars <- coef_summary[coef_summary[,4] < 0.05, ]
print(round(sig_vars, 4))#> Estimate Std. Error z value Pr(>|z|)
# ── Decision Tree dengan Tuning cp ────────────────────────────────────────────
library(rpart)
library(rpart.plot)
set.seed(42)
ctrl_dt <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final"
)
cp_grid <- expand.grid(cp = seq(0.001, 0.05, by = 0.001))
model_dt_cv <- train(
status_stunting ~ .,
data = train_set,
method = "rpart",
trControl = ctrl_dt,
tuneGrid = cp_grid,
metric = "ROC"
)
cat("\n=== HASIL TUNING cp (5-Fold CV) ===\n")#>
#> === HASIL TUNING cp (5-Fold CV) ===
#> cp ROC Sens Spec
#> 1 0.001 0.8495003 0.9147282 0.7119048
#> 2 0.002 0.8495003 0.9147282 0.7119048
#> 3 0.003 0.8495003 0.9147282 0.7119048
#> 4 0.004 0.8495003 0.9147282 0.7119048
#> 5 0.005 0.8495003 0.9147282 0.7119048
#> 6 0.006 0.8495003 0.9147282 0.7119048
#> 7 0.007 0.8495003 0.9147282 0.7119048
#> 8 0.008 0.8495003 0.9147282 0.7119048
#> 9 0.009 0.8495003 0.9147282 0.7119048
#> 10 0.010 0.8495003 0.9147282 0.7119048
#> 11 0.011 0.8495003 0.9147282 0.7119048
#> 12 0.012 0.8495003 0.9147282 0.7119048
#> 13 0.013 0.8439730 0.9216248 0.7023810
#> 14 0.014 0.8439730 0.9216248 0.7023810
#> 15 0.015 0.8439730 0.9216248 0.7023810
#> 16 0.016 0.8439730 0.9216248 0.7023810
#> 17 0.017 0.8439730 0.9216248 0.7023810
#> 18 0.018 0.8439730 0.9216248 0.7023810
#> 19 0.019 0.8439730 0.9216248 0.7023810
#> 20 0.020 0.8439730 0.9216248 0.7023810
#> 21 0.021 0.8439730 0.9216248 0.7023810
#> 22 0.022 0.8439730 0.9216248 0.7023810
#> 23 0.023 0.8439730 0.9216248 0.7023810
#> 24 0.024 0.8439730 0.9216248 0.7023810
#> 25 0.025 0.8310069 0.9419638 0.6823810
#> 26 0.026 0.8310069 0.9419638 0.6823810
#> 27 0.027 0.8310069 0.9419638 0.6823810
#> 28 0.028 0.8310069 0.9419638 0.6823810
#> 29 0.029 0.8310069 0.9419638 0.6823810
#> 30 0.030 0.8310069 0.9419638 0.6823810
#> 31 0.031 0.8310069 0.9419638 0.6823810
#> 32 0.032 0.8310069 0.9419638 0.6823810
#> 33 0.033 0.8310069 0.9419638 0.6823810
#> 34 0.034 0.8310069 0.9419638 0.6823810
#> 35 0.035 0.8310069 0.9419638 0.6823810
#> 36 0.036 0.8310069 0.9419638 0.6823810
#> 37 0.037 0.8310069 0.9419638 0.6823810
#> 38 0.038 0.8323629 0.9453536 0.6723810
#> 39 0.039 0.8323629 0.9453536 0.6723810
#> 40 0.040 0.8323629 0.9453536 0.6723810
#> 41 0.041 0.8323629 0.9453536 0.6723810
#> 42 0.042 0.8323629 0.9453536 0.6723810
#> 43 0.043 0.8323629 0.9453536 0.6723810
#> 44 0.044 0.8277866 0.9284044 0.6723810
#> 45 0.045 0.8277866 0.9284044 0.6723810
#> 46 0.046 0.8277866 0.9284044 0.6723810
#> 47 0.047 0.8277866 0.9284044 0.6723810
#> 48 0.048 0.8277866 0.9284044 0.6723810
#> 49 0.049 0.8277866 0.9284044 0.6723810
#> 50 0.050 0.8277866 0.9284044 0.6723810
#>
#> === cp OPTIMAL (best) ===
#> cp
#> 12 0.012
# 1-SE Rule
results_dt <- model_dt_cv$results
best_roc <- max(results_dt$ROC)
se_roc <- sd(results_dt$ROC) / sqrt(5)
threshold <- best_roc - se_roc
cp_1se <- results_dt$cp[results_dt$ROC >= threshold][1]
cat("\nBest ROC :", round(best_roc, 4), "\n")#>
#> Best ROC : 0.8495
#> 1-SE threshold : 0.8457
#> cp optimal (1-SE rule): 0.001
set.seed(42)
model_dt_final <- rpart(
status_stunting ~ .,
data = train_set,
method = "class",
control = rpart.control(cp = cp_1se)
)
cat("\n=== RINGKASAN DECISION TREE FINAL ===\n")#>
#> === RINGKASAN DECISION TREE FINAL ===
#>
#> Classification tree:
#> rpart(formula = status_stunting ~ ., data = train_set, method = "class",
#> control = rpart.control(cp = cp_1se))
#>
#> Variables actually used in tree construction:
#> [1] bmi rasio_bb tb_sekarang usia_bulan
#>
#> Root node error: 101/394 = 0.25635
#>
#> n= 394
#>
#> CP nsplit rel error xerror xstd
#> 1 0.356436 0 1.00000 1.00000 0.085807
#> 2 0.089109 1 0.64356 0.69307 0.075119
#> 3 0.054455 2 0.55446 0.58416 0.070126
#> 4 0.019802 5 0.37624 0.54455 0.068110
#> 5 0.009901 7 0.33663 0.53465 0.067588
#> 6 0.001000 8 0.32673 0.54455 0.068110
#> bmi tb_sekarang bb_sekarang rasio_bb
#> 61.0341758 41.1727881 13.8814786 12.4942513
#> usia_bulan bb_lahir pendapatan pend_ibu_enc
#> 10.3824263 3.4376910 1.4257306 0.8554017
#> kunjungan_posyandu
#> 0.7136781
rpart.plot(model_dt_final,
type = 4,
extra = 104,
under = TRUE,
fallen.leaves = TRUE,
branch.lty = 3,
shadow.col = "gray70",
col = COL_ACCENT,
main = paste0("Decision Tree Prediksi Stunting (cp = ", cp_1se, ")"),
cex = 0.78,
box.palette = c(COL_NORMAL, COL_STUNTING))Figure 9.1: Decision Tree Prediksi Stunting (cp Optimal)
# ── Random Forest dengan Tuning mtry ─────────────────────────────────────────
library(randomForest)
set.seed(42)
ctrl_rf <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final"
)
mtry_grid <- expand.grid(mtry = c(2, 3, 4, 5, 6, 7, 8))
model_rf_cv <- train(
status_stunting ~ .,
data = train_set,
method = "rf",
trControl = ctrl_rf,
tuneGrid = mtry_grid,
metric = "ROC",
ntree = 200
)
cat("\n=== HASIL TUNING mtry (5-Fold CV) ===\n")#>
#> === HASIL TUNING mtry (5-Fold CV) ===
#> mtry ROC Sens Spec
#> 1 2 0.9215732 0.9794857 0.6028571
#> 2 3 0.9382779 0.9692577 0.6223810
#> 3 4 0.9401904 0.9657510 0.6623810
#> 4 5 0.9470196 0.9589714 0.7023810
#> 5 6 0.9543966 0.9590298 0.7219048
#> 6 7 0.9521226 0.9589129 0.7319048
#> 7 8 0.9527794 0.9657510 0.7409524
#>
#> === mtry OPTIMAL ===
#> mtry
#> 5 6
Figure 9.2: Tuning Hyperparameter mtry — Random Forest (5-Fold CV)
#>
#> === FEATURE IMPORTANCE ===
#> rf variable importance
#>
#> Overall
#> bmi 100.00000
#> tb_sekarang 90.73389
#> usia_bulan 42.80157
#> bb_sekarang 18.06126
#> rasio_bb 12.82976
#> pendapatan 11.61447
#> kunjungan_posyandu 8.52009
#> bb_lahir 7.81536
#> diare_3bln 5.81946
#> pend_ibu_enc 3.90669
#> asi_eksklusif_enc 0.78026
#> jenis_kelamin_enc 0.29173
#> sanitasi_enc 0.19062
#> imunisasi_enc 0.02374
#> akses_air_enc 0.00000
# ── Feature Importance yang lebih cantik ─────────────────────────────────────
imp_df <- imp_rf$importance %>%
as.data.frame() %>%
rownames_to_column("Fitur") %>%
rename(Importance = Overall) %>%
arrange(Importance)
ggplot(imp_df, aes(x = reorder(Fitur, Importance), y = Importance)) +
geom_col(aes(fill = Importance), colour = "white", width = 0.7) +
geom_text(aes(label = round(Importance, 1)), hjust = -0.1, size = 3.2) +
scale_fill_gradient(low = "#BBDEFB", high = COL_NORMAL) +
coord_flip() +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(title = "Feature Importance — Random Forest",
subtitle = paste0("mtry optimal = ", model_rf_cv$bestTune$mtry),
x = NULL,
y = "Importance Score (ROC)",
fill = "Importance") +
theme(legend.position = "none")Figure 9.3: Feature Importance — Random Forest
# ── Penanganan Class Imbalance dengan ROSE ────────────────────────────────────
library(ROSE)
set.seed(42)
cat("Distribusi kelas training (sebelum ROSE):\n")#> Distribusi kelas training (sebelum ROSE):
#>
#> Normal Stunting
#> 293 101
train_rose <- ROSE(status_stunting ~ ., data = train_set, seed = 42)$data
cat("\nDistribusi kelas training (sesudah ROSE):\n")#>
#> Distribusi kelas training (sesudah ROSE):
#>
#> Normal Stunting
#> 203 191
# RF tanpa ROSE
pred_rf_no <- predict(model_rf_cv, newdata = test_set)
cm_no <- confusionMatrix(pred_rf_no, test_set$status_stunting, positive = "Stunting")
# RF dengan ROSE
set.seed(42)
model_rf_rose <- train(
status_stunting ~ .,
data = train_rose,
method = "rf",
trControl = trainControl(method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary),
tuneGrid = expand.grid(mtry = model_rf_cv$bestTune$mtry),
metric = "ROC",
ntree = 200
)
pred_rf_rose <- predict(model_rf_rose, newdata = test_set)
cm_rose <- confusionMatrix(pred_rf_rose, test_set$status_stunting, positive = "Stunting")
cat("\n=== CONFUSION MATRIX — RF TANPA ROSE ===\n"); print(cm_no)#>
#> === CONFUSION MATRIX — RF TANPA ROSE ===
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Normal Stunting
#> Normal 72 5
#> Stunting 1 20
#>
#> Accuracy : 0.9388
#> 95% CI : (0.8715, 0.9772)
#> No Information Rate : 0.7449
#> P-Value [Acc > NIR] : 6.059e-07
#>
#> Kappa : 0.83
#>
#> Mcnemar's Test P-Value : 0.2207
#>
#> Sensitivity : 0.8000
#> Specificity : 0.9863
#> Pos Pred Value : 0.9524
#> Neg Pred Value : 0.9351
#> Prevalence : 0.2551
#> Detection Rate : 0.2041
#> Detection Prevalence : 0.2143
#> Balanced Accuracy : 0.8932
#>
#> 'Positive' Class : Stunting
#>
#>
#> === CONFUSION MATRIX — RF DENGAN ROSE ===
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Normal Stunting
#> Normal 66 7
#> Stunting 7 18
#>
#> Accuracy : 0.8571
#> 95% CI : (0.7719, 0.9196)
#> No Information Rate : 0.7449
#> P-Value [Acc > NIR] : 0.005306
#>
#> Kappa : 0.6241
#>
#> Mcnemar's Test P-Value : 1.000000
#>
#> Sensitivity : 0.7200
#> Specificity : 0.9041
#> Pos Pred Value : 0.7200
#> Neg Pred Value : 0.9041
#> Prevalence : 0.2551
#> Detection Rate : 0.1837
#> Detection Prevalence : 0.2551
#> Balanced Accuracy : 0.8121
#>
#> 'Positive' Class : Stunting
#>
#>
#> === PERBANDINGAN SENSITIVITY (Recall) ===
#> Tanpa ROSE: 0.8
#> Dengan ROSE: 0.72
#>
#> === PERBANDINGAN SPECIFICITY ===
#> Tanpa ROSE: 0.9863
#> Dengan ROSE: 0.9041
Unit Kompetensi: J.62DMI00.014.1
Elemen: El.1 Pengujian dengan Data Riil | El.2 Penilaian Hasil
Dimensi: TMS + CMS | KUK: 1.2, 2.1–2.2
# ── Load dan persiapkan data ──────────────────────────────────────────────────
library(pROC)
test_set <- readRDS("C:/Users/User/Downloads/test_set.rds") %>%
mutate(status_stunting = factor(status_stunting, levels = c("Normal","Stunting")))
train_set <- readRDS("C:/Users/User/Downloads/train_set.rds") %>%
mutate(status_stunting = factor(status_stunting, levels = c("Normal","Stunting")))
# ── MODEL 1: Logistic Regression ──────────────────────────────────────────────
set.seed(42)
model_lr <- glm(status_stunting ~ ., data = train_set,
family = binomial(link = "logit"))
pred_lr_class <- predict(model_lr, newdata = test_set, type = "response")
pred_lr_label <- factor(ifelse(pred_lr_class >= 0.5, "Stunting", "Normal"),
levels = c("Normal", "Stunting"))
cm_lr <- confusionMatrix(pred_lr_label, test_set$status_stunting, positive = "Stunting")
roc_lr <- roc(test_set$status_stunting, pred_lr_class,
levels = c("Normal","Stunting"), quiet = TRUE)
cat("=== CONFUSION MATRIX — Logistic Regression ===\n"); print(cm_lr)#> === CONFUSION MATRIX — Logistic Regression ===
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Normal Stunting
#> Normal 69 0
#> Stunting 4 25
#>
#> Accuracy : 0.9592
#> 95% CI : (0.8988, 0.9888)
#> No Information Rate : 0.7449
#> P-Value [Acc > NIR] : 1.647e-08
#>
#> Kappa : 0.898
#>
#> Mcnemar's Test P-Value : 0.1336
#>
#> Sensitivity : 1.0000
#> Specificity : 0.9452
#> Pos Pred Value : 0.8621
#> Neg Pred Value : 1.0000
#> Prevalence : 0.2551
#> Detection Rate : 0.2551
#> Detection Prevalence : 0.2959
#> Balanced Accuracy : 0.9726
#>
#> 'Positive' Class : Stunting
#>
#> AUC-ROC LR: 0.9852
# ── MODEL 2: Decision Tree ────────────────────────────────────────────────────
set.seed(42)
model_dt_final <- rpart(status_stunting ~ ., data = train_set,
method = "class",
control = rpart.control(cp = 0.001))
pred_dt_prob <- predict(model_dt_final, newdata = test_set, type = "prob")[, "Stunting"]
pred_dt_label <- predict(model_dt_final, newdata = test_set, type = "class")
cm_dt <- confusionMatrix(pred_dt_label, test_set$status_stunting, positive = "Stunting")
roc_dt <- roc(test_set$status_stunting, pred_dt_prob,
levels = c("Normal","Stunting"), quiet = TRUE)
cat("\n=== CONFUSION MATRIX — Decision Tree ===\n"); print(cm_dt)#>
#> === CONFUSION MATRIX — Decision Tree ===
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Normal Stunting
#> Normal 68 7
#> Stunting 5 18
#>
#> Accuracy : 0.8776
#> 95% CI : (0.7959, 0.9351)
#> No Information Rate : 0.7449
#> P-Value [Acc > NIR] : 0.0009983
#>
#> Kappa : 0.6691
#>
#> Mcnemar's Test P-Value : 0.7728300
#>
#> Sensitivity : 0.7200
#> Specificity : 0.9315
#> Pos Pred Value : 0.7826
#> Neg Pred Value : 0.9067
#> Prevalence : 0.2551
#> Detection Rate : 0.1837
#> Detection Prevalence : 0.2347
#> Balanced Accuracy : 0.8258
#>
#> 'Positive' Class : Stunting
#>
#> AUC-ROC DT: 0.9099
# ── MODEL 3: Random Forest ────────────────────────────────────────────────────
set.seed(42)
model_rf_final <- randomForest(status_stunting ~ ., data = train_set,
mtry = 6, ntree = 200)
pred_rf_prob <- predict(model_rf_final, newdata = test_set, type = "prob")[, "Stunting"]
pred_rf_label <- predict(model_rf_final, newdata = test_set, type = "class")
cm_rf <- confusionMatrix(pred_rf_label, test_set$status_stunting, positive = "Stunting")
roc_rf <- roc(test_set$status_stunting, pred_rf_prob,
levels = c("Normal","Stunting"), quiet = TRUE)
cat("\n=== CONFUSION MATRIX — Random Forest ===\n"); print(cm_rf)#>
#> === CONFUSION MATRIX — Random Forest ===
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Normal Stunting
#> Normal 73 5
#> Stunting 0 20
#>
#> Accuracy : 0.949
#> 95% CI : (0.8849, 0.9832)
#> No Information Rate : 0.7449
#> P-Value [Acc > NIR] : 1.099e-07
#>
#> Kappa : 0.8563
#>
#> Mcnemar's Test P-Value : 0.07364
#>
#> Sensitivity : 0.8000
#> Specificity : 1.0000
#> Pos Pred Value : 1.0000
#> Neg Pred Value : 0.9359
#> Prevalence : 0.2551
#> Detection Rate : 0.2041
#> Detection Prevalence : 0.2041
#> Balanced Accuracy : 0.9000
#>
#> 'Positive' Class : Stunting
#>
#> AUC-ROC RF: 0.9816
# ── Rekap Perbandingan Metrik ─────────────────────────────────────────────────
rekap <- data.frame(
Model = c("Logistic Regression", "Decision Tree", "Random Forest"),
Accuracy = c(round(cm_lr$overall["Accuracy"], 4),
round(cm_dt$overall["Accuracy"], 4),
round(cm_rf$overall["Accuracy"], 4)),
Sensitivity = c(round(cm_lr$byClass["Sensitivity"], 4),
round(cm_dt$byClass["Sensitivity"], 4),
round(cm_rf$byClass["Sensitivity"], 4)),
Specificity = c(round(cm_lr$byClass["Specificity"], 4),
round(cm_dt$byClass["Specificity"], 4),
round(cm_rf$byClass["Specificity"], 4)),
Precision = c(round(cm_lr$byClass["Precision"], 4),
round(cm_dt$byClass["Precision"], 4),
round(cm_rf$byClass["Precision"], 4)),
F1_Score = c(round(cm_lr$byClass["F1"], 4),
round(cm_dt$byClass["F1"], 4),
round(cm_rf$byClass["F1"], 4)),
AUC_ROC = c(round(auc(roc_lr), 4),
round(auc(roc_dt), 4),
round(auc(roc_rf), 4)),
Kappa = c(round(cm_lr$overall["Kappa"], 4),
round(cm_dt$overall["Kappa"], 4),
round(cm_rf$overall["Kappa"], 4))
)
kable(rekap,
caption = "Rekap Perbandingan Metrik Evaluasi — Ketiga Model",
align = c("l", rep("r", 7))) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = TRUE) %>%
column_spec(1, bold = TRUE, background = "#E8EAF6") %>%
column_spec(3, background = "#E8F5E9",
color = ifelse(rekap$Sensitivity == max(rekap$Sensitivity),
"#1B5E20", "black"),
bold = ifelse(rekap$Sensitivity == max(rekap$Sensitivity), TRUE, FALSE)) %>%
column_spec(7, background = "#E3F2FD",
color = ifelse(rekap$AUC_ROC == max(rekap$AUC_ROC),
"#0D47A1", "black"),
bold = ifelse(rekap$AUC_ROC == max(rekap$AUC_ROC), TRUE, FALSE)) %>%
add_header_above(c(" " = 1, "Performa Model" = 7))| Model | Accuracy | Sensitivity | Specificity | Precision | F1_Score | AUC_ROC | Kappa |
|---|---|---|---|---|---|---|---|
| Logistic Regression | 0.9592 | 1.00 | 0.9452 | 0.8621 | 0.9259 | 0.9852 | 0.8980 |
| Decision Tree | 0.8776 | 0.72 | 0.9315 | 0.7826 | 0.7500 | 0.9099 | 0.6691 |
| Random Forest | 0.9490 | 0.80 | 1.0000 | 1.0000 | 0.8889 | 0.9816 | 0.8563 |
# ── ROC Curve ─────────────────────────────────────────────────────────────────
# Build label strings first
lbl_lr <- paste0("Logistic Regression (AUC = ", round(auc(roc_lr), 3), ")")
lbl_dt <- paste0("Decision Tree (AUC = ", round(auc(roc_dt), 3), ")")
lbl_rf <- paste0("Random Forest (AUC = ", round(auc(roc_rf), 3), ")")
roc_data <- bind_rows(
data.frame(fpr = 1 - roc_lr$specificities,
tpr = roc_lr$sensitivities,
Model = lbl_lr),
data.frame(fpr = 1 - roc_dt$specificities,
tpr = roc_dt$sensitivities,
Model = lbl_dt),
data.frame(fpr = 1 - roc_rf$specificities,
tpr = roc_rf$sensitivities,
Model = lbl_rf)
)
colour_vals <- c("#1565C0", "#2E7D32", "#C62828")
linetype_vals <- c("solid", "dashed", "dotdash")
names(colour_vals) <- c(lbl_lr, lbl_dt, lbl_rf)
names(linetype_vals) <- c(lbl_lr, lbl_dt, lbl_rf)
ggplot(roc_data, aes(x = fpr, y = tpr, colour = Model, linetype = Model)) +
geom_line(linewidth = 1.1) +
geom_abline(slope = 1, intercept = 0,
linetype = "dashed", colour = "gray60", linewidth = 0.7) +
annotate("text", x = 0.62, y = 0.38,
label = "Garis Acak (AUC = 0.5)",
colour = "gray50", size = 3, fontface = "italic") +
scale_colour_manual(values = colour_vals) +
scale_linetype_manual(values = linetype_vals) +
coord_equal() +
labs(title = "ROC Curve — Perbandingan Ketiga Model",
subtitle = "Prediksi Risiko Stunting Balita",
x = "1 − Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)",
colour = "Model",
linetype = "Model",
caption = "Sumber: data_stunting_model.csv (test set 20%)") +
theme(legend.position = "bottom",
legend.direction = "vertical",
legend.key.width = unit(2, "cm"))Figure 10.1: ROC Curve — Perbandingan Ketiga Model Klasifikasi Stunting
#>
#> === DETAIL CM — Logistic Regression ===
#> Reference
#> Prediction Normal Stunting
#> Normal 69 0
#> Stunting 4 25
#>
#> === DETAIL CM — Decision Tree ===
#> Reference
#> Prediction Normal Stunting
#> Normal 68 7
#> Stunting 5 18
#>
#> === DETAIL CM — Random Forest ===
#> Reference
#> Prediction Normal Stunting
#> Normal 73 5
#> Stunting 0 20
Unit Kompetensi: J.62DMI00.015.1
Elemen: El.1 Kesesuaian Proses | El.2 Kualitas Proses
Dimensi: TS | KUK: 1.1–1.2, 2.1–2.2
(Jawaban naratif — IK 18 disajikan dalam tabel checklist Word/Material)
Dimensi: TMS + JRES | KUK: 1.1–2.2
(Laporan akhir disusun dalam dokumen Word terpisah sesuai format TNR 12pt single space)
Dokumen ini dibuat menggunakan R Markdown. Diekspor ke format Word dan Material HTML.
Tanggal: 29 April 2026 | LSP Politeknik Statistika STIS 2026