required_packages <- c("dplyr", "ggplot2", "broom", "knitr", "scales",
                       "readxl", "ResourceSelection", "DescTools", "car",
                       "tidyr", "kableExtra")

missing_packages <- required_packages[
  !vapply(required_packages, requireNamespace, logical(1), quietly = TRUE)
]
if (length(missing_packages) > 0) install.packages(missing_packages)
invisible(lapply(required_packages, library, character.only = TRUE))

1 Pendahuluan

Latar Belakang: Pencemaran lahan industri merupakan salah satu permasalahan lingkungan yang serius, terutama di negara berkembang yang mengalami urbanisasi pesat seperti China. Jumlah lahan terkontaminasi di China meningkat hingga 325% antara tahun 1990 dan 2018, dengan 43.676 lokasi terkontaminasi teridentifikasi dari 83.498 lahan perusahaan.

Tujuan penelitian:

  1. Mengidentifikasi faktor risiko yang secara signifikan memengaruhi kontaminasi lahan industri.
  2. Membangun model prediktif berbasis regresi logistik biner yang parsimonious dengan validasi asumsi komprehensif.
  3. Menginterpretasikan besar dan arah pengaruh masing-masing faktor melalui nilai Odds Ratio (OR).

2 Data dan Prapemrosesan

2.1 Sumber Data

Dataset yang digunakan adalah Urban Contaminated Sites along China’s Urbanization yang dipublikasikan oleh Li (2023) melalui Mendeley Data (DOI: 10.17632/r4y2vcpfmx.1). Dataset ini mencakup 2.005 lokasi industri dengan 15 variabel (1 respon + 14 prediktor).

2.2 Baca Data

# Ganti path sesuai lokasi file kamu
raw_contamination <- readxl::read_excel("Input dataset.xlsx")
names(raw_contamination) <- trimws(names(raw_contamination))

ringkasan_data <- data.frame(
  Keterangan = c("Jumlah observasi (lokasi industri)", "Jumlah variabel"),
  Nilai      = c(nrow(raw_contamination), ncol(raw_contamination))
)
knitr::kable(ringkasan_data,
             caption = "Ukuran Dataset Kontaminasi Lahan Perkotaan China") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Ukuran Dataset Kontaminasi Lahan Perkotaan China
Keterangan Nilai
Jumlah observasi (lokasi industri) 2005
Jumlah variabel 16

2.3 Kamus Variabel

kamus_variabel <- data.frame(
  `Kolom Asli` = c(
    "Presence of contamination", "Duration", "Starting time",
    "Industry class", "Violations", "Precipitation (mm)",
    "Temperature", "Impervious surfaces", "Scale",
    "Pollutant mobility", "Pollutant volatility",
    "Persistent organic pollutants",
    "Soil texture (Soil clay content in percentage)",
    "Soil erosion", "Wind speed (m/s)"
  ),
  `Variabel Analisis` = c(
    "status_kontaminasi", "durasi_operasi", "tahun_mulai",
    "jenis_industri", "jumlah_pelanggaran", "curah_hujan",
    "suhu", "permukaan_kedap", "skala_industri",
    "mobilitas_polutan", "volatilitas_polutan",
    "polutan_organik_persisten",
    "tekstur_tanah", "erosi_tanah", "kecepatan_angin"
  ),
  `Keterangan` = c(
    "Status kontaminasi lahan (RESPON Y)", "Lama operasi industri (tahun)",
    "Tahun mulai beroperasi", "Jenis/kelas industri",
    "Jumlah pelanggaran lingkungan", "Total curah hujan tahunan (mm)",
    "Rata-rata suhu tahunan (°C)", "Ada/tidaknya permukaan kedap air",
    "Skala ukuran industri", "Tingkat mobilitas polutan dalam tanah",
    "Tingkat volatilitas polutan", "Ada/tidaknya polutan organik persisten",
    "Kandungan liat tanah (%)", "Tingkat erosi tanah",
    "Rata-rata kecepatan angin (m/s)"
  ),
  `Tipe` = c(
    "Respon biner", "Numerik", "Numerik",
    "Kategorik (banyak level)", "Numerik", "Numerik",
    "Numerik", "Kategorik (biner)", "Kategorik (ordinal)",
    "Numerik", "Numerik", "Kategorik (biner)",
    "Numerik", "Kategorik (ordinal)", "Numerik"
  ),
  check.names = FALSE
)
knitr::kable(kamus_variabel,
             caption = "Kamus Variabel Dataset Kontaminasi Lahan") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)
Kamus Variabel Dataset Kontaminasi Lahan
Kolom Asli Variabel Analisis Keterangan Tipe
Presence of contamination status_kontaminasi Status kontaminasi lahan (RESPON Y) Respon biner
Duration durasi_operasi Lama operasi industri (tahun) Numerik
Starting time tahun_mulai Tahun mulai beroperasi Numerik
Industry class jenis_industri Jenis/kelas industri Kategorik (banyak level)
Violations jumlah_pelanggaran Jumlah pelanggaran lingkungan Numerik
Precipitation (mm) curah_hujan Total curah hujan tahunan (mm) Numerik
Temperature suhu Rata-rata suhu tahunan (°C) Numerik
Impervious surfaces permukaan_kedap Ada/tidaknya permukaan kedap air Kategorik (biner)
Scale skala_industri Skala ukuran industri Kategorik (ordinal)
Pollutant mobility mobilitas_polutan Tingkat mobilitas polutan dalam tanah Numerik
Pollutant volatility volatilitas_polutan Tingkat volatilitas polutan Numerik
Persistent organic pollutants polutan_organik_persisten Ada/tidaknya polutan organik persisten Kategorik (biner)
Soil texture (Soil clay content in percentage) tekstur_tanah Kandungan liat tanah (%) Numerik
Soil erosion erosi_tanah Tingkat erosi tanah Kategorik (ordinal)
Wind speed (m/s) kecepatan_angin Rata-rata kecepatan angin (m/s) Numerik

2.4 Preprocessing & Rekoding

Catatan preprocessing jenis_industri: Penggabungan dilakukan dengan ambang batas frekuensi minimum n = 31. - 18 jenis industri dipertahankan sebagai kategori tersendiri (n ≥ 31) - 42 jenis industri dengan n < 31 digabung ke “Industri Lainnya” (n = 427) - Baseline (referensi): “Industri Lainnya” — ditetapkan eksplisit via relevel()

collapse_rare <- function(x, min_n = 31, other_label = "Industri Lainnya") {
  x_chr <- as.character(x)
  keep  <- names(which(table(x_chr) >= min_n))
  factor(ifelse(x_chr %in% keep, x_chr, other_label))
}

erosi_levels <- c("Slight", "Mild", "Moderate", "Strong", "Very strong", "Severe")

contamination <- raw_contamination %>%
  transmute(
    status_kontaminasi = factor(`Presence of contamination`, levels = c("No", "Yes")),
    kontaminasi_biner  = as.integer(`Presence of contamination` == "Yes"),
    durasi_operasi         = as.numeric(Duration),
    tahun_mulai            = as.numeric(`Starting time`),
    jumlah_pelanggaran     = as.numeric(Violations),
    curah_hujan            = as.numeric(`Precipitation (mm)`),
    suhu                   = as.numeric(`Temperature (℃)`),
    mobilitas_polutan      = as.numeric(`Pollutant mobility`),
    volatilitas_polutan    = as.numeric(`Pollutant volatility`),
    tekstur_tanah          = as.numeric(`Soil texture (Soil clay content in percentage)`),
    kecepatan_angin        = as.numeric(`Wind speed (m/s)`),
    log_mobilitas_polutan  = log(as.numeric(`Pollutant mobility`) + 1),
    permukaan_kedap = factor(`Impervious surfaces`,
                             levels = c("No","Yes"), labels = c("Tidak","Ya")),
    polutan_organik_persisten = factor(`Persistent organic pollutants`,
                                       levels = c("No","Yes"), labels = c("Tidak","Ya")),
    skala_industri = factor(Scale,
                            levels = c("Micro","Small","Medium","Large","Extra-large"),
                            labels = c("Mikro","Kecil","Menengah","Besar","Sangat Besar")),
    erosi_tanah = factor(`Soil erosion`, levels = erosi_levels,
                         labels = c("Ringan","Agak Ringan","Sedang",
                                    "Kuat","Sangat Kuat","Parah")),
    jenis_industri = {
      x_chr <- as.character(`Industry class`)
      freq  <- table(x_chr)
      keep  <- names(freq[freq >= 31])
      factor(ifelse(x_chr %in% keep, x_chr, "Industri Lainnya"))
    }
  ) %>%
  droplevels() %>%
  mutate(jenis_industri = relevel(jenis_industri, ref = "Industri Lainnya"))

2.4.1 Verifikasi Preprocessing jenis_industri

freq_industri <- sort(table(contamination$jenis_industri), decreasing = TRUE)
freq_df <- as.data.frame(freq_industri)
names(freq_df) <- c("Jenis Industri", "Frekuensi")
knitr::kable(freq_df,
             caption = "Frekuensi Jenis Industri Setelah Penggabungan") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)
Frekuensi Jenis Industri Setelah Penggabungan
Jenis Industri Frekuensi
Industri Lainnya 427
Metal surface treatment and heat treatment processing 209
Organic chemical raw material manufacturing 134
Cotton printing and dyeing processing 133
Chemical reagent and additive manufacturing 129
Pesticide manufacturing 126
Leather tanning and processing 121
Coking 104
Crude oil processing and petroleum product manufacturing 100
Iron ore mining and dressing 84
Hazardous waste treatment 67
Lead-zinc smelting 52
Petroleum extraction 52
Iron smelting 50
Steel smelting 49
Chemical pharmaceutical raw material manufacturing 48
Lead-zinc ore mining and dressing 47
Gold ore mining and dressing 40
Copper smelting 33
cat("Baseline jenis_industri:", levels(contamination$jenis_industri)[1])
## Baseline jenis_industri: Industri Lainnya

3 Eksplorasi Data

3.1 Distribusi Variabel Respon

distribusi_y <- contamination %>%
  count(status_kontaminasi, name = "Jumlah") %>%
  mutate(Proporsi = scales::percent(Jumlah / sum(Jumlah), accuracy = 0.1)) %>%
  rename("Status Kontaminasi" = status_kontaminasi)

knitr::kable(distribusi_y,
             caption = "Distribusi Kelas Respon (Y)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)
Distribusi Kelas Respon (Y)
Status Kontaminasi Jumlah Proporsi
No 801 40.0%
Yes 1204 60.0%
ggplot(contamination, aes(x = status_kontaminasi, fill = status_kontaminasi)) +
  geom_bar(width = 0.62, color = "white", linewidth = 0.8) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.4, fontface = "bold") +
  scale_fill_manual(values = c("No" = "#2d6a2d", "Yes" = "#eab308")) +
  labs(title = "Distribusi Status Kontaminasi Lahan",
       x = NULL, y = "Jumlah Lokasi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

3.2 Statistik Deskriptif per Kelas

ringkasan_numerik <- contamination %>%
  group_by(status_kontaminasi) %>%
  summarise(
    Jumlah                       = n(),
    `Rata-rata Durasi (th)`      = round(mean(durasi_operasi,      na.rm = TRUE), 2),
    `Tahun Mulai (rata-rata)`    = round(mean(tahun_mulai,         na.rm = TRUE), 2),
    `Rata-rata Pelanggaran`      = round(mean(jumlah_pelanggaran,  na.rm = TRUE), 2),
    `Median Curah Hujan (mm)`    = round(median(curah_hujan,       na.rm = TRUE), 2),
    `Rata-rata Suhu (°C)`        = round(mean(suhu,                na.rm = TRUE), 2),
    `Rata-rata Mobilitas Pol.`   = round(mean(mobilitas_polutan,   na.rm = TRUE), 2),
    `Rata-rata Volatilitas Pol.` = round(mean(volatilitas_polutan, na.rm = TRUE), 2),
    `Rata-rata Tekstur Tanah`    = round(mean(tekstur_tanah,       na.rm = TRUE), 2),
    `Rata-rata Kec. Angin (m/s)` = round(mean(kecepatan_angin,     na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  rename("Status Kontaminasi" = status_kontaminasi)

ringkasan_numerik %>%
  tidyr::pivot_longer(-`Status Kontaminasi`, names_to = "Statistik", values_to = "Nilai") %>%
  tidyr::pivot_wider(names_from = `Status Kontaminasi`, values_from = Nilai) %>%
  knitr::kable(caption = "Ringkasan Numerik per Kelas Respon") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Ringkasan Numerik per Kelas Respon
Statistik No Yes
Jumlah 801.00 1204.00
Rata-rata Durasi (th) 13.36 24.92
Tahun Mulai (rata-rata) 2004.07 1992.11
Rata-rata Pelanggaran 1.00 2.34
Median Curah Hujan (mm) 8584.68 12222.13
Rata-rata Suhu (°C) 13.55 14.26
Rata-rata Mobilitas Pol. 0.59 0.60
Rata-rata Volatilitas Pol. 0.57 0.64
Rata-rata Tekstur Tanah 26.11 26.14
Rata-rata Kec. Angin (m/s) 21.49 22.13

3.3 Visualisasi Eksplorasi

# Scatter: durasi vs curah hujan
ggplot(contamination, aes(x = durasi_operasi, y = curah_hujan,
                          color = status_kontaminasi)) +
  geom_point(alpha = 0.45, size = 1.8) +
  scale_color_manual(values = c("No" = "#2d6a2d", "Yes" = "#eab308")) +
  scale_y_continuous(labels = scales::comma) +
  labs(title    = "Durasi Operasi vs Curah Hujan",
       subtitle = "Warna = status kontaminasi lahan",
       x = "Durasi operasi (tahun)", y = "Curah hujan tahunan (mm)",
       color = "Status Kontaminasi") +
  theme_minimal(base_size = 12)

# Proporsi kontaminasi per skala industri
ggplot(contamination, aes(x = skala_industri, fill = status_kontaminasi)) +
  geom_bar(position = "fill", width = 0.7, color = "white") +
  scale_fill_manual(values = c("No" = "#2d6a2d", "Yes" = "#eab308")) +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Proporsi Kontaminasi per Skala Industri",
       x = "Skala Industri", y = "Proporsi", fill = "Status Kontaminasi") +
  theme_minimal(base_size = 12)

3.4 Contoh 8 Data Pertama

contamination %>%
  select(status_kontaminasi, jenis_industri, durasi_operasi,
         tahun_mulai, skala_industri, permukaan_kedap, erosi_tanah,
         curah_hujan, suhu) %>%
  head(8) %>%
  knitr::kable(caption = "Contoh 8 Data Pertama") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Contoh 8 Data Pertama
status_kontaminasi jenis_industri durasi_operasi tahun_mulai skala_industri permukaan_kedap erosi_tanah curah_hujan suhu
Yes Petroleum extraction 21 1992 Mikro Tidak Ringan 13775.359 19.7
No Petroleum extraction 16 2000 Mikro Tidak Ringan 18130.318 21.5
Yes Petroleum extraction 44 1978 Mikro Tidak Ringan 7784.797 14.3
No Petroleum extraction 23 1979 Mikro Ya Ringan 4900.000 13.5
No Petroleum extraction 3 2011 Besar Tidak Kuat 4841.117 9.8
Yes Petroleum extraction 36 1986 Mikro Ya Ringan 5840.898 13.4
Yes Petroleum extraction 42 1980 Menengah Ya Ringan 5885.018 13.4
Yes Petroleum extraction 46 1976 Besar Ya Ringan 4303.062 7.2

4 Split Data Training & Testing

set.seed(42)

stratified_split <- function(y, prop = 0.8) {
  idx_by_class <- split(seq_along(y), y)
  unlist(lapply(idx_by_class,
                function(idx) sample(idx, floor(length(idx) * prop))),
         use.names = FALSE)
}

train_id   <- stratified_split(contamination$kontaminasi_biner, 0.8)
train_data <- contamination[train_id, ]
test_data  <- contamination[-train_id, ]

bind_rows(
  train_data %>% count(status_kontaminasi) %>% mutate(Data = "Training"),
  test_data  %>% count(status_kontaminasi) %>% mutate(Data = "Testing")
) %>%
  group_by(Data) %>%
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) %>%
  ungroup() %>%
  rename("Status Kontaminasi" = status_kontaminasi, Jumlah = n) %>%
  knitr::kable(caption = "Distribusi Kelas Training dan Testing") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Distribusi Kelas Training dan Testing
Status Kontaminasi Jumlah Data Proporsi
No 640 Training 39.9%
Yes 963 Training 60.1%
No 161 Testing 40.0%
Yes 241 Testing 60.0%


5 Pembangunan Model Secara Iteratif

Pengembangan model dilakukan secara iteratif melalui tiga tahap untuk memperbaiki pelanggaran asumsi dan menghasilkan model yang parsimonious.

5.1 Model v1 — Full Model Awal (15 Variabel)

5.1.1 Estimasi Model v1

contamination_fit1 <- glm(
  kontaminasi_biner ~
    durasi_operasi + tahun_mulai + jumlah_pelanggaran +
    curah_hujan + suhu + kecepatan_angin +
    tekstur_tanah + mobilitas_polutan + volatilitas_polutan +
    permukaan_kedap + polutan_organik_persisten +
    skala_industri + erosi_tanah + jenis_industri,
  data   = train_data,
  family = binomial(link = "logit")
)

data.frame(
  Keterangan = c("Jumlah observasi training", "Jumlah variabel prediktor",
                 "Null deviance", "Residual deviance", "Derajat bebas residual", "AIC"),
  Nilai = c(nobs(contamination_fit1), length(contamination_fit1$coefficients) - 1,
            round(contamination_fit1$null.deviance, 3),
            round(contamination_fit1$deviance, 3),
            contamination_fit1$df.residual, round(AIC(contamination_fit1), 3))
) %>%
  knitr::kable(caption = "Ringkasan Kecocokan Model v1 (Full — 15 variabel)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Ringkasan Kecocokan Model v1 (Full — 15 variabel)
Keterangan Nilai
Jumlah observasi training 1603.000
Jumlah variabel prediktor 38.000
Null deviance 2156.699
Residual deviance 1411.060
Derajat bebas residual 1564.000
AIC 1489.060

5.1.2 Uji Multikolinearitas (VIF) — Model v1

vif_v1 <- car::vif(contamination_fit1)

if (is.matrix(vif_v1)) {
  vif_df1 <- as.data.frame(vif_v1)
  vif_df1$`GVIF^(1/(2Df))` <- vif_v1[, 3]
  vif_df1$Keterangan <- ifelse(vif_v1[, 3] > 5, "⚠️ Bermasalah", "✅ Aman")
} else {
  vif_df1 <- data.frame(VIF = vif_v1,
                        Keterangan = ifelse(vif_v1 > 5, "⚠️ Bermasalah", "✅ Aman"))
}
knitr::kable(vif_df1, caption = "Hasil Uji VIF — Model v1") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Hasil Uji VIF — Model v1
GVIF Df GVIF^(1/(2*Df)) GVIF^(1/(2Df)) Keterangan
durasi_operasi 1.992835 1 1.411678 1.411678 ✅ Aman |
tahun_mulai 2.036131 1 1.426931 1.426931 ✅ Aman |
jumlah_pelanggaran 1.106675 1 1.051986 1.051986 ✅ Aman |
curah_hujan 3.027219 1 1.739891 1.739891 ✅ Aman |
suhu 3.387764 1 1.840588 1.840588 ✅ Aman |
kecepatan_angin 1.464081 1 1.209992 1.209992 ✅ Aman |
tekstur_tanah 1.574873 1 1.254939 1.254939 ✅ Aman |
mobilitas_polutan 2.716299 1 1.648120 1.648120 ✅ Aman |
volatilitas_polutan 85.554369 1 9.249560 9.249560 ⚠️ Bermasalah
permukaan_kedap 1.196102 1 1.093664 1.093664 ✅ Aman |
polutan_organik_persisten 15.095518 1 3.885295 3.885295 ✅ Aman |
skala_industri 1.663734 4 1.065701 1.065701 ✅ Aman |
erosi_tanah 1.352301 5 1.030641 1.030641 ✅ Aman |
jenis_industri 4636.502745 18 1.264266 1.264266 ✅ Aman |

Temuan v1 — VIF: volatilitas_polutan memiliki GVIF^(1/(2Df)) = 9,25 > 5. Keputusan: Hapus volatilitas_polutan dari model → model v2.

5.1.3 Uji Linearitas Logit (Box-Tidwell) — Model v1

train_bt1 <- train_data %>%
  mutate(
    dur_log   = durasi_operasi      * log(durasi_operasi      + 1),
    hujan_log = curah_hujan         * log(curah_hujan         + 1),
    suhu_log  = suhu                * log(abs(suhu)           + 1),
    angin_log = kecepatan_angin     * log(kecepatan_angin     + 1),
    tanah_log = tekstur_tanah       * log(tekstur_tanah       + 1),
    mob_log   = mobilitas_polutan   * log(mobilitas_polutan   + 1),
    vol_log   = volatilitas_polutan * log(volatilitas_polutan + 1)
  )

bt_fit1 <- glm(
  kontaminasi_biner ~
    durasi_operasi    + dur_log   + curah_hujan     + hujan_log +
    suhu              + suhu_log  + kecepatan_angin + angin_log +
    tekstur_tanah     + tanah_log + mobilitas_polutan + mob_log +
    volatilitas_polutan + vol_log,
  data = train_bt1, family = binomial
)

broom::tidy(bt_fit1) %>%
  filter(grepl("_log$", term)) %>%
  transmute(
    `Variabel (interaksi log)` = term,
    Koefisien  = round(estimate, 4),
    `p-value`  = signif(p.value, 3),
    Kesimpulan = ifelse(p.value < 0.05, "⚠️ Asumsi DILANGGAR", "✅ Asumsi terpenuhi")
  ) %>%
  knitr::kable(caption = "Uji Box-Tidwell v1: Linearitas Logit Variabel Kontinu") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Uji Box-Tidwell v1: Linearitas Logit Variabel Kontinu
Variabel (interaksi log) Koefisien p-value Kesimpulan
dur_log 0.0144 0.63200 ✅ Asumsi terpenuhi |
hujan_log 0.0000 0.60900 ✅ Asumsi terpenuhi |
suhu_log 0.0506 0.19700 ✅ Asumsi terpenuhi |
angin_log 0.0228 0.21000 ✅ Asumsi terpenuhi |
tanah_log 0.0101 0.76200 ✅ Asumsi terpenuhi |
mob_log -199.1905 0.00433 ⚠️ Asumsi DILANGGAR
vol_log -0.6576 0.06170 ✅ Asumsi terpenuhi |

Temuan v1 — Box-Tidwell: mobilitas_polutan melanggar asumsi linearitas logit (p = 0,004 < 0,05). Keputusan: Transformasi log(mobilitas_polutan + 1) → dipakai sebagai log_mobilitas_polutan pada v2 & v3.

5.1.4 Uji Signifikansi Per Variabel (LRT) — Model v1

lrt_v1 <- car::Anova(contamination_fit1, type = "II", test = "LR")
print(lrt_v1)
## Analysis of Deviance Table (Type II tests)
## 
## Response: kontaminasi_biner
##                           LR Chisq Df Pr(>Chisq)    
## durasi_operasi              88.443  1  < 2.2e-16 ***
## tahun_mulai                  8.100  1   0.004428 ** 
## jumlah_pelanggaran         149.124  1  < 2.2e-16 ***
## curah_hujan                 23.646  1  1.158e-06 ***
## suhu                         0.168  1   0.682219    
## kecepatan_angin              3.576  1   0.058637 .  
## tekstur_tanah                8.383  1   0.003788 ** 
## mobilitas_polutan            5.270  1   0.021696 *  
## volatilitas_polutan          0.016  1   0.899957    
## permukaan_kedap              0.325  1   0.568672    
## polutan_organik_persisten    8.165  1   0.004270 ** 
## skala_industri               1.459  4   0.833867    
## erosi_tanah                  3.628  5   0.604182    
## jenis_industri              97.302 18  6.894e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.1.5 Prediksi & Evaluasi — Model v1

p_train_v1 <- predict(contamination_fit1, newdata = train_data, type = "response")
p_test_v1  <- predict(contamination_fit1, newdata = test_data,  type = "response")

roc_test_v1 <- roc_points(test_data$kontaminasi_biner, p_test_v1) %>%
  mutate(data = "Testing v1")
auc_test_v1 <- auc_value(roc_test_v1)
met_v1_def  <- classification_metrics(test_data$kontaminasi_biner, p_test_v1, 0.5)

confusion_matrix_tabel(test_data$kontaminasi_biner, p_test_v1, 0.5) %>%
  knitr::kable(caption = "Confusion Matrix Testing (Threshold = 0,50) — v1") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Confusion Matrix Testing (Threshold = 0,50) — v1
Prediksi: Terkontaminasi Prediksi: Bersih Sum
Aktual: Terkontaminasi 192 49 241
Aktual: Bersih 52 109 161
Sum 244 158 402
format_metrics_indonesia(met_v1_def) %>%
  mutate(across(where(is.numeric), round, 3)) %>%
  knitr::kable(caption = "Metrik Evaluasi Testing (Threshold = 0,50) — v1") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Metrik Evaluasi Testing (Threshold = 0,50) — v1
Threshold Akurasi Error Rate Sensitivity Specificity Presisi F1-Score Balanced Accuracy
0.5 0.749 0.251 0.797 0.677 0.787 0.792 0.737
# Kurva ROC v1
roc_train_v1 <- roc_points(train_data$kontaminasi_biner, p_train_v1) %>%
  mutate(data = "Training v1")

optimal_v1 <- roc_train_v1 %>%
  filter(is.finite(threshold)) %>% arrange(desc(youden), desc(sensitivity)) %>% slice(1)
threshold_opt_v1 <- optimal_v1$threshold[1]

ggplot(bind_rows(roc_train_v1, roc_test_v1), aes(x = fpr, y = sensitivity, color = data)) +
  geom_path(linewidth = 1.1) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
  geom_point(data = optimal_v1, aes(x = fpr, y = sensitivity),
             inherit.aes = FALSE, color = "#854d0e", fill = "#eab308",
             shape = 21, size = 4, stroke = 1.2) +
  coord_equal() +
  scale_color_manual(values = c("Training v1" = "#2d6a2d", "Testing v1" = "#eab308")) +
  labs(title    = "Kurva ROC — Model v1 (Full — 15 variabel)",
       subtitle = paste0("AUC Training = ", round(auc_value(roc_train_v1), 3),
                         " | AUC Testing = ", round(auc_test_v1, 3),
                         " | Threshold Optimal = ", round(threshold_opt_v1, 3)),
       x = "False Positive Rate (1 - Specificity)",
       y = "Sensitivity (True Positive Rate)", color = "Data") +
  theme_minimal(base_size = 12) + theme(legend.position = "bottom")

data.frame(Data = c("Training v1","Testing v1"),
           AUC  = round(c(auc_value(roc_train_v1), auc_test_v1), 3)) %>%
  knitr::kable(caption = "Nilai AUC — Model v1") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Nilai AUC — Model v1
Data AUC
Training v1 0.863
Testing v1 0.841

5.2 Model v2 — Full Model Terkoreksi (13 Variabel)

Perubahan dari v1:

  • [DIHAPUS] volatilitas_polutan → VIF = 9,25 (multikolinearitas berat)
  • [DITRANSFORMASI] mobilitas_polutanlog(x+1) = log_mobilitas_polutan

5.2.1 Estimasi Model v2

contamination_fit2 <- glm(
  kontaminasi_biner ~
    durasi_operasi + tahun_mulai + jumlah_pelanggaran +
    curah_hujan + suhu + kecepatan_angin +
    tekstur_tanah + log_mobilitas_polutan +
    permukaan_kedap + polutan_organik_persisten +
    skala_industri + erosi_tanah + jenis_industri,
  data   = train_data,
  family = binomial(link = "logit")
)

data.frame(
  Keterangan = c("Jumlah observasi training", "Jumlah variabel prediktor",
                 "Null deviance", "Residual deviance", "Derajat bebas residual", "AIC"),
  Nilai = c(nobs(contamination_fit2), length(contamination_fit2$coefficients) - 1,
            round(contamination_fit2$null.deviance, 3),
            round(contamination_fit2$deviance, 3),
            contamination_fit2$df.residual, round(AIC(contamination_fit2), 3))
) %>%
  knitr::kable(caption = "Ringkasan Kecocokan Model v2 (13 variabel)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Ringkasan Kecocokan Model v2 (13 variabel)
Keterangan Nilai
Jumlah observasi training 1603.000
Jumlah variabel prediktor 37.000
Null deviance 2156.699
Residual deviance 1411.100
Derajat bebas residual 1565.000
AIC 1487.100

5.2.2 Uji Multikolinearitas (VIF) — Model v2

vif_v2 <- car::vif(contamination_fit2)

if (is.matrix(vif_v2)) {
  vif_df2 <- as.data.frame(vif_v2)
  vif_df2$Keterangan <- ifelse(vif_v2[, 3] > 2.5, "⚠️ Tinggi", "✅ Aman")
} else {
  vif_df2 <- data.frame(VIF = vif_v2,
                        Keterangan = ifelse(vif_v2 > 2.5, "⚠️ Tinggi", "✅ Aman"))
}
knitr::kable(vif_df2, caption = "Hasil Uji VIF — Model v2") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Hasil Uji VIF — Model v2
GVIF Df GVIF^(1/(2*Df)) Keterangan
durasi_operasi 1.993069 1 1.411761 ✅ Aman |
tahun_mulai 2.031962 1 1.425469 ✅ Aman |
jumlah_pelanggaran 1.106356 1 1.051835 ✅ Aman |
curah_hujan 3.025701 1 1.739454 ✅ Aman |
suhu 3.386560 1 1.840261 ✅ Aman |
kecepatan_angin 1.463668 1 1.209822 ✅ Aman |
tekstur_tanah 1.574540 1 1.254807 ✅ Aman |
log_mobilitas_polutan 2.486152 1 1.576754 ✅ Aman |
permukaan_kedap 1.195124 1 1.093217 ✅ Aman |
polutan_organik_persisten 15.074709 1 3.882616 ⚠️ Tinggi
skala_industri 1.657428 4 1.065196 ✅ Aman |
erosi_tanah 1.339993 5 1.029699 ✅ Aman |
jenis_industri 73.529238 18 1.126798 ✅ Aman |

✅ Setelah penghapusan volatilitas_polutan, tidak ada lagi variabel dengan GVIF^(1/(2Df)) > 5. Nilai VIF tertinggi adalah polutan_organik_persisten (3,88) — dipertahankan karena relevansi ekologisnya.

5.2.3 Uji Linearitas Logit (Box-Tidwell) — Model v2

train_bt2 <- train_data %>%
  mutate(
    dur_log   = durasi_operasi  * log(durasi_operasi  + 1),
    hujan_log = curah_hujan     * log(curah_hujan     + 1),
    suhu_log  = suhu            * log(abs(suhu)       + 1),
    angin_log = kecepatan_angin * log(kecepatan_angin + 1),
    tanah_log = tekstur_tanah   * log(tekstur_tanah   + 1)
  )

bt_fit2 <- glm(
  kontaminasi_biner ~
    durasi_operasi  + dur_log  + curah_hujan     + hujan_log +
    suhu            + suhu_log + kecepatan_angin + angin_log +
    tekstur_tanah   + tanah_log,
  data = train_bt2, family = binomial
)

broom::tidy(bt_fit2) %>%
  filter(grepl("_log$", term)) %>%
  transmute(
    `Variabel (interaksi log)` = term,
    Koefisien  = round(estimate, 4),
    `p-value`  = signif(p.value, 3),
    Kesimpulan = ifelse(p.value < 0.05, "⚠️ Asumsi DILANGGAR", "✅ Asumsi terpenuhi")
  ) %>%
  knitr::kable(caption = "Uji Box-Tidwell v2: Linearitas Logit (Variabel Tersisa)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Uji Box-Tidwell v2: Linearitas Logit (Variabel Tersisa)
Variabel (interaksi log) Koefisien p-value Kesimpulan
dur_log 0.0168 0.580 ✅ Asumsi terpenuhi |
hujan_log 0.0000 0.328 ✅ Asumsi terpenuhi |
suhu_log 0.0402 0.298 ✅ Asumsi terpenuhi |
angin_log 0.0254 0.154 ✅ Asumsi terpenuhi |
tanah_log 0.0043 0.896 ✅ Asumsi terpenuhi |

✅ Semua p-value > 0,05 → asumsi linearitas logit terpenuhi untuk semua variabel numerik di v2.

5.2.4 Uji Signifikansi Per Variabel (LRT) — Model v2

lrt_v2 <- car::Anova(contamination_fit2, type = "II", test = "LR")
print(lrt_v2)
## Analysis of Deviance Table (Type II tests)
## 
## Response: kontaminasi_biner
##                           LR Chisq Df Pr(>Chisq)    
## durasi_operasi              88.458  1  < 2.2e-16 ***
## tahun_mulai                  8.084  1   0.004465 ** 
## jumlah_pelanggaran         149.222  1  < 2.2e-16 ***
## curah_hujan                 23.631  1  1.167e-06 ***
## suhu                         0.167  1   0.683200    
## kecepatan_angin              3.583  1   0.058369 .  
## tekstur_tanah                8.372  1   0.003810 ** 
## log_mobilitas_polutan        5.916  1   0.015002 *  
## permukaan_kedap              0.330  1   0.565742    
## polutan_organik_persisten    8.157  1   0.004290 ** 
## skala_industri               1.454  4   0.834749    
## erosi_tanah                  3.623  5   0.604933    
## jenis_industri              98.496 18  4.174e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.2.5 Koefisien & Odds Ratio — Model v2

coef_v2 <- broom::tidy(contamination_fit2) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    odds_ratio   = exp(estimate),
    ci_low       = exp(estimate - 1.96 * std.error),
    ci_high      = exp(estimate + 1.96 * std.error),
    Signifikansi = case_when(
      p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
      p.value < 0.05  ~ "*",  p.value < 0.1  ~ ".",
      TRUE            ~ "tidak signifikan ⚠"
    )
  ) %>%
  arrange(p.value) %>%
  transmute(`Variabel/Level` = term,
            `Odds Ratio`     = round(odds_ratio, 3),
            `CI 95%`         = paste0(round(ci_low, 3), " - ", round(ci_high, 3)),
            `p-value`        = signif(p.value, 3),
            Signifikansi     = Signifikansi)

knitr::kable(coef_v2, caption = "Koefisien & Odds Ratio Lengkap — Model v2 (diurutkan p-value)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Koefisien & Odds Ratio Lengkap — Model v2 (diurutkan p-value)
Variabel/Level Odds Ratio CI 95% p-value Signifikansi
jumlah_pelanggaran 1.554 1.425 - 1.695 0.00e+00 ***
durasi_operasi 1.142 1.11 - 1.176 0.00e+00 ***
jenis_industriHazardous waste treatment 0.071 0.025 - 0.205 1.00e-06 ***
curah_hujan 1.000 1 - 1 1.50e-06 ***
jenis_industriOrganic chemical raw material manufacturing 12.325 3.744 - 40.577 3.61e-05 ***
jenis_industriCotton printing and dyeing processing 0.320 0.18 - 0.569 1.05e-04 ***
jenis_industriPesticide manufacturing 7.945 2.438 - 25.893 5.85e-04 ***
jenis_industriCoking 7.838 2.303 - 26.675 9.85e-04 ***
jenis_industriSteel smelting 9.378 2.25 - 39.077 2.11e-03 **
jenis_industriCrude oil processing and petroleum product manufacturing 2.967 1.466 - 6.002 2.49e-03 **
tekstur_tanah 0.969 0.949 - 0.99 4.04e-03 **
tahun_mulai 0.965 0.941 - 0.989 4.75e-03 **
polutan_organik_persistenYa 0.209 0.068 - 0.639 6.08e-03 **
jenis_industriIron smelting 6.139 1.599 - 23.562 8.18e-03 **
log_mobilitas_polutan 0.476 0.262 - 0.867 1.53e-02
jenis_industriCopper smelting 4.384 1.101 - 17.453 3.60e-02
jenis_industriPetroleum extraction 0.336 0.119 - 0.948 3.92e-02
jenis_industriLead-zinc smelting 2.246 1.013 - 4.982 4.64e-02
kecepatan_angin 1.016 0.999 - 1.033 6.05e-02 .
erosi_tanahSangat Kuat 3.872 0.655 - 22.875 1.35e-01 tidak signifikan ⚠
jenis_industriChemical reagent and additive manufacturing 0.651 0.347 - 1.22 1.80e-01 tidak signifikan ⚠
jenis_industriLeather tanning and processing 1.524 0.803 - 2.892 1.97e-01 tidak signifikan ⚠
jenis_industriIron ore mining and dressing 0.627 0.294 - 1.339 2.28e-01 tidak signifikan ⚠
erosi_tanahAgak Ringan 1.297 0.792 - 2.124 3.02e-01 tidak signifikan ⚠
skala_industriKecil 1.220 0.828 - 1.796 3.14e-01 tidak signifikan ⚠
jenis_industriLead-zinc ore mining and dressing 1.380 0.533 - 3.572 5.06e-01 tidak signifikan ⚠
permukaan_kedapYa 0.920 0.691 - 1.224 5.66e-01 tidak signifikan ⚠
jenis_industriChemical pharmaceutical raw material manufacturing 0.793 0.31 - 2.031 6.29e-01 tidak signifikan ⚠
suhu 0.990 0.941 - 1.041 6.83e-01 tidak signifikan ⚠
jenis_industriGold ore mining and dressing 1.135 0.411 - 3.133 8.06e-01 tidak signifikan ⚠
erosi_tanahSedang 0.961 0.508 - 1.816 9.01e-01 tidak signifikan ⚠
skala_industriMenengah 1.026 0.67 - 1.572 9.05e-01 tidak signifikan ⚠
erosi_tanahKuat 1.062 0.365 - 3.095 9.12e-01 tidak signifikan ⚠
skala_industriSangat Besar 0.983 0.612 - 1.58 9.44e-01 tidak signifikan ⚠
skala_industriBesar 0.994 0.659 - 1.5 9.77e-01 tidak signifikan ⚠
erosi_tanahParah 1222.199 0 - 3.2740385073678e+279 9.83e-01 tidak signifikan ⚠
jenis_industriMetal surface treatment and heat treatment processing 1.000 0.594 - 1.683 9.99e-01 tidak signifikan ⚠

5.2.6 Variabel Tidak Signifikan — Kandidat Penghapusan

tidak_sig_v2 <- broom::tidy(contamination_fit2) %>%
  filter(term != "(Intercept)") %>%
  mutate(variabel_utama = case_when(
    grepl("^suhu$",           term) ~ "suhu",
    grepl("^permukaan_kedap", term) ~ "permukaan_kedap",
    grepl("^skala_industri",  term) ~ "skala_industri",
    grepl("^erosi_tanah",     term) ~ "erosi_tanah",
    TRUE                            ~ term
  )) %>%
  group_by(variabel_utama) %>%
  summarise(
    `p-value min` = round(min(p.value), 4),
    `p-value max` = round(max(p.value), 4),
    `Semua level tidak signifikan?` = ifelse(all(p.value >= 0.05), "Ya ⚠️", "Tidak"),
    .groups = "drop"
  ) %>%
  filter(`Semua level tidak signifikan?` == "Ya ⚠️") %>%
  rename(Variabel = variabel_utama)

knitr::kable(tidak_sig_v2,
             caption = "Variabel dengan Semua Level Tidak Signifikan di Model v2") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Variabel dengan Semua Level Tidak Signifikan di Model v2
Variabel p-value min p-value max Semua level tidak signifikan?
erosi_tanah 0.1352 0.9825 Ya ⚠️
jenis_industriChemical pharmaceutical raw material manufacturing 0.6286 0.6286 Ya ⚠️
jenis_industriChemical reagent and additive manufacturing 0.1804 0.1804 Ya ⚠️
jenis_industriGold ore mining and dressing 0.8063 0.8063 Ya ⚠️
jenis_industriIron ore mining and dressing 0.2280 0.2280 Ya ⚠️
jenis_industriLead-zinc ore mining and dressing 0.5064 0.5064 Ya ⚠️
jenis_industriLeather tanning and processing 0.1973 0.1973 Ya ⚠️
jenis_industriMetal surface treatment and heat treatment processing 0.9991 0.9991 Ya ⚠️
kecepatan_angin 0.0605 0.0605 Ya ⚠️
permukaan_kedap 0.5659 0.5659 Ya ⚠️
skala_industri 0.3140 0.9767 Ya ⚠️
suhu 0.6831 0.6831 Ya ⚠️

Keempat variabel di atas (suhu, permukaan_kedap, skala_industri, erosi_tanah) dieliminasi pada model v3. Catatan: erosi_tanah level ‘Parah’ menunjukkan gejala quasi-complete separation.

5.2.7 Prediksi & Evaluasi — Model v2

p_train_v2 <- predict(contamination_fit2, newdata = train_data, type = "response")
p_test_v2  <- predict(contamination_fit2, newdata = test_data,  type = "response")

confusion_matrix_tabel(test_data$kontaminasi_biner, p_test_v2, 0.5) %>%
  knitr::kable(caption = "Confusion Matrix Testing (Threshold = 0,50) — v2") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Confusion Matrix Testing (Threshold = 0,50) — v2
Prediksi: Terkontaminasi Prediksi: Bersih Sum
Aktual: Terkontaminasi 192 49 241
Aktual: Bersih 52 109 161
Sum 244 158 402
met_v2_def <- classification_metrics(test_data$kontaminasi_biner, p_test_v2, 0.5)
format_metrics_indonesia(met_v2_def) %>%
  mutate(across(where(is.numeric), round, 3)) %>%
  knitr::kable(caption = "Metrik Evaluasi Testing (Threshold = 0,50) — v2") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Metrik Evaluasi Testing (Threshold = 0,50) — v2
Threshold Akurasi Error Rate Sensitivity Specificity Presisi F1-Score Balanced Accuracy
0.5 0.749 0.251 0.797 0.677 0.787 0.792 0.737
roc_train_v2 <- roc_points(train_data$kontaminasi_biner, p_train_v2) %>% mutate(data = "Training v2")
roc_test_v2  <- roc_points(test_data$kontaminasi_biner,  p_test_v2)  %>% mutate(data = "Testing v2")
auc_train_v2 <- auc_value(roc_train_v2); auc_test_v2 <- auc_value(roc_test_v2)

data.frame(Data = c("Training v2","Testing v2"),
           AUC  = round(c(auc_train_v2, auc_test_v2), 3)) %>%
  knitr::kable(caption = "Nilai AUC — Model v2") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Nilai AUC — Model v2
Data AUC
Training v2 0.863
Testing v2 0.841
ggplot(bind_rows(roc_train_v2, roc_test_v2), aes(x = fpr, y = sensitivity, color = data)) +
  geom_path(linewidth = 1.1) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
  coord_equal() +
  scale_color_manual(values = c("Training v2" = "#2d6a2d", "Testing v2" = "#eab308")) +
  labs(title    = "Kurva ROC — Model v2 (Full Diperbaiki — 13 variabel)",
       subtitle = paste0("AUC Training = ", round(auc_train_v2, 3),
                         " | AUC Testing = ", round(auc_test_v2, 3)),
       x = "False Positive Rate (1 - Specificity)",
       y = "Sensitivity (True Positive Rate)", color = "Data") +
  theme_minimal(base_size = 12) + theme(legend.position = "bottom")

5.2.8 Uji Hosmer-Lemeshow & Nagelkerke R² — Model v2

cat("--- UJI HOSMER-LEMESHOW — MODEL v2 ---\n")
## --- UJI HOSMER-LEMESHOW — MODEL v2 ---
print(ResourceSelection::hoslem.test(train_data$kontaminasi_biner, fitted(contamination_fit2), g = 10))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  train_data$kontaminasi_biner, fitted(contamination_fit2)
## X-squared = 18.758, df = 8, p-value = 0.01621
cat("(p = 0.016 → indikasi misfit → akan diperbaiki di v3)\n\n")
## (p = 0.016 → indikasi misfit → akan diperbaiki di v3)
cat("Nagelkerke R² v2:",
    round(DescTools::PseudoR2(contamination_fit2, which = "Nagelkerke"), 4))
## Nagelkerke R² v2: 0.5029

5.3 Model v3 — Reduced Model / Model Final ✅ (9 Variabel)

Perubahan dari v2:

  • [DIHAPUS] suhu (p = 0,683)
  • [DIHAPUS] permukaan_kedap (p = 0,566)
  • [DIHAPUS] skala_industri (semua level p > 0,31)
  • [DIHAPUS] erosi_tanah (semua level p > 0,13 + quasi-complete separation)
  • [DIPERTAHANKAN] kecepatan_angin (p = 0,058 borderline, relevan substantif)

5.3.1 Estimasi Model v3

contamination_fit3 <- glm(
  kontaminasi_biner ~
    durasi_operasi + tahun_mulai + jumlah_pelanggaran +
    curah_hujan + kecepatan_angin + tekstur_tanah +
    log_mobilitas_polutan + polutan_organik_persisten +
    jenis_industri,
  data   = train_data,
  family = binomial(link = "logit")
)

data.frame(
  Keterangan = c("Jumlah observasi training", "Jumlah variabel prediktor",
                 "Null deviance", "Residual deviance", "Derajat bebas residual", "AIC"),
  Nilai = c(nobs(contamination_fit3), length(contamination_fit3$coefficients) - 1,
            round(contamination_fit3$null.deviance, 3),
            round(contamination_fit3$deviance, 3),
            contamination_fit3$df.residual, round(AIC(contamination_fit3), 3))
) %>%
  knitr::kable(caption = "Ringkasan Kecocokan Model v3 (Reduced / Final)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Ringkasan Kecocokan Model v3 (Reduced / Final)
Keterangan Nilai
Jumlah observasi training 1603.000
Jumlah variabel prediktor 26.000
Null deviance 2156.699
Residual deviance 1416.731
Derajat bebas residual 1576.000
AIC 1470.731

5.3.2 Perbandingan AIC: v2 vs v3 & Uji LRT

aic_v2 <- AIC(contamination_fit2); aic_v3 <- AIC(contamination_fit3)
selisih_aic <- round(abs(aic_v2 - aic_v3), 3)

data.frame(
  Model = c("v2 — Full Model (13 variabel)", "v3 — Reduced Model (9 variabel)"),
  `Jumlah Variabel` = c(13, 9),
  `Residual Deviance` = c(round(contamination_fit2$deviance, 3),
                           round(contamination_fit3$deviance, 3)),
  `Df Residual` = c(contamination_fit2$df.residual, contamination_fit3$df.residual),
  AIC = c(round(aic_v2, 3), round(aic_v3, 3)),
  check.names = FALSE
) %>%
  knitr::kable(caption = "Perbandingan Full Model (v2) vs Reduced Model (v3)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Perbandingan Full Model (v2) vs Reduced Model (v3)
Model Jumlah Variabel Residual Deviance Df Residual AIC
v2 — Full Model (13 variabel) 13 1411.100 1565 1487.100
v3 — Reduced Model (9 variabel) 9 1416.731 1576 1470.731
cat("Selisih AIC (v2 - v3):", selisih_aic,
    "→ ΔAIc > 10 → bukti kuat memilih v3\n\n")
## Selisih AIC (v2 - v3): 16.369 → ΔAIc > 10 → bukti kuat memilih v3
# LRT v3 vs v2
cat("--- LIKELIHOOD RATIO TEST (LRT): v3 vs v2 ---\n")
## --- LIKELIHOOD RATIO TEST (LRT): v3 vs v2 ---
lrt_result <- anova(contamination_fit3, contamination_fit2, test = "Chisq")
print(lrt_result)
## Analysis of Deviance Table
## 
## Model 1: kontaminasi_biner ~ durasi_operasi + tahun_mulai + jumlah_pelanggaran + 
##     curah_hujan + kecepatan_angin + tekstur_tanah + log_mobilitas_polutan + 
##     polutan_organik_persisten + jenis_industri
## Model 2: kontaminasi_biner ~ durasi_operasi + tahun_mulai + jumlah_pelanggaran + 
##     curah_hujan + suhu + kecepatan_angin + tekstur_tanah + log_mobilitas_polutan + 
##     permukaan_kedap + polutan_organik_persisten + skala_industri + 
##     erosi_tanah + jenis_industri
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      1576     1416.7                     
## 2      1565     1411.1 11   5.6305   0.8968
lrt_pval <- lrt_result$`Pr(>Chi)`[2]
cat("\n✅ p-value =", round(lrt_pval, 4), "> 0.05\n")
## 
## ✅ p-value = 0.8968 > 0.05
cat("→ Variabel yang dihapus TIDAK signifikan secara bersama-sama.\n")
## → Variabel yang dihapus TIDAK signifikan secara bersama-sama.
cat("→ Reduced model (v3) VALID digunakan sebagai MODEL FINAL.\n")
## → Reduced model (v3) VALID digunakan sebagai MODEL FINAL.

5.3.3 Uji Multikolinearitas (VIF) — Model v3

vif_v3 <- car::vif(contamination_fit3)

if (is.matrix(vif_v3)) {
  vif_df3 <- as.data.frame(vif_v3)
  vif_df3$Keterangan <- ifelse(vif_v3[, 3] > 2.5, "⚠️ Tinggi", "✅ Aman")
} else {
  vif_df3 <- data.frame(VIF = vif_v3,
                        Keterangan = ifelse(vif_v3 > 2.5, "⚠️ Tinggi", "✅ Aman"))
}
knitr::kable(vif_df3, caption = "Hasil Uji VIF — Model v3") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Hasil Uji VIF — Model v3
GVIF Df GVIF^(1/(2*Df)) Keterangan
durasi_operasi 1.835681 1 1.354873 ✅ Aman |
tahun_mulai 1.766605 1 1.329137 ✅ Aman |
jumlah_pelanggaran 1.091027 1 1.044523 ✅ Aman |
curah_hujan 1.606868 1 1.267623 ✅ Aman |
kecepatan_angin 1.227614 1 1.107977 ✅ Aman |
tekstur_tanah 1.498299 1 1.224050 ✅ Aman |
log_mobilitas_polutan 2.447461 1 1.564436 ✅ Aman |
polutan_organik_persisten 14.992907 1 3.872067 ⚠️ Tinggi
jenis_industri 43.772412 18 1.110680 ✅ Aman |

5.3.4 Uji Signifikansi Per Variabel (LRT) — Model v3

lrt_v3 <- car::Anova(contamination_fit3, type = "II", test = "LR")
print(lrt_v3)
## Analysis of Deviance Table (Type II tests)
## 
## Response: kontaminasi_biner
##                           LR Chisq Df Pr(>Chisq)    
## durasi_operasi              94.050  1  < 2.2e-16 ***
## tahun_mulai                  9.948  1   0.001611 ** 
## jumlah_pelanggaran         153.105  1  < 2.2e-16 ***
## curah_hujan                 40.480  1  1.987e-10 ***
## kecepatan_angin              6.802  1   0.009103 ** 
## tekstur_tanah                9.661  1   0.001882 ** 
## log_mobilitas_polutan        6.588  1   0.010266 *  
## polutan_organik_persisten    8.556  1   0.003444 ** 
## jenis_industri             101.796 18  1.037e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.3.5 Koefisien, Odds Ratio & p-value — Model v3 (Final)

coef_v3 <- broom::tidy(contamination_fit3) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    odds_ratio   = exp(estimate),
    ci_low       = exp(estimate - 1.96 * std.error),
    ci_high      = exp(estimate + 1.96 * std.error),
    Signifikansi = case_when(
      p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
      p.value < 0.05  ~ "*",  p.value < 0.1  ~ ".",
      TRUE            ~ "-"
    )
  ) %>%
  arrange(p.value) %>%
  transmute(`Variabel/Level`      = term,
            `Odds Ratio`          = round(odds_ratio, 3),
            `CI 95% (Low - High)` = paste0(round(ci_low, 3), " - ", round(ci_high, 3)),
            `p-value`             = signif(p.value, 3),
            Signifikansi          = Signifikansi)

knitr::kable(coef_v3, caption = "Koefisien & Odds Ratio — Model v3 (Reduced / Final)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Koefisien & Odds Ratio — Model v3 (Reduced / Final)
Variabel/Level Odds Ratio CI 95% (Low - High) p-value Signifikansi
jumlah_pelanggaran 1.554 1.426 - 1.694 0.00e+00 ***
durasi_operasi 1.140 1.108 - 1.172 0.00e+00 ***
curah_hujan 1.000 1 - 1 0.00e+00 ***
jenis_industriHazardous waste treatment 0.070 0.024 - 0.201 8.00e-07 ***
jenis_industriCotton printing and dyeing processing 0.309 0.177 - 0.539 3.57e-05 ***
jenis_industriOrganic chemical raw material manufacturing 12.039 3.681 - 39.375 3.87e-05 ***
jenis_industriPesticide manufacturing 7.875 2.417 - 25.654 6.15e-04 ***
jenis_industriCoking 8.188 2.426 - 27.642 7.05e-04 ***
tahun_mulai 0.964 0.942 - 0.986 1.77e-03 **
tekstur_tanah 0.968 0.948 - 0.988 2.03e-03 **
jenis_industriSteel smelting 9.369 2.261 - 38.815 2.04e-03 **
jenis_industriCrude oil processing and petroleum product manufacturing 2.856 1.418 - 5.749 3.29e-03 **
polutan_organik_persistenYa 0.203 0.067 - 0.619 5.05e-03 **
jenis_industriIron smelting 6.510 1.703 - 24.878 6.17e-03 **
kecepatan_angin 1.020 1.005 - 1.035 1.00e-02
log_mobilitas_polutan 0.461 0.255 - 0.834 1.05e-02
jenis_industriCopper smelting 4.600 1.17 - 18.092 2.89e-02
jenis_industriPetroleum extraction 0.333 0.123 - 0.902 3.05e-02
jenis_industriLead-zinc smelting 2.356 1.075 - 5.165 3.23e-02
jenis_industriChemical reagent and additive manufacturing 0.640 0.345 - 1.187 1.57e-01
jenis_industriLeather tanning and processing 1.461 0.781 - 2.735 2.36e-01
jenis_industriIron ore mining and dressing 0.644 0.308 - 1.349 2.44e-01
jenis_industriLead-zinc ore mining and dressing 1.427 0.555 - 3.671 4.61e-01
jenis_industriChemical pharmaceutical raw material manufacturing 0.761 0.303 - 1.91 5.60e-01
jenis_industriMetal surface treatment and heat treatment processing 0.925 0.568 - 1.509 7.56e-01
jenis_industriGold ore mining and dressing 1.163 0.431 - 3.141 7.66e-01

5.3.6 Prediksi & Confusion Matrix — Model v3

p_train_v3 <- predict(contamination_fit3, newdata = train_data, type = "response")
p_test_v3  <- predict(contamination_fit3, newdata = test_data,  type = "response")

# Preview 8 data pertama
head(data.frame("Status Aktual" = as.character(test_data$status_kontaminasi),
                "Peluang Terkontaminasi" = round(p_test_v3, 4),
                check.names = FALSE), 8) %>%
  knitr::kable(caption = "Contoh Peluang Prediksi v3 (8 Data Pertama)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Contoh Peluang Prediksi v3 (8 Data Pertama)
Status Aktual Peluang Terkontaminasi
No 0.2102
No 0.2175
Yes 0.4757
No 0.2041
No 0.2732
No 0.1773
Yes 0.9722
Yes 0.9799
# Confusion Matrix threshold 0.50
confusion_matrix_tabel(test_data$kontaminasi_biner, p_test_v3, 0.5) %>%
  knitr::kable(caption = "Confusion Matrix Testing (Threshold = 0,50) — v3") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Confusion Matrix Testing (Threshold = 0,50) — v3
Prediksi: Terkontaminasi Prediksi: Bersih Sum
Aktual: Terkontaminasi 192 49 241
Aktual: Bersih 47 114 161
Sum 239 163 402
met_v3_def <- classification_metrics(test_data$kontaminasi_biner, p_test_v3, 0.5)
format_metrics_indonesia(met_v3_def) %>%
  mutate(across(where(is.numeric), \(x) round(x, 3))) %>%
  knitr::kable(caption = "Metrik Evaluasi Testing (Threshold = 0,50) — v3") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Metrik Evaluasi Testing (Threshold = 0,50) — v3
Threshold Akurasi Error Rate Sensitivity Specificity Presisi F1-Score Balanced Accuracy
0.5 0.761 0.239 0.797 0.708 0.803 0.8 0.752

6 Evaluasi Kinerja Model v3

6.1 ROC & AUC — Model v3 (Final)

roc_train_v3 <- roc_points(train_data$kontaminasi_biner, p_train_v3) %>% mutate(data = "Training v3")
roc_test_v3  <- roc_points(test_data$kontaminasi_biner,  p_test_v3)  %>% mutate(data = "Testing v3")
auc_train_v3 <- auc_value(roc_train_v3); auc_test_v3 <- auc_value(roc_test_v3)

optimal_v3 <- roc_train_v3 %>%
  filter(is.finite(threshold)) %>% arrange(desc(youden), desc(sensitivity)) %>% slice(1)
threshold_opt_v3 <- optimal_v3$threshold[1]

data.frame(Data = c("Training v3","Testing v3"),
           AUC  = round(c(auc_train_v3, auc_test_v3), 3)) %>%
  knitr::kable(caption = "Nilai AUC — Model v3 (Final)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Nilai AUC — Model v3 (Final)
Data AUC
Training v3 0.862
Testing v3 0.851
cat("Threshold optimal v3 (Youden):", round(threshold_opt_v3, 4))
## Threshold optimal v3 (Youden): 0.6406
# Kurva ROC v3
ggplot(bind_rows(roc_train_v3, roc_test_v3), aes(x = fpr, y = sensitivity, color = data)) +
  geom_path(linewidth = 1.2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
  geom_point(data = optimal_v3, aes(x = fpr, y = sensitivity),
             inherit.aes = FALSE, color = "#854d0e", fill = "#eab308",
             shape = 21, size = 4.5, stroke = 1.3) +
  coord_equal() +
  scale_color_manual(values = c("Training v3" = "#2d6a2d", "Testing v3" = "#eab308")) +
  labs(title    = "Kurva ROC Model v3 (Reduced / Final)",
       subtitle = paste0("AUC Training = ", round(auc_train_v3, 3),
                         " | AUC Testing = ", round(auc_test_v3, 3),
                         " | Threshold Optimal = ", round(threshold_opt_v3, 3)),
       x = "False Positive Rate (1 - Specificity)",
       y = "Sensitivity (True Positive Rate)", color = "Data") +
  theme_minimal(base_size = 12) + theme(legend.position = "bottom")

6.2 Threshold Default vs Threshold Optimal — Model v3

met_v3_opt <- classification_metrics(test_data$kontaminasi_biner, p_test_v3, threshold_opt_v3)

confusion_matrix_tabel(test_data$kontaminasi_biner, p_test_v3, threshold_opt_v3) %>%
  knitr::kable(caption = paste0("Confusion Matrix Testing (Threshold Optimal = ",
                                 round(threshold_opt_v3, 3), ") — v3")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Confusion Matrix Testing (Threshold Optimal = 0.641) — v3
Prediksi: Terkontaminasi Prediksi: Bersih Sum
Aktual: Terkontaminasi 161 80 241
Aktual: Bersih 29 132 161
Sum 190 212 402
bind_rows(
  met_v3_def %>% mutate(aturan = "Threshold 0.50"),
  met_v3_opt %>% mutate(aturan = paste0("Threshold Optimal (", round(threshold_opt_v3, 3), ")"))
) %>%
  format_metrics_indonesia() %>%
  bind_cols(`Aturan Klasifikasi` = c("Threshold 0.50",
                                      paste0("Threshold Optimal (", round(threshold_opt_v3, 3), ")")), .) %>%
  select(`Aturan Klasifikasi`, everything()) %>%
  mutate(across(where(is.numeric), round, 3)) %>%
  knitr::kable(caption = "Perbandingan Metrik: Default vs Optimal — v3 (Final)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Perbandingan Metrik: Default vs Optimal — v3 (Final)
Aturan Klasifikasi Threshold Akurasi Error Rate Sensitivity Specificity Presisi F1-Score Balanced Accuracy
Threshold 0.50 0.500 0.761 0.239 0.797 0.708 0.803 0.800 0.752
Threshold Optimal (0.641) 0.641 0.729 0.271 0.668 0.820 0.847 0.747 0.744

6.3 Distribusi Peluang Prediksi — Model v3

test_data %>%
  mutate(peluang_kontaminasi = p_test_v3) %>%
  ggplot(aes(x = peluang_kontaminasi, fill = status_kontaminasi)) +
  geom_density(alpha = 0.55, color = "white", linewidth = 0.7) +
  geom_vline(xintercept = threshold_opt_v3,
             color = "#2d6a2d", linewidth = 1.2, linetype = "dashed") +
  annotate("text", x = threshold_opt_v3 + 0.02, y = Inf,
           label = paste0("Threshold = ", round(threshold_opt_v3, 3)),
           vjust = 1.6, hjust = 0, color = "#2d6a2d", size = 3.5) +
  scale_fill_manual(values = c("No" = "#2d6a2d", "Yes" = "#eab308")) +
  labs(title    = "Distribusi Peluang Prediksi Kontaminasi Lahan (Data Testing) — v3 Final",
       subtitle = "Garis putus-putus = threshold optimal ROC",
       x = "Peluang prediksi terkontaminasi", y = "Kepadatan",
       fill = "Status Aktual") +
  theme_minimal(base_size = 12)

6.4 Kecocokan Model Final (Goodness-of-Fit)

cat("--- UJI HOSMER-LEMESHOW — MODEL v3 ---\n")
## --- UJI HOSMER-LEMESHOW — MODEL v3 ---
print(ResourceSelection::hoslem.test(train_data$kontaminasi_biner, fitted(contamination_fit3), g = 10))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  train_data$kontaminasi_biner, fitted(contamination_fit3)
## X-squared = 9.5847, df = 8, p-value = 0.2954
cat("\n--- OMNIBUS TEST (ANOVA LRT vs NULL) — MODEL v3 ---\n")
## 
## --- OMNIBUS TEST (ANOVA LRT vs NULL) — MODEL v3 ---
null_fit <- glm(kontaminasi_biner ~ 1, data = train_data, family = binomial)
print(anova(null_fit, contamination_fit3, test = "Chisq"))
## Analysis of Deviance Table
## 
## Model 1: kontaminasi_biner ~ 1
## Model 2: kontaminasi_biner ~ durasi_operasi + tahun_mulai + jumlah_pelanggaran + 
##     curah_hujan + kecepatan_angin + tekstur_tanah + log_mobilitas_polutan + 
##     polutan_organik_persisten + jenis_industri
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      1602     2156.7                          
## 2      1576     1416.7 26   739.97 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- NAGELKERKE PSEUDO R² — MODEL v3 ---\n")
## 
## --- NAGELKERKE PSEUDO R² — MODEL v3 ---
cat("Nagelkerke R² v3:", round(DescTools::PseudoR2(contamination_fit3, which = "Nagelkerke"), 4))
## Nagelkerke R² v3: 0.4999

Hasil Evaluasi Goodness-of-Fit:

  • Hosmer-Lemeshow: p = 0,295 > 0,05 → model fit dengan data ✅
  • Omnibus LRT: χ² = 739,97, p < 0,001 → model secara keseluruhan signifikan ✅
  • Nagelkerke R²: 0,499 → model menjelaskan ~50% variasi kontaminasi ✅

7 Perbandingan Akhir: v1 vs v2 vs v3

7.1 Tabel Perbandingan Lengkap

roc_test_v2_plot <- roc_points(test_data$kontaminasi_biner, p_test_v2) %>% mutate(data = "Testing v2")

met_v1_full <- classification_metrics(test_data$kontaminasi_biner, p_test_v1, 0.5)
met_v2_full <- classification_metrics(test_data$kontaminasi_biner, p_test_v2, 0.5)
met_v3_full <- classification_metrics(test_data$kontaminasi_biner, p_test_v3, 0.5)

data.frame(
  Metrik = c("Jumlah Variabel","AIC","Nagelkerke R²","AUC Training","AUC Testing",
             "Akurasi (threshold 0.5)","Sensitivity (threshold 0.5)",
             "Specificity (threshold 0.5)","Balanced Accuracy","F1-Score",
             "Hosmer-Lemeshow p-value"),
  `v1 — Full (15 var)` = c(
    length(contamination_fit1$coefficients) - 1, round(AIC(contamination_fit1), 3),
    round(DescTools::PseudoR2(contamination_fit1, which = "Nagelkerke"), 4),
    round(auc_value(roc_points(train_data$kontaminasi_biner, p_train_v1)), 3),
    round(auc_test_v1, 3), round(met_v1_full$accuracy, 3),
    round(met_v1_full$sensitivity, 3), round(met_v1_full$specificity, 3),
    round(met_v1_full$balanced_accuracy, 3), round(met_v1_full$f1_score, 3),
    round(ResourceSelection::hoslem.test(train_data$kontaminasi_biner,
                                          fitted(contamination_fit1), g = 10)$p.value, 4)
  ),
  `v2 — Full diperbaiki (13 var)` = c(
    length(contamination_fit2$coefficients) - 1, round(AIC(contamination_fit2), 3),
    round(DescTools::PseudoR2(contamination_fit2, which = "Nagelkerke"), 4),
    round(auc_train_v2, 3), round(auc_test_v2, 3),
    round(met_v2_full$accuracy, 3), round(met_v2_full$sensitivity, 3),
    round(met_v2_full$specificity, 3), round(met_v2_full$balanced_accuracy, 3),
    round(met_v2_full$f1_score, 3),
    round(ResourceSelection::hoslem.test(train_data$kontaminasi_biner,
                                          fitted(contamination_fit2), g = 10)$p.value, 4)
  ),
  `v3 — Reduced / Final ✅ (9 var)` = c(
    length(contamination_fit3$coefficients) - 1, round(AIC(contamination_fit3), 3),
    round(DescTools::PseudoR2(contamination_fit3, which = "Nagelkerke"), 4),
    round(auc_train_v3, 3), round(auc_test_v3, 3),
    round(met_v3_full$accuracy, 3), round(met_v3_full$sensitivity, 3),
    round(met_v3_full$specificity, 3), round(met_v3_full$balanced_accuracy, 3),
    round(met_v3_full$f1_score, 3),
    round(ResourceSelection::hoslem.test(train_data$kontaminasi_biner,
                                          fitted(contamination_fit3), g = 10)$p.value, 4)
  ),
  check.names = FALSE
) %>%
  knitr::kable(caption = "Perbandingan Performa Lengkap: v1 vs v2 vs v3") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Perbandingan Performa Lengkap: v1 vs v2 vs v3
Metrik v1 — Full (15 var) v2 — Full diperbaiki (13 var) v3 — Reduced / Final ✅ (9 var)|
Jumlah Variabel 38.0000 37.0000 26.0000
AIC 1489.0600 1487.1000 1470.7310
Nagelkerke R² 0.5029 0.5029 0.4999
AUC Training 0.8630 0.8630 0.8620
AUC Testing 0.8410 0.8410 0.8510
Akurasi (threshold 0.5) 0.7490 0.7490 0.7610
Sensitivity (threshold 0.5) 0.7970 0.7970 0.7970
Specificity (threshold 0.5) 0.6770 0.6770 0.7080
Balanced Accuracy 0.7370 0.7370 0.7520
F1-Score 0.7920 0.7920 0.8000
Hosmer-Lemeshow p-value 0.0185 0.0162 0.2954

7.2 Perbandingan Kurva ROC: v1 vs v2 vs v3

roc_compare_all <- bind_rows(
  roc_test_v1      %>% mutate(data = "Testing v1 (Full — 15 var)"),
  roc_test_v2_plot %>% mutate(data = "Testing v2 (Full diperbaiki — 13 var)"),
  roc_test_v3      %>% mutate(data = "Testing v3 (Reduced — 9 var)")
)

ggplot(roc_compare_all, aes(x = fpr, y = sensitivity, color = data)) +
  geom_path(linewidth = 1.2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
  coord_equal() +
  scale_color_manual(values = c(
    "Testing v1 (Full — 15 var)"            = "#86c287",
    "Testing v2 (Full diperbaiki — 13 var)" = "#eab308",
    "Testing v3 (Reduced — 9 var)"          = "#2d6a2d"
  )) +
  labs(title    = "Perbandingan Kurva ROC: v1 vs v2 vs v3 (Data Testing)",
       subtitle = paste0("AUC v1 = ", round(auc_test_v1, 3),
                         " | AUC v2 = ", round(auc_test_v2, 3),
                         " | AUC v3 = ", round(auc_test_v3, 3)),
       x = "False Positive Rate (1 - Specificity)",
       y = "Sensitivity (True Positive Rate)", color = "Model") +
  theme_minimal(base_size = 12) + theme(legend.position = "bottom")


8 Hasil & Interpretasi Model Final

8.1 Persamaan Model Final (v3)

\[ \widehat{\text{logit}}(\pi_i) = 71{,}15 + 0{,}131\,(\text{durasi\_operasi}) - 0{,}037\,(\text{tahun\_mulai}) \] \[ + 0{,}441\,(\text{jumlah\_pelanggaran}) + 0{,}0000914\,(\text{curah\_hujan}) + 0{,}020\,(\text{kecepatan\_angin}) \] \[ - 0{,}033\,(\text{tekstur\_tanah}) - 0{,}775\,(\text{log\_mobilitas\_polutan}) \] \[ - 1{,}595\,(\text{polutan\_organik\_persisten} = \text{Ya}) + \beta_9\,(\text{jenis\_industri}) \]

8.2 Odds Ratio — Karakteristik Operasional Industri

Jumlah Pelanggaran Regulasi Lingkungan (OR = 1,554; CI 95%: 1,426–1,694; p < 0,001)

Prediktor paling kuat dalam model. Setiap tambahan satu pelanggaran regulasi lingkungan meningkatkan peluang terjadinya kontaminasi sebesar 55,4%, dengan asumsi variabel lain tetap konstan. Hal ini menunjukkan bahwa efektivitas penegakan hukum lingkungan memiliki peran penting dalam menurunkan risiko kontaminasi.

Durasi Operasi (OR = 1,140; p < 0,001)

Setiap tambahan satu tahun masa operasi industri meningkatkan peluang kontaminasi sebesar 14,0%. Hal ini berkaitan dengan akumulasi polutan dalam jangka panjang di area industri.

Tahun Mulai Beroperasi (OR = 0,964; p = 0,002)

Industri yang mulai beroperasi pada tahun yang lebih baru cenderung memiliki peluang kontaminasi yang lebih rendah sebesar 3,6% untuk setiap kenaikan satu tahun. Pola ini sejalan dengan peningkatan standar pengelolaan lingkungan dan pengetatan regulasi pada periode yang lebih baru.

Jenis Industri (baseline: Industri Lainnya)

Sektor yang melibatkan proses kimia intensif dan peleburan logam memiliki risiko kontaminasi tertinggi:

Jenis Industri OR CI 95% p-value
Organic chemical raw material mfg. 12,039 3,681–39,375 < 0,001 ***
Steel smelting 9,369 2,261–38,815 0,002 **
Coking 8,188 2,426–27,642 < 0,001 ***
Pesticide manufacturing 7,875 2,417–25,654 < 0,001 ***
Iron smelting 6,510 1,703–24,878 0,006 **
Copper smelting 4,600 1,17–18,092 0,029 *
Lead-zinc smelting 2,356 1,075–5,165 0,032 *
Crude oil processing & petroleum mfg. 2,856 1,418–5,749 0,003 **

Sebaliknya, sektor berikut menunjukkan risiko lebih rendah dibanding Industri Lainnya:

Jenis Industri OR Interpretasi
Hazardous waste treatment 0,070 ~93% lebih rendah — kemungkinan karena standar pengawasan lebih ketat pada fasilitas pengelolaan limbah berbahaya
Cotton printing & dyeing 0,309 Risiko lebih rendah secara signifikan (p < 0,001)
Petroleum extraction 0,333 Risiko lebih rendah secara signifikan (p = 0,031)

8.3 Odds Ratio — Kondisi Lingkungan Biofisik

Curah Hujan (OR ≈ 1,000 per 1 mm; p < 0,001)

Meskipun nilai OR per satuan milimeter sangat kecil, pengaruhnya tetap penting karena curah hujan memiliki rentang nilai yang luas. Secara kumulatif, curah hujan tinggi dapat meningkatkan proses pencucian polutan ke lapisan tanah lebih dalam serta memperluas perpindahan kontaminan ke area sekitar.

Kecepatan Angin (OR = 1,020; p = 0,010)

Setiap kenaikan satu satuan kecepatan angin meningkatkan peluang kontaminasi. Hal ini diduga berkaitan dengan proses penyebaran partikel dan kontaminan melalui udara.

Tekstur Tanah — Kandungan Liat (OR = 0,968; p = 0,002)

Setiap peningkatan kandungan liat sebesar 1% menurunkan peluang kontaminasi sebesar 3,2%. Hubungan protektif ini berkaitan dengan karakteristik fisik tanah berliat dalam membatasi pergerakan polutan.

Log Mobilitas Polutan (OR = 0,461; p = 0,011)

Menunjukkan hubungan negatif dengan probabilitas kontaminasi yang terdeteksi. Hal ini tidak berarti polutan bermobilitas tinggi tidak berbahaya — sebaliknya, polutan dengan mobilitas tinggi cenderung berpindah dari lapisan permukaan ke lapisan lebih dalam atau terbawa ke badan air, sehingga konsentrasinya pada titik sampling permukaan menjadi lebih rendah. Hasil ini lebih tepat dipahami sebagai indikasi perbedaan perilaku migrasi dan deteksi polutan.

Polutan Organik Persisten / POPs (OR = 0,203; p = 0,005)

Juga menunjukkan hubungan negatif yang serupa. POPs memiliki afinitas kuat terhadap bahan organik tanah sehingga lebih banyak terakumulasi dalam fase padat dan tidak selalu terdeteksi secara optimal melalui metode pemantauan konvensional (sampel tanah permukaan). Temuan ini mengindikasikan bahwa sistem pemantauan yang hanya mengandalkan sampel permukaan berpotensi mengabaikan sebagian risiko kontaminasi sesungguhnya.


9 Kesimpulan

Model Final: v3 — Reduced Model (9 variabel)

Dipilih berdasarkan:

Kriteria Nilai Keterangan
AIC 1.470,731 Lebih rendah 16,4 poin dari v2 (ΔAIC > 10)
LRT v3 vs v2 p = 0,897 Variabel yang dihapus tidak signifikan bersama ✅
Hosmer-Lemeshow p = 0,295 Model fit baik dengan data ✅
AUC Testing 0,851 Kemampuan diskriminasi baik (0,80–0,90) ✅
Akurasi 76,1% Threshold default 0,50
Balanced Accuracy 75,2% Threshold default 0,50
Nagelkerke R² 0,499 Model menjelaskan ~50% variasi kontaminasi
Parsimoni 9 variabel Lebih efisien dari v2 (13 var) & v1 (15 var)

Sembilan faktor risiko signifikan:

  1. jumlah_pelanggaran — prediktor terkuat (OR = 1,554; p < 0,001)
  2. durasi_operasi (OR = 1,140; p < 0,001)
  3. tahun_mulai (OR = 0,964; p = 0,002)
  4. jenis_industri — beberapa level signifikan, tertinggi: Organic chemical mfg. (OR = 12,039)
  5. curah_hujan (p < 0,001)
  6. kecepatan_angin (OR = 1,020; p = 0,010)
  7. tekstur_tanah — efek protektif (OR = 0,968; p = 0,002)
  8. log_mobilitas_polutan (OR = 0,461; p = 0,011)
  9. polutan_organik_persisten (OR = 0,203; p = 0,005)

Implikasi praktis: Prioritas pemantauan dan remediasi difokuskan pada industri kimia dan metalurgi (organic chemical manufacturing, steel smelting, coking, pesticide manufacturing) dengan riwayat pelanggaran lingkungan tinggi dan telah beroperasi lama. Hubungan negatif pada mobilitas polutan dan POPs mengindikasikan bahwa metode pemantauan berbasis sampel tanah permukaan saja berpotensi mengabaikan sebagian risiko kontaminasi — sistem pemantauan yang mengakomodasi karakteristik migrasi polutan perlu dikembangkan.


Dataset: Urban Contaminated Sites along China’s Urbanization — Li, K. (2023). Mendeley Data. DOI: 10.17632/r4y2vcpfmx.1