required_packages <- c("dplyr", "ggplot2", "broom", "knitr", "scales",
"readxl", "ResourceSelection", "DescTools", "car",
"tidyr", "kableExtra")
missing_packages <- required_packages[
!vapply(required_packages, requireNamespace, logical(1), quietly = TRUE)
]
if (length(missing_packages) > 0) install.packages(missing_packages)
invisible(lapply(required_packages, library, character.only = TRUE))Latar Belakang: Pencemaran lahan industri merupakan salah satu permasalahan lingkungan yang serius, terutama di negara berkembang yang mengalami urbanisasi pesat seperti China. Jumlah lahan terkontaminasi di China meningkat hingga 325% antara tahun 1990 dan 2018, dengan 43.676 lokasi terkontaminasi teridentifikasi dari 83.498 lahan perusahaan.
Tujuan penelitian:
- Mengidentifikasi faktor risiko yang secara signifikan memengaruhi kontaminasi lahan industri.
- Membangun model prediktif berbasis regresi logistik biner yang parsimonious dengan validasi asumsi komprehensif.
- Menginterpretasikan besar dan arah pengaruh masing-masing faktor melalui nilai Odds Ratio (OR).
Dataset yang digunakan adalah Urban Contaminated Sites along
China’s Urbanization yang dipublikasikan oleh Li (2023) melalui
Mendeley Data (DOI: 10.17632/r4y2vcpfmx.1). Dataset ini
mencakup 2.005 lokasi industri dengan 15
variabel (1 respon + 14 prediktor).
# Ganti path sesuai lokasi file kamu
raw_contamination <- readxl::read_excel("Input dataset.xlsx")
names(raw_contamination) <- trimws(names(raw_contamination))
ringkasan_data <- data.frame(
Keterangan = c("Jumlah observasi (lokasi industri)", "Jumlah variabel"),
Nilai = c(nrow(raw_contamination), ncol(raw_contamination))
)
knitr::kable(ringkasan_data,
caption = "Ukuran Dataset Kontaminasi Lahan Perkotaan China") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Keterangan | Nilai |
|---|---|
| Jumlah observasi (lokasi industri) | 2005 |
| Jumlah variabel | 16 |
kamus_variabel <- data.frame(
`Kolom Asli` = c(
"Presence of contamination", "Duration", "Starting time",
"Industry class", "Violations", "Precipitation (mm)",
"Temperature", "Impervious surfaces", "Scale",
"Pollutant mobility", "Pollutant volatility",
"Persistent organic pollutants",
"Soil texture (Soil clay content in percentage)",
"Soil erosion", "Wind speed (m/s)"
),
`Variabel Analisis` = c(
"status_kontaminasi", "durasi_operasi", "tahun_mulai",
"jenis_industri", "jumlah_pelanggaran", "curah_hujan",
"suhu", "permukaan_kedap", "skala_industri",
"mobilitas_polutan", "volatilitas_polutan",
"polutan_organik_persisten",
"tekstur_tanah", "erosi_tanah", "kecepatan_angin"
),
`Keterangan` = c(
"Status kontaminasi lahan (RESPON Y)", "Lama operasi industri (tahun)",
"Tahun mulai beroperasi", "Jenis/kelas industri",
"Jumlah pelanggaran lingkungan", "Total curah hujan tahunan (mm)",
"Rata-rata suhu tahunan (°C)", "Ada/tidaknya permukaan kedap air",
"Skala ukuran industri", "Tingkat mobilitas polutan dalam tanah",
"Tingkat volatilitas polutan", "Ada/tidaknya polutan organik persisten",
"Kandungan liat tanah (%)", "Tingkat erosi tanah",
"Rata-rata kecepatan angin (m/s)"
),
`Tipe` = c(
"Respon biner", "Numerik", "Numerik",
"Kategorik (banyak level)", "Numerik", "Numerik",
"Numerik", "Kategorik (biner)", "Kategorik (ordinal)",
"Numerik", "Numerik", "Kategorik (biner)",
"Numerik", "Kategorik (ordinal)", "Numerik"
),
check.names = FALSE
)
knitr::kable(kamus_variabel,
caption = "Kamus Variabel Dataset Kontaminasi Lahan") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE)| Kolom Asli | Variabel Analisis | Keterangan | Tipe |
|---|---|---|---|
| Presence of contamination | status_kontaminasi | Status kontaminasi lahan (RESPON Y) | Respon biner |
| Duration | durasi_operasi | Lama operasi industri (tahun) | Numerik |
| Starting time | tahun_mulai | Tahun mulai beroperasi | Numerik |
| Industry class | jenis_industri | Jenis/kelas industri | Kategorik (banyak level) |
| Violations | jumlah_pelanggaran | Jumlah pelanggaran lingkungan | Numerik |
| Precipitation (mm) | curah_hujan | Total curah hujan tahunan (mm) | Numerik |
| Temperature | suhu | Rata-rata suhu tahunan (°C) | Numerik |
| Impervious surfaces | permukaan_kedap | Ada/tidaknya permukaan kedap air | Kategorik (biner) |
| Scale | skala_industri | Skala ukuran industri | Kategorik (ordinal) |
| Pollutant mobility | mobilitas_polutan | Tingkat mobilitas polutan dalam tanah | Numerik |
| Pollutant volatility | volatilitas_polutan | Tingkat volatilitas polutan | Numerik |
| Persistent organic pollutants | polutan_organik_persisten | Ada/tidaknya polutan organik persisten | Kategorik (biner) |
| Soil texture (Soil clay content in percentage) | tekstur_tanah | Kandungan liat tanah (%) | Numerik |
| Soil erosion | erosi_tanah | Tingkat erosi tanah | Kategorik (ordinal) |
| Wind speed (m/s) | kecepatan_angin | Rata-rata kecepatan angin (m/s) | Numerik |
Catatan preprocessing
jenis_industri: Penggabungan dilakukan dengan ambang batas frekuensi minimum n = 31. - 18 jenis industri dipertahankan sebagai kategori tersendiri (n ≥ 31) - 42 jenis industri dengan n < 31 digabung ke “Industri Lainnya” (n = 427) - Baseline (referensi): “Industri Lainnya” — ditetapkan eksplisit viarelevel()
collapse_rare <- function(x, min_n = 31, other_label = "Industri Lainnya") {
x_chr <- as.character(x)
keep <- names(which(table(x_chr) >= min_n))
factor(ifelse(x_chr %in% keep, x_chr, other_label))
}
erosi_levels <- c("Slight", "Mild", "Moderate", "Strong", "Very strong", "Severe")
contamination <- raw_contamination %>%
transmute(
status_kontaminasi = factor(`Presence of contamination`, levels = c("No", "Yes")),
kontaminasi_biner = as.integer(`Presence of contamination` == "Yes"),
durasi_operasi = as.numeric(Duration),
tahun_mulai = as.numeric(`Starting time`),
jumlah_pelanggaran = as.numeric(Violations),
curah_hujan = as.numeric(`Precipitation (mm)`),
suhu = as.numeric(`Temperature (℃)`),
mobilitas_polutan = as.numeric(`Pollutant mobility`),
volatilitas_polutan = as.numeric(`Pollutant volatility`),
tekstur_tanah = as.numeric(`Soil texture (Soil clay content in percentage)`),
kecepatan_angin = as.numeric(`Wind speed (m/s)`),
log_mobilitas_polutan = log(as.numeric(`Pollutant mobility`) + 1),
permukaan_kedap = factor(`Impervious surfaces`,
levels = c("No","Yes"), labels = c("Tidak","Ya")),
polutan_organik_persisten = factor(`Persistent organic pollutants`,
levels = c("No","Yes"), labels = c("Tidak","Ya")),
skala_industri = factor(Scale,
levels = c("Micro","Small","Medium","Large","Extra-large"),
labels = c("Mikro","Kecil","Menengah","Besar","Sangat Besar")),
erosi_tanah = factor(`Soil erosion`, levels = erosi_levels,
labels = c("Ringan","Agak Ringan","Sedang",
"Kuat","Sangat Kuat","Parah")),
jenis_industri = {
x_chr <- as.character(`Industry class`)
freq <- table(x_chr)
keep <- names(freq[freq >= 31])
factor(ifelse(x_chr %in% keep, x_chr, "Industri Lainnya"))
}
) %>%
droplevels() %>%
mutate(jenis_industri = relevel(jenis_industri, ref = "Industri Lainnya"))jenis_industrifreq_industri <- sort(table(contamination$jenis_industri), decreasing = TRUE)
freq_df <- as.data.frame(freq_industri)
names(freq_df) <- c("Jenis Industri", "Frekuensi")
knitr::kable(freq_df,
caption = "Frekuensi Jenis Industri Setelah Penggabungan") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)| Jenis Industri | Frekuensi |
|---|---|
| Industri Lainnya | 427 |
| Metal surface treatment and heat treatment processing | 209 |
| Organic chemical raw material manufacturing | 134 |
| Cotton printing and dyeing processing | 133 |
| Chemical reagent and additive manufacturing | 129 |
| Pesticide manufacturing | 126 |
| Leather tanning and processing | 121 |
| Coking | 104 |
| Crude oil processing and petroleum product manufacturing | 100 |
| Iron ore mining and dressing | 84 |
| Hazardous waste treatment | 67 |
| Lead-zinc smelting | 52 |
| Petroleum extraction | 52 |
| Iron smelting | 50 |
| Steel smelting | 49 |
| Chemical pharmaceutical raw material manufacturing | 48 |
| Lead-zinc ore mining and dressing | 47 |
| Gold ore mining and dressing | 40 |
| Copper smelting | 33 |
## Baseline jenis_industri: Industri Lainnya
distribusi_y <- contamination %>%
count(status_kontaminasi, name = "Jumlah") %>%
mutate(Proporsi = scales::percent(Jumlah / sum(Jumlah), accuracy = 0.1)) %>%
rename("Status Kontaminasi" = status_kontaminasi)
knitr::kable(distribusi_y,
caption = "Distribusi Kelas Respon (Y)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)| Status Kontaminasi | Jumlah | Proporsi |
|---|---|---|
| No | 801 | 40.0% |
| Yes | 1204 | 60.0% |
ggplot(contamination, aes(x = status_kontaminasi, fill = status_kontaminasi)) +
geom_bar(width = 0.62, color = "white", linewidth = 0.8) +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.4, fontface = "bold") +
scale_fill_manual(values = c("No" = "#2d6a2d", "Yes" = "#eab308")) +
labs(title = "Distribusi Status Kontaminasi Lahan",
x = NULL, y = "Jumlah Lokasi") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")ringkasan_numerik <- contamination %>%
group_by(status_kontaminasi) %>%
summarise(
Jumlah = n(),
`Rata-rata Durasi (th)` = round(mean(durasi_operasi, na.rm = TRUE), 2),
`Tahun Mulai (rata-rata)` = round(mean(tahun_mulai, na.rm = TRUE), 2),
`Rata-rata Pelanggaran` = round(mean(jumlah_pelanggaran, na.rm = TRUE), 2),
`Median Curah Hujan (mm)` = round(median(curah_hujan, na.rm = TRUE), 2),
`Rata-rata Suhu (°C)` = round(mean(suhu, na.rm = TRUE), 2),
`Rata-rata Mobilitas Pol.` = round(mean(mobilitas_polutan, na.rm = TRUE), 2),
`Rata-rata Volatilitas Pol.` = round(mean(volatilitas_polutan, na.rm = TRUE), 2),
`Rata-rata Tekstur Tanah` = round(mean(tekstur_tanah, na.rm = TRUE), 2),
`Rata-rata Kec. Angin (m/s)` = round(mean(kecepatan_angin, na.rm = TRUE), 2),
.groups = "drop"
) %>%
rename("Status Kontaminasi" = status_kontaminasi)
ringkasan_numerik %>%
tidyr::pivot_longer(-`Status Kontaminasi`, names_to = "Statistik", values_to = "Nilai") %>%
tidyr::pivot_wider(names_from = `Status Kontaminasi`, values_from = Nilai) %>%
knitr::kable(caption = "Ringkasan Numerik per Kelas Respon") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Statistik | No | Yes |
|---|---|---|
| Jumlah | 801.00 | 1204.00 |
| Rata-rata Durasi (th) | 13.36 | 24.92 |
| Tahun Mulai (rata-rata) | 2004.07 | 1992.11 |
| Rata-rata Pelanggaran | 1.00 | 2.34 |
| Median Curah Hujan (mm) | 8584.68 | 12222.13 |
| Rata-rata Suhu (°C) | 13.55 | 14.26 |
| Rata-rata Mobilitas Pol. | 0.59 | 0.60 |
| Rata-rata Volatilitas Pol. | 0.57 | 0.64 |
| Rata-rata Tekstur Tanah | 26.11 | 26.14 |
| Rata-rata Kec. Angin (m/s) | 21.49 | 22.13 |
# Scatter: durasi vs curah hujan
ggplot(contamination, aes(x = durasi_operasi, y = curah_hujan,
color = status_kontaminasi)) +
geom_point(alpha = 0.45, size = 1.8) +
scale_color_manual(values = c("No" = "#2d6a2d", "Yes" = "#eab308")) +
scale_y_continuous(labels = scales::comma) +
labs(title = "Durasi Operasi vs Curah Hujan",
subtitle = "Warna = status kontaminasi lahan",
x = "Durasi operasi (tahun)", y = "Curah hujan tahunan (mm)",
color = "Status Kontaminasi") +
theme_minimal(base_size = 12)# Proporsi kontaminasi per skala industri
ggplot(contamination, aes(x = skala_industri, fill = status_kontaminasi)) +
geom_bar(position = "fill", width = 0.7, color = "white") +
scale_fill_manual(values = c("No" = "#2d6a2d", "Yes" = "#eab308")) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proporsi Kontaminasi per Skala Industri",
x = "Skala Industri", y = "Proporsi", fill = "Status Kontaminasi") +
theme_minimal(base_size = 12)contamination %>%
select(status_kontaminasi, jenis_industri, durasi_operasi,
tahun_mulai, skala_industri, permukaan_kedap, erosi_tanah,
curah_hujan, suhu) %>%
head(8) %>%
knitr::kable(caption = "Contoh 8 Data Pertama") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| status_kontaminasi | jenis_industri | durasi_operasi | tahun_mulai | skala_industri | permukaan_kedap | erosi_tanah | curah_hujan | suhu |
|---|---|---|---|---|---|---|---|---|
| Yes | Petroleum extraction | 21 | 1992 | Mikro | Tidak | Ringan | 13775.359 | 19.7 |
| No | Petroleum extraction | 16 | 2000 | Mikro | Tidak | Ringan | 18130.318 | 21.5 |
| Yes | Petroleum extraction | 44 | 1978 | Mikro | Tidak | Ringan | 7784.797 | 14.3 |
| No | Petroleum extraction | 23 | 1979 | Mikro | Ya | Ringan | 4900.000 | 13.5 |
| No | Petroleum extraction | 3 | 2011 | Besar | Tidak | Kuat | 4841.117 | 9.8 |
| Yes | Petroleum extraction | 36 | 1986 | Mikro | Ya | Ringan | 5840.898 | 13.4 |
| Yes | Petroleum extraction | 42 | 1980 | Menengah | Ya | Ringan | 5885.018 | 13.4 |
| Yes | Petroleum extraction | 46 | 1976 | Besar | Ya | Ringan | 4303.062 | 7.2 |
set.seed(42)
stratified_split <- function(y, prop = 0.8) {
idx_by_class <- split(seq_along(y), y)
unlist(lapply(idx_by_class,
function(idx) sample(idx, floor(length(idx) * prop))),
use.names = FALSE)
}
train_id <- stratified_split(contamination$kontaminasi_biner, 0.8)
train_data <- contamination[train_id, ]
test_data <- contamination[-train_id, ]
bind_rows(
train_data %>% count(status_kontaminasi) %>% mutate(Data = "Training"),
test_data %>% count(status_kontaminasi) %>% mutate(Data = "Testing")
) %>%
group_by(Data) %>%
mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) %>%
ungroup() %>%
rename("Status Kontaminasi" = status_kontaminasi, Jumlah = n) %>%
knitr::kable(caption = "Distribusi Kelas Training dan Testing") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Status Kontaminasi | Jumlah | Data | Proporsi |
|---|---|---|---|
| No | 640 | Training | 39.9% |
| Yes | 963 | Training | 60.1% |
| No | 161 | Testing | 40.0% |
| Yes | 241 | Testing | 60.0% |
Pengembangan model dilakukan secara iteratif melalui tiga tahap untuk memperbaiki pelanggaran asumsi dan menghasilkan model yang parsimonious.
contamination_fit1 <- glm(
kontaminasi_biner ~
durasi_operasi + tahun_mulai + jumlah_pelanggaran +
curah_hujan + suhu + kecepatan_angin +
tekstur_tanah + mobilitas_polutan + volatilitas_polutan +
permukaan_kedap + polutan_organik_persisten +
skala_industri + erosi_tanah + jenis_industri,
data = train_data,
family = binomial(link = "logit")
)
data.frame(
Keterangan = c("Jumlah observasi training", "Jumlah variabel prediktor",
"Null deviance", "Residual deviance", "Derajat bebas residual", "AIC"),
Nilai = c(nobs(contamination_fit1), length(contamination_fit1$coefficients) - 1,
round(contamination_fit1$null.deviance, 3),
round(contamination_fit1$deviance, 3),
contamination_fit1$df.residual, round(AIC(contamination_fit1), 3))
) %>%
knitr::kable(caption = "Ringkasan Kecocokan Model v1 (Full — 15 variabel)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Keterangan | Nilai |
|---|---|
| Jumlah observasi training | 1603.000 |
| Jumlah variabel prediktor | 38.000 |
| Null deviance | 2156.699 |
| Residual deviance | 1411.060 |
| Derajat bebas residual | 1564.000 |
| AIC | 1489.060 |
vif_v1 <- car::vif(contamination_fit1)
if (is.matrix(vif_v1)) {
vif_df1 <- as.data.frame(vif_v1)
vif_df1$`GVIF^(1/(2Df))` <- vif_v1[, 3]
vif_df1$Keterangan <- ifelse(vif_v1[, 3] > 5, "⚠️ Bermasalah", "✅ Aman")
} else {
vif_df1 <- data.frame(VIF = vif_v1,
Keterangan = ifelse(vif_v1 > 5, "⚠️ Bermasalah", "✅ Aman"))
}
knitr::kable(vif_df1, caption = "Hasil Uji VIF — Model v1") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| GVIF | Df | GVIF^(1/(2*Df)) | GVIF^(1/(2Df)) | Keterangan | |
|---|---|---|---|---|---|
| durasi_operasi | 1.992835 | 1 | 1.411678 | 1.411678 | ✅ Aman | |
| tahun_mulai | 2.036131 | 1 | 1.426931 | 1.426931 | ✅ Aman | |
| jumlah_pelanggaran | 1.106675 | 1 | 1.051986 | 1.051986 | ✅ Aman | |
| curah_hujan | 3.027219 | 1 | 1.739891 | 1.739891 | ✅ Aman | |
| suhu | 3.387764 | 1 | 1.840588 | 1.840588 | ✅ Aman | |
| kecepatan_angin | 1.464081 | 1 | 1.209992 | 1.209992 | ✅ Aman | |
| tekstur_tanah | 1.574873 | 1 | 1.254939 | 1.254939 | ✅ Aman | |
| mobilitas_polutan | 2.716299 | 1 | 1.648120 | 1.648120 | ✅ Aman | |
| volatilitas_polutan | 85.554369 | 1 | 9.249560 | 9.249560 | ⚠️ Bermasalah |
| permukaan_kedap | 1.196102 | 1 | 1.093664 | 1.093664 | ✅ Aman | |
| polutan_organik_persisten | 15.095518 | 1 | 3.885295 | 3.885295 | ✅ Aman | |
| skala_industri | 1.663734 | 4 | 1.065701 | 1.065701 | ✅ Aman | |
| erosi_tanah | 1.352301 | 5 | 1.030641 | 1.030641 | ✅ Aman | |
| jenis_industri | 4636.502745 | 18 | 1.264266 | 1.264266 | ✅ Aman | |
Temuan v1 — VIF:
volatilitas_polutanmemiliki GVIF^(1/(2Df)) = 9,25 > 5. Keputusan: Hapusvolatilitas_polutandari model → model v2.
train_bt1 <- train_data %>%
mutate(
dur_log = durasi_operasi * log(durasi_operasi + 1),
hujan_log = curah_hujan * log(curah_hujan + 1),
suhu_log = suhu * log(abs(suhu) + 1),
angin_log = kecepatan_angin * log(kecepatan_angin + 1),
tanah_log = tekstur_tanah * log(tekstur_tanah + 1),
mob_log = mobilitas_polutan * log(mobilitas_polutan + 1),
vol_log = volatilitas_polutan * log(volatilitas_polutan + 1)
)
bt_fit1 <- glm(
kontaminasi_biner ~
durasi_operasi + dur_log + curah_hujan + hujan_log +
suhu + suhu_log + kecepatan_angin + angin_log +
tekstur_tanah + tanah_log + mobilitas_polutan + mob_log +
volatilitas_polutan + vol_log,
data = train_bt1, family = binomial
)
broom::tidy(bt_fit1) %>%
filter(grepl("_log$", term)) %>%
transmute(
`Variabel (interaksi log)` = term,
Koefisien = round(estimate, 4),
`p-value` = signif(p.value, 3),
Kesimpulan = ifelse(p.value < 0.05, "⚠️ Asumsi DILANGGAR", "✅ Asumsi terpenuhi")
) %>%
knitr::kable(caption = "Uji Box-Tidwell v1: Linearitas Logit Variabel Kontinu") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Variabel (interaksi log) | Koefisien | p-value | Kesimpulan |
|---|---|---|---|
| dur_log | 0.0144 | 0.63200 | ✅ Asumsi terpenuhi | |
| hujan_log | 0.0000 | 0.60900 | ✅ Asumsi terpenuhi | |
| suhu_log | 0.0506 | 0.19700 | ✅ Asumsi terpenuhi | |
| angin_log | 0.0228 | 0.21000 | ✅ Asumsi terpenuhi | |
| tanah_log | 0.0101 | 0.76200 | ✅ Asumsi terpenuhi | |
| mob_log | -199.1905 | 0.00433 | ⚠️ Asumsi DILANGGAR |
| vol_log | -0.6576 | 0.06170 | ✅ Asumsi terpenuhi | |
Temuan v1 — Box-Tidwell:
mobilitas_polutanmelanggar asumsi linearitas logit (p = 0,004 < 0,05). Keputusan: Transformasilog(mobilitas_polutan + 1)→ dipakai sebagailog_mobilitas_polutanpada v2 & v3.
## Analysis of Deviance Table (Type II tests)
##
## Response: kontaminasi_biner
## LR Chisq Df Pr(>Chisq)
## durasi_operasi 88.443 1 < 2.2e-16 ***
## tahun_mulai 8.100 1 0.004428 **
## jumlah_pelanggaran 149.124 1 < 2.2e-16 ***
## curah_hujan 23.646 1 1.158e-06 ***
## suhu 0.168 1 0.682219
## kecepatan_angin 3.576 1 0.058637 .
## tekstur_tanah 8.383 1 0.003788 **
## mobilitas_polutan 5.270 1 0.021696 *
## volatilitas_polutan 0.016 1 0.899957
## permukaan_kedap 0.325 1 0.568672
## polutan_organik_persisten 8.165 1 0.004270 **
## skala_industri 1.459 4 0.833867
## erosi_tanah 3.628 5 0.604182
## jenis_industri 97.302 18 6.894e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p_train_v1 <- predict(contamination_fit1, newdata = train_data, type = "response")
p_test_v1 <- predict(contamination_fit1, newdata = test_data, type = "response")
roc_test_v1 <- roc_points(test_data$kontaminasi_biner, p_test_v1) %>%
mutate(data = "Testing v1")
auc_test_v1 <- auc_value(roc_test_v1)
met_v1_def <- classification_metrics(test_data$kontaminasi_biner, p_test_v1, 0.5)
confusion_matrix_tabel(test_data$kontaminasi_biner, p_test_v1, 0.5) %>%
knitr::kable(caption = "Confusion Matrix Testing (Threshold = 0,50) — v1") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Prediksi: Terkontaminasi | Prediksi: Bersih | Sum | |
|---|---|---|---|
| Aktual: Terkontaminasi | 192 | 49 | 241 |
| Aktual: Bersih | 52 | 109 | 161 |
| Sum | 244 | 158 | 402 |
format_metrics_indonesia(met_v1_def) %>%
mutate(across(where(is.numeric), round, 3)) %>%
knitr::kable(caption = "Metrik Evaluasi Testing (Threshold = 0,50) — v1") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Threshold | Akurasi | Error Rate | Sensitivity | Specificity | Presisi | F1-Score | Balanced Accuracy |
|---|---|---|---|---|---|---|---|
| 0.5 | 0.749 | 0.251 | 0.797 | 0.677 | 0.787 | 0.792 | 0.737 |
# Kurva ROC v1
roc_train_v1 <- roc_points(train_data$kontaminasi_biner, p_train_v1) %>%
mutate(data = "Training v1")
optimal_v1 <- roc_train_v1 %>%
filter(is.finite(threshold)) %>% arrange(desc(youden), desc(sensitivity)) %>% slice(1)
threshold_opt_v1 <- optimal_v1$threshold[1]
ggplot(bind_rows(roc_train_v1, roc_test_v1), aes(x = fpr, y = sensitivity, color = data)) +
geom_path(linewidth = 1.1) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
geom_point(data = optimal_v1, aes(x = fpr, y = sensitivity),
inherit.aes = FALSE, color = "#854d0e", fill = "#eab308",
shape = 21, size = 4, stroke = 1.2) +
coord_equal() +
scale_color_manual(values = c("Training v1" = "#2d6a2d", "Testing v1" = "#eab308")) +
labs(title = "Kurva ROC — Model v1 (Full — 15 variabel)",
subtitle = paste0("AUC Training = ", round(auc_value(roc_train_v1), 3),
" | AUC Testing = ", round(auc_test_v1, 3),
" | Threshold Optimal = ", round(threshold_opt_v1, 3)),
x = "False Positive Rate (1 - Specificity)",
y = "Sensitivity (True Positive Rate)", color = "Data") +
theme_minimal(base_size = 12) + theme(legend.position = "bottom")data.frame(Data = c("Training v1","Testing v1"),
AUC = round(c(auc_value(roc_train_v1), auc_test_v1), 3)) %>%
knitr::kable(caption = "Nilai AUC — Model v1") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Data | AUC |
|---|---|
| Training v1 | 0.863 |
| Testing v1 | 0.841 |
Perubahan dari v1:
- [DIHAPUS]
volatilitas_polutan→ VIF = 9,25 (multikolinearitas berat)- [DITRANSFORMASI]
mobilitas_polutan→log(x+1)=log_mobilitas_polutan
contamination_fit2 <- glm(
kontaminasi_biner ~
durasi_operasi + tahun_mulai + jumlah_pelanggaran +
curah_hujan + suhu + kecepatan_angin +
tekstur_tanah + log_mobilitas_polutan +
permukaan_kedap + polutan_organik_persisten +
skala_industri + erosi_tanah + jenis_industri,
data = train_data,
family = binomial(link = "logit")
)
data.frame(
Keterangan = c("Jumlah observasi training", "Jumlah variabel prediktor",
"Null deviance", "Residual deviance", "Derajat bebas residual", "AIC"),
Nilai = c(nobs(contamination_fit2), length(contamination_fit2$coefficients) - 1,
round(contamination_fit2$null.deviance, 3),
round(contamination_fit2$deviance, 3),
contamination_fit2$df.residual, round(AIC(contamination_fit2), 3))
) %>%
knitr::kable(caption = "Ringkasan Kecocokan Model v2 (13 variabel)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Keterangan | Nilai |
|---|---|
| Jumlah observasi training | 1603.000 |
| Jumlah variabel prediktor | 37.000 |
| Null deviance | 2156.699 |
| Residual deviance | 1411.100 |
| Derajat bebas residual | 1565.000 |
| AIC | 1487.100 |
vif_v2 <- car::vif(contamination_fit2)
if (is.matrix(vif_v2)) {
vif_df2 <- as.data.frame(vif_v2)
vif_df2$Keterangan <- ifelse(vif_v2[, 3] > 2.5, "⚠️ Tinggi", "✅ Aman")
} else {
vif_df2 <- data.frame(VIF = vif_v2,
Keterangan = ifelse(vif_v2 > 2.5, "⚠️ Tinggi", "✅ Aman"))
}
knitr::kable(vif_df2, caption = "Hasil Uji VIF — Model v2") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| GVIF | Df | GVIF^(1/(2*Df)) | Keterangan | |
|---|---|---|---|---|
| durasi_operasi | 1.993069 | 1 | 1.411761 | ✅ Aman | |
| tahun_mulai | 2.031962 | 1 | 1.425469 | ✅ Aman | |
| jumlah_pelanggaran | 1.106356 | 1 | 1.051835 | ✅ Aman | |
| curah_hujan | 3.025701 | 1 | 1.739454 | ✅ Aman | |
| suhu | 3.386560 | 1 | 1.840261 | ✅ Aman | |
| kecepatan_angin | 1.463668 | 1 | 1.209822 | ✅ Aman | |
| tekstur_tanah | 1.574540 | 1 | 1.254807 | ✅ Aman | |
| log_mobilitas_polutan | 2.486152 | 1 | 1.576754 | ✅ Aman | |
| permukaan_kedap | 1.195124 | 1 | 1.093217 | ✅ Aman | |
| polutan_organik_persisten | 15.074709 | 1 | 3.882616 | ⚠️ Tinggi |
| skala_industri | 1.657428 | 4 | 1.065196 | ✅ Aman | |
| erosi_tanah | 1.339993 | 5 | 1.029699 | ✅ Aman | |
| jenis_industri | 73.529238 | 18 | 1.126798 | ✅ Aman | |
✅ Setelah penghapusan
volatilitas_polutan, tidak ada lagi variabel dengan GVIF^(1/(2Df)) > 5. Nilai VIF tertinggi adalahpolutan_organik_persisten(3,88) — dipertahankan karena relevansi ekologisnya.
train_bt2 <- train_data %>%
mutate(
dur_log = durasi_operasi * log(durasi_operasi + 1),
hujan_log = curah_hujan * log(curah_hujan + 1),
suhu_log = suhu * log(abs(suhu) + 1),
angin_log = kecepatan_angin * log(kecepatan_angin + 1),
tanah_log = tekstur_tanah * log(tekstur_tanah + 1)
)
bt_fit2 <- glm(
kontaminasi_biner ~
durasi_operasi + dur_log + curah_hujan + hujan_log +
suhu + suhu_log + kecepatan_angin + angin_log +
tekstur_tanah + tanah_log,
data = train_bt2, family = binomial
)
broom::tidy(bt_fit2) %>%
filter(grepl("_log$", term)) %>%
transmute(
`Variabel (interaksi log)` = term,
Koefisien = round(estimate, 4),
`p-value` = signif(p.value, 3),
Kesimpulan = ifelse(p.value < 0.05, "⚠️ Asumsi DILANGGAR", "✅ Asumsi terpenuhi")
) %>%
knitr::kable(caption = "Uji Box-Tidwell v2: Linearitas Logit (Variabel Tersisa)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Variabel (interaksi log) | Koefisien | p-value | Kesimpulan |
|---|---|---|---|
| dur_log | 0.0168 | 0.580 | ✅ Asumsi terpenuhi | |
| hujan_log | 0.0000 | 0.328 | ✅ Asumsi terpenuhi | |
| suhu_log | 0.0402 | 0.298 | ✅ Asumsi terpenuhi | |
| angin_log | 0.0254 | 0.154 | ✅ Asumsi terpenuhi | |
| tanah_log | 0.0043 | 0.896 | ✅ Asumsi terpenuhi | |
✅ Semua p-value > 0,05 → asumsi linearitas logit terpenuhi untuk semua variabel numerik di v2.
## Analysis of Deviance Table (Type II tests)
##
## Response: kontaminasi_biner
## LR Chisq Df Pr(>Chisq)
## durasi_operasi 88.458 1 < 2.2e-16 ***
## tahun_mulai 8.084 1 0.004465 **
## jumlah_pelanggaran 149.222 1 < 2.2e-16 ***
## curah_hujan 23.631 1 1.167e-06 ***
## suhu 0.167 1 0.683200
## kecepatan_angin 3.583 1 0.058369 .
## tekstur_tanah 8.372 1 0.003810 **
## log_mobilitas_polutan 5.916 1 0.015002 *
## permukaan_kedap 0.330 1 0.565742
## polutan_organik_persisten 8.157 1 0.004290 **
## skala_industri 1.454 4 0.834749
## erosi_tanah 3.623 5 0.604933
## jenis_industri 98.496 18 4.174e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef_v2 <- broom::tidy(contamination_fit2) %>%
filter(term != "(Intercept)") %>%
mutate(
odds_ratio = exp(estimate),
ci_low = exp(estimate - 1.96 * std.error),
ci_high = exp(estimate + 1.96 * std.error),
Signifikansi = case_when(
p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*", p.value < 0.1 ~ ".",
TRUE ~ "tidak signifikan ⚠"
)
) %>%
arrange(p.value) %>%
transmute(`Variabel/Level` = term,
`Odds Ratio` = round(odds_ratio, 3),
`CI 95%` = paste0(round(ci_low, 3), " - ", round(ci_high, 3)),
`p-value` = signif(p.value, 3),
Signifikansi = Signifikansi)
knitr::kable(coef_v2, caption = "Koefisien & Odds Ratio Lengkap — Model v2 (diurutkan p-value)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Variabel/Level | Odds Ratio | CI 95% | p-value | Signifikansi |
|---|---|---|---|---|
| jumlah_pelanggaran | 1.554 | 1.425 - 1.695 | 0.00e+00 | *** |
| durasi_operasi | 1.142 | 1.11 - 1.176 | 0.00e+00 | *** |
| jenis_industriHazardous waste treatment | 0.071 | 0.025 - 0.205 | 1.00e-06 | *** |
| curah_hujan | 1.000 | 1 - 1 | 1.50e-06 | *** |
| jenis_industriOrganic chemical raw material manufacturing | 12.325 | 3.744 - 40.577 | 3.61e-05 | *** |
| jenis_industriCotton printing and dyeing processing | 0.320 | 0.18 - 0.569 | 1.05e-04 | *** |
| jenis_industriPesticide manufacturing | 7.945 | 2.438 - 25.893 | 5.85e-04 | *** |
| jenis_industriCoking | 7.838 | 2.303 - 26.675 | 9.85e-04 | *** |
| jenis_industriSteel smelting | 9.378 | 2.25 - 39.077 | 2.11e-03 | ** |
| jenis_industriCrude oil processing and petroleum product manufacturing | 2.967 | 1.466 - 6.002 | 2.49e-03 | ** |
| tekstur_tanah | 0.969 | 0.949 - 0.99 | 4.04e-03 | ** |
| tahun_mulai | 0.965 | 0.941 - 0.989 | 4.75e-03 | ** |
| polutan_organik_persistenYa | 0.209 | 0.068 - 0.639 | 6.08e-03 | ** |
| jenis_industriIron smelting | 6.139 | 1.599 - 23.562 | 8.18e-03 | ** |
| log_mobilitas_polutan | 0.476 | 0.262 - 0.867 | 1.53e-02 |
|
| jenis_industriCopper smelting | 4.384 | 1.101 - 17.453 | 3.60e-02 |
|
| jenis_industriPetroleum extraction | 0.336 | 0.119 - 0.948 | 3.92e-02 |
|
| jenis_industriLead-zinc smelting | 2.246 | 1.013 - 4.982 | 4.64e-02 |
|
| kecepatan_angin | 1.016 | 0.999 - 1.033 | 6.05e-02 | . |
| erosi_tanahSangat Kuat | 3.872 | 0.655 - 22.875 | 1.35e-01 | tidak signifikan ⚠ |
| jenis_industriChemical reagent and additive manufacturing | 0.651 | 0.347 - 1.22 | 1.80e-01 | tidak signifikan ⚠ |
| jenis_industriLeather tanning and processing | 1.524 | 0.803 - 2.892 | 1.97e-01 | tidak signifikan ⚠ |
| jenis_industriIron ore mining and dressing | 0.627 | 0.294 - 1.339 | 2.28e-01 | tidak signifikan ⚠ |
| erosi_tanahAgak Ringan | 1.297 | 0.792 - 2.124 | 3.02e-01 | tidak signifikan ⚠ |
| skala_industriKecil | 1.220 | 0.828 - 1.796 | 3.14e-01 | tidak signifikan ⚠ |
| jenis_industriLead-zinc ore mining and dressing | 1.380 | 0.533 - 3.572 | 5.06e-01 | tidak signifikan ⚠ |
| permukaan_kedapYa | 0.920 | 0.691 - 1.224 | 5.66e-01 | tidak signifikan ⚠ |
| jenis_industriChemical pharmaceutical raw material manufacturing | 0.793 | 0.31 - 2.031 | 6.29e-01 | tidak signifikan ⚠ |
| suhu | 0.990 | 0.941 - 1.041 | 6.83e-01 | tidak signifikan ⚠ |
| jenis_industriGold ore mining and dressing | 1.135 | 0.411 - 3.133 | 8.06e-01 | tidak signifikan ⚠ |
| erosi_tanahSedang | 0.961 | 0.508 - 1.816 | 9.01e-01 | tidak signifikan ⚠ |
| skala_industriMenengah | 1.026 | 0.67 - 1.572 | 9.05e-01 | tidak signifikan ⚠ |
| erosi_tanahKuat | 1.062 | 0.365 - 3.095 | 9.12e-01 | tidak signifikan ⚠ |
| skala_industriSangat Besar | 0.983 | 0.612 - 1.58 | 9.44e-01 | tidak signifikan ⚠ |
| skala_industriBesar | 0.994 | 0.659 - 1.5 | 9.77e-01 | tidak signifikan ⚠ |
| erosi_tanahParah | 1222.199 | 0 - 3.2740385073678e+279 | 9.83e-01 | tidak signifikan ⚠ |
| jenis_industriMetal surface treatment and heat treatment processing | 1.000 | 0.594 - 1.683 | 9.99e-01 | tidak signifikan ⚠ |
tidak_sig_v2 <- broom::tidy(contamination_fit2) %>%
filter(term != "(Intercept)") %>%
mutate(variabel_utama = case_when(
grepl("^suhu$", term) ~ "suhu",
grepl("^permukaan_kedap", term) ~ "permukaan_kedap",
grepl("^skala_industri", term) ~ "skala_industri",
grepl("^erosi_tanah", term) ~ "erosi_tanah",
TRUE ~ term
)) %>%
group_by(variabel_utama) %>%
summarise(
`p-value min` = round(min(p.value), 4),
`p-value max` = round(max(p.value), 4),
`Semua level tidak signifikan?` = ifelse(all(p.value >= 0.05), "Ya ⚠️", "Tidak"),
.groups = "drop"
) %>%
filter(`Semua level tidak signifikan?` == "Ya ⚠️") %>%
rename(Variabel = variabel_utama)
knitr::kable(tidak_sig_v2,
caption = "Variabel dengan Semua Level Tidak Signifikan di Model v2") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Variabel | p-value min | p-value max | Semua level tidak signifikan? |
|---|---|---|---|
| erosi_tanah | 0.1352 | 0.9825 | Ya ⚠️ |
| jenis_industriChemical pharmaceutical raw material manufacturing | 0.6286 | 0.6286 | Ya ⚠️ |
| jenis_industriChemical reagent and additive manufacturing | 0.1804 | 0.1804 | Ya ⚠️ |
| jenis_industriGold ore mining and dressing | 0.8063 | 0.8063 | Ya ⚠️ |
| jenis_industriIron ore mining and dressing | 0.2280 | 0.2280 | Ya ⚠️ |
| jenis_industriLead-zinc ore mining and dressing | 0.5064 | 0.5064 | Ya ⚠️ |
| jenis_industriLeather tanning and processing | 0.1973 | 0.1973 | Ya ⚠️ |
| jenis_industriMetal surface treatment and heat treatment processing | 0.9991 | 0.9991 | Ya ⚠️ |
| kecepatan_angin | 0.0605 | 0.0605 | Ya ⚠️ |
| permukaan_kedap | 0.5659 | 0.5659 | Ya ⚠️ |
| skala_industri | 0.3140 | 0.9767 | Ya ⚠️ |
| suhu | 0.6831 | 0.6831 | Ya ⚠️ |
Keempat variabel di atas (suhu, permukaan_kedap, skala_industri, erosi_tanah) dieliminasi pada model v3. Catatan:
erosi_tanahlevel ‘Parah’ menunjukkan gejala quasi-complete separation.
p_train_v2 <- predict(contamination_fit2, newdata = train_data, type = "response")
p_test_v2 <- predict(contamination_fit2, newdata = test_data, type = "response")
confusion_matrix_tabel(test_data$kontaminasi_biner, p_test_v2, 0.5) %>%
knitr::kable(caption = "Confusion Matrix Testing (Threshold = 0,50) — v2") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Prediksi: Terkontaminasi | Prediksi: Bersih | Sum | |
|---|---|---|---|
| Aktual: Terkontaminasi | 192 | 49 | 241 |
| Aktual: Bersih | 52 | 109 | 161 |
| Sum | 244 | 158 | 402 |
met_v2_def <- classification_metrics(test_data$kontaminasi_biner, p_test_v2, 0.5)
format_metrics_indonesia(met_v2_def) %>%
mutate(across(where(is.numeric), round, 3)) %>%
knitr::kable(caption = "Metrik Evaluasi Testing (Threshold = 0,50) — v2") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Threshold | Akurasi | Error Rate | Sensitivity | Specificity | Presisi | F1-Score | Balanced Accuracy |
|---|---|---|---|---|---|---|---|
| 0.5 | 0.749 | 0.251 | 0.797 | 0.677 | 0.787 | 0.792 | 0.737 |
roc_train_v2 <- roc_points(train_data$kontaminasi_biner, p_train_v2) %>% mutate(data = "Training v2")
roc_test_v2 <- roc_points(test_data$kontaminasi_biner, p_test_v2) %>% mutate(data = "Testing v2")
auc_train_v2 <- auc_value(roc_train_v2); auc_test_v2 <- auc_value(roc_test_v2)
data.frame(Data = c("Training v2","Testing v2"),
AUC = round(c(auc_train_v2, auc_test_v2), 3)) %>%
knitr::kable(caption = "Nilai AUC — Model v2") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Data | AUC |
|---|---|
| Training v2 | 0.863 |
| Testing v2 | 0.841 |
ggplot(bind_rows(roc_train_v2, roc_test_v2), aes(x = fpr, y = sensitivity, color = data)) +
geom_path(linewidth = 1.1) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
coord_equal() +
scale_color_manual(values = c("Training v2" = "#2d6a2d", "Testing v2" = "#eab308")) +
labs(title = "Kurva ROC — Model v2 (Full Diperbaiki — 13 variabel)",
subtitle = paste0("AUC Training = ", round(auc_train_v2, 3),
" | AUC Testing = ", round(auc_test_v2, 3)),
x = "False Positive Rate (1 - Specificity)",
y = "Sensitivity (True Positive Rate)", color = "Data") +
theme_minimal(base_size = 12) + theme(legend.position = "bottom")## --- UJI HOSMER-LEMESHOW — MODEL v2 ---
print(ResourceSelection::hoslem.test(train_data$kontaminasi_biner, fitted(contamination_fit2), g = 10))##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: train_data$kontaminasi_biner, fitted(contamination_fit2)
## X-squared = 18.758, df = 8, p-value = 0.01621
## (p = 0.016 → indikasi misfit → akan diperbaiki di v3)
## Nagelkerke R² v2: 0.5029
Perubahan dari v2:
- [DIHAPUS]
suhu(p = 0,683)- [DIHAPUS]
permukaan_kedap(p = 0,566)- [DIHAPUS]
skala_industri(semua level p > 0,31)- [DIHAPUS]
erosi_tanah(semua level p > 0,13 + quasi-complete separation)- [DIPERTAHANKAN]
kecepatan_angin(p = 0,058 borderline, relevan substantif)
contamination_fit3 <- glm(
kontaminasi_biner ~
durasi_operasi + tahun_mulai + jumlah_pelanggaran +
curah_hujan + kecepatan_angin + tekstur_tanah +
log_mobilitas_polutan + polutan_organik_persisten +
jenis_industri,
data = train_data,
family = binomial(link = "logit")
)
data.frame(
Keterangan = c("Jumlah observasi training", "Jumlah variabel prediktor",
"Null deviance", "Residual deviance", "Derajat bebas residual", "AIC"),
Nilai = c(nobs(contamination_fit3), length(contamination_fit3$coefficients) - 1,
round(contamination_fit3$null.deviance, 3),
round(contamination_fit3$deviance, 3),
contamination_fit3$df.residual, round(AIC(contamination_fit3), 3))
) %>%
knitr::kable(caption = "Ringkasan Kecocokan Model v3 (Reduced / Final)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Keterangan | Nilai |
|---|---|
| Jumlah observasi training | 1603.000 |
| Jumlah variabel prediktor | 26.000 |
| Null deviance | 2156.699 |
| Residual deviance | 1416.731 |
| Derajat bebas residual | 1576.000 |
| AIC | 1470.731 |
aic_v2 <- AIC(contamination_fit2); aic_v3 <- AIC(contamination_fit3)
selisih_aic <- round(abs(aic_v2 - aic_v3), 3)
data.frame(
Model = c("v2 — Full Model (13 variabel)", "v3 — Reduced Model (9 variabel)"),
`Jumlah Variabel` = c(13, 9),
`Residual Deviance` = c(round(contamination_fit2$deviance, 3),
round(contamination_fit3$deviance, 3)),
`Df Residual` = c(contamination_fit2$df.residual, contamination_fit3$df.residual),
AIC = c(round(aic_v2, 3), round(aic_v3, 3)),
check.names = FALSE
) %>%
knitr::kable(caption = "Perbandingan Full Model (v2) vs Reduced Model (v3)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Model | Jumlah Variabel | Residual Deviance | Df Residual | AIC |
|---|---|---|---|---|
| v2 — Full Model (13 variabel) | 13 | 1411.100 | 1565 | 1487.100 |
| v3 — Reduced Model (9 variabel) | 9 | 1416.731 | 1576 | 1470.731 |
## Selisih AIC (v2 - v3): 16.369 → ΔAIc > 10 → bukti kuat memilih v3
## --- LIKELIHOOD RATIO TEST (LRT): v3 vs v2 ---
## Analysis of Deviance Table
##
## Model 1: kontaminasi_biner ~ durasi_operasi + tahun_mulai + jumlah_pelanggaran +
## curah_hujan + kecepatan_angin + tekstur_tanah + log_mobilitas_polutan +
## polutan_organik_persisten + jenis_industri
## Model 2: kontaminasi_biner ~ durasi_operasi + tahun_mulai + jumlah_pelanggaran +
## curah_hujan + suhu + kecepatan_angin + tekstur_tanah + log_mobilitas_polutan +
## permukaan_kedap + polutan_organik_persisten + skala_industri +
## erosi_tanah + jenis_industri
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1576 1416.7
## 2 1565 1411.1 11 5.6305 0.8968
##
## ✅ p-value = 0.8968 > 0.05
## → Variabel yang dihapus TIDAK signifikan secara bersama-sama.
## → Reduced model (v3) VALID digunakan sebagai MODEL FINAL.
vif_v3 <- car::vif(contamination_fit3)
if (is.matrix(vif_v3)) {
vif_df3 <- as.data.frame(vif_v3)
vif_df3$Keterangan <- ifelse(vif_v3[, 3] > 2.5, "⚠️ Tinggi", "✅ Aman")
} else {
vif_df3 <- data.frame(VIF = vif_v3,
Keterangan = ifelse(vif_v3 > 2.5, "⚠️ Tinggi", "✅ Aman"))
}
knitr::kable(vif_df3, caption = "Hasil Uji VIF — Model v3") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| GVIF | Df | GVIF^(1/(2*Df)) | Keterangan | |
|---|---|---|---|---|
| durasi_operasi | 1.835681 | 1 | 1.354873 | ✅ Aman | |
| tahun_mulai | 1.766605 | 1 | 1.329137 | ✅ Aman | |
| jumlah_pelanggaran | 1.091027 | 1 | 1.044523 | ✅ Aman | |
| curah_hujan | 1.606868 | 1 | 1.267623 | ✅ Aman | |
| kecepatan_angin | 1.227614 | 1 | 1.107977 | ✅ Aman | |
| tekstur_tanah | 1.498299 | 1 | 1.224050 | ✅ Aman | |
| log_mobilitas_polutan | 2.447461 | 1 | 1.564436 | ✅ Aman | |
| polutan_organik_persisten | 14.992907 | 1 | 3.872067 | ⚠️ Tinggi |
| jenis_industri | 43.772412 | 18 | 1.110680 | ✅ Aman | |
## Analysis of Deviance Table (Type II tests)
##
## Response: kontaminasi_biner
## LR Chisq Df Pr(>Chisq)
## durasi_operasi 94.050 1 < 2.2e-16 ***
## tahun_mulai 9.948 1 0.001611 **
## jumlah_pelanggaran 153.105 1 < 2.2e-16 ***
## curah_hujan 40.480 1 1.987e-10 ***
## kecepatan_angin 6.802 1 0.009103 **
## tekstur_tanah 9.661 1 0.001882 **
## log_mobilitas_polutan 6.588 1 0.010266 *
## polutan_organik_persisten 8.556 1 0.003444 **
## jenis_industri 101.796 18 1.037e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef_v3 <- broom::tidy(contamination_fit3) %>%
filter(term != "(Intercept)") %>%
mutate(
odds_ratio = exp(estimate),
ci_low = exp(estimate - 1.96 * std.error),
ci_high = exp(estimate + 1.96 * std.error),
Signifikansi = case_when(
p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*", p.value < 0.1 ~ ".",
TRUE ~ "-"
)
) %>%
arrange(p.value) %>%
transmute(`Variabel/Level` = term,
`Odds Ratio` = round(odds_ratio, 3),
`CI 95% (Low - High)` = paste0(round(ci_low, 3), " - ", round(ci_high, 3)),
`p-value` = signif(p.value, 3),
Signifikansi = Signifikansi)
knitr::kable(coef_v3, caption = "Koefisien & Odds Ratio — Model v3 (Reduced / Final)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Variabel/Level | Odds Ratio | CI 95% (Low - High) | p-value | Signifikansi |
|---|---|---|---|---|
| jumlah_pelanggaran | 1.554 | 1.426 - 1.694 | 0.00e+00 | *** |
| durasi_operasi | 1.140 | 1.108 - 1.172 | 0.00e+00 | *** |
| curah_hujan | 1.000 | 1 - 1 | 0.00e+00 | *** |
| jenis_industriHazardous waste treatment | 0.070 | 0.024 - 0.201 | 8.00e-07 | *** |
| jenis_industriCotton printing and dyeing processing | 0.309 | 0.177 - 0.539 | 3.57e-05 | *** |
| jenis_industriOrganic chemical raw material manufacturing | 12.039 | 3.681 - 39.375 | 3.87e-05 | *** |
| jenis_industriPesticide manufacturing | 7.875 | 2.417 - 25.654 | 6.15e-04 | *** |
| jenis_industriCoking | 8.188 | 2.426 - 27.642 | 7.05e-04 | *** |
| tahun_mulai | 0.964 | 0.942 - 0.986 | 1.77e-03 | ** |
| tekstur_tanah | 0.968 | 0.948 - 0.988 | 2.03e-03 | ** |
| jenis_industriSteel smelting | 9.369 | 2.261 - 38.815 | 2.04e-03 | ** |
| jenis_industriCrude oil processing and petroleum product manufacturing | 2.856 | 1.418 - 5.749 | 3.29e-03 | ** |
| polutan_organik_persistenYa | 0.203 | 0.067 - 0.619 | 5.05e-03 | ** |
| jenis_industriIron smelting | 6.510 | 1.703 - 24.878 | 6.17e-03 | ** |
| kecepatan_angin | 1.020 | 1.005 - 1.035 | 1.00e-02 |
|
| log_mobilitas_polutan | 0.461 | 0.255 - 0.834 | 1.05e-02 |
|
| jenis_industriCopper smelting | 4.600 | 1.17 - 18.092 | 2.89e-02 |
|
| jenis_industriPetroleum extraction | 0.333 | 0.123 - 0.902 | 3.05e-02 |
|
| jenis_industriLead-zinc smelting | 2.356 | 1.075 - 5.165 | 3.23e-02 |
|
| jenis_industriChemical reagent and additive manufacturing | 0.640 | 0.345 - 1.187 | 1.57e-01 |
|
| jenis_industriLeather tanning and processing | 1.461 | 0.781 - 2.735 | 2.36e-01 |
|
| jenis_industriIron ore mining and dressing | 0.644 | 0.308 - 1.349 | 2.44e-01 |
|
| jenis_industriLead-zinc ore mining and dressing | 1.427 | 0.555 - 3.671 | 4.61e-01 |
|
| jenis_industriChemical pharmaceutical raw material manufacturing | 0.761 | 0.303 - 1.91 | 5.60e-01 |
|
| jenis_industriMetal surface treatment and heat treatment processing | 0.925 | 0.568 - 1.509 | 7.56e-01 |
|
| jenis_industriGold ore mining and dressing | 1.163 | 0.431 - 3.141 | 7.66e-01 |
|
p_train_v3 <- predict(contamination_fit3, newdata = train_data, type = "response")
p_test_v3 <- predict(contamination_fit3, newdata = test_data, type = "response")
# Preview 8 data pertama
head(data.frame("Status Aktual" = as.character(test_data$status_kontaminasi),
"Peluang Terkontaminasi" = round(p_test_v3, 4),
check.names = FALSE), 8) %>%
knitr::kable(caption = "Contoh Peluang Prediksi v3 (8 Data Pertama)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Status Aktual | Peluang Terkontaminasi |
|---|---|
| No | 0.2102 |
| No | 0.2175 |
| Yes | 0.4757 |
| No | 0.2041 |
| No | 0.2732 |
| No | 0.1773 |
| Yes | 0.9722 |
| Yes | 0.9799 |
# Confusion Matrix threshold 0.50
confusion_matrix_tabel(test_data$kontaminasi_biner, p_test_v3, 0.5) %>%
knitr::kable(caption = "Confusion Matrix Testing (Threshold = 0,50) — v3") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Prediksi: Terkontaminasi | Prediksi: Bersih | Sum | |
|---|---|---|---|
| Aktual: Terkontaminasi | 192 | 49 | 241 |
| Aktual: Bersih | 47 | 114 | 161 |
| Sum | 239 | 163 | 402 |
met_v3_def <- classification_metrics(test_data$kontaminasi_biner, p_test_v3, 0.5)
format_metrics_indonesia(met_v3_def) %>%
mutate(across(where(is.numeric), \(x) round(x, 3))) %>%
knitr::kable(caption = "Metrik Evaluasi Testing (Threshold = 0,50) — v3") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Threshold | Akurasi | Error Rate | Sensitivity | Specificity | Presisi | F1-Score | Balanced Accuracy |
|---|---|---|---|---|---|---|---|
| 0.5 | 0.761 | 0.239 | 0.797 | 0.708 | 0.803 | 0.8 | 0.752 |
roc_train_v3 <- roc_points(train_data$kontaminasi_biner, p_train_v3) %>% mutate(data = "Training v3")
roc_test_v3 <- roc_points(test_data$kontaminasi_biner, p_test_v3) %>% mutate(data = "Testing v3")
auc_train_v3 <- auc_value(roc_train_v3); auc_test_v3 <- auc_value(roc_test_v3)
optimal_v3 <- roc_train_v3 %>%
filter(is.finite(threshold)) %>% arrange(desc(youden), desc(sensitivity)) %>% slice(1)
threshold_opt_v3 <- optimal_v3$threshold[1]
data.frame(Data = c("Training v3","Testing v3"),
AUC = round(c(auc_train_v3, auc_test_v3), 3)) %>%
knitr::kable(caption = "Nilai AUC — Model v3 (Final)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Data | AUC |
|---|---|
| Training v3 | 0.862 |
| Testing v3 | 0.851 |
## Threshold optimal v3 (Youden): 0.6406
# Kurva ROC v3
ggplot(bind_rows(roc_train_v3, roc_test_v3), aes(x = fpr, y = sensitivity, color = data)) +
geom_path(linewidth = 1.2) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
geom_point(data = optimal_v3, aes(x = fpr, y = sensitivity),
inherit.aes = FALSE, color = "#854d0e", fill = "#eab308",
shape = 21, size = 4.5, stroke = 1.3) +
coord_equal() +
scale_color_manual(values = c("Training v3" = "#2d6a2d", "Testing v3" = "#eab308")) +
labs(title = "Kurva ROC Model v3 (Reduced / Final)",
subtitle = paste0("AUC Training = ", round(auc_train_v3, 3),
" | AUC Testing = ", round(auc_test_v3, 3),
" | Threshold Optimal = ", round(threshold_opt_v3, 3)),
x = "False Positive Rate (1 - Specificity)",
y = "Sensitivity (True Positive Rate)", color = "Data") +
theme_minimal(base_size = 12) + theme(legend.position = "bottom")met_v3_opt <- classification_metrics(test_data$kontaminasi_biner, p_test_v3, threshold_opt_v3)
confusion_matrix_tabel(test_data$kontaminasi_biner, p_test_v3, threshold_opt_v3) %>%
knitr::kable(caption = paste0("Confusion Matrix Testing (Threshold Optimal = ",
round(threshold_opt_v3, 3), ") — v3")) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Prediksi: Terkontaminasi | Prediksi: Bersih | Sum | |
|---|---|---|---|
| Aktual: Terkontaminasi | 161 | 80 | 241 |
| Aktual: Bersih | 29 | 132 | 161 |
| Sum | 190 | 212 | 402 |
bind_rows(
met_v3_def %>% mutate(aturan = "Threshold 0.50"),
met_v3_opt %>% mutate(aturan = paste0("Threshold Optimal (", round(threshold_opt_v3, 3), ")"))
) %>%
format_metrics_indonesia() %>%
bind_cols(`Aturan Klasifikasi` = c("Threshold 0.50",
paste0("Threshold Optimal (", round(threshold_opt_v3, 3), ")")), .) %>%
select(`Aturan Klasifikasi`, everything()) %>%
mutate(across(where(is.numeric), round, 3)) %>%
knitr::kable(caption = "Perbandingan Metrik: Default vs Optimal — v3 (Final)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Aturan Klasifikasi | Threshold | Akurasi | Error Rate | Sensitivity | Specificity | Presisi | F1-Score | Balanced Accuracy |
|---|---|---|---|---|---|---|---|---|
| Threshold 0.50 | 0.500 | 0.761 | 0.239 | 0.797 | 0.708 | 0.803 | 0.800 | 0.752 |
| Threshold Optimal (0.641) | 0.641 | 0.729 | 0.271 | 0.668 | 0.820 | 0.847 | 0.747 | 0.744 |
test_data %>%
mutate(peluang_kontaminasi = p_test_v3) %>%
ggplot(aes(x = peluang_kontaminasi, fill = status_kontaminasi)) +
geom_density(alpha = 0.55, color = "white", linewidth = 0.7) +
geom_vline(xintercept = threshold_opt_v3,
color = "#2d6a2d", linewidth = 1.2, linetype = "dashed") +
annotate("text", x = threshold_opt_v3 + 0.02, y = Inf,
label = paste0("Threshold = ", round(threshold_opt_v3, 3)),
vjust = 1.6, hjust = 0, color = "#2d6a2d", size = 3.5) +
scale_fill_manual(values = c("No" = "#2d6a2d", "Yes" = "#eab308")) +
labs(title = "Distribusi Peluang Prediksi Kontaminasi Lahan (Data Testing) — v3 Final",
subtitle = "Garis putus-putus = threshold optimal ROC",
x = "Peluang prediksi terkontaminasi", y = "Kepadatan",
fill = "Status Aktual") +
theme_minimal(base_size = 12)## --- UJI HOSMER-LEMESHOW — MODEL v3 ---
print(ResourceSelection::hoslem.test(train_data$kontaminasi_biner, fitted(contamination_fit3), g = 10))##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: train_data$kontaminasi_biner, fitted(contamination_fit3)
## X-squared = 9.5847, df = 8, p-value = 0.2954
##
## --- OMNIBUS TEST (ANOVA LRT vs NULL) — MODEL v3 ---
null_fit <- glm(kontaminasi_biner ~ 1, data = train_data, family = binomial)
print(anova(null_fit, contamination_fit3, test = "Chisq"))## Analysis of Deviance Table
##
## Model 1: kontaminasi_biner ~ 1
## Model 2: kontaminasi_biner ~ durasi_operasi + tahun_mulai + jumlah_pelanggaran +
## curah_hujan + kecepatan_angin + tekstur_tanah + log_mobilitas_polutan +
## polutan_organik_persisten + jenis_industri
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1602 2156.7
## 2 1576 1416.7 26 739.97 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## --- NAGELKERKE PSEUDO R² — MODEL v3 ---
## Nagelkerke R² v3: 0.4999
Hasil Evaluasi Goodness-of-Fit:
- Hosmer-Lemeshow: p = 0,295 > 0,05 → model fit dengan data ✅
- Omnibus LRT: χ² = 739,97, p < 0,001 → model secara keseluruhan signifikan ✅
- Nagelkerke R²: 0,499 → model menjelaskan ~50% variasi kontaminasi ✅
roc_test_v2_plot <- roc_points(test_data$kontaminasi_biner, p_test_v2) %>% mutate(data = "Testing v2")
met_v1_full <- classification_metrics(test_data$kontaminasi_biner, p_test_v1, 0.5)
met_v2_full <- classification_metrics(test_data$kontaminasi_biner, p_test_v2, 0.5)
met_v3_full <- classification_metrics(test_data$kontaminasi_biner, p_test_v3, 0.5)
data.frame(
Metrik = c("Jumlah Variabel","AIC","Nagelkerke R²","AUC Training","AUC Testing",
"Akurasi (threshold 0.5)","Sensitivity (threshold 0.5)",
"Specificity (threshold 0.5)","Balanced Accuracy","F1-Score",
"Hosmer-Lemeshow p-value"),
`v1 — Full (15 var)` = c(
length(contamination_fit1$coefficients) - 1, round(AIC(contamination_fit1), 3),
round(DescTools::PseudoR2(contamination_fit1, which = "Nagelkerke"), 4),
round(auc_value(roc_points(train_data$kontaminasi_biner, p_train_v1)), 3),
round(auc_test_v1, 3), round(met_v1_full$accuracy, 3),
round(met_v1_full$sensitivity, 3), round(met_v1_full$specificity, 3),
round(met_v1_full$balanced_accuracy, 3), round(met_v1_full$f1_score, 3),
round(ResourceSelection::hoslem.test(train_data$kontaminasi_biner,
fitted(contamination_fit1), g = 10)$p.value, 4)
),
`v2 — Full diperbaiki (13 var)` = c(
length(contamination_fit2$coefficients) - 1, round(AIC(contamination_fit2), 3),
round(DescTools::PseudoR2(contamination_fit2, which = "Nagelkerke"), 4),
round(auc_train_v2, 3), round(auc_test_v2, 3),
round(met_v2_full$accuracy, 3), round(met_v2_full$sensitivity, 3),
round(met_v2_full$specificity, 3), round(met_v2_full$balanced_accuracy, 3),
round(met_v2_full$f1_score, 3),
round(ResourceSelection::hoslem.test(train_data$kontaminasi_biner,
fitted(contamination_fit2), g = 10)$p.value, 4)
),
`v3 — Reduced / Final ✅ (9 var)` = c(
length(contamination_fit3$coefficients) - 1, round(AIC(contamination_fit3), 3),
round(DescTools::PseudoR2(contamination_fit3, which = "Nagelkerke"), 4),
round(auc_train_v3, 3), round(auc_test_v3, 3),
round(met_v3_full$accuracy, 3), round(met_v3_full$sensitivity, 3),
round(met_v3_full$specificity, 3), round(met_v3_full$balanced_accuracy, 3),
round(met_v3_full$f1_score, 3),
round(ResourceSelection::hoslem.test(train_data$kontaminasi_biner,
fitted(contamination_fit3), g = 10)$p.value, 4)
),
check.names = FALSE
) %>%
knitr::kable(caption = "Perbandingan Performa Lengkap: v1 vs v2 vs v3") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Metrik | v1 — Full (15 var) | v2 — Full diperbaiki (13 var) | v3 — Reduced / Final ✅ (9 var)| |
|---|---|---|---|
| Jumlah Variabel | 38.0000 | 37.0000 | 26.0000 |
| AIC | 1489.0600 | 1487.1000 | 1470.7310 |
| Nagelkerke R² | 0.5029 | 0.5029 | 0.4999 |
| AUC Training | 0.8630 | 0.8630 | 0.8620 |
| AUC Testing | 0.8410 | 0.8410 | 0.8510 |
| Akurasi (threshold 0.5) | 0.7490 | 0.7490 | 0.7610 |
| Sensitivity (threshold 0.5) | 0.7970 | 0.7970 | 0.7970 |
| Specificity (threshold 0.5) | 0.6770 | 0.6770 | 0.7080 |
| Balanced Accuracy | 0.7370 | 0.7370 | 0.7520 |
| F1-Score | 0.7920 | 0.7920 | 0.8000 |
| Hosmer-Lemeshow p-value | 0.0185 | 0.0162 | 0.2954 |
roc_compare_all <- bind_rows(
roc_test_v1 %>% mutate(data = "Testing v1 (Full — 15 var)"),
roc_test_v2_plot %>% mutate(data = "Testing v2 (Full diperbaiki — 13 var)"),
roc_test_v3 %>% mutate(data = "Testing v3 (Reduced — 9 var)")
)
ggplot(roc_compare_all, aes(x = fpr, y = sensitivity, color = data)) +
geom_path(linewidth = 1.2) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
coord_equal() +
scale_color_manual(values = c(
"Testing v1 (Full — 15 var)" = "#86c287",
"Testing v2 (Full diperbaiki — 13 var)" = "#eab308",
"Testing v3 (Reduced — 9 var)" = "#2d6a2d"
)) +
labs(title = "Perbandingan Kurva ROC: v1 vs v2 vs v3 (Data Testing)",
subtitle = paste0("AUC v1 = ", round(auc_test_v1, 3),
" | AUC v2 = ", round(auc_test_v2, 3),
" | AUC v3 = ", round(auc_test_v3, 3)),
x = "False Positive Rate (1 - Specificity)",
y = "Sensitivity (True Positive Rate)", color = "Model") +
theme_minimal(base_size = 12) + theme(legend.position = "bottom")\[ \widehat{\text{logit}}(\pi_i) = 71{,}15 + 0{,}131\,(\text{durasi\_operasi}) - 0{,}037\,(\text{tahun\_mulai}) \] \[ + 0{,}441\,(\text{jumlah\_pelanggaran}) + 0{,}0000914\,(\text{curah\_hujan}) + 0{,}020\,(\text{kecepatan\_angin}) \] \[ - 0{,}033\,(\text{tekstur\_tanah}) - 0{,}775\,(\text{log\_mobilitas\_polutan}) \] \[ - 1{,}595\,(\text{polutan\_organik\_persisten} = \text{Ya}) + \beta_9\,(\text{jenis\_industri}) \]
Jumlah Pelanggaran Regulasi Lingkungan (OR = 1,554; CI 95%: 1,426–1,694; p < 0,001)
Prediktor paling kuat dalam model. Setiap tambahan satu pelanggaran regulasi lingkungan meningkatkan peluang terjadinya kontaminasi sebesar 55,4%, dengan asumsi variabel lain tetap konstan. Hal ini menunjukkan bahwa efektivitas penegakan hukum lingkungan memiliki peran penting dalam menurunkan risiko kontaminasi.
Durasi Operasi (OR = 1,140; p < 0,001)
Setiap tambahan satu tahun masa operasi industri meningkatkan peluang kontaminasi sebesar 14,0%. Hal ini berkaitan dengan akumulasi polutan dalam jangka panjang di area industri.
Tahun Mulai Beroperasi (OR = 0,964; p = 0,002)
Industri yang mulai beroperasi pada tahun yang lebih baru cenderung memiliki peluang kontaminasi yang lebih rendah sebesar 3,6% untuk setiap kenaikan satu tahun. Pola ini sejalan dengan peningkatan standar pengelolaan lingkungan dan pengetatan regulasi pada periode yang lebih baru.
Jenis Industri (baseline: Industri Lainnya)
Sektor yang melibatkan proses kimia intensif dan peleburan logam memiliki risiko kontaminasi tertinggi:
Jenis Industri OR CI 95% p-value Organic chemical raw material mfg. 12,039 3,681–39,375 < 0,001 *** Steel smelting 9,369 2,261–38,815 0,002 ** Coking 8,188 2,426–27,642 < 0,001 *** Pesticide manufacturing 7,875 2,417–25,654 < 0,001 *** Iron smelting 6,510 1,703–24,878 0,006 ** Copper smelting 4,600 1,17–18,092 0,029 * Lead-zinc smelting 2,356 1,075–5,165 0,032 * Crude oil processing & petroleum mfg. 2,856 1,418–5,749 0,003 ** Sebaliknya, sektor berikut menunjukkan risiko lebih rendah dibanding Industri Lainnya:
Jenis Industri OR Interpretasi Hazardous waste treatment 0,070 ~93% lebih rendah — kemungkinan karena standar pengawasan lebih ketat pada fasilitas pengelolaan limbah berbahaya Cotton printing & dyeing 0,309 Risiko lebih rendah secara signifikan (p < 0,001) Petroleum extraction 0,333 Risiko lebih rendah secara signifikan (p = 0,031)
Curah Hujan (OR ≈ 1,000 per 1 mm; p < 0,001)
Meskipun nilai OR per satuan milimeter sangat kecil, pengaruhnya tetap penting karena curah hujan memiliki rentang nilai yang luas. Secara kumulatif, curah hujan tinggi dapat meningkatkan proses pencucian polutan ke lapisan tanah lebih dalam serta memperluas perpindahan kontaminan ke area sekitar.
Kecepatan Angin (OR = 1,020; p = 0,010)
Setiap kenaikan satu satuan kecepatan angin meningkatkan peluang kontaminasi. Hal ini diduga berkaitan dengan proses penyebaran partikel dan kontaminan melalui udara.
Tekstur Tanah — Kandungan Liat (OR = 0,968; p = 0,002)
Setiap peningkatan kandungan liat sebesar 1% menurunkan peluang kontaminasi sebesar 3,2%. Hubungan protektif ini berkaitan dengan karakteristik fisik tanah berliat dalam membatasi pergerakan polutan.
Log Mobilitas Polutan (OR = 0,461; p = 0,011)
Menunjukkan hubungan negatif dengan probabilitas kontaminasi yang terdeteksi. Hal ini tidak berarti polutan bermobilitas tinggi tidak berbahaya — sebaliknya, polutan dengan mobilitas tinggi cenderung berpindah dari lapisan permukaan ke lapisan lebih dalam atau terbawa ke badan air, sehingga konsentrasinya pada titik sampling permukaan menjadi lebih rendah. Hasil ini lebih tepat dipahami sebagai indikasi perbedaan perilaku migrasi dan deteksi polutan.
Polutan Organik Persisten / POPs (OR = 0,203; p = 0,005)
Juga menunjukkan hubungan negatif yang serupa. POPs memiliki afinitas kuat terhadap bahan organik tanah sehingga lebih banyak terakumulasi dalam fase padat dan tidak selalu terdeteksi secara optimal melalui metode pemantauan konvensional (sampel tanah permukaan). Temuan ini mengindikasikan bahwa sistem pemantauan yang hanya mengandalkan sampel permukaan berpotensi mengabaikan sebagian risiko kontaminasi sesungguhnya.
Model Final: v3 — Reduced Model (9 variabel)
Dipilih berdasarkan:
Kriteria Nilai Keterangan AIC 1.470,731 Lebih rendah 16,4 poin dari v2 (ΔAIC > 10) LRT v3 vs v2 p = 0,897 Variabel yang dihapus tidak signifikan bersama ✅ Hosmer-Lemeshow p = 0,295 Model fit baik dengan data ✅ AUC Testing 0,851 Kemampuan diskriminasi baik (0,80–0,90) ✅ Akurasi 76,1% Threshold default 0,50 Balanced Accuracy 75,2% Threshold default 0,50 Nagelkerke R² 0,499 Model menjelaskan ~50% variasi kontaminasi Parsimoni 9 variabel Lebih efisien dari v2 (13 var) & v1 (15 var)
Sembilan faktor risiko signifikan:
jumlah_pelanggaran — prediktor terkuat (OR = 1,554; p
< 0,001)durasi_operasi (OR = 1,140; p < 0,001)tahun_mulai (OR = 0,964; p = 0,002)jenis_industri — beberapa level signifikan, tertinggi:
Organic chemical mfg. (OR = 12,039)curah_hujan (p < 0,001)kecepatan_angin (OR = 1,020; p = 0,010)tekstur_tanah — efek protektif (OR = 0,968; p =
0,002)log_mobilitas_polutan (OR = 0,461; p = 0,011)polutan_organik_persisten (OR = 0,203; p = 0,005)Implikasi praktis: Prioritas pemantauan dan remediasi difokuskan pada industri kimia dan metalurgi (organic chemical manufacturing, steel smelting, coking, pesticide manufacturing) dengan riwayat pelanggaran lingkungan tinggi dan telah beroperasi lama. Hubungan negatif pada mobilitas polutan dan POPs mengindikasikan bahwa metode pemantauan berbasis sampel tanah permukaan saja berpotensi mengabaikan sebagian risiko kontaminasi — sistem pemantauan yang mengakomodasi karakteristik migrasi polutan perlu dikembangkan.
Dataset: Urban Contaminated Sites along China’s Urbanization — Li, K. (2023). Mendeley Data. DOI: 10.17632/r4y2vcpfmx.1