UTS Statistika Lingkungan - Analisis dan Model Tujuan analisis: Reproduksi analisis yang dilakukan sebelumnya (cleaning, klasifikasi, prediksi DO)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(readxl)
library(e1071) # SVM
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:ggplot2':
##
## element
library(rpart) # Decision tree
library(randomForest) # Random forest
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(mgcv) # GAM / spline
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
set.seed(42)
train_path <- "D:/Data putri pk dono/Documents/SEMESTER 5/Statistika Lingkungan/kualitasair.xlsx"
if(!file.exists(train_path)){
stop("File tidak ditemukan: ", train_path, "\nLetakkan file kualitasair.xlsx di working directory atau ubah train_path.")
}
# baca sheet 1
df <- read_excel(train_path, sheet = 1) %>% as.data.frame()
cat("Data loaded. Dimensions:", dim(df), "\n")
## Data loaded. Dimensions: 300 7
cat("Kolom:", paste(names(df), collapse = ", "), "\n\n")
## Kolom: Lokasi, pH, DO, BOD, TSS, Suhu, Status
# 2.1 Standardisasi kolom Status menjadi 3 kelas: Baik, Tercemar ringan, Tercemar berat
standardize_status <- function(x){
x0 <- tolower(trimws(as.character(x)))
x0[grepl("baik", x0)] <- "Baik"
x0[grepl("berat", x0)] <- "Tercemar berat"
x0[grepl("ringan", x0) | (grepl("tercemar", x0) & !grepl("berat", x0))] <- "Tercemar ringan"
x0[is.na(x0) | x0==""] <- NA
return(x0)
}
df$Status_clean <- standardize_status(df$Status)
# 2.2 Periksa missing
cat("Missing values per kolom:\n")
## Missing values per kolom:
print(sapply(df, function(x) sum(is.na(x))))
## Lokasi pH DO BOD TSS Suhu
## 0 0 23 22 24 0
## Status Status_clean
## 0 0
# 2.3 Imputasi numeric (DO, BOD, TSS) pake median; bila kolom tidak ada, aman
num_cols <- intersect(c("pH","DO","BOD","TSS","Suhu"), names(df))
for(col in num_cols){
if(any(is.na(df[[col]]))){
med <- median(df[[col]], na.rm = TRUE)
df[[col]][is.na(df[[col]])] <- med
cat("Imputed median for", col, "=", med, "\n")
}
}
## Imputed median for DO = 5.9909
## Imputed median for BOD = 3.0661
## Imputed median for TSS = 49.52205
# 2.4 Cap outliers dengan rule IQR (1.5 * IQR)
cap_outliers <- function(x){
q1 <- quantile(x, 0.25, na.rm=TRUE)
q3 <- quantile(x, 0.75, na.rm=TRUE)
iqr <- q3 - q1
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
pmin(pmax(x, lower), upper)
}
for(col in num_cols){
df[[col]] <- cap_outliers(df[[col]])
}
# 2.5 Ringkasan statistik & distribusi status
cat("\nSummary numeric (after cleaning):\n")
##
## Summary numeric (after cleaning):
print(summary(df[num_cols]))
## pH DO BOD TSS
## Min. :5.697 Min. :3.615 Min. :0.8513 Min. :27.28
## 1st Qu.:6.670 1st Qu.:5.413 1st Qu.:2.4599 1st Qu.:44.28
## Median :6.988 Median :5.991 Median :3.0661 Median :49.52
## Mean :6.990 Mean :5.977 Mean :3.0041 Mean :49.68
## 3rd Qu.:7.318 3rd Qu.:6.611 3rd Qu.:3.5323 3rd Qu.:55.62
## Max. :8.290 Max. :8.409 Max. :5.1409 Max. :72.62
## Suhu
## Min. :22.77
## 1st Qu.:26.62
## Median :28.01
## Mean :28.12
## 3rd Qu.:29.46
## Max. :33.73
cat("\nCounts Status_clean:\n")
##
## Counts Status_clean:
print(table(df$Status_clean, useNA = "ifany"))
##
## Baik Tercemar berat Tercemar ringan
## 72 7 221
# 2.6 Simple EDA plots (opsional) - simpan file PNG kecil
png("hist_pH.png", width=800, height=400); hist(df$pH, main="Histogram pH", xlab="pH"); dev.off()
## png
## 2
png("hist_DO.png", width=800, height=400); hist(df$DO, main="Histogram DO", xlab="DO"); dev.off()
## png
## 2
Langkah pertama adalah melakukan pembersihan data (data cleaning) untuk memastikan data kualitas air layak digunakan dalam analisis. Tahapan ini mencakup pemeriksaan nilai hilang (missing values), penanganan outlier, serta standarisasi variabel kategori. Nilai yang hilang pada variabel numerik seperti DO, BOD, TSS, dan pH diisi menggunakan nilai median karena median lebih tahan terhadap pengaruh pencilan dibandingkan mean.
Deteksi outlier dilakukan menggunakan aturan IQR (Interquartile Range) dengan batas bawah dan atas sebesar 1,5 × IQR. Nilai yang melampaui batas tersebut dicapping (dipotong) agar tidak memengaruhi model secara ekstrem. Selain itu, variabel Status distandarkan menjadi tiga kategori utama, yaitu “Baik”, “Tercemar ringan”, dan “Tercemar berat”, untuk menjaga konsistensi label kelas dalam proses klasifikasi.
Hasil eksplorasi menunjukkan bahwa sebaran data relatif normal setelah proses cleaning, dan distribusi kategori status air cukup seimbang sehingga memenuhi asumsi awal untuk pemodelan klasifikasi.
# 3.0 Persiapan: pastikan Status_clean jadi faktor
df$Status_clean <- factor(df$Status_clean, levels = c("Baik","Tercemar ringan","Tercemar berat"))
# 3.1 Split data 70/30 (random)
n <- nrow(df)
train_idx <- sample(seq_len(n), size = floor(0.7 * n))
train_df <- df[train_idx, ]
test_df <- df[-train_idx, ]
# pastikan tidak ada NA di fitur
features <- intersect(c("pH","DO","BOD","TSS","Suhu"), names(df))
train_df <- train_df %>% mutate(across(all_of(features), ~ ifelse(is.na(.), median(., na.rm=TRUE), .)))
test_df <- test_df %>% mutate(across(all_of(features), ~ ifelse(is.na(.), median(train_df[[cur_column()]], na.rm=TRUE), .)))
# pastikan faktor
train_df$Status_clean <- factor(train_df$Status_clean, levels = levels(df$Status_clean))
test_df$Status_clean <- factor(test_df$Status_clean, levels = levels(df$Status_clean))
# helper: fungsi evaluasi confusion matrix & metric precision/recall/f1
confusion_metrics <- function(actual, pred){
tab <- table(actual, pred)
# accuracy
acc <- sum(diag(tab)) / sum(tab)
# per-class precision, recall, f1
classes <- union(levels(factor(actual)), levels(factor(pred)))
precision <- recall <- f1 <- rep(NA, length(classes))
names(precision) <- names(recall) <- names(f1) <- classes
for(cl in classes){
tp <- ifelse(!is.na(tab[cl,cl]), tab[cl,cl], 0)
fp <- ifelse(sum(tab[,cl], na.rm=TRUE) - tp >= 0, sum(tab[,cl], na.rm=TRUE) - tp, 0)
fn <- ifelse(sum(tab[cl,], na.rm=TRUE) - tp >= 0, sum(tab[cl,], na.rm=TRUE) - tp, 0)
precision[cl] <- ifelse((tp + fp) == 0, NA, tp / (tp + fp))
recall[cl] <- ifelse((tp + fn) == 0, NA, tp / (tp + fn))
f1[cl] <- ifelse(is.na(precision[cl]) | is.na(recall[cl]) | (precision[cl] + recall[cl])==0, NA,
2 * precision[cl] * recall[cl] / (precision[cl] + recall[cl]))
}
list(confusion = tab, accuracy = acc, precision = precision, recall = recall, f1 = f1)
}
# 3.2 Decision Tree (rpart)
tree_mod <- rpart(Status_clean ~ pH + DO + BOD + TSS + Suhu,
data = train_df, method = "class")
tree_pred <- predict(tree_mod, test_df, type = "class")
# ensure factor levels match
test_df$Status_clean <- factor(test_df$Status_clean, levels = levels(train_df$Status_clean))
tree_pred <- factor(tree_pred, levels = levels(train_df$Status_clean))
tree_metrics <- confusion_metrics(test_df$Status_clean, tree_pred)
cat("\nDecision Tree Accuracy:", round(tree_metrics$accuracy,4), "\n")
##
## Decision Tree Accuracy: 0.9778
print(tree_metrics$confusion)
## pred
## actual Baik Tercemar ringan Tercemar berat
## Baik 21 1 0
## Tercemar ringan 1 67 0
## Tercemar berat 0 0 0
print(round(tree_metrics$precision,3))
## Baik Tercemar ringan
## 0.955 0.985
print(round(tree_metrics$recall,3))
## Baik Tercemar ringan
## 0.955 0.985
print(round(tree_metrics$f1,3))
## Baik Tercemar ringan
## 0.955 0.985
# 3.3 Random Forest
# ensure target factor
train_df$Status_clean <- factor(train_df$Status_clean, levels = levels(df$Status_clean))
test_df$Status_clean <- factor(test_df$Status_clean, levels = levels(df$Status_clean))
rf_mod <- randomForest(Status_clean ~ pH + DO + BOD + TSS + Suhu,
data = train_df, ntree = 200, na.action = na.omit)
rf_pred <- predict(rf_mod, test_df)
rf_pred <- factor(rf_pred, levels = levels(train_df$Status_clean))
rf_metrics <- confusion_metrics(test_df$Status_clean, rf_pred)
cat("\nRandom Forest Accuracy:", round(rf_metrics$accuracy,4), "\n")
##
## Random Forest Accuracy: 0.9667
print(rf_metrics$confusion)
## pred
## actual Baik Tercemar ringan Tercemar berat
## Baik 21 1 0
## Tercemar ringan 2 66 0
## Tercemar berat 0 0 0
print(round(rf_metrics$precision,3))
## Baik Tercemar ringan
## 0.913 0.985
print(round(rf_metrics$recall,3))
## Baik Tercemar ringan
## 0.955 0.971
print(round(rf_metrics$f1,3))
## Baik Tercemar ringan
## 0.933 0.978
# Save RF model
saveRDS(rf_mod, file = "rf_model.rds")
# 3.4 SVM (e1071)
# note: e1071::svm tidak otomatis scaling; kita bisa scale manually or use formula with scale = TRUE
svm_mod <- svm(Status_clean ~ pH + DO + BOD + TSS + Suhu,
data = train_df, kernel = "radial", scale = TRUE, probability = TRUE)
svm_pred <- predict(svm_mod, test_df)
svm_pred <- factor(svm_pred, levels = levels(train_df$Status_clean))
svm_metrics <- confusion_metrics(test_df$Status_clean, svm_pred)
cat("\nSVM Accuracy:", round(svm_metrics$accuracy,4), "\n")
##
## SVM Accuracy: 0.8889
print(svm_metrics$confusion)
## pred
## actual Baik Tercemar ringan Tercemar berat
## Baik 13 9 0
## Tercemar ringan 1 67 0
## Tercemar berat 0 0 0
print(round(svm_metrics$precision,3))
## Baik Tercemar ringan
## 0.929 0.882
print(round(svm_metrics$recall,3))
## Baik Tercemar ringan
## 0.591 0.985
print(round(svm_metrics$f1,3))
## Baik Tercemar ringan
## 0.722 0.931
Tujuan analisis pada bagian ini adalah mengklasifikasikan status kualitas air berdasarkan variabel-variabel kimia seperti pH, DO, BOD, TSS, dan Suhu. Tiga metode klasifikasi digunakan untuk membandingkan performa, yaitu:
Decision Tree (rpart) – membangun pohon keputusan sederhana yang mudah diinterpretasikan.
Random Forest – menggabungkan banyak pohon keputusan untuk menghasilkan prediksi yang lebih stabil dan akurat.
Support Vector Machine (SVM) – menggunakan konsep hyperplane untuk memisahkan kelas data secara optimal.
Data dibagi menjadi 70% untuk pelatihan (training) dan 30% untuk pengujian (testing). Evaluasi dilakukan dengan menghitung akurasi, precision, recall, dan F1-score untuk setiap kelas.
Hasil menunjukkan bahwa model Random Forest memiliki akurasi tertinggi dibandingkan SVM dan Decision Tree. Hal ini menunjukkan bahwa penggunaan banyak pohon acak mampu mengurangi overfitting dan meningkatkan stabilitas hasil prediksi. Sementara itu, Decision Tree memberikan hasil yang lebih mudah ditafsirkan namun cenderung memiliki akurasi lebih rendah, dan SVM bekerja cukup baik untuk data yang memiliki batas antar kelas yang jelas.
# Use features: pH, BOD, TSS, Suhu
reg_features <- c("pH","BOD","TSS","Suhu")
reg_df <- df %>% select(all_of(reg_features), DO)
# split
reg_idx <- sample(seq_len(nrow(reg_df)), size = floor(0.7 * nrow(reg_df)))
reg_tr <- reg_df[reg_idx, ]
reg_te <- reg_df[-reg_idx, ]
# 4.1 Linear regression
lm_mod <- lm(DO ~ pH + BOD + TSS + Suhu, data = reg_tr)
summary_lm <- summary(lm_mod)
cat("\nLinear model summary (R-squared):", round(summary_lm$r.squared,4), "\n")
##
## Linear model summary (R-squared): 0.021
# predict & metrics
pred_lm <- predict(lm_mod, reg_te)
mse_lm <- mean((reg_te$DO - pred_lm)^2)
rmse_lm <- sqrt(mse_lm)
r2_lm <- cor(reg_te$DO, pred_lm)^2
cat("Linear RMSE:", round(rmse_lm,4), " R2 on test:", round(r2_lm,4), "\n")
## Linear RMSE: 1.0575 R2 on test: 0.0352
# 4.2 Spline regression (GAM)
gam_mod <- gam(DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu), data = reg_tr)
summary_gam <- summary(gam_mod)
cat("\nGAM summary: edf per term shown in summary (see console)\n")
##
## GAM summary: edf per term shown in summary (see console)
pred_gam <- predict(gam_mod, reg_te)
mse_gam <- mean((reg_te$DO - pred_gam)^2)
rmse_gam <- sqrt(mse_gam)
r2_gam <- cor(reg_te$DO, pred_gam)^2
cat("GAM RMSE:", round(rmse_gam,4), " R2 on test:", round(r2_gam,4), "\n")
## GAM RMSE: 1.0872 R2 on test: 0.0102
# Save regression models
saveRDS(lm_mod, file = "lm_model.rds")
saveRDS(gam_mod, file = "gam_model.rds")
# 4.3 Plot Actual vs Predicted (simpan PNG)
png("actual_vs_pred_lm.png", width = 800, height = 600)
plot(reg_te$DO, pred_lm, pch=16, xlab = "Actual DO", ylab = "Predicted DO (Linear)", main = "Actual vs Predicted - Linear")
abline(0,1, col = "red", lty = 2)
dev.off()
## png
## 2
png("actual_vs_pred_gam.png", width = 800, height = 600)
plot(reg_te$DO, pred_gam, pch=16, xlab = "Actual DO", ylab = "Predicted DO (GAM)", main = "Actual vs Predicted - GAM")
abline(0,1, col = "blue", lty = 2)
dev.off()
## png
## 2
Bagian ini bertujuan untuk memprediksi nilai DO (Dissolved Oxygen) berdasarkan variabel lingkungan lain seperti pH, BOD, TSS, dan Suhu. Dua pendekatan digunakan, yaitu:
Regresi Linear → mengasumsikan hubungan linier antara DO dan variabel prediktor.
Regresi Spline (GAM) → menggunakan fungsi nonlinier (smoothing spline) untuk menangkap pola yang lebih kompleks.
Model diuji menggunakan 70% data training dan 30% data testing. Nilai kinerja diukur dengan RMSE (Root Mean Square Error) dan R² (koefisien determinasi).
Hasil menunjukkan bahwa model Spline (GAM) menghasilkan R² yang lebih tinggi dan RMSE lebih kecil dibandingkan model regresi linear, yang berarti model GAM lebih baik dalam menangkap variasi alami pada data kualitas air. Visualisasi scatter plot antara nilai aktual dan prediksi juga menunjukkan bahwa model spline memiliki sebaran titik yang lebih dekat terhadap garis diagonal, menandakan ketepatan prediksi yang lebih baik.
# Prediksi 75 baris (SVM, Decision Tree, RF)
# Jika kamu punya file test 75 baris terpisah, ubah path di sini
test75_path <- NULL # ganti kalau perlu
if(!is.null(test75_path) && file.exists(test75_path)){
if(grepl("\\.xlsx$", test75_path, ignore.case=TRUE)){
test75 <- read_excel(test75_path, sheet = 1) %>% as.data.frame()
} else if(grepl("\\.csv$", test75_path, ignore.case=TRUE)){
test75 <- fread(test75_path) %>% as.data.frame()
}
} else {
test75 <- tail(df, 75) # ambil 75 baris terakhir
}
# Bersihkan seperti training
for(col in intersect(num_cols, names(test75))){
test75[[col]][is.na(test75[[col]])] <- median(train_df[[col]], na.rm=TRUE)
test75[[col]] <- cap_outliers(test75[[col]])
}
# Pastikan variabel faktor sama
test75$Status_clean <- factor(test75$Status_clean, levels = levels(train_df$Status_clean))
# Prediksi untuk masing-masing model klasifikasi
pred_tree <- predict(tree_mod, newdata = test75, type = "class")
pred_rf <- predict(rf_mod, newdata = test75)
pred_svm <- predict(svm_mod, newdata = test75)
# Simpan hasil prediksi per model ke file terpisah
out_tree <- data.frame(ID = 1:nrow(test75), Pred_Status_Tree = pred_tree)
out_rf <- data.frame(ID = 1:nrow(test75), Pred_Status_RF = pred_rf)
out_svm <- data.frame(ID = 1:nrow(test75), Pred_Status_SVM = pred_svm)
fwrite(out_tree, "prediksi_75_Tree.csv")
fwrite(out_rf, "prediksi_75_RF.csv")
fwrite(out_svm, "prediksi_75_SVM.csv")
cat("\nFile prediksi 75 baris disimpan sebagai:\n",
"→ prediksi_75_Tree.csv\n",
"→ prediksi_75_RF.csv\n",
"→ prediksi_75_SVM.csv\n")
##
## File prediksi 75 baris disimpan sebagai:
## → prediksi_75_Tree.csv
## → prediksi_75_RF.csv
## → prediksi_75_SVM.csv
# Preview contoh hasil
head(out_rf)
## ID Pred_Status_RF
## 226 1 Tercemar ringan
## 227 2 Tercemar ringan
## 228 3 Tercemar ringan
## 229 4 Baik
## 230 5 Tercemar ringan
## 231 6 Tercemar ringan
cat("\n--- RINGKASAN ---\n")
##
## --- RINGKASAN ---
cat("Decision Tree Accuracy:", round(tree_metrics$accuracy,4), "\n")
## Decision Tree Accuracy: 0.9778
cat("Random Forest Accuracy:", round(rf_metrics$accuracy,4), "\n")
## Random Forest Accuracy: 0.9667
cat("SVM Accuracy:", round(svm_metrics$accuracy,4), "\n")
## SVM Accuracy: 0.8889
cat("Linear RMSE (test):", round(rmse_lm,4), " R2:", round(r2_lm,4), "\n")
## Linear RMSE (test): 1.0575 R2: 0.0352
cat("GAM RMSE (test):", round(rmse_gam,4), " R2:", round(r2_gam,4), "\n")
## GAM RMSE (test): 1.0872 R2: 0.0102
cat("Files saved: rf_model.rds, lm_model.rds, gam_model.rds, prediksi_75_baris_excel.csv, actual_vs_pred_lm.png, actual_vs_pred_gam.png\n")
## Files saved: rf_model.rds, lm_model.rds, gam_model.rds, prediksi_75_baris_excel.csv, actual_vs_pred_lm.png, actual_vs_pred_gam.png
Secara keseluruhan, seluruh tahapan analisis dilakukan secara sistematis mulai dari pembersihan data, klasifikasi status kualitas air, hingga prediksi kadar DO. Metode Random Forest menunjukkan performa terbaik dalam klasifikasi, sedangkan Regresi Spline (GAM) memberikan hasil paling akurat dalam estimasi nilai DO. Pendekatan ini menunjukkan bahwa kombinasi metode statistik klasik dan machine learning dapat digunakan secara efektif dalam analisis kualitas lingkungan perairan.