library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(splines)
library(Metrics)
train <- read_csv("D:\\UNNES\\Semester 5\\StatLing\\kualitasair.xlsx - Training.csv")
## Rows: 300 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Lokasi, Status
## dbl (5): pH, DO, BOD, TSS, Suhu
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv("D:\\UNNES\\Semester 5\\StatLing\\kualitasair.xlsx - Testing.csv")
## Rows: 75 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Lokasi
## dbl (5): pH, DO, BOD, TSS, Suhu
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colSums(is.na(train))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 23 22 24 0 0
num_to_impute <- c("DO", "BOD", "TSS")
train_clean <- train %>%
mutate(across(all_of(num_to_impute),
~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
colSums(is.na(select(train_clean, all_of(num_to_impute))))
## DO BOD TSS
## 0 0 0
table(train_clean$Status)
##
## baik Baik BAIK Tercemar berat tercemar ringan
## 1 70 1 7 1
## Tercemar ringan Tercemar Ringan
## 219 1
train_clean <- train_clean %>%
mutate(Status = str_trim(Status),
Status = str_squish(Status),
Status = str_to_lower(Status),
Status = str_to_title(Status)) %>%
mutate(Status = recode(Status,
"Baik" = "Baik",
"Tercemar Ringan" = "Tercemar ringan",
"Tercemar Berat" = "Tercemar berat"))
# Cek hasil akhirnya
table(train_clean$Status)
##
## Baik Tercemar berat Tercemar ringan
## 72 7 221
# Fungsi untuk menghapus outlier berdasarkan IQR
remove_outliers <- function(df, cols) {
for (col in cols) {
Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
df <- df %>%
filter(.data[[col]] >= lower & .data[[col]] <= upper)
}
return(df)
}
# Terapkan pada kolom numerik
numeric_cols <- c("pH", "DO", "BOD", "TSS", "Suhu")
train_clean <- remove_outliers(train_clean, numeric_cols)
# Bandingkan jumlah baris sebelum & sesudah
cat("Sebelum:", nrow(train), "baris\n")
## Sebelum: 300 baris
cat("Sesudah:", nrow(train_clean), "baris\n")
## Sesudah: 281 baris
summary(train_clean)
## Lokasi pH DO BOD
## Length:281 Min. :5.780 Min. :3.811 Min. :0.8993
## Class :character 1st Qu.:6.667 1st Qu.:5.406 1st Qu.:2.4449
## Mode :character Median :6.985 Median :5.991 Median :3.0661
## Mean :6.995 Mean :5.962 Mean :2.9905
## 3rd Qu.:7.318 3rd Qu.:6.598 3rd Qu.:3.5280
## Max. :8.230 Max. :8.242 Max. :5.0988
## TSS Suhu Status
## Min. :27.73 Min. :22.77 Length:281
## 1st Qu.:44.30 1st Qu.:26.80 Class :character
## Median :49.52 Median :28.02 Mode :character
## Mean :49.76 Mean :28.12
## 3rd Qu.:55.73 3rd Qu.:29.52
## Max. :72.41 Max. :33.25
colSums(is.na(test))
## Lokasi pH DO BOD TSS Suhu
## 0 0 8 9 7 0
num_to_impute <- c("DO", "BOD", "TSS")
test_clean <- test %>%
mutate(across(all_of(num_to_impute),
~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
colSums(is.na(select(test_clean, all_of(num_to_impute))))
## DO BOD TSS
## 0 0 0
summary(test_clean)
## Lokasi pH DO BOD
## Length:75 Min. : 1.500 Min. :3.807 Min. :0.957
## Class :character 1st Qu.: 6.615 1st Qu.:5.354 1st Qu.:2.574
## Mode :character Median : 6.965 Median :5.644 Median :3.062
## Mean : 6.946 Mean :5.801 Mean :3.035
## 3rd Qu.: 7.349 3rd Qu.:6.253 3rd Qu.:3.378
## Max. :10.500 Max. :7.730 Max. :5.082
## TSS Suhu
## Min. :24.06 Min. :23.39
## 1st Qu.:43.35 1st Qu.:26.79
## Median :49.57 Median :28.28
## Mean :49.15 Mean :29.04
## 3rd Qu.:55.62 3rd Qu.:29.88
## Max. :74.09 Max. :85.00
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(e1071) # untuk SVM (dipanggil oleh caret)
library(rpart) # untuk Decision Tree
library(randomForest) # untuk Random Forest
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(dplyr)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
# ----------------------------
# 1) Load data
df_train_raw <- train_clean
df_test_raw <- test_clean
# ----------------------------
# 2) Persiapan data & fitur
# ----------------------------
# Hapus kolom non-numerik 'Lokasi' dari fitur; label = Status (factor)
df_train <- df_train_raw %>%
mutate(Status = as.factor(Status)) %>%
select(-Lokasi)
# Simpan Lokasi untuk output prediksi nanti
lokasi_test <- df_test_raw$Lokasi
df_test <- df_test_raw %>% select(-Lokasi)
# Pastikan urutan & nama kolom fitur sesuai
stopifnot(all(colnames(df_test) == setdiff(colnames(df_train), "Status")))
# ----------------------------
# 3) Train/Validation split
# ----------------------------
set.seed(123)
idx <- createDataPartition(df_train$Status, p = 0.8, list = FALSE)
train_set <- df_train[idx, ]
valid_set <- df_train[-idx, ]
# Control training: 5-fold CV, metrik Accuracy
ctrl <- trainControl(
method = "cv",
number = 5,
classProbs = FALSE,
summaryFunction = defaultSummary
)
# Preprocess: imputasi median + standardisasi
pp_steps <- c("medianImpute", "center", "scale")
# ----------------------------
# 4) Model 1: SVM (RBF)
# ----------------------------
set.seed(123)
svm_fit <- train(
Status ~ .,
data = train_set,
method = "svmRadial",
preProcess = pp_steps,
metric = "Accuracy",
trControl = ctrl,
tuneLength = 7 # grid otomatis untuk C & sigma
)
# Evaluasi pada validation set
svm_pred_val <- predict(svm_fit, newdata = valid_set)
svm_cm <- confusionMatrix(svm_pred_val, valid_set$Status)
# ----------------------------
# 5) Model 2: Decision Tree
# ----------------------------
set.seed(123)
dt_fit <- train(
Status ~ .,
data = train_set,
method = "rpart",
preProcess = pp_steps,
metric = "Accuracy",
trControl = ctrl,
tuneLength = 10 # grid cp
)
dt_pred_val <- predict(dt_fit, newdata = valid_set)
dt_cm <- confusionMatrix(dt_pred_val, valid_set$Status)
# ----------------------------
# 6) Model 3: Random Forest
# ----------------------------
set.seed(123)
rf_fit <- train(
Status ~ .,
data = train_set,
method = "rf",
preProcess = pp_steps,
metric = "Accuracy",
trControl = ctrl,
tuneLength = 7 # grid mtry
)
## note: only 4 unique complexity parameters in default grid. Truncating the grid to 4 .
rf_pred_val <- predict(rf_fit, newdata = valid_set)
rf_cm <- confusionMatrix(rf_pred_val, valid_set$Status)
# ----------------------------
# 7) Ringkasan akurasi & Confusion Matrix
# ----------------------------
cat("\n=== VALIDATION ACCURACY ===\n")
##
## === VALIDATION ACCURACY ===
acc_table <- data.frame(
Model = c("SVM (RBF)", "Decision Tree", "Random Forest"),
Accuracy = c(svm_cm$overall["Accuracy"],
dt_cm$overall["Accuracy"],
rf_cm$overall["Accuracy"])
)
print(acc_table)
## Model Accuracy
## 1 SVM (RBF) 0.9090909
## 2 Decision Tree 0.9636364
## 3 Random Forest 0.9636364
cat("\n=== CONFUSION MATRIX: SVM ===\n"); print(svm_cm$table)
##
## === CONFUSION MATRIX: SVM ===
## Reference
## Prediction Baik Tercemar berat Tercemar ringan
## Baik 10 0 1
## Tercemar berat 0 0 0
## Tercemar ringan 4 0 40
cat("\n=== CONFUSION MATRIX: Decision Tree ===\n"); print(dt_cm$table)
##
## === CONFUSION MATRIX: Decision Tree ===
## Reference
## Prediction Baik Tercemar berat Tercemar ringan
## Baik 12 0 0
## Tercemar berat 0 0 0
## Tercemar ringan 2 0 41
cat("\n=== CONFUSION MATRIX: Random Forest ===\n"); print(rf_cm$table)
##
## === CONFUSION MATRIX: Random Forest ===
## Reference
## Prediction Baik Tercemar berat Tercemar ringan
## Baik 12 0 0
## Tercemar berat 0 0 0
## Tercemar ringan 2 0 41
# ----------------------------
# 8) (Opsional) Majority Vote ensemble pada validation
# ----------------------------
ens_val <- data.frame(
svm = svm_pred_val,
dt = dt_pred_val,
rf = rf_pred_val
)
maj_vote <- apply(ens_val, 1, function(x) names(sort(table(x), decreasing = TRUE))[1])
ens_cm <- confusionMatrix(as.factor(maj_vote), valid_set$Status)
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(as.factor(maj_vote), valid_set$Status):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cat("\n=== VALIDATION (Ensemble Majority Vote) ===\n")
##
## === VALIDATION (Ensemble Majority Vote) ===
print(ens_cm$overall["Accuracy"])
## Accuracy
## 0.9636364
print(ens_cm$table)
## Reference
## Prediction Baik Tercemar berat Tercemar ringan
## Baik 12 0 0
## Tercemar berat 0 0 0
## Tercemar ringan 2 0 41
# ----------------------------
# 9) Retrain pada SELURUH training untuk model TERBAIK
# (di sini otomatis pilih berdasarkan akurasi tertinggi di validation)
# ----------------------------
best_name <- acc_table$Model[which.max(acc_table$Accuracy)]
cat("\nModel terbaik (validation): ", best_name, "\n")
##
## Model terbaik (validation): Decision Tree
set.seed(123)
if (best_name == "SVM (RBF)") {
best_fit <- train(Status ~ ., data = df_train, method = "svmRadial",
preProcess = pp_steps, metric = "Accuracy",
trControl = ctrl, tuneLength = 7)
} else if (best_name == "Decision Tree") {
best_fit <- train(Status ~ ., data = df_train, method = "rpart",
preProcess = pp_steps, metric = "Accuracy",
trControl = ctrl, tuneLength = 10)
} else {
best_fit <- train(Status ~ ., data = df_train, method = "rf",
preProcess = pp_steps, metric = "Accuracy",
trControl = ctrl, tuneLength = 7)
}
# ----------------------------
# 10) Prediksi untuk FILE TESTING
# ----------------------------
pred_test_svm <- predict(svm_fit, newdata = df_test)
pred_test_dt <- predict(dt_fit, newdata = df_test)
pred_test_rf <- predict(rf_fit, newdata = df_test)
# Majority vote di data testing
ens_test <- data.frame(svm = pred_test_svm, dt = pred_test_dt, rf = pred_test_rf)
pred_test_mv <- apply(ens_test, 1, function(x) names(sort(table(x), decreasing = TRUE))[1])
# Prediksi dari model terbaik (retrain full train)
pred_test_best <- predict(best_fit, newdata = df_test)
# Gabungkan & simpan
out_pred <- data.frame(
Lokasi = lokasi_test,
Pred_SVM = pred_test_svm,
Pred_DTree = pred_test_dt,
Pred_RF = pred_test_rf,
Pred_MajorityVote = pred_test_mv,
Pred_BestModel = pred_test_best
)
# Simpan ke CSV
write.csv(out_pred, "Prediksi_Status_Testing.csv", row.names = FALSE)
cat("\nFile prediksi tersimpan sebagai: Prediksi_Status_Testing.csv\n")
##
## File prediksi tersimpan sebagai: Prediksi_Status_Testing.csv
# Baca data
df <- read.csv("D:\\UNNES\\Semester 5\\StatLing\\Prediksi_Status_Testing.csv", stringsAsFactors = FALSE)
# Buat mapping status ke kode numerik
mapping <- c("Baik" = 1,
"Tercemar ringan" = 2,
"Tercemar berat" = 3)
# Lakukan encoding untuk kolom-kolom prediksi
cols_to_encode <- c("Pred_SVM", "Pred_DTree", "Pred_RF",
"Pred_MajorityVote", "Pred_BestModel")
for (col in cols_to_encode) {
df[[col]] <- mapping[df[[col]]]
}
# Lihat hasil
head(df)
## Lokasi Pred_SVM Pred_DTree Pred_RF Pred_MajorityVote Pred_BestModel
## 1 S301 2 2 2 2 2
## 2 S302 2 2 2 2 2
## 3 S303 1 1 1 1 1
## 4 S304 2 2 2 2 2
## 5 S305 1 2 2 2 2
## 6 S306 2 2 2 2 2
# Simpan ke CSV
write.csv(df, "Prediksi_Status_Testing_Encoded.csv", row.names = FALSE)
cat("\nFile prediksi tersimpan sebagai: Prediksi_Status_Testing_Encoded.csv\n")
##
## File prediksi tersimpan sebagai: Prediksi_Status_Testing_Encoded.csv
Berdasarkan hasil evaluasi performa model, terlihat bahwa ketiga algoritma klasifikasi memberikan akurasi yang cukup tinggi. Model Decision Tree dan Random Forest menunjukkan akurasi tertinggi, yaitu sekitar 96,36%, yang berarti kedua model ini mampu mengklasifikasikan status kualitas air dengan tingkat ketepatan yang sangat baik. Sementara itu, model SVM (RBF) menghasilkan akurasi sedikit lebih rendah, yaitu sekitar 90,91%, namun masih tergolong tinggi dan layak digunakan. Perbedaan ini menunjukkan bahwa model berbasis pohon keputusan, khususnya Random Forest yang mengandalkan pendekatan ensemble, lebih mampu menangkap pola kompleks dalam data dibandingkan SVM pada kasus ini. Secara keseluruhan, dapat disimpulkan bahwa Decision Tree dan Random Forest adalah model yang lebih unggul untuk memprediksi status kualitas air berdasarkan variabel pH, DO, BOD, TSS, dan Suhu.
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(ggplot2)
library(dplyr)
library(broom)
df <- train_clean
# Ambil hanya kolom yg diperlukan (pakai base R agar bebas konflik)
needed <- c("DO","pH","BOD","TSS","Suhu")
stopifnot(all(needed %in% names(df)))
df_model <- df[, needed, drop = FALSE]
# Cek tidak ada NA
stopifnot(sum(is.na(df_model)) == 0)
# ---------- 1) Split train/valid (80/20) ----------
set.seed(123)
n <- nrow(df_model)
idx <- sample(seq_len(n), size = floor(0.8 * n))
train <- df_model[idx, ]
valid <- df_model[-idx, ]
# ---------- 2) Metrics ----------
metrics_reg <- function(actual, pred) {
mse <- mean((actual - pred)^2)
rmse <- sqrt(mse)
r2 <- 1 - sum((actual - pred)^2) / sum((actual - mean(actual))^2)
data.frame(R2 = r2, MSE = mse, RMSE = rmse)
}
# ---------- 3) Linear Regression ----------
lm_fit <- lm(DO ~ pH + BOD + TSS + Suhu, data = train)
pred_lm_val <- predict(lm_fit, newdata = valid)
perf_lm <- metrics_reg(valid$DO, pred_lm_val)
# Koefisien terstandar (untuk "pengaruh relatif")
lm_std_fit <- lm(scale(DO) ~ scale(pH) + scale(BOD) + scale(TSS) + scale(Suhu),
data = train)
coef_std <- broom::tidy(lm_std_fit) |>
dplyr::filter(term != "(Intercept)") |>
dplyr::mutate(abs_est = abs(estimate)) |>
dplyr::arrange(dplyr::desc(abs_est)) |>
dplyr::select(term, estimate)
# ---------- 4) GAM (Spline) ----------
# Saran k lebih kecil jika sampel kecil; method = "REML" stabil
gam_fit <- mgcv::gam(
DO ~ s(pH, k = 5) +
s(BOD, k = 5) +
s(TSS, k = 5) +
s(Suhu, k = 5),
data = train,
method = "REML"
)
pred_gam_val <- predict(gam_fit, newdata = valid)
perf_gam <- metrics_reg(valid$DO, pred_gam_val)
# ---------- 5) Ringkasan performa ----------
perf_all <- dplyr::bind_rows(
cbind(Model = "Linear Regression", perf_lm),
cbind(Model = "GAM (Spline)", perf_gam)
)
print(perf_all)
## Model R2 MSE RMSE
## 1 Linear Regression -0.08056432 1.074190 1.036431
## 2 GAM (Spline) -0.23745073 1.230151 1.109122
# ---------- 6) Visualisasi prediksi vs aktual ----------
plot_pred <- function(actual, pred, title_txt) {
dfp <- data.frame(Aktual = actual, Prediksi = pred)
ggplot(dfp, aes(x = Aktual, y = Prediksi)) +
geom_point(size = 2) +
geom_abline(intercept = 0, slope = 1, linetype = 2) +
labs(title = title_txt, x = "DO Aktual", y = "DO Prediksi") +
theme_minimal(base_size = 12)
}
print(plot_pred(valid$DO, pred_lm_val, "Prediksi vs Aktual — Linear Regression"))
print(plot_pred(valid$DO, pred_gam_val, "Prediksi vs Aktual — GAM (Spline)"))
# ---------- 7) Signifikansi spline (tanpa dropterm) ----------
cat("\n=== Signifikansi tiap spline (GAM) ===\n")
##
## === Signifikansi tiap spline (GAM) ===
print(summary(gam_fit)$s.table) # edf, F, p-value per s(var)
## edf Ref.df F p-value
## s(pH) 1.000004 1.000007 0.2057911 0.65054206
## s(BOD) 3.022369 3.536373 2.1834858 0.08130118
## s(TSS) 1.000094 1.000189 3.3182811 0.06987425
## s(Suhu) 1.000162 1.000325 0.4598065 0.49840284
# ---------- 8) Analisis kontribusi tiap variabel (nested model ANOVA) ----------
# Kita fit ulang model dengan MENGHILANGKAN satu spline per langkah,
# lalu bandingkan dengan model penuh (F-test; lebih rendah = kontribusi besar)
drop1_gam <- function(full_fit, data) {
terms <- c("s(pH, k = 5)", "s(BOD, k = 5)", "s(TSS, k = 5)", "s(Suhu, k = 5)")
form_full <- formula(full_fit)
res <- lapply(terms, function(trm) {
# Buat formula reduced dengan menghapus satu komponen trm
form_red_chr <- gsub(paste0("\\+\\s*", gsub("\\(", "\\\\(", gsub("\\)", "\\\\)", trm))),
"", deparse(form_full))
form_red_chr <- gsub(paste0("\\s*", gsub("\\(", "\\\\(", gsub("\\)", "\\\\)", trm)), "\\s*\\+"),
"", form_red_chr)
form_red_chr <- gsub("\\s+\\+", " +", form_red_chr)
form_red_chr <- gsub("\\+\\s*$", "", form_red_chr)
form_red <- as.formula(form_red_chr)
fit_red <- mgcv::gam(form_red, data = data, method = "REML")
# anova antara full vs reduced
a <- anova(fit_red, full_fit, test = "F")
# Ambil baris perbandingan (model kedua = full) → p-value di kolom terakhir
data.frame(
Removed = trm,
F_stat = a$F[2],
p_value = a$`Pr(>F)`[2],
AIC_red = AIC(fit_red),
AIC_full= AIC(full_fit)
)
})
dplyr::bind_rows(res) |>
dplyr::arrange(p_value)
}
cat("\n=== Kontribusi (nested ANOVA) — hapus satu spline per kali ===\n")
##
## === Kontribusi (nested ANOVA) — hapus satu spline per kali ===
contrib_tbl <- drop1_gam(gam_fit, train)
print(contrib_tbl)
## Removed F_stat p_value AIC_red AIC_full
## 1 s(BOD, k = 5) 2.69088266 0.03151960 585.8415 581.9404
## 2 s(TSS, k = 5) 3.69050911 0.04923948 583.9666 581.9404
## 3 s(Suhu, k = 5) 0.56205213 0.45660720 580.4924 581.9404
## 4 s(pH, k = 5) 0.01209172 0.90739906 580.0218 581.9404
# ---------- 9) Output ringkas tambahan ----------
cat("\n=== Peringkat pengaruh (Linear; koef. terstandar) ===\n")
##
## === Peringkat pengaruh (Linear; koef. terstandar) ===
print(coef_std |>
dplyr::rename(Variabel = term, Koefisien_Terstandar = estimate))
## # A tibble: 4 × 2
## Variabel Koefisien_Terstandar
## <chr> <dbl>
## 1 scale(TSS) 0.110
## 2 scale(Suhu) -0.0350
## 3 scale(pH) 0.0287
## 4 scale(BOD) -0.0199
Berdasarkan hasil regresi linier dengan koefisien terstandar, terlihat bahwa variabel TSS (Total Suspended Solid) memiliki pengaruh paling besar terhadap nilai DO dibandingkan variabel lain, dengan koefisien positif sebesar 0.11. Artinya, peningkatan TSS cenderung diikuti oleh peningkatan DO, meskipun dalam konteks ekologi hubungan ini bisa jadi kompleks. Variabel pH juga berkontribusi positif terhadap DO, meski pengaruhnya relatif kecil (0.028). Sementara itu, variabel Suhu dan BOD memiliki koefisien negatif, masing-masing sekitar –0.035 dan –0.020, yang menunjukkan bahwa kenaikan suhu maupun BOD cenderung menurunkan kadar DO, meskipun pengaruhnya lebih lemah dibandingkan TSS.
Secara ringkas, dari hasil ini dapat diinterpretasikan bahwa TSS merupakan faktor paling dominan memengaruhi variasi DO dalam data, sedangkan pH, suhu, dan BOD juga berperan tetapi dengan efek yang relatif lebih kecil.