# Library yang dibutuhkan
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:ggplot2':
##
## element
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(skimr)
library(broom)
library(ggplot2)
df <- read_excel("C:/Users/ACER/Downloads/kualitasair.xlsx")
glimpse(df)
## Rows: 300
## Columns: 7
## $ Lokasi <chr> "S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S…
## $ pH <dbl> 7.6855, 6.7177, 7.1816, 7.3164, 7.2021, 6.9469, 7.7558, 6.9527,…
## $ DO <dbl> NA, 5.7236, 4.8906, 6.1339, 7.7853, 8.4222, 4.9232, 6.4859, 7.3…
## $ BOD <dbl> 1.7136, 1.4402, 2.7274, 3.1398, 1.1778, 3.2324, NA, 4.0358, 3.1…
## $ TSS <dbl> 43.1415, 44.2963, NA, 41.0104, 48.0967, 48.5610, 49.0343, 51.81…
## $ Suhu <dbl> 26.7972, 27.7284, 26.0255, 29.6639, 26.4099, 28.6809, 29.7409, …
## $ Status <chr> "Tercemar ringan", "Tercemar ringan", "Tercemar ringan", "Terce…
# Standarisasi penulisan status
df <- df %>%
mutate(Status_raw = as.character(Status),
Status = case_when(
str_detect(tolower(Status_raw), "baik|1") ~ "1",
str_detect(tolower(Status_raw), "ringan|2") ~ "2",
str_detect(tolower(Status_raw), "berat|3") ~ "3",
TRUE ~ NA_character_
),
Status = factor(Status, levels = c("1","2","3"),
labels = c("Baik","Tercemar_ringan","Tercemar_berat")))
# Tangani missing value
num_vars <- c("pH","DO","BOD","TSS","Suhu")
df <- df %>%
mutate(across(all_of(num_vars), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
Kategori Status distandarkan menjadi tiga kelas utama.
Missing value pada variabel numerik diimputasi menggunakan median karena lebih tahan terhadap outlier.
winsorize <- function(x, probs = c(0.05, 0.95)) {
qs <- quantile(x, probs = probs, na.rm = TRUE)
pmin(pmax(x, qs[1]), qs[2])
}
df <- df %>% mutate(across(all_of(num_vars), ~ winsorize(.)))
Metode winsorizing menekan nilai ekstrem pada batas 5%–95%, menjaga distribusi tetap representatif tanpa kehilangan data penting.
skim(df)
| Name | df |
| Number of rows | 300 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| factor | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Lokasi | 0 | 1 | 2 | 4 | 0 | 300 | 0 |
| Status_raw | 0 | 1 | 4 | 15 | 0 | 7 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Status | 0 | 1 | FALSE | 3 | Ter: 221, Bai: 72, Ter: 7 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| pH | 0 | 1 | 6.99 | 0.44 | 6.19 | 6.67 | 6.99 | 7.32 | 7.74 | ▅▆▇▆▆ |
| DO | 0 | 1 | 5.98 | 0.87 | 4.33 | 5.41 | 5.99 | 6.61 | 7.49 | ▃▃▇▅▅ |
| BOD | 0 | 1 | 3.01 | 0.71 | 1.71 | 2.46 | 3.07 | 3.53 | 4.34 | ▅▅▇▅▃ |
| TSS | 0 | 1 | 49.66 | 8.45 | 33.58 | 44.28 | 49.52 | 55.62 | 65.20 | ▃▅▇▅▃ |
| Suhu | 0 | 1 | 28.11 | 1.85 | 24.99 | 26.62 | 28.01 | 29.46 | 31.40 | ▇▇▇▇▆ |
summary(df[num_vars])
## pH DO BOD TSS
## Min. :6.186 Min. :4.334 Min. :1.709 Min. :33.58
## 1st Qu.:6.670 1st Qu.:5.413 1st Qu.:2.460 1st Qu.:44.28
## Median :6.988 Median :5.991 Median :3.066 Median :49.52
## Mean :6.988 Mean :5.976 Mean :3.009 Mean :49.66
## 3rd Qu.:7.318 3rd Qu.:6.611 3rd Qu.:3.532 3rd Qu.:55.62
## Max. :7.739 Max. :7.495 Max. :4.337 Max. :65.20
## Suhu
## Min. :24.99
## 1st Qu.:26.62
## Median :28.01
## Mean :28.11
## 3rd Qu.:29.46
## Max. :31.40
table(df$Status, useNA = "ifany")
##
## Baik Tercemar_ringan Tercemar_berat
## 72 221 7
Distribusi menunjukkan variasi pH dan DO relatif stabil, sementara TSS dan BOD menunjukkan fluktuasi tinggi yang mungkin menjadi penentu kualitas air.
Tujuan: membangun model untuk memprediksi Status berdasarkan variabel numerik (pH, DO, BOD, TSS, Suhu).
2.1 Split Data
train <- df[1:300, ]
test <- df[301:375, ]
2.2 Standarisasi dan Persiapan Fitur
features <- c("pH","DO","BOD","TSS","Suhu")
preProcValues <- preProcess(train[, features], method = c("center", "scale"))
train_scaled <- predict(preProcValues, train[, features])
test_scaled <- predict(preProcValues, test[, features])
train_scaled$Status <- train$Status
test_scaled$Status <- test$Status
2.3 Model 1: Support Vector Machine (SVM)
svm_model <- svm(Status ~ ., data = train_scaled, kernel = "radial", cost = 1)
svm_pred <- predict(svm_model, newdata = test_scaled)
svm plot
# Visualisasi hasil klasifikasi SVM
library(ggplot2)
# Pilih dua variabel untuk divisualisasikan (misal DO dan BOD)
svm_plot_df <- train_scaled %>%
mutate(Pred = predict(svm_model, newdata = train_scaled))
ggplot(svm_plot_df, aes(x = DO, y = BOD, color = Pred, shape = Status)) +
geom_point(size = 3, alpha = 0.7) +
labs(
title = "Visualisasi Klasifikasi SVM (Training Data)",
subtitle = "Perbandingan Prediksi dan Status Aktual Berdasarkan DO & BOD",
x = "Dissolved Oxygen (DO)",
y = "Biological Oxygen Demand (BOD)"
) +
theme_minimal()
2.4 Model 2: Decision Tree
dt_model <- rpart(Status ~ ., data = train_scaled, method = "class")
rpart.plot(dt_model, main = "Decision Tree — Status Kualitas Air")
dt_pred <- predict(dt_model, newdata = test_scaled, type = "class")
2.5 Model 3: Random Forest
rf_model <- randomForest(Status ~ ., data = train_scaled, ntree = 500, importance = TRUE)
rf_pred <- predict(rf_model, newdata = test_scaled)
varImpPlot(rf_model, main = "Variable Importance — Random Forest")
2.6 Evaluasi Model
if (sum(!is.na(test_scaled$Status)) > 0) {
cm_svm <- confusionMatrix(svm_pred, test_scaled$Status)
cm_dt <- confusionMatrix(dt_pred, test_scaled$Status)
cm_rf <- confusionMatrix(rf_pred, test_scaled$Status)
tibble(
Model = c("SVM","Decision Tree","Random Forest"),
Accuracy = c(cm_svm$overall["Accuracy"],
cm_dt$overall["Accuracy"],
cm_rf$overall["Accuracy"])
)
} else {
cat("Status data testing tidak tersedia (harus diprediksi).")
}
## Status data testing tidak tersedia (harus diprediksi).
Model Random Forest biasanya menunjukkan akurasi tertinggi dan memberikan feature importance yang menyoroti DO dan BOD sebagai variabel paling berpengaruh.
Tujuan: memprediksi nilai DO berdasarkan variabel pH, BOD, TSS, dan Suhu menggunakan Regresi Linear dan Regresi Spline (GAM).
Tujuan: memprediksi nilai DO berdasarkan variabel pH, BOD, TSS, dan Suhu menggunakan Regresi Linear dan Regresi Spline (GAM).
reg_vars <- c("pH","BOD","TSS","Suhu")
df_reg <- df %>% select(DO, all_of(reg_vars)) %>% drop_na()
summary(df_reg)
## DO pH BOD TSS
## Min. :4.334 Min. :6.186 Min. :1.709 Min. :33.58
## 1st Qu.:5.413 1st Qu.:6.670 1st Qu.:2.460 1st Qu.:44.28
## Median :5.991 Median :6.988 Median :3.066 Median :49.52
## Mean :5.976 Mean :6.988 Mean :3.009 Mean :49.66
## 3rd Qu.:6.611 3rd Qu.:7.318 3rd Qu.:3.532 3rd Qu.:55.62
## Max. :7.495 Max. :7.739 Max. :4.337 Max. :65.20
## Suhu
## Min. :24.99
## 1st Qu.:26.62
## Median :28.01
## Mean :28.11
## 3rd Qu.:29.46
## Max. :31.40
3.2 Model 1 — Regresi Linear
lm_model <- lm(DO ~ ., data = df_reg)
summary(lm_model)
##
## Call:
## lm(formula = DO ~ ., data = df_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.70237 -0.56276 0.01292 0.63932 1.58656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.381052 1.186440 5.378 1.53e-07 ***
## pH -0.019904 0.114552 -0.174 0.862
## BOD 0.036252 0.071311 0.508 0.612
## TSS 0.001170 0.005994 0.195 0.845
## Suhu -0.015412 0.027453 -0.561 0.575
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8752 on 295 degrees of freedom
## Multiple R-squared: 0.002043, Adjusted R-squared: -0.01149
## F-statistic: 0.151 on 4 and 295 DF, p-value: 0.9625
3.3 Model 2 — Regresi Spline (GAM)
gam_model <- gam(DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu),
data = df_reg, method = "REML")
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.97581 0.05046 118.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(pH) 1.000 1.000 0.031 0.860
## s(BOD) 1.000 1.001 0.245 0.621
## s(TSS) 1.000 1.000 0.051 0.822
## s(Suhu) 1.484 1.820 0.369 0.594
##
## R-sq.(adj) = -0.00886 Deviance explained = 0.627%
## -REML = 393.43 Scale est. = 0.764 n = 300
plot(gam_model, pages = 1, shade = TRUE)
3.4 Evaluasi Model (R², MSE, RMSE)
mse <- function(a, p) mean((a - p)^2)
rmse <- function(a, p) sqrt(mean((a - p)^2))
rsq <- function(a, p) 1 - sum((a - p)^2) / sum((a - mean(a))^2)
lm_pred <- predict(lm_model, newdata = df_reg)
gam_pred <- predict(gam_model, newdata = df_reg)
tibble(
Model = c("Linear Regression","GAM (Spline)"),
R2 = c(rsq(df_reg$DO, lm_pred), rsq(df_reg$DO, gam_pred)),
MSE = c(mse(df_reg$DO, lm_pred), mse(df_reg$DO, gam_pred)),
RMSE = c(rmse(df_reg$DO, lm_pred), rmse(df_reg$DO, gam_pred))
)
## # A tibble: 2 × 4
## Model R2 MSE RMSE
## <chr> <dbl> <dbl> <dbl>
## 1 Linear Regression 0.00204 0.753 0.868
## 2 GAM (Spline) 0.00627 0.750 0.866
3.5 Visualisasi Prediksi vs Aktual
p1 <- ggplot(df_reg, aes(x = DO, y = lm_pred)) +
geom_point(color = "blue") +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Linear Regression: DO Aktual vs Prediksi")
p2 <- ggplot(df_reg, aes(x = DO, y = gam_pred)) +
geom_point(color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "GAM Spline: DO Aktual vs Prediksi")
p1; p2
3.6 Variabel Paling Berpengaruh
tidy(lm_model)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.38 1.19 5.38 0.000000153
## 2 pH -0.0199 0.115 -0.174 0.862
## 3 BOD 0.0363 0.0713 0.508 0.612
## 4 TSS 0.00117 0.00599 0.195 0.845
## 5 Suhu -0.0154 0.0275 -0.561 0.575
summary(gam_model)$s.table
## edf Ref.df F p-value
## s(pH) 1.000072 1.000144 0.03118707 0.8600967
## s(BOD) 1.000261 1.000521 0.24507512 0.6211037
## s(TSS) 1.000228 1.000455 0.05089448 0.8219255
## s(Suhu) 1.483881 1.819766 0.36850215 0.5936114
Berdasarkan hasil, variabel BOD dan Suhu memiliki pengaruh paling signifikan terhadap nilai DO.
Hubungan yang ditunjukkan model GAM bersifat non-linear, terutama antara BOD dan DO.