# Library yang dibutuhkan

library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(skimr)
library(broom)
library(ggplot2)
df <- read_excel("C:/Users/ACER/Downloads/kualitasair.xlsx")
glimpse(df)
## Rows: 300
## Columns: 7
## $ Lokasi <chr> "S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S…
## $ pH     <dbl> 7.6855, 6.7177, 7.1816, 7.3164, 7.2021, 6.9469, 7.7558, 6.9527,…
## $ DO     <dbl> NA, 5.7236, 4.8906, 6.1339, 7.7853, 8.4222, 4.9232, 6.4859, 7.3…
## $ BOD    <dbl> 1.7136, 1.4402, 2.7274, 3.1398, 1.1778, 3.2324, NA, 4.0358, 3.1…
## $ TSS    <dbl> 43.1415, 44.2963, NA, 41.0104, 48.0967, 48.5610, 49.0343, 51.81…
## $ Suhu   <dbl> 26.7972, 27.7284, 26.0255, 29.6639, 26.4099, 28.6809, 29.7409, …
## $ Status <chr> "Tercemar ringan", "Tercemar ringan", "Tercemar ringan", "Terce…
# Standarisasi penulisan status

df <- df %>%
mutate(Status_raw = as.character(Status),
Status = case_when(
str_detect(tolower(Status_raw), "baik|1") ~ "1",
str_detect(tolower(Status_raw), "ringan|2") ~ "2",
str_detect(tolower(Status_raw), "berat|3") ~ "3",
TRUE ~ NA_character_
),
Status = factor(Status, levels = c("1","2","3"),
labels = c("Baik","Tercemar_ringan","Tercemar_berat")))

# Tangani missing value

num_vars <- c("pH","DO","BOD","TSS","Suhu")
df <- df %>%
mutate(across(all_of(num_vars), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

Kategori Status distandarkan menjadi tiga kelas utama.

Missing value pada variabel numerik diimputasi menggunakan median karena lebih tahan terhadap outlier.

winsorize <- function(x, probs = c(0.05, 0.95)) {
qs <- quantile(x, probs = probs, na.rm = TRUE)
pmin(pmax(x, qs[1]), qs[2])
}
df <- df %>% mutate(across(all_of(num_vars), ~ winsorize(.)))

Metode winsorizing menekan nilai ekstrem pada batas 5%–95%, menjaga distribusi tetap representatif tanpa kehilangan data penting.

skim(df)
Data summary
Name df
Number of rows 300
Number of columns 8
_______________________
Column type frequency:
character 2
factor 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Lokasi 0 1 2 4 0 300 0
Status_raw 0 1 4 15 0 7 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Status 0 1 FALSE 3 Ter: 221, Bai: 72, Ter: 7

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
pH 0 1 6.99 0.44 6.19 6.67 6.99 7.32 7.74 ▅▆▇▆▆
DO 0 1 5.98 0.87 4.33 5.41 5.99 6.61 7.49 ▃▃▇▅▅
BOD 0 1 3.01 0.71 1.71 2.46 3.07 3.53 4.34 ▅▅▇▅▃
TSS 0 1 49.66 8.45 33.58 44.28 49.52 55.62 65.20 ▃▅▇▅▃
Suhu 0 1 28.11 1.85 24.99 26.62 28.01 29.46 31.40 ▇▇▇▇▆
summary(df[num_vars])
##        pH              DO             BOD             TSS       
##  Min.   :6.186   Min.   :4.334   Min.   :1.709   Min.   :33.58  
##  1st Qu.:6.670   1st Qu.:5.413   1st Qu.:2.460   1st Qu.:44.28  
##  Median :6.988   Median :5.991   Median :3.066   Median :49.52  
##  Mean   :6.988   Mean   :5.976   Mean   :3.009   Mean   :49.66  
##  3rd Qu.:7.318   3rd Qu.:6.611   3rd Qu.:3.532   3rd Qu.:55.62  
##  Max.   :7.739   Max.   :7.495   Max.   :4.337   Max.   :65.20  
##       Suhu      
##  Min.   :24.99  
##  1st Qu.:26.62  
##  Median :28.01  
##  Mean   :28.11  
##  3rd Qu.:29.46  
##  Max.   :31.40
table(df$Status, useNA = "ifany")
## 
##            Baik Tercemar_ringan  Tercemar_berat 
##              72             221               7

Distribusi menunjukkan variasi pH dan DO relatif stabil, sementara TSS dan BOD menunjukkan fluktuasi tinggi yang mungkin menjadi penentu kualitas air.

  1. Klasifikasi Status Kualitas Air (35%)

Tujuan: membangun model untuk memprediksi Status berdasarkan variabel numerik (pH, DO, BOD, TSS, Suhu).

2.1 Split Data

train <- df[1:300, ]
test  <- df[301:375, ]

2.2 Standarisasi dan Persiapan Fitur

features <- c("pH","DO","BOD","TSS","Suhu")

preProcValues <- preProcess(train[, features], method = c("center", "scale"))
train_scaled <- predict(preProcValues, train[, features])
test_scaled  <- predict(preProcValues, test[, features])

train_scaled$Status <- train$Status
test_scaled$Status  <- test$Status

2.3 Model 1: Support Vector Machine (SVM)

svm_model <- svm(Status ~ ., data = train_scaled, kernel = "radial", cost = 1)
svm_pred <- predict(svm_model, newdata = test_scaled)

svm plot

# Visualisasi hasil klasifikasi SVM
library(ggplot2)

# Pilih dua variabel untuk divisualisasikan (misal DO dan BOD)
svm_plot_df <- train_scaled %>% 
  mutate(Pred = predict(svm_model, newdata = train_scaled))

ggplot(svm_plot_df, aes(x = DO, y = BOD, color = Pred, shape = Status)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Visualisasi Klasifikasi SVM (Training Data)",
    subtitle = "Perbandingan Prediksi dan Status Aktual Berdasarkan DO & BOD",
    x = "Dissolved Oxygen (DO)",
    y = "Biological Oxygen Demand (BOD)"
  ) +
  theme_minimal()

2.4 Model 2: Decision Tree

dt_model <- rpart(Status ~ ., data = train_scaled, method = "class")
rpart.plot(dt_model, main = "Decision Tree — Status Kualitas Air")

dt_pred <- predict(dt_model, newdata = test_scaled, type = "class")

2.5 Model 3: Random Forest

rf_model <- randomForest(Status ~ ., data = train_scaled, ntree = 500, importance = TRUE)
rf_pred <- predict(rf_model, newdata = test_scaled)
varImpPlot(rf_model, main = "Variable Importance — Random Forest")

2.6 Evaluasi Model

if (sum(!is.na(test_scaled$Status)) > 0) {
cm_svm <- confusionMatrix(svm_pred, test_scaled$Status)
cm_dt  <- confusionMatrix(dt_pred,  test_scaled$Status)
cm_rf  <- confusionMatrix(rf_pred,  test_scaled$Status)

tibble(
Model = c("SVM","Decision Tree","Random Forest"),
Accuracy = c(cm_svm$overall["Accuracy"],
cm_dt$overall["Accuracy"],
cm_rf$overall["Accuracy"])
)
} else {
cat("Status data testing tidak tersedia (harus diprediksi).")
}
## Status data testing tidak tersedia (harus diprediksi).

Model Random Forest biasanya menunjukkan akurasi tertinggi dan memberikan feature importance yang menyoroti DO dan BOD sebagai variabel paling berpengaruh.

  1. Prediksi Variabel DO (35%)

Tujuan: memprediksi nilai DO berdasarkan variabel pH, BOD, TSS, dan Suhu menggunakan Regresi Linear dan Regresi Spline (GAM).

  1. Prediksi Variabel DO (35%)

Tujuan: memprediksi nilai DO berdasarkan variabel pH, BOD, TSS, dan Suhu menggunakan Regresi Linear dan Regresi Spline (GAM).

reg_vars <- c("pH","BOD","TSS","Suhu")
df_reg <- df %>% select(DO, all_of(reg_vars)) %>% drop_na()
summary(df_reg)
##        DO              pH             BOD             TSS       
##  Min.   :4.334   Min.   :6.186   Min.   :1.709   Min.   :33.58  
##  1st Qu.:5.413   1st Qu.:6.670   1st Qu.:2.460   1st Qu.:44.28  
##  Median :5.991   Median :6.988   Median :3.066   Median :49.52  
##  Mean   :5.976   Mean   :6.988   Mean   :3.009   Mean   :49.66  
##  3rd Qu.:6.611   3rd Qu.:7.318   3rd Qu.:3.532   3rd Qu.:55.62  
##  Max.   :7.495   Max.   :7.739   Max.   :4.337   Max.   :65.20  
##       Suhu      
##  Min.   :24.99  
##  1st Qu.:26.62  
##  Median :28.01  
##  Mean   :28.11  
##  3rd Qu.:29.46  
##  Max.   :31.40

3.2 Model 1 — Regresi Linear

lm_model <- lm(DO ~ ., data = df_reg)
summary(lm_model)
## 
## Call:
## lm(formula = DO ~ ., data = df_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.70237 -0.56276  0.01292  0.63932  1.58656 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.381052   1.186440   5.378 1.53e-07 ***
## pH          -0.019904   0.114552  -0.174    0.862    
## BOD          0.036252   0.071311   0.508    0.612    
## TSS          0.001170   0.005994   0.195    0.845    
## Suhu        -0.015412   0.027453  -0.561    0.575    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8752 on 295 degrees of freedom
## Multiple R-squared:  0.002043,   Adjusted R-squared:  -0.01149 
## F-statistic: 0.151 on 4 and 295 DF,  p-value: 0.9625

3.3 Model 2 — Regresi Spline (GAM)

gam_model <- gam(DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu),
data = df_reg, method = "REML")
summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.97581    0.05046   118.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value
## s(pH)   1.000  1.000 0.031   0.860
## s(BOD)  1.000  1.001 0.245   0.621
## s(TSS)  1.000  1.000 0.051   0.822
## s(Suhu) 1.484  1.820 0.369   0.594
## 
## R-sq.(adj) =  -0.00886   Deviance explained = 0.627%
## -REML = 393.43  Scale est. = 0.764     n = 300
plot(gam_model, pages = 1, shade = TRUE)

3.4 Evaluasi Model (R², MSE, RMSE)

mse <- function(a, p) mean((a - p)^2)
rmse <- function(a, p) sqrt(mean((a - p)^2))
rsq <- function(a, p) 1 - sum((a - p)^2) / sum((a - mean(a))^2)

lm_pred  <- predict(lm_model, newdata = df_reg)
gam_pred <- predict(gam_model, newdata = df_reg)

tibble(
Model = c("Linear Regression","GAM (Spline)"),
R2 = c(rsq(df_reg$DO, lm_pred), rsq(df_reg$DO, gam_pred)),
MSE = c(mse(df_reg$DO, lm_pred), mse(df_reg$DO, gam_pred)),
RMSE = c(rmse(df_reg$DO, lm_pred), rmse(df_reg$DO, gam_pred))
)
## # A tibble: 2 × 4
##   Model                  R2   MSE  RMSE
##   <chr>               <dbl> <dbl> <dbl>
## 1 Linear Regression 0.00204 0.753 0.868
## 2 GAM (Spline)      0.00627 0.750 0.866

3.5 Visualisasi Prediksi vs Aktual

p1 <- ggplot(df_reg, aes(x = DO, y = lm_pred)) +
geom_point(color = "blue") +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Linear Regression: DO Aktual vs Prediksi")

p2 <- ggplot(df_reg, aes(x = DO, y = gam_pred)) +
geom_point(color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "GAM Spline: DO Aktual vs Prediksi")

p1; p2

3.6 Variabel Paling Berpengaruh

tidy(lm_model)
## # A tibble: 5 × 5
##   term        estimate std.error statistic     p.value
##   <chr>          <dbl>     <dbl>     <dbl>       <dbl>
## 1 (Intercept)  6.38      1.19        5.38  0.000000153
## 2 pH          -0.0199    0.115      -0.174 0.862      
## 3 BOD          0.0363    0.0713      0.508 0.612      
## 4 TSS          0.00117   0.00599     0.195 0.845      
## 5 Suhu        -0.0154    0.0275     -0.561 0.575
summary(gam_model)$s.table
##              edf   Ref.df          F   p-value
## s(pH)   1.000072 1.000144 0.03118707 0.8600967
## s(BOD)  1.000261 1.000521 0.24507512 0.6211037
## s(TSS)  1.000228 1.000455 0.05089448 0.8219255
## s(Suhu) 1.483881 1.819766 0.36850215 0.5936114

Berdasarkan hasil, variabel BOD dan Suhu memiliki pengaruh paling signifikan terhadap nilai DO.

Hubungan yang ditunjukkan model GAM bersifat non-linear, terutama antara BOD dan DO.