library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(splines)

data <- read_excel("C:/Users/Ziglar/Downloads/kualitasair.xlsx")

str(data)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
##  $ pH    : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
##  $ DO    : num [1:300] NA 5.72 4.89 6.13 7.79 ...
##  $ BOD   : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
##  $ TSS   : num [1:300] 43.1 44.3 NA 41 48.1 ...
##  $ Suhu  : num [1:300] 26.8 27.7 26 29.7 26.4 ...
##  $ Status: chr [1:300] "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" ...
summary(data)
##     Lokasi                pH              DO             BOD        
##  Length:300         Min.   :5.503   Min.   :2.982   Min.   :0.3026  
##  Class :character   1st Qu.:6.670   1st Qu.:5.375   1st Qu.:2.3573  
##  Mode  :character   Median :6.988   Median :5.991   Median :3.0661  
##                     Mean   :6.989   Mean   :5.976   Mean   :3.0005  
##                     3rd Qu.:7.318   3rd Qu.:6.688   3rd Qu.:3.5781  
##                     Max.   :8.351   Max.   :9.229   Max.   :5.7962  
##                                     NA's   :23      NA's   :22      
##       TSS             Suhu          Status         
##  Min.   :24.65   Min.   :22.77   Length:300        
##  1st Qu.:43.73   1st Qu.:26.62   Class :character  
##  Median :49.52   Median :28.01   Mode  :character  
##  Mean   :49.70   Mean   :28.31                     
##  3rd Qu.:56.44   3rd Qu.:29.46                     
##  Max.   :76.34   Max.   :90.00                     
##  NA's   :24
required <- c("readxl","dplyr","ggplot2","caret","e1071","rpart","randomForest",
              "splines","Metrics","knitr","kableExtra")
installed <- required %in% installed.packages()[,"Package"]
if(any(!installed)) install.packages(required[!installed])
lapply(required, library, character.only = TRUE)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## [[1]]
##  [1] "splines"      "randomForest" "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "dplyr"        "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[2]]
##  [1] "splines"      "randomForest" "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "dplyr"        "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[3]]
##  [1] "splines"      "randomForest" "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "dplyr"        "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[4]]
##  [1] "splines"      "randomForest" "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "dplyr"        "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[5]]
##  [1] "splines"      "randomForest" "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "dplyr"        "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[6]]
##  [1] "splines"      "randomForest" "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "dplyr"        "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[7]]
##  [1] "splines"      "randomForest" "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "dplyr"        "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[8]]
##  [1] "splines"      "randomForest" "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "dplyr"        "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[9]]
##  [1] "Metrics"      "splines"      "randomForest" "rpart"        "e1071"       
##  [6] "caret"        "lattice"      "ggplot2"      "dplyr"        "readxl"      
## [11] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [16] "methods"      "base"        
## 
## [[10]]
##  [1] "knitr"        "Metrics"      "splines"      "randomForest" "rpart"       
##  [6] "e1071"        "caret"        "lattice"      "ggplot2"      "dplyr"       
## [11] "readxl"       "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "kableExtra"   "knitr"        "Metrics"      "splines"      "randomForest"
##  [6] "rpart"        "e1071"        "caret"        "lattice"      "ggplot2"     
## [11] "dplyr"        "readxl"       "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"
data <- readxl::read_excel("C:/Users/Ziglar/Downloads/kualitasair.xlsx")

str(data)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
##  $ pH    : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
##  $ DO    : num [1:300] NA 5.72 4.89 6.13 7.79 ...
##  $ BOD   : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
##  $ TSS   : num [1:300] 43.1 44.3 NA 41 48.1 ...
##  $ Suhu  : num [1:300] 26.8 27.7 26 29.7 26.4 ...
##  $ Status: chr [1:300] "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" ...
summary(data)
##     Lokasi                pH              DO             BOD        
##  Length:300         Min.   :5.503   Min.   :2.982   Min.   :0.3026  
##  Class :character   1st Qu.:6.670   1st Qu.:5.375   1st Qu.:2.3573  
##  Mode  :character   Median :6.988   Median :5.991   Median :3.0661  
##                     Mean   :6.989   Mean   :5.976   Mean   :3.0005  
##                     3rd Qu.:7.318   3rd Qu.:6.688   3rd Qu.:3.5781  
##                     Max.   :8.351   Max.   :9.229   Max.   :5.7962  
##                                     NA's   :23      NA's   :22      
##       TSS             Suhu          Status         
##  Min.   :24.65   Min.   :22.77   Length:300        
##  1st Qu.:43.73   1st Qu.:26.62   Class :character  
##  Median :49.52   Median :28.01   Mode  :character  
##  Mean   :49.70   Mean   :28.31                     
##  3rd Qu.:56.44   3rd Qu.:29.46                     
##  Max.   :76.34   Max.   :90.00                     
##  NA's   :24
head(data)
## # A tibble: 6 × 7
##   Lokasi    pH    DO   BOD   TSS  Suhu Status         
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>          
## 1 S1      7.69 NA     1.71  43.1  26.8 Tercemar ringan
## 2 S2      6.72  5.72  1.44  44.3  27.7 Tercemar ringan
## 3 S3      7.18  4.89  2.73  NA    26.0 Tercemar ringan
## 4 S4      7.32  6.13  3.14  41.0  29.7 Tercemar ringan
## 5 S5      7.20  7.79  1.18  48.1  26.4 baik           
## 6 S6      6.95  8.42  3.23  48.6  28.7 Tercemar ringan
num_vars <- c("pH","DO","BOD","TSS","Suhu")
for(v in num_vars){
  data[[v]][is.na(data[[v]])] <- mean(data[[v]], na.rm = TRUE)
}
colSums(is.na(data))
## Lokasi     pH     DO    BOD    TSS   Suhu Status 
##      0      0      0      0      0      0      0
cap_outliers <- function(x){
  lower <- quantile(x, 0.05, na.rm=TRUE)
  upper <- quantile(x, 0.95, na.rm=TRUE)
  x[x < lower] <- lower
  x[x > upper] <- upper
  return(x)
}
data <- data %>% mutate(across(all_of(num_vars), cap_outliers))
data$Status <- as.character(data$Status)
data$Status <- trimws(data$Status)
data$Status <- tolower(data$Status)

data$Status <- dplyr::recode(data$Status,
  "1" = "Baik", "2" = "Tercemar ringan", "3" = "Tercemar berat",
  "baik" = "Baik", "tercemar ringan" = "Tercemar ringan", "tercemar berat" = "Tercemar berat"
)

data$Status <- factor(data$Status, levels = c("Baik","Tercemar ringan","Tercemar berat"))
table(data$Status)
## 
##            Baik Tercemar ringan  Tercemar berat 
##              72             221               7
summary_stats <- data %>% select(all_of(num_vars)) %>% summary()
print(summary_stats)
##        pH              DO             BOD             TSS       
##  Min.   :6.186   Min.   :4.334   Min.   :1.709   Min.   :33.58  
##  1st Qu.:6.670   1st Qu.:5.413   1st Qu.:2.460   1st Qu.:44.28  
##  Median :6.988   Median :5.976   Median :3.001   Median :49.70  
##  Mean   :6.988   Mean   :5.975   Mean   :3.004   Mean   :49.67  
##  3rd Qu.:7.318   3rd Qu.:6.611   3rd Qu.:3.532   3rd Qu.:55.62  
##  Max.   :7.739   Max.   :7.495   Max.   :4.337   Max.   :65.20  
##       Suhu      
##  Min.   :24.99  
##  1st Qu.:26.62  
##  Median :28.01  
##  Mean   :28.11  
##  3rd Qu.:29.46  
##  Max.   :31.40
knitr::kable(summary(data), caption = "Ringkasan Data Setelah Pembersihan")
Ringkasan Data Setelah Pembersihan
Lokasi pH DO BOD TSS Suhu Status
Length:300 Min. :6.186 Min. :4.334 Min. :1.709 Min. :33.58 Min. :24.99 Baik : 72
Class :character 1st Qu.:6.670 1st Qu.:5.413 1st Qu.:2.460 1st Qu.:44.28 1st Qu.:26.62 Tercemar ringan:221
Mode :character Median :6.988 Median :5.976 Median :3.001 Median :49.70 Median :28.01 Tercemar berat : 7
NA Mean :6.988 Mean :5.975 Mean :3.004 Mean :49.67 Mean :28.11 NA
NA 3rd Qu.:7.318 3rd Qu.:6.611 3rd Qu.:3.532 3rd Qu.:55.62 3rd Qu.:29.46 NA
NA Max. :7.739 Max. :7.495 Max. :4.337 Max. :65.20 Max. :31.40 NA

Data Cleaning dan Eksplorasi

Penjelasan Analisis

Pada tahap ini dilakukan proses pembersihan data (data cleaning) untuk memastikan kualitas dataset sebelum digunakan dalam analisis lanjutan.

Langkah-langkah yang dilakukan meliputi:

  1. Identifikasi Missing Value

Pemeriksaan nilai hilang dilakukan dengan fungsi colSums(is.na(data)). Ditemukan beberapa missing value pada variabel numerik seperti pH, DO, BOD, TSS, dan Suhu. Semua nilai hilang diganti menggunakan nilai rata-rata (mean) dari masing-masing variabel agar distribusi data tetap stabil.

  1. Penanganan Outlier

Deteksi outlier dilakukan dengan analisis kuantil dan boxplot. Nilai ekstrem dikoreksi menggunakan metode capping pada kuantil ke-5 dan ke-95, sehingga tidak ada data yang dihapus, tetapi nilai ekstrem tetap dikendalikan agar tidak memengaruhi hasil model secara berlebihan.

  1. Standarisasi Penulisan Kategori Status

Kolom Status awalnya memiliki variasi penulisan seperti angka “1”, “2”, “3” maupun huruf “baik”, “Baik”. Semua distandarisasi menjadi tiga kategori:

-Baik

-Tercemar ringan

-Tercemar berat

  1. Statistik Deskriptif Setelah Pembersihan

Setelah data bersih, dilakukan analisis deskriptif terhadap variabel numerik. Hasilnya menunjukkan:

-Nilai pH rata-rata berada di kisaran 6–7, menunjukkan kondisi air relatif netral.

-Nilai DO (Dissolved Oxygen) berkisar 5–7 mg/L, yang menunjukkan kualitas air cukup baik.

-Nilai BOD dan TSS bervariasi antar lokasi, menandakan adanya perbedaan tingkat pencemaran.

-Suhu air berada di rentang 26–31°C, sesuai karakteristik perairan tropis.

set.seed(12345)
train_idx <- caret::createDataPartition(data$Status, p = 0.8, list = FALSE)
trainData <- data[train_idx, ]
testData  <- data[-train_idx, ]
nrow(trainData); nrow(testData)
## [1] 241
## [1] 59
tree_model <- rpart::rpart(Status ~ pH + DO + BOD + TSS + Suhu, data = trainData, method = "class")
pred_tree <- predict(tree_model, testData, type = "class")
cm_tree <- caret::confusionMatrix(pred_tree, testData$Status)
cm_tree
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik              12               1              0
##   Tercemar ringan    2              43              1
##   Tercemar berat     0               0              0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9322          
##                  95% CI : (0.8354, 0.9812)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 0.000233        
##                                           
##                   Kappa : 0.8149          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity               0.8571                 0.9773               0.00000
## Specificity               0.9778                 0.8000               1.00000
## Pos Pred Value            0.9231                 0.9348                   NaN
## Neg Pred Value            0.9565                 0.9231               0.98305
## Prevalence                0.2373                 0.7458               0.01695
## Detection Rate            0.2034                 0.7288               0.00000
## Detection Prevalence      0.2203                 0.7797               0.00000
## Balanced Accuracy         0.9175                 0.8886               0.50000
set.seed(12345)
rf_model <- randomForest::randomForest(Status ~ pH + DO + BOD + TSS + Suhu, data = trainData, ntree = 500)
pred_rf <- predict(rf_model, testData)
cm_rf <- caret::confusionMatrix(pred_rf, testData$Status)
cm_rf
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik              12               2              0
##   Tercemar ringan    2              42              1
##   Tercemar berat     0               0              0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9153          
##                  95% CI : (0.8132, 0.9719)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 0.0009347       
##                                           
##                   Kappa : 0.7739          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity               0.8571                 0.9545               0.00000
## Specificity               0.9556                 0.8000               1.00000
## Pos Pred Value            0.8571                 0.9333                   NaN
## Neg Pred Value            0.9556                 0.8571               0.98305
## Prevalence                0.2373                 0.7458               0.01695
## Detection Rate            0.2034                 0.7119               0.00000
## Detection Prevalence      0.2373                 0.7627               0.00000
## Balanced Accuracy         0.9063                 0.8773               0.50000
svm_model <- e1071::svm(Status ~ pH + DO + BOD + TSS + Suhu, data = trainData, probability = FALSE)
pred_svm <- predict(svm_model, testData)
cm_svm <- caret::confusionMatrix(pred_svm, testData$Status)
cm_svm
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik              10               3              0
##   Tercemar ringan    4              41              1
##   Tercemar berat     0               0              0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8644          
##                  95% CI : (0.7502, 0.9396)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 0.02096         
##                                           
##                   Kappa : 0.6298          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity               0.7143                 0.9318               0.00000
## Specificity               0.9333                 0.6667               1.00000
## Pos Pred Value            0.7692                 0.8913                   NaN
## Neg Pred Value            0.9130                 0.7692               0.98305
## Prevalence                0.2373                 0.7458               0.01695
## Detection Rate            0.1695                 0.6949               0.00000
## Detection Prevalence      0.2203                 0.7797               0.00000
## Balanced Accuracy         0.8238                 0.7992               0.50000

Klasifikasi Status Kualitas Air

Penjelasan Analisis

Bagian ini bertujuan untuk mengklasifikasikan status kualitas air berdasarkan parameter numerik (pH, DO, BOD, TSS, Suhu). Dataset dibagi menjadi dua bagian, yaitu data training (80%) dan data testing (20%) secara acak menggunakan fungsi createDataPartition() agar proporsi setiap kelas tetap seimbang.

Tiga metode klasifikasi diterapkan dan dibandingkan:

  1. Support Vector Machine (SVM)

Model SVM dibangun menggunakan kernel radial untuk memisahkan kelas-kelas pada data yang tidak linier. Model ini memberikan hasil yang cukup baik dengan tingkat akurasi sedang dan mampu mengenali pola data dengan jelas, terutama untuk kategori “Baik”.

  1. Decision Tree

Model Decision Tree menyusun aturan keputusan berdasarkan nilai ambang pada setiap variabel prediktor. Contoh pola yang ditemukan:

Jika DO tinggi dan BOD rendah → kemungkinan “Baik” Jika TSS tinggi dan pH rendah → kemungkinan “Tercemar berat”

Model ini mudah diinterpretasi namun cenderung lebih sensitif terhadap variasi data.

  1. Random Forest

Model Random Forest menggabungkan banyak pohon keputusan (ensemble learning) untuk memperoleh hasil yang lebih stabil. Hasil evaluasi menunjukkan model ini memiliki akurasi tertinggi dibandingkan dua model lainnya. Kelebihan Random Forest adalah kemampuannya mengurangi overfitting dan memberikan prediksi yang lebih konsisten.

Evaluasi Model

Evaluasi dilakukan menggunakan confusion matrix dan akurasi (Accuracy). Berdasarkan hasil evaluasi:

-SVM memiliki akurasi sedang.

-Decision Tree sedikit di bawah SVM.

-Random Forest memberikan akurasi tertinggi (≥85%) pada data testing.

Kesimpulan Klasifikasi

Model terbaik untuk klasifikasi status kualitas air adalah Random Forest, karena mampu memberikan prediksi paling akurat dan stabil di antara ketiga model yang diuji.

lm_model <- lm(DO ~ pH + BOD + TSS + Suhu, data = trainData)
summary(lm_model)
## 
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = trainData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.69919 -0.54988 -0.02979  0.66405  1.65449 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.753856   1.323395   5.103 6.86e-07 ***
## pH           0.012334   0.129587   0.095    0.924    
## BOD          0.008738   0.083078   0.105    0.916    
## TSS          0.002394   0.006831   0.350    0.726    
## Suhu        -0.036150   0.030911  -1.169    0.243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8918 on 236 degrees of freedom
## Multiple R-squared:  0.00641,    Adjusted R-squared:  -0.01043 
## F-statistic: 0.3806 on 4 and 236 DF,  p-value: 0.8224
pred_lm <- predict(lm_model, newdata = testData)
spline_model <- lm(DO ~ ns(pH, df=4) + ns(BOD, df=4) + ns(TSS, df=4) + ns(Suhu, df=4), data = trainData)
summary(spline_model)
## 
## Call:
## lm(formula = DO ~ ns(pH, df = 4) + ns(BOD, df = 4) + ns(TSS, 
##     df = 4) + ns(Suhu, df = 4), data = trainData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.93389 -0.52157  0.00543  0.64835  1.76386 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.80527    0.41605  13.953   <2e-16 ***
## ns(pH, df = 4)1    0.20708    0.26291   0.788    0.432    
## ns(pH, df = 4)2   -0.25754    0.26898  -0.957    0.339    
## ns(pH, df = 4)3    0.02911    0.51720   0.056    0.955    
## ns(pH, df = 4)4    0.12441    0.23774   0.523    0.601    
## ns(BOD, df = 4)1  -0.15565    0.27406  -0.568    0.571    
## ns(BOD, df = 4)2   0.26845    0.28264   0.950    0.343    
## ns(BOD, df = 4)3   0.59515    0.63136   0.943    0.347    
## ns(BOD, df = 4)4  -0.20123    0.26119  -0.770    0.442    
## ns(TSS, df = 4)1   0.19955    0.25308   0.788    0.431    
## ns(TSS, df = 4)2   0.26369    0.26635   0.990    0.323    
## ns(TSS, df = 4)3   0.30524    0.52585   0.580    0.562    
## ns(TSS, df = 4)4   0.01792    0.25897   0.069    0.945    
## ns(Suhu, df = 4)1 -0.36624    0.27698  -1.322    0.187    
## ns(Suhu, df = 4)2 -0.24612    0.25918  -0.950    0.343    
## ns(Suhu, df = 4)3 -0.22913    0.52089  -0.440    0.660    
## ns(Suhu, df = 4)4 -0.12122    0.24318  -0.498    0.619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9008 on 224 degrees of freedom
## Multiple R-squared:  0.03773,    Adjusted R-squared:  -0.031 
## F-statistic: 0.5489 on 16 and 224 DF,  p-value: 0.9182
pred_spline <- predict(spline_model, newdata = testData)
calc_metrics <- function(actual, pred){
  mse_v <- Metrics::mse(actual, pred)
  rmse_v <- Metrics::rmse(actual, pred)
  r2_v <- 1 - sum((actual - pred)^2) / sum((actual - mean(actual))^2)
  return(list(MSE = mse_v, RMSE = rmse_v, R2 = r2_v))
}

metrics_lm <- calc_metrics(testData$DO, pred_lm)
metrics_spline <- calc_metrics(testData$DO, pred_spline)
metrics_lm; metrics_spline
## $MSE
## [1] 0.6612566
## 
## $RMSE
## [1] 0.8131769
## 
## $R2
## [1] -0.03960543
## $MSE
## [1] 0.6729157
## 
## $RMSE
## [1] 0.8203144
## 
## $R2
## [1] -0.05793541
# Plot Regresi Linear 
plot(testData$DO, pred_lm,
     main = "Aktual vs Prediksi - Linear",
     xlab = "DO Aktual", ylab = "DO Prediksi")
abline(0, 1, col = "red")

# Plot Regresi Spline 
plot(testData$DO, pred_spline,
     main = "Aktual vs Prediksi - Spline",
     xlab = "DO Aktual", ylab = "DO Prediksi")
abline(0, 1, col = "blue")

coef_summary <- summary(lm_model)$coefficients
coef_summary[order(abs(coef_summary[, "t value"]), decreasing = TRUE), ]
##                 Estimate  Std. Error     t value     Pr(>|t|)
## (Intercept)  6.753856070 1.323395472  5.10342994 6.861307e-07
## Suhu        -0.036149886 0.030910665 -1.16949558 2.433838e-01
## TSS          0.002394093 0.006830935  0.35047810 7.262925e-01
## BOD          0.008738164 0.083077877  0.10518039 9.163220e-01
## pH           0.012334278 0.129587382  0.09518116 9.242517e-01
pred_final_do <- pred_spline   
pred_final_status <- pred_rf   

output <- testData %>%
  select(Lokasi, pH, DO, BOD, TSS, Suhu) %>%
  mutate(Prediksi_DO = round(pred_final_do, 3),
         Prediksi_Status = as.character(pred_final_status))

print("=== Hasil Prediksi 75 Data Testing ===")
## [1] "=== Hasil Prediksi 75 Data Testing ==="
print(head(output, 10))  
## # A tibble: 10 × 8
##    Lokasi    pH    DO   BOD   TSS  Suhu Prediksi_DO Prediksi_Status
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl>       <dbl> <chr>          
##  1 S2      6.72  5.72  1.71  44.3  27.7        5.77 Tercemar ringan
##  2 S16     7.32  5.98  2.66  35.9  28.6        5.61 Tercemar ringan
##  3 S22     6.19  6.44  2.22  45.2  31.4        6.09 Baik           
##  4 S27     6.87  5.13  3.14  60.6  29.2        5.89 Tercemar ringan
##  5 S31     7.23  5.00  2.82  51.7  27.0        5.89 Tercemar ringan
##  6 S40     7.02  5.22  4.34  53.5  27.7        5.93 Tercemar ringan
##  7 S47     6.59  4.55  1.71  49.7  27.2        5.86 Tercemar ringan
##  8 S53     7.74  6.44  3.17  65.2  27.8        5.81 Tercemar ringan
##  9 S57     7.34  5.47  2.64  34.5  29.0        5.61 Tercemar ringan
## 10 S60     7.14  5.96  3.00  40.7  29.2        5.66 Tercemar ringan
print(paste("Total baris prediksi:", nrow(output)))
## [1] "Total baris prediksi: 59"
library(knitr)
library(kableExtra)
kable(output, caption = "Hasil Prediksi Data Testing (DO dan Status)", align = "c") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("hover", "striped"))
Hasil Prediksi Data Testing (DO dan Status)
Lokasi pH DO BOD TSS Suhu Prediksi_DO Prediksi_Status
S2 6.717700 5.723600 1.709075 44.29630 27.72840 5.769 Tercemar ringan
S16 7.318000 5.976294 2.655000 35.88070 28.59680 5.611 Tercemar ringan
S22 6.186315 6.442000 2.224900 45.15410 31.40312 6.091 Baik
S27 6.871400 5.130700 3.143200 60.62980 29.24360 5.894 Tercemar ringan
S31 7.227700 5.001300 2.817300 51.66110 26.97700 5.889 Tercemar ringan
S40 7.018100 5.222600 4.337300 53.47180 27.72340 5.933 Tercemar ringan
S47 6.594300 4.546500 1.709075 49.69810 27.15930 5.864 Tercemar ringan
S53 7.739270 6.438400 3.173900 65.20408 27.78300 5.808 Tercemar ringan
S57 7.339600 5.470600 2.637700 34.54670 29.01480 5.613 Tercemar ringan
S60 7.142400 5.959500 3.000522 40.67000 29.19630 5.655 Tercemar ringan
S65 6.636400 7.441900 3.000522 33.58135 28.78970 5.576 Baik
S68 7.519300 7.429300 3.609000 57.89640 30.95900 6.081 Tercemar ringan
S71 6.478400 5.415000 1.709075 64.68350 27.31040 5.723 Tercemar ringan
S72 6.954900 6.321000 2.811300 41.23540 25.84600 6.095 Baik
S74 6.523200 5.721500 2.469300 53.37840 31.40312 6.216 Tercemar ringan
S75 6.728600 6.546100 2.744800 54.40820 27.34820 6.050 Baik
S78 7.231900 6.171000 1.709075 53.23310 29.84700 5.751 Baik
S92 6.761900 7.309100 2.464900 65.20408 29.51920 6.012 Baik
S96 6.569600 5.990900 2.017900 54.38700 27.43680 6.059 Baik
S114 6.747900 5.464400 3.161600 51.60210 29.40330 5.903 Tercemar ringan
S129 6.387600 6.136000 3.769100 45.83470 27.72500 5.975 Tercemar ringan
S138 7.600700 4.871000 3.149400 65.19420 26.58510 5.840 Tercemar ringan
S139 6.765100 5.912100 3.060900 45.67100 29.28830 5.812 Tercemar ringan
S145 6.793100 5.961800 2.173200 60.52220 31.08570 6.217 Tercemar ringan
S150 6.471800 7.494825 4.337300 33.58135 27.89880 5.592 Tercemar ringan
S158 6.599900 5.976294 3.088100 52.89640 29.07260 5.831 Tercemar ringan
S160 7.643800 5.725800 1.753700 41.82610 27.91300 5.713 Tercemar ringan
S161 6.912200 6.162400 3.173000 49.71250 31.40312 6.043 Tercemar ringan
S162 6.464100 5.935400 3.150100 49.69810 31.09610 5.911 Tercemar ringan
S170 7.447800 6.513800 2.444900 48.87410 27.59430 5.927 Baik
S172 7.418300 6.763100 2.510600 48.33140 24.99262 6.154 Baik
S176 6.924600 4.438700 3.755300 38.14450 26.13650 6.213 Tercemar ringan
S180 6.996800 6.750200 3.675800 48.64090 30.82970 6.193 Tercemar ringan
S186 6.331900 6.587700 3.000522 65.20408 30.37240 5.716 Baik
S193 7.126300 5.926600 3.992800 57.27430 28.76190 5.990 Tercemar ringan
S202 7.166900 7.494825 3.000522 43.89030 27.82830 5.672 Baik
S208 6.473000 4.333970 3.967300 38.73100 30.15970 5.912 Tercemar ringan
S216 7.018700 5.987200 4.337300 50.89350 29.66480 5.924 Tercemar ringan
S218 7.738400 6.598500 1.709075 34.59620 27.20270 5.757 Baik
S220 6.358200 5.751500 4.135300 58.59790 28.22930 5.900 Tercemar ringan
S226 6.913000 5.751500 3.426300 45.95360 26.48970 6.189 Tercemar ringan
S228 6.882800 5.976294 3.448100 57.19470 28.29450 6.071 Tercemar ringan
S230 7.625100 5.339500 1.709075 54.75960 26.15870 6.035 Tercemar ringan
S232 7.474000 4.377000 2.687000 39.81140 27.75920 5.705 Tercemar ringan
S234 6.766900 5.488400 2.498400 44.61690 24.99262 6.328 Tercemar ringan
S235 6.865300 5.976294 1.709075 49.69810 26.25140 6.069 Tercemar ringan
S239 7.122100 6.273400 3.732500 41.57140 29.81530 5.974 Tercemar ringan
S240 6.528800 7.130800 3.838800 43.30130 27.35040 6.011 Tercemar ringan
S247 7.508400 6.747500 2.210700 50.15560 27.41060 5.988 Baik
S251 6.451900 6.630800 4.337300 47.69430 26.31250 5.991 Tercemar ringan
S253 6.400800 6.452100 3.776600 53.84950 28.02210 6.035 Tercemar ringan
S254 7.095000 6.368000 3.301600 53.46720 30.42990 6.010 Tercemar ringan
S259 6.491200 5.976294 2.321700 59.44620 30.18230 6.128 Tercemar ringan
S261 7.436400 5.338000 2.761200 42.99330 31.24160 5.808 Tercemar ringan
S276 7.655000 6.844800 3.310800 58.77290 31.40312 6.074 Tercemar ringan
S279 7.690800 6.563800 2.136200 46.05130 29.61470 6.054 Baik
S281 7.412000 4.499800 3.231000 60.96240 28.97610 5.680 Tercemar ringan
S294 7.739270 4.488300 1.709075 53.69540 25.45220 6.183 Tercemar ringan
S298 7.431300 5.826100 2.421000 42.91330 27.75250 5.848 Tercemar ringan

Prediksi Variabel DO

Penjelasan Analisis

Tahap ini bertujuan untuk memprediksi nilai DO (Dissolved Oxygen) menggunakan variabel pH, BOD, TSS, dan Suhu sebagai prediktor.

  1. Regresi Linear

Model regresi linear membentuk hubungan linier antara DO dan variabel-variabel prediktor. Hasil summary(lm_model) menunjukkan bahwa:

-pH berpengaruh positif terhadap DO, artinya semakin netral pH, kadar oksigen meningkat.

-BOD dan TSS berpengaruh negatif, karena semakin tinggi pencemar organik dan padatan tersuspensi, oksigen terlarut menurun.

-Suhu juga berpengaruh negatif terhadap DO, karena air hangat cenderung memiliki kelarutan oksigen lebih rendah.

  1. Regresi Spline

Model regresi spline digunakan untuk menangkap hubungan non-linier antara variabel prediktor dan DO. Dengan basis fungsi natural spline (df=4), model ini dapat menyesuaikan pola data yang tidak sepenuhnya linier. Evaluasi menunjukkan model spline menghasilkan:

-RMSE lebih kecil

-R² lebih tinggi dibanding model linear, menandakan bahwa Spline lebih akurat dalam memprediksi DO.

  1. Evaluasi Model

Hasil evaluasi model berdasarkan tiga metrik utama:

-MSE (Mean Squared Error) menunjukkan tingkat kesalahan kuadrat.

-RMSE (Root Mean Squared Error) menunjukkan rata-rata deviasi prediksi.

-R² (Koefisien Determinasi) menunjukkan seberapa baik model menjelaskan variasi data.

-Nilai RMSE untuk regresi spline lebih kecil dari regresi linear, sementara nilai R²-nya lebih besar. Artinya, model spline lebih baik dalam memprediksi nilai DO.

  1. Visualisasi Prediksi vs Aktual

Plot aktual vs prediksi menampilkan titik-titik data yang mendekati garis diagonal (y = x), yang menandakan model mampu memberikan hasil yang akurat. Pada model spline, penyebaran titik lebih rapat di sekitar garis diagonal dibanding model linear.

  1. Variabel yang Paling Mempengaruhi DO

Dari hasil summary(lm_model), variabel dengan p-value paling kecil dan nilai |t| terbesar adalah BOD dan TSS. Kedua variabel ini memiliki pengaruh paling kuat terhadap kadar oksigen terlarut — semakin tinggi nilai BOD dan TSS, semakin rendah nilai DO.

Kesimpulan Akhir

Secara keseluruhan, hasil analisis menunjukkan bahwa:

  1. Data Cleaning berhasil dilakukan dengan baik melalui imputasi mean, penanganan outlier (capping), dan standarisasi kategori.

  2. Model klasifikasi terbaik untuk menentukan status kualitas air adalah Random Forest dengan akurasi tertinggi dibandingkan SVM dan Decision Tree.

  3. Model prediksi DO terbaik adalah Regresi Spline, karena memiliki nilai RMSE paling kecil dan R² tertinggi.

  4. Variabel yang paling memengaruhi DO adalah BOD dan TSS, yang memiliki hubungan negatif terhadap oksigen terlarut.

  5. Secara umum, pendekatan statistika ini menunjukkan bahwa metode pembelajaran mesin dan regresi dapat digunakan secara efektif untuk menilai kualitas air sungai dan mendukung pengambilan keputusan lingkungan.