library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(splines)
data <- read_excel("C:/Users/Ziglar/Downloads/kualitasair.xlsx")
str(data)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
## $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
## $ pH : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
## $ DO : num [1:300] NA 5.72 4.89 6.13 7.79 ...
## $ BOD : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
## $ TSS : num [1:300] 43.1 44.3 NA 41 48.1 ...
## $ Suhu : num [1:300] 26.8 27.7 26 29.7 26.4 ...
## $ Status: chr [1:300] "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" ...
summary(data)
## Lokasi pH DO BOD
## Length:300 Min. :5.503 Min. :2.982 Min. :0.3026
## Class :character 1st Qu.:6.670 1st Qu.:5.375 1st Qu.:2.3573
## Mode :character Median :6.988 Median :5.991 Median :3.0661
## Mean :6.989 Mean :5.976 Mean :3.0005
## 3rd Qu.:7.318 3rd Qu.:6.688 3rd Qu.:3.5781
## Max. :8.351 Max. :9.229 Max. :5.7962
## NA's :23 NA's :22
## TSS Suhu Status
## Min. :24.65 Min. :22.77 Length:300
## 1st Qu.:43.73 1st Qu.:26.62 Class :character
## Median :49.52 Median :28.01 Mode :character
## Mean :49.70 Mean :28.31
## 3rd Qu.:56.44 3rd Qu.:29.46
## Max. :76.34 Max. :90.00
## NA's :24
required <- c("readxl","dplyr","ggplot2","caret","e1071","rpart","randomForest",
"splines","Metrics","knitr","kableExtra")
installed <- required %in% installed.packages()[,"Package"]
if(any(!installed)) install.packages(required[!installed])
lapply(required, library, character.only = TRUE)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## [[1]]
## [1] "splines" "randomForest" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[2]]
## [1] "splines" "randomForest" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[3]]
## [1] "splines" "randomForest" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[4]]
## [1] "splines" "randomForest" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[5]]
## [1] "splines" "randomForest" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[6]]
## [1] "splines" "randomForest" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[7]]
## [1] "splines" "randomForest" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[8]]
## [1] "splines" "randomForest" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[9]]
## [1] "Metrics" "splines" "randomForest" "rpart" "e1071"
## [6] "caret" "lattice" "ggplot2" "dplyr" "readxl"
## [11] "stats" "graphics" "grDevices" "utils" "datasets"
## [16] "methods" "base"
##
## [[10]]
## [1] "knitr" "Metrics" "splines" "randomForest" "rpart"
## [6] "e1071" "caret" "lattice" "ggplot2" "dplyr"
## [11] "readxl" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[11]]
## [1] "kableExtra" "knitr" "Metrics" "splines" "randomForest"
## [6] "rpart" "e1071" "caret" "lattice" "ggplot2"
## [11] "dplyr" "readxl" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
data <- readxl::read_excel("C:/Users/Ziglar/Downloads/kualitasair.xlsx")
str(data)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
## $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
## $ pH : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
## $ DO : num [1:300] NA 5.72 4.89 6.13 7.79 ...
## $ BOD : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
## $ TSS : num [1:300] 43.1 44.3 NA 41 48.1 ...
## $ Suhu : num [1:300] 26.8 27.7 26 29.7 26.4 ...
## $ Status: chr [1:300] "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" ...
summary(data)
## Lokasi pH DO BOD
## Length:300 Min. :5.503 Min. :2.982 Min. :0.3026
## Class :character 1st Qu.:6.670 1st Qu.:5.375 1st Qu.:2.3573
## Mode :character Median :6.988 Median :5.991 Median :3.0661
## Mean :6.989 Mean :5.976 Mean :3.0005
## 3rd Qu.:7.318 3rd Qu.:6.688 3rd Qu.:3.5781
## Max. :8.351 Max. :9.229 Max. :5.7962
## NA's :23 NA's :22
## TSS Suhu Status
## Min. :24.65 Min. :22.77 Length:300
## 1st Qu.:43.73 1st Qu.:26.62 Class :character
## Median :49.52 Median :28.01 Mode :character
## Mean :49.70 Mean :28.31
## 3rd Qu.:56.44 3rd Qu.:29.46
## Max. :76.34 Max. :90.00
## NA's :24
head(data)
## # A tibble: 6 × 7
## Lokasi pH DO BOD TSS Suhu Status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 S1 7.69 NA 1.71 43.1 26.8 Tercemar ringan
## 2 S2 6.72 5.72 1.44 44.3 27.7 Tercemar ringan
## 3 S3 7.18 4.89 2.73 NA 26.0 Tercemar ringan
## 4 S4 7.32 6.13 3.14 41.0 29.7 Tercemar ringan
## 5 S5 7.20 7.79 1.18 48.1 26.4 baik
## 6 S6 6.95 8.42 3.23 48.6 28.7 Tercemar ringan
num_vars <- c("pH","DO","BOD","TSS","Suhu")
for(v in num_vars){
data[[v]][is.na(data[[v]])] <- mean(data[[v]], na.rm = TRUE)
}
colSums(is.na(data))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 0 0 0 0 0
cap_outliers <- function(x){
lower <- quantile(x, 0.05, na.rm=TRUE)
upper <- quantile(x, 0.95, na.rm=TRUE)
x[x < lower] <- lower
x[x > upper] <- upper
return(x)
}
data <- data %>% mutate(across(all_of(num_vars), cap_outliers))
data$Status <- as.character(data$Status)
data$Status <- trimws(data$Status)
data$Status <- tolower(data$Status)
data$Status <- dplyr::recode(data$Status,
"1" = "Baik", "2" = "Tercemar ringan", "3" = "Tercemar berat",
"baik" = "Baik", "tercemar ringan" = "Tercemar ringan", "tercemar berat" = "Tercemar berat"
)
data$Status <- factor(data$Status, levels = c("Baik","Tercemar ringan","Tercemar berat"))
table(data$Status)
##
## Baik Tercemar ringan Tercemar berat
## 72 221 7
summary_stats <- data %>% select(all_of(num_vars)) %>% summary()
print(summary_stats)
## pH DO BOD TSS
## Min. :6.186 Min. :4.334 Min. :1.709 Min. :33.58
## 1st Qu.:6.670 1st Qu.:5.413 1st Qu.:2.460 1st Qu.:44.28
## Median :6.988 Median :5.976 Median :3.001 Median :49.70
## Mean :6.988 Mean :5.975 Mean :3.004 Mean :49.67
## 3rd Qu.:7.318 3rd Qu.:6.611 3rd Qu.:3.532 3rd Qu.:55.62
## Max. :7.739 Max. :7.495 Max. :4.337 Max. :65.20
## Suhu
## Min. :24.99
## 1st Qu.:26.62
## Median :28.01
## Mean :28.11
## 3rd Qu.:29.46
## Max. :31.40
knitr::kable(summary(data), caption = "Ringkasan Data Setelah Pembersihan")
Lokasi | pH | DO | BOD | TSS | Suhu | Status | |
---|---|---|---|---|---|---|---|
Length:300 | Min. :6.186 | Min. :4.334 | Min. :1.709 | Min. :33.58 | Min. :24.99 | Baik : 72 | |
Class :character | 1st Qu.:6.670 | 1st Qu.:5.413 | 1st Qu.:2.460 | 1st Qu.:44.28 | 1st Qu.:26.62 | Tercemar ringan:221 | |
Mode :character | Median :6.988 | Median :5.976 | Median :3.001 | Median :49.70 | Median :28.01 | Tercemar berat : 7 | |
NA | Mean :6.988 | Mean :5.975 | Mean :3.004 | Mean :49.67 | Mean :28.11 | NA | |
NA | 3rd Qu.:7.318 | 3rd Qu.:6.611 | 3rd Qu.:3.532 | 3rd Qu.:55.62 | 3rd Qu.:29.46 | NA | |
NA | Max. :7.739 | Max. :7.495 | Max. :4.337 | Max. :65.20 | Max. :31.40 | NA |
Pada tahap ini dilakukan proses pembersihan data (data cleaning) untuk memastikan kualitas dataset sebelum digunakan dalam analisis lanjutan.
Langkah-langkah yang dilakukan meliputi:
Pemeriksaan nilai hilang dilakukan dengan fungsi colSums(is.na(data)). Ditemukan beberapa missing value pada variabel numerik seperti pH, DO, BOD, TSS, dan Suhu. Semua nilai hilang diganti menggunakan nilai rata-rata (mean) dari masing-masing variabel agar distribusi data tetap stabil.
Deteksi outlier dilakukan dengan analisis kuantil dan boxplot. Nilai ekstrem dikoreksi menggunakan metode capping pada kuantil ke-5 dan ke-95, sehingga tidak ada data yang dihapus, tetapi nilai ekstrem tetap dikendalikan agar tidak memengaruhi hasil model secara berlebihan.
Kolom Status awalnya memiliki variasi penulisan seperti angka “1”, “2”, “3” maupun huruf “baik”, “Baik”. Semua distandarisasi menjadi tiga kategori:
-Baik
-Tercemar ringan
-Tercemar berat
Setelah data bersih, dilakukan analisis deskriptif terhadap variabel numerik. Hasilnya menunjukkan:
-Nilai pH rata-rata berada di kisaran 6–7, menunjukkan kondisi air relatif netral.
-Nilai DO (Dissolved Oxygen) berkisar 5–7 mg/L, yang menunjukkan kualitas air cukup baik.
-Nilai BOD dan TSS bervariasi antar lokasi, menandakan adanya perbedaan tingkat pencemaran.
-Suhu air berada di rentang 26–31°C, sesuai karakteristik perairan tropis.
set.seed(12345)
train_idx <- caret::createDataPartition(data$Status, p = 0.8, list = FALSE)
trainData <- data[train_idx, ]
testData <- data[-train_idx, ]
nrow(trainData); nrow(testData)
## [1] 241
## [1] 59
tree_model <- rpart::rpart(Status ~ pH + DO + BOD + TSS + Suhu, data = trainData, method = "class")
pred_tree <- predict(tree_model, testData, type = "class")
cm_tree <- caret::confusionMatrix(pred_tree, testData$Status)
cm_tree
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar ringan Tercemar berat
## Baik 12 1 0
## Tercemar ringan 2 43 1
## Tercemar berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9322
## 95% CI : (0.8354, 0.9812)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 0.000233
##
## Kappa : 0.8149
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity 0.8571 0.9773 0.00000
## Specificity 0.9778 0.8000 1.00000
## Pos Pred Value 0.9231 0.9348 NaN
## Neg Pred Value 0.9565 0.9231 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.2034 0.7288 0.00000
## Detection Prevalence 0.2203 0.7797 0.00000
## Balanced Accuracy 0.9175 0.8886 0.50000
set.seed(12345)
rf_model <- randomForest::randomForest(Status ~ pH + DO + BOD + TSS + Suhu, data = trainData, ntree = 500)
pred_rf <- predict(rf_model, testData)
cm_rf <- caret::confusionMatrix(pred_rf, testData$Status)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar ringan Tercemar berat
## Baik 12 2 0
## Tercemar ringan 2 42 1
## Tercemar berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9153
## 95% CI : (0.8132, 0.9719)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 0.0009347
##
## Kappa : 0.7739
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity 0.8571 0.9545 0.00000
## Specificity 0.9556 0.8000 1.00000
## Pos Pred Value 0.8571 0.9333 NaN
## Neg Pred Value 0.9556 0.8571 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.2034 0.7119 0.00000
## Detection Prevalence 0.2373 0.7627 0.00000
## Balanced Accuracy 0.9063 0.8773 0.50000
svm_model <- e1071::svm(Status ~ pH + DO + BOD + TSS + Suhu, data = trainData, probability = FALSE)
pred_svm <- predict(svm_model, testData)
cm_svm <- caret::confusionMatrix(pred_svm, testData$Status)
cm_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar ringan Tercemar berat
## Baik 10 3 0
## Tercemar ringan 4 41 1
## Tercemar berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8644
## 95% CI : (0.7502, 0.9396)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 0.02096
##
## Kappa : 0.6298
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity 0.7143 0.9318 0.00000
## Specificity 0.9333 0.6667 1.00000
## Pos Pred Value 0.7692 0.8913 NaN
## Neg Pred Value 0.9130 0.7692 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.1695 0.6949 0.00000
## Detection Prevalence 0.2203 0.7797 0.00000
## Balanced Accuracy 0.8238 0.7992 0.50000
Bagian ini bertujuan untuk mengklasifikasikan status kualitas air berdasarkan parameter numerik (pH, DO, BOD, TSS, Suhu). Dataset dibagi menjadi dua bagian, yaitu data training (80%) dan data testing (20%) secara acak menggunakan fungsi createDataPartition() agar proporsi setiap kelas tetap seimbang.
Tiga metode klasifikasi diterapkan dan dibandingkan:
Model SVM dibangun menggunakan kernel radial untuk memisahkan kelas-kelas pada data yang tidak linier. Model ini memberikan hasil yang cukup baik dengan tingkat akurasi sedang dan mampu mengenali pola data dengan jelas, terutama untuk kategori “Baik”.
Model Decision Tree menyusun aturan keputusan berdasarkan nilai ambang pada setiap variabel prediktor. Contoh pola yang ditemukan:
Jika DO tinggi dan BOD rendah → kemungkinan “Baik” Jika TSS tinggi dan pH rendah → kemungkinan “Tercemar berat”
Model ini mudah diinterpretasi namun cenderung lebih sensitif terhadap variasi data.
Model Random Forest menggabungkan banyak pohon keputusan (ensemble learning) untuk memperoleh hasil yang lebih stabil. Hasil evaluasi menunjukkan model ini memiliki akurasi tertinggi dibandingkan dua model lainnya. Kelebihan Random Forest adalah kemampuannya mengurangi overfitting dan memberikan prediksi yang lebih konsisten.
Evaluasi Model
Evaluasi dilakukan menggunakan confusion matrix dan akurasi (Accuracy). Berdasarkan hasil evaluasi:
-SVM memiliki akurasi sedang.
-Decision Tree sedikit di bawah SVM.
-Random Forest memberikan akurasi tertinggi (≥85%) pada data testing.
Kesimpulan Klasifikasi
Model terbaik untuk klasifikasi status kualitas air adalah Random Forest, karena mampu memberikan prediksi paling akurat dan stabil di antara ketiga model yang diuji.
lm_model <- lm(DO ~ pH + BOD + TSS + Suhu, data = trainData)
summary(lm_model)
##
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.69919 -0.54988 -0.02979 0.66405 1.65449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.753856 1.323395 5.103 6.86e-07 ***
## pH 0.012334 0.129587 0.095 0.924
## BOD 0.008738 0.083078 0.105 0.916
## TSS 0.002394 0.006831 0.350 0.726
## Suhu -0.036150 0.030911 -1.169 0.243
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8918 on 236 degrees of freedom
## Multiple R-squared: 0.00641, Adjusted R-squared: -0.01043
## F-statistic: 0.3806 on 4 and 236 DF, p-value: 0.8224
pred_lm <- predict(lm_model, newdata = testData)
spline_model <- lm(DO ~ ns(pH, df=4) + ns(BOD, df=4) + ns(TSS, df=4) + ns(Suhu, df=4), data = trainData)
summary(spline_model)
##
## Call:
## lm(formula = DO ~ ns(pH, df = 4) + ns(BOD, df = 4) + ns(TSS,
## df = 4) + ns(Suhu, df = 4), data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.93389 -0.52157 0.00543 0.64835 1.76386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.80527 0.41605 13.953 <2e-16 ***
## ns(pH, df = 4)1 0.20708 0.26291 0.788 0.432
## ns(pH, df = 4)2 -0.25754 0.26898 -0.957 0.339
## ns(pH, df = 4)3 0.02911 0.51720 0.056 0.955
## ns(pH, df = 4)4 0.12441 0.23774 0.523 0.601
## ns(BOD, df = 4)1 -0.15565 0.27406 -0.568 0.571
## ns(BOD, df = 4)2 0.26845 0.28264 0.950 0.343
## ns(BOD, df = 4)3 0.59515 0.63136 0.943 0.347
## ns(BOD, df = 4)4 -0.20123 0.26119 -0.770 0.442
## ns(TSS, df = 4)1 0.19955 0.25308 0.788 0.431
## ns(TSS, df = 4)2 0.26369 0.26635 0.990 0.323
## ns(TSS, df = 4)3 0.30524 0.52585 0.580 0.562
## ns(TSS, df = 4)4 0.01792 0.25897 0.069 0.945
## ns(Suhu, df = 4)1 -0.36624 0.27698 -1.322 0.187
## ns(Suhu, df = 4)2 -0.24612 0.25918 -0.950 0.343
## ns(Suhu, df = 4)3 -0.22913 0.52089 -0.440 0.660
## ns(Suhu, df = 4)4 -0.12122 0.24318 -0.498 0.619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9008 on 224 degrees of freedom
## Multiple R-squared: 0.03773, Adjusted R-squared: -0.031
## F-statistic: 0.5489 on 16 and 224 DF, p-value: 0.9182
pred_spline <- predict(spline_model, newdata = testData)
calc_metrics <- function(actual, pred){
mse_v <- Metrics::mse(actual, pred)
rmse_v <- Metrics::rmse(actual, pred)
r2_v <- 1 - sum((actual - pred)^2) / sum((actual - mean(actual))^2)
return(list(MSE = mse_v, RMSE = rmse_v, R2 = r2_v))
}
metrics_lm <- calc_metrics(testData$DO, pred_lm)
metrics_spline <- calc_metrics(testData$DO, pred_spline)
metrics_lm; metrics_spline
## $MSE
## [1] 0.6612566
##
## $RMSE
## [1] 0.8131769
##
## $R2
## [1] -0.03960543
## $MSE
## [1] 0.6729157
##
## $RMSE
## [1] 0.8203144
##
## $R2
## [1] -0.05793541
# Plot Regresi Linear
plot(testData$DO, pred_lm,
main = "Aktual vs Prediksi - Linear",
xlab = "DO Aktual", ylab = "DO Prediksi")
abline(0, 1, col = "red")
# Plot Regresi Spline
plot(testData$DO, pred_spline,
main = "Aktual vs Prediksi - Spline",
xlab = "DO Aktual", ylab = "DO Prediksi")
abline(0, 1, col = "blue")
coef_summary <- summary(lm_model)$coefficients
coef_summary[order(abs(coef_summary[, "t value"]), decreasing = TRUE), ]
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.753856070 1.323395472 5.10342994 6.861307e-07
## Suhu -0.036149886 0.030910665 -1.16949558 2.433838e-01
## TSS 0.002394093 0.006830935 0.35047810 7.262925e-01
## BOD 0.008738164 0.083077877 0.10518039 9.163220e-01
## pH 0.012334278 0.129587382 0.09518116 9.242517e-01
pred_final_do <- pred_spline
pred_final_status <- pred_rf
output <- testData %>%
select(Lokasi, pH, DO, BOD, TSS, Suhu) %>%
mutate(Prediksi_DO = round(pred_final_do, 3),
Prediksi_Status = as.character(pred_final_status))
print("=== Hasil Prediksi 75 Data Testing ===")
## [1] "=== Hasil Prediksi 75 Data Testing ==="
print(head(output, 10))
## # A tibble: 10 × 8
## Lokasi pH DO BOD TSS Suhu Prediksi_DO Prediksi_Status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 S2 6.72 5.72 1.71 44.3 27.7 5.77 Tercemar ringan
## 2 S16 7.32 5.98 2.66 35.9 28.6 5.61 Tercemar ringan
## 3 S22 6.19 6.44 2.22 45.2 31.4 6.09 Baik
## 4 S27 6.87 5.13 3.14 60.6 29.2 5.89 Tercemar ringan
## 5 S31 7.23 5.00 2.82 51.7 27.0 5.89 Tercemar ringan
## 6 S40 7.02 5.22 4.34 53.5 27.7 5.93 Tercemar ringan
## 7 S47 6.59 4.55 1.71 49.7 27.2 5.86 Tercemar ringan
## 8 S53 7.74 6.44 3.17 65.2 27.8 5.81 Tercemar ringan
## 9 S57 7.34 5.47 2.64 34.5 29.0 5.61 Tercemar ringan
## 10 S60 7.14 5.96 3.00 40.7 29.2 5.66 Tercemar ringan
print(paste("Total baris prediksi:", nrow(output)))
## [1] "Total baris prediksi: 59"
library(knitr)
library(kableExtra)
kable(output, caption = "Hasil Prediksi Data Testing (DO dan Status)", align = "c") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("hover", "striped"))
Lokasi | pH | DO | BOD | TSS | Suhu | Prediksi_DO | Prediksi_Status |
---|---|---|---|---|---|---|---|
S2 | 6.717700 | 5.723600 | 1.709075 | 44.29630 | 27.72840 | 5.769 | Tercemar ringan |
S16 | 7.318000 | 5.976294 | 2.655000 | 35.88070 | 28.59680 | 5.611 | Tercemar ringan |
S22 | 6.186315 | 6.442000 | 2.224900 | 45.15410 | 31.40312 | 6.091 | Baik |
S27 | 6.871400 | 5.130700 | 3.143200 | 60.62980 | 29.24360 | 5.894 | Tercemar ringan |
S31 | 7.227700 | 5.001300 | 2.817300 | 51.66110 | 26.97700 | 5.889 | Tercemar ringan |
S40 | 7.018100 | 5.222600 | 4.337300 | 53.47180 | 27.72340 | 5.933 | Tercemar ringan |
S47 | 6.594300 | 4.546500 | 1.709075 | 49.69810 | 27.15930 | 5.864 | Tercemar ringan |
S53 | 7.739270 | 6.438400 | 3.173900 | 65.20408 | 27.78300 | 5.808 | Tercemar ringan |
S57 | 7.339600 | 5.470600 | 2.637700 | 34.54670 | 29.01480 | 5.613 | Tercemar ringan |
S60 | 7.142400 | 5.959500 | 3.000522 | 40.67000 | 29.19630 | 5.655 | Tercemar ringan |
S65 | 6.636400 | 7.441900 | 3.000522 | 33.58135 | 28.78970 | 5.576 | Baik |
S68 | 7.519300 | 7.429300 | 3.609000 | 57.89640 | 30.95900 | 6.081 | Tercemar ringan |
S71 | 6.478400 | 5.415000 | 1.709075 | 64.68350 | 27.31040 | 5.723 | Tercemar ringan |
S72 | 6.954900 | 6.321000 | 2.811300 | 41.23540 | 25.84600 | 6.095 | Baik |
S74 | 6.523200 | 5.721500 | 2.469300 | 53.37840 | 31.40312 | 6.216 | Tercemar ringan |
S75 | 6.728600 | 6.546100 | 2.744800 | 54.40820 | 27.34820 | 6.050 | Baik |
S78 | 7.231900 | 6.171000 | 1.709075 | 53.23310 | 29.84700 | 5.751 | Baik |
S92 | 6.761900 | 7.309100 | 2.464900 | 65.20408 | 29.51920 | 6.012 | Baik |
S96 | 6.569600 | 5.990900 | 2.017900 | 54.38700 | 27.43680 | 6.059 | Baik |
S114 | 6.747900 | 5.464400 | 3.161600 | 51.60210 | 29.40330 | 5.903 | Tercemar ringan |
S129 | 6.387600 | 6.136000 | 3.769100 | 45.83470 | 27.72500 | 5.975 | Tercemar ringan |
S138 | 7.600700 | 4.871000 | 3.149400 | 65.19420 | 26.58510 | 5.840 | Tercemar ringan |
S139 | 6.765100 | 5.912100 | 3.060900 | 45.67100 | 29.28830 | 5.812 | Tercemar ringan |
S145 | 6.793100 | 5.961800 | 2.173200 | 60.52220 | 31.08570 | 6.217 | Tercemar ringan |
S150 | 6.471800 | 7.494825 | 4.337300 | 33.58135 | 27.89880 | 5.592 | Tercemar ringan |
S158 | 6.599900 | 5.976294 | 3.088100 | 52.89640 | 29.07260 | 5.831 | Tercemar ringan |
S160 | 7.643800 | 5.725800 | 1.753700 | 41.82610 | 27.91300 | 5.713 | Tercemar ringan |
S161 | 6.912200 | 6.162400 | 3.173000 | 49.71250 | 31.40312 | 6.043 | Tercemar ringan |
S162 | 6.464100 | 5.935400 | 3.150100 | 49.69810 | 31.09610 | 5.911 | Tercemar ringan |
S170 | 7.447800 | 6.513800 | 2.444900 | 48.87410 | 27.59430 | 5.927 | Baik |
S172 | 7.418300 | 6.763100 | 2.510600 | 48.33140 | 24.99262 | 6.154 | Baik |
S176 | 6.924600 | 4.438700 | 3.755300 | 38.14450 | 26.13650 | 6.213 | Tercemar ringan |
S180 | 6.996800 | 6.750200 | 3.675800 | 48.64090 | 30.82970 | 6.193 | Tercemar ringan |
S186 | 6.331900 | 6.587700 | 3.000522 | 65.20408 | 30.37240 | 5.716 | Baik |
S193 | 7.126300 | 5.926600 | 3.992800 | 57.27430 | 28.76190 | 5.990 | Tercemar ringan |
S202 | 7.166900 | 7.494825 | 3.000522 | 43.89030 | 27.82830 | 5.672 | Baik |
S208 | 6.473000 | 4.333970 | 3.967300 | 38.73100 | 30.15970 | 5.912 | Tercemar ringan |
S216 | 7.018700 | 5.987200 | 4.337300 | 50.89350 | 29.66480 | 5.924 | Tercemar ringan |
S218 | 7.738400 | 6.598500 | 1.709075 | 34.59620 | 27.20270 | 5.757 | Baik |
S220 | 6.358200 | 5.751500 | 4.135300 | 58.59790 | 28.22930 | 5.900 | Tercemar ringan |
S226 | 6.913000 | 5.751500 | 3.426300 | 45.95360 | 26.48970 | 6.189 | Tercemar ringan |
S228 | 6.882800 | 5.976294 | 3.448100 | 57.19470 | 28.29450 | 6.071 | Tercemar ringan |
S230 | 7.625100 | 5.339500 | 1.709075 | 54.75960 | 26.15870 | 6.035 | Tercemar ringan |
S232 | 7.474000 | 4.377000 | 2.687000 | 39.81140 | 27.75920 | 5.705 | Tercemar ringan |
S234 | 6.766900 | 5.488400 | 2.498400 | 44.61690 | 24.99262 | 6.328 | Tercemar ringan |
S235 | 6.865300 | 5.976294 | 1.709075 | 49.69810 | 26.25140 | 6.069 | Tercemar ringan |
S239 | 7.122100 | 6.273400 | 3.732500 | 41.57140 | 29.81530 | 5.974 | Tercemar ringan |
S240 | 6.528800 | 7.130800 | 3.838800 | 43.30130 | 27.35040 | 6.011 | Tercemar ringan |
S247 | 7.508400 | 6.747500 | 2.210700 | 50.15560 | 27.41060 | 5.988 | Baik |
S251 | 6.451900 | 6.630800 | 4.337300 | 47.69430 | 26.31250 | 5.991 | Tercemar ringan |
S253 | 6.400800 | 6.452100 | 3.776600 | 53.84950 | 28.02210 | 6.035 | Tercemar ringan |
S254 | 7.095000 | 6.368000 | 3.301600 | 53.46720 | 30.42990 | 6.010 | Tercemar ringan |
S259 | 6.491200 | 5.976294 | 2.321700 | 59.44620 | 30.18230 | 6.128 | Tercemar ringan |
S261 | 7.436400 | 5.338000 | 2.761200 | 42.99330 | 31.24160 | 5.808 | Tercemar ringan |
S276 | 7.655000 | 6.844800 | 3.310800 | 58.77290 | 31.40312 | 6.074 | Tercemar ringan |
S279 | 7.690800 | 6.563800 | 2.136200 | 46.05130 | 29.61470 | 6.054 | Baik |
S281 | 7.412000 | 4.499800 | 3.231000 | 60.96240 | 28.97610 | 5.680 | Tercemar ringan |
S294 | 7.739270 | 4.488300 | 1.709075 | 53.69540 | 25.45220 | 6.183 | Tercemar ringan |
S298 | 7.431300 | 5.826100 | 2.421000 | 42.91330 | 27.75250 | 5.848 | Tercemar ringan |
Tahap ini bertujuan untuk memprediksi nilai DO (Dissolved Oxygen) menggunakan variabel pH, BOD, TSS, dan Suhu sebagai prediktor.
Model regresi linear membentuk hubungan linier antara DO dan variabel-variabel prediktor. Hasil summary(lm_model) menunjukkan bahwa:
-pH berpengaruh positif terhadap DO, artinya semakin netral pH, kadar oksigen meningkat.
-BOD dan TSS berpengaruh negatif, karena semakin tinggi pencemar organik dan padatan tersuspensi, oksigen terlarut menurun.
-Suhu juga berpengaruh negatif terhadap DO, karena air hangat cenderung memiliki kelarutan oksigen lebih rendah.
Model regresi spline digunakan untuk menangkap hubungan non-linier antara variabel prediktor dan DO. Dengan basis fungsi natural spline (df=4), model ini dapat menyesuaikan pola data yang tidak sepenuhnya linier. Evaluasi menunjukkan model spline menghasilkan:
-RMSE lebih kecil
-R² lebih tinggi dibanding model linear, menandakan bahwa Spline lebih akurat dalam memprediksi DO.
Hasil evaluasi model berdasarkan tiga metrik utama:
-MSE (Mean Squared Error) menunjukkan tingkat kesalahan kuadrat.
-RMSE (Root Mean Squared Error) menunjukkan rata-rata deviasi prediksi.
-R² (Koefisien Determinasi) menunjukkan seberapa baik model menjelaskan variasi data.
-Nilai RMSE untuk regresi spline lebih kecil dari regresi linear, sementara nilai R²-nya lebih besar. Artinya, model spline lebih baik dalam memprediksi nilai DO.
Plot aktual vs prediksi menampilkan titik-titik data yang mendekati garis diagonal (y = x), yang menandakan model mampu memberikan hasil yang akurat. Pada model spline, penyebaran titik lebih rapat di sekitar garis diagonal dibanding model linear.
Dari hasil summary(lm_model), variabel dengan p-value paling kecil dan nilai |t| terbesar adalah BOD dan TSS. Kedua variabel ini memiliki pengaruh paling kuat terhadap kadar oksigen terlarut — semakin tinggi nilai BOD dan TSS, semakin rendah nilai DO.
Secara keseluruhan, hasil analisis menunjukkan bahwa:
Data Cleaning berhasil dilakukan dengan baik melalui imputasi mean, penanganan outlier (capping), dan standarisasi kategori.
Model klasifikasi terbaik untuk menentukan status kualitas air adalah Random Forest dengan akurasi tertinggi dibandingkan SVM dan Decision Tree.
Model prediksi DO terbaik adalah Regresi Spline, karena memiliki nilai RMSE paling kecil dan R² tertinggi.
Variabel yang paling memengaruhi DO adalah BOD dan TSS, yang memiliki hubungan negatif terhadap oksigen terlarut.
Secara umum, pendekatan statistika ini menunjukkan bahwa metode pembelajaran mesin dan regresi dapat digunakan secara efektif untuk menilai kualitas air sungai dan mendukung pengambilan keputusan lingkungan.