library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(rpart)
library(rpart.plot)
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(splines)
library(readxl)
train <- read_excel("C:/Users/ASUS/Downloads/kualitasair.xlsx", sheet = "Training")
test <- read_excel("C:/Users/ASUS/Downloads/kualitasair.xlsx", sheet = "Testing")
nrow(train)
## [1] 300
nrow(test)
## [1] 75
colSums(is.na(train))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 23 22 24 0 0
train$pH[is.na(train$pH)] <- median(train$pH, na.rm = TRUE)
train$DO[is.na(train$DO)] <- median(train$DO, na.rm = TRUE)
train$BOD[is.na(train$BOD)] <- median(train$BOD, na.rm = TRUE)
train$TSS[is.na(train$TSS)] <- median(train$TSS, na.rm = TRUE)
train$Suhu[is.na(train$Suhu)] <- median(train$Suhu, na.rm = TRUE)
# Cek ulang apakah sudah bersih dari NA
colSums(is.na(train))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 0 0 0 0 0
### Deteksi Outlier
# Tangani outlier dengan aturan IQR (optional)
for (col in c("pH", "DO", "BOD", "TSS", "Suhu")) {
Q1 <- quantile(train[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(train[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
# Ganti nilai di luar batas menjadi batas bawah/atas
train[[col]][train[[col]] < lower] <- lower
train[[col]][train[[col]] > upper] <- upper
}
# Statistik deskriptif setelah cleaning
summary(train[, c("pH", "DO", "BOD", "TSS", "Suhu")])
## pH DO BOD TSS
## Min. :5.697 Min. :3.615 Min. :0.8513 Min. :27.28
## 1st Qu.:6.670 1st Qu.:5.413 1st Qu.:2.4599 1st Qu.:44.28
## Median :6.988 Median :5.991 Median :3.0661 Median :49.52
## Mean :6.990 Mean :5.977 Mean :3.0041 Mean :49.68
## 3rd Qu.:7.318 3rd Qu.:6.611 3rd Qu.:3.5323 3rd Qu.:55.62
## Max. :8.290 Max. :8.409 Max. :5.1409 Max. :72.62
## Suhu
## Min. :22.77
## 1st Qu.:26.62
## Median :28.01
## Mean :28.12
## 3rd Qu.:29.46
## Max. :33.73
# Cek jumlah dan posisi outlier (setelah penanganan)
boxplot.stats(train$pH)$out
## numeric(0)
boxplot.stats(train$DO)$out
## numeric(0)
boxplot.stats(train$BOD)$out
## numeric(0)
boxplot.stats(train$TSS)$out
## numeric(0)
boxplot.stats(train$Suhu)$out
## numeric(0)
# Standarisasi penulisan kategori Status
train$Status <- tolower(trimws(train$Status)) # ubah semua huruf jadi kecil & hapus spasi
train$Status <- factor(train$Status)
# Cek ulang kategori unik
unique(train$Status)
## [1] tercemar ringan baik tercemar berat
## Levels: baik tercemar berat tercemar ringan
table(train$Status)
##
## baik tercemar berat tercemar ringan
## 72 7 221
Keterangan Setelah dilakukan standarisasi didapatkan 3 kategori pada variabel Status: Baik (72 data) → kualitas air sesuai baku mutu. Tercemar ringan (221 data) → air sedikit melebihi ambang batas. Tercemar berat (7 data) → air mengalami pencemaran signifikan.
# Ringkasan statistik
summary(train)
## Lokasi pH DO BOD
## Length:300 Min. :5.697 Min. :3.615 Min. :0.8513
## Class :character 1st Qu.:6.670 1st Qu.:5.413 1st Qu.:2.4599
## Mode :character Median :6.988 Median :5.991 Median :3.0661
## Mean :6.990 Mean :5.977 Mean :3.0041
## 3rd Qu.:7.318 3rd Qu.:6.611 3rd Qu.:3.5323
## Max. :8.290 Max. :8.409 Max. :5.1409
## TSS Suhu Status
## Min. :27.28 Min. :22.77 baik : 72
## 1st Qu.:44.28 1st Qu.:26.62 tercemar berat : 7
## Median :49.52 Median :28.01 tercemar ringan:221
## Mean :49.68 Mean :28.12
## 3rd Qu.:55.62 3rd Qu.:29.46
## Max. :72.62 Max. :33.73
# Visualisasi distribusi
ggplot(train, aes(x = pH, fill = Status)) +
geom_histogram(bins = 20, alpha = 0.7) +
theme_minimal() + ggtitle("Distribusi pH berdasarkan Status")
# Korelasi antar variabel numerik
pairs(train[, c("pH","DO","BOD","TSS","Suhu")], main="Korelasi antar variabel numerik")
Keterangan a. Nilai pH berkisar antara 5,5 hingga 8,3 →
masih dalam rentang wajar air alami. b. Nilai DO (oksigen terlarut)
rata-rata 6 mg/L → cukup baik untuk ekosistem sungai. c. Nilai BOD dan
TSS bervariasi antar lokasi, mencerminkan perbedaan tingkat pencemaran.
d. Suhu air relatif stabil (rata-rata 28°C). e. Sebagian besar lokasi
(221 dari 300) masuk kategori tercemar ringan, menandakan pencemaran
ringan dominan di area studi.
set.seed(123)
train$Status <- tolower(trimws(train$Status))
train$Status <- as.factor(train$Status)
# Pilih variabel numerik
features <- c("pH", "DO", "BOD", "TSS", "Suhu")
# Bagi data menjadi Training (80%) dan Testing (20%)
index <- createDataPartition(train$Status, p = 0.8, list = FALSE)
data_train <- train[index, ]
data_test <- train[-index, ]
nrow(data_train)
## [1] 241
nrow(data_test)
## [1] 59
# Pastikan variabel target faktor di kedua dataset
data_train$Status <- as.factor(data_train$Status)
data_test$Status <- factor(data_test$Status, levels = levels(data_train$Status))
for (f in features) {
if (any(is.na(data_train[[f]]))) {
data_train[[f]][is.na(data_train[[f]])] <- median(data_train[[f]], na.rm = TRUE)
}
if (any(is.na(data_test[[f]]))) {
data_test[[f]][is.na(data_test[[f]])] <- median(data_train[[f]], na.rm = TRUE)
}
}
data_test\(Status <- factor(data_test\)Status, levels = levels(data_train$Status))
SVM
# Standarisasi data
preProc <- preProcess(data_train[, features], method = c("center", "scale"))
train_svm <- predict(preProc, data_train[, features])
test_svm <- predict(preProc, data_test[, features])
# Bangun model SVM (klasifikasi)
model_svm <- svm(x = train_svm, y = data_train$Status, kernel = "radial")
# Prediksi
pred_svm <- predict(model_svm, test_svm)
# Evaluasi
conf_svm <- confusionMatrix(pred_svm, data_test$Status)
conf_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction baik tercemar berat tercemar ringan
## baik 8 0 2
## tercemar berat 0 0 0
## tercemar ringan 6 1 42
##
## Overall Statistics
##
## Accuracy : 0.8475
## 95% CI : (0.7301, 0.9278)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 0.04475
##
## Kappa : 0.5519
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: baik Class: tercemar berat Class: tercemar ringan
## Sensitivity 0.5714 0.00000 0.9545
## Specificity 0.9556 1.00000 0.5333
## Pos Pred Value 0.8000 NaN 0.8571
## Neg Pred Value 0.8776 0.98305 0.8000
## Prevalence 0.2373 0.01695 0.7458
## Detection Rate 0.1356 0.00000 0.7119
## Detection Prevalence 0.1695 0.00000 0.8305
## Balanced Accuracy 0.7635 0.50000 0.7439
**Decision Tree
model_tree <- rpart(Status ~ pH + DO + BOD + TSS + Suhu,
data = data_train, method = "class")
# Visualisasi tree
rpart.plot(model_tree, main = "Decision Tree Kualitas Air")
# Prediksi dan evaluasi
pred_tree <- predict(model_tree, data_test, type = "class")
conf_tree <- confusionMatrix(pred_tree, data_test$Status)
print(conf_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction baik tercemar berat tercemar ringan
## baik 13 0 0
## tercemar berat 0 0 0
## tercemar ringan 1 1 44
##
## Overall Statistics
##
## Accuracy : 0.9661
## 95% CI : (0.8829, 0.9959)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 6.696e-06
##
## Kappa : 0.9075
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: baik Class: tercemar berat Class: tercemar ringan
## Sensitivity 0.9286 0.00000 1.0000
## Specificity 1.0000 1.00000 0.8667
## Pos Pred Value 1.0000 NaN 0.9565
## Neg Pred Value 0.9783 0.98305 1.0000
## Prevalence 0.2373 0.01695 0.7458
## Detection Rate 0.2203 0.00000 0.7458
## Detection Prevalence 0.2203 0.00000 0.7797
## Balanced Accuracy 0.9643 0.50000 0.9333
Random Forest
model_rf <- randomForest(Status ~ pH + DO + BOD + TSS + Suhu,
data = data_train, ntree = 200, importance = TRUE)
# Prediksi dan evaluasi
pred_rf <- predict(model_rf, data_test)
conf_rf <- confusionMatrix(pred_rf, data_test$Status)
print(conf_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction baik tercemar berat tercemar ringan
## baik 13 0 1
## tercemar berat 0 0 0
## tercemar ringan 1 1 43
##
## Overall Statistics
##
## Accuracy : 0.9492
## 95% CI : (0.8585, 0.9894)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 4.59e-05
##
## Kappa : 0.8644
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: baik Class: tercemar berat Class: tercemar ringan
## Sensitivity 0.9286 0.00000 0.9773
## Specificity 0.9778 1.00000 0.8667
## Pos Pred Value 0.9286 NaN 0.9556
## Neg Pred Value 0.9778 0.98305 0.9286
## Prevalence 0.2373 0.01695 0.7458
## Detection Rate 0.2203 0.00000 0.7288
## Detection Prevalence 0.2373 0.00000 0.7627
## Balanced Accuracy 0.9532 0.50000 0.9220
# Evaluasi SVM
cm_svm <- table(Prediksi = pred_svm, Aktual = data_test$Status)
cat("CONFUSION MATRIX SVM")
## CONFUSION MATRIX SVM
print(cm_svm)
## Aktual
## Prediksi baik tercemar berat tercemar ringan
## baik 8 0 2
## tercemar berat 0 0 0
## tercemar ringan 6 1 42
# Evaluasi Decision Tree
cm_tree <- table(Prediksi = pred_tree, Aktual = data_test$Status)
cat("CONFUSION MATRIX DECISION TREE")
## CONFUSION MATRIX DECISION TREE
print(cm_tree)
## Aktual
## Prediksi baik tercemar berat tercemar ringan
## baik 13 0 0
## tercemar berat 0 0 0
## tercemar ringan 1 1 44
# Evaluasi Random Forest
cm_rf <- table(Prediksi = pred_rf, Aktual = data_test$Status)
cat("CONFUSION MATRIX RANDOM FOREST")
## CONFUSION MATRIX RANDOM FOREST
print(cm_rf)
## Aktual
## Prediksi baik tercemar berat tercemar ringan
## baik 13 0 1
## tercemar berat 0 0 0
## tercemar ringan 1 1 43
accuracy_svm <- sum(diag(cm_svm)) / sum(cm_svm)
accuracy_tree <- sum(diag(cm_tree)) / sum(cm_tree)
accuracy_rf <- sum(diag(cm_rf)) / sum(cm_rf)
accuracy_results <- data.frame(
Model = c("SVM", "Decision Tree", "Random Forest"),
Accuracy = c(accuracy_svm, accuracy_tree, accuracy_rf)
)
cat("AKURASI SETIAP MODEL")
## AKURASI SETIAP MODEL
print(accuracy_results)
## Model Accuracy
## 1 SVM 0.8474576
## 2 Decision Tree 0.9661017
## 3 Random Forest 0.9491525
best_model <- accuracy_results$Model[which.max(accuracy_results$Accuracy)]
cat("\nModel dengan akurasi tertinggi adalah:", best_model, "\n")
##
## Model dengan akurasi tertinggi adalah: Decision Tree
features <- c("pH", "BOD", "TSS", "Suhu", "DO")
# Tangani NA jika masih ada
for (f in features) {
if (any(is.na(train[[f]]))) {
train[[f]][is.na(train[[f]])] <- median(train[[f]], na.rm = TRUE)
}
}
# Bagi data menjadi training (80%) dan testing (20%)
index <- sample(1:nrow(train), 0.8 * nrow(train))
data_train <- train[index, ]
data_test <- train[-index, ]
model_lm <- lm(DO ~ pH + BOD + TSS + Suhu, data = data_train)
summary(model_lm)
##
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35543 -0.54022 -0.00989 0.63204 2.37154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.656990 1.306258 5.096 7.12e-07 ***
## pH -0.053099 0.121339 -0.438 0.6621
## BOD 0.130644 0.075076 1.740 0.0831 .
## TSS -0.003110 0.006539 -0.476 0.6348
## Suhu -0.018183 0.030199 -0.602 0.5477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9277 on 235 degrees of freedom
## Multiple R-squared: 0.01603, Adjusted R-squared: -0.0007215
## F-statistic: 0.9569 on 4 and 235 DF, p-value: 0.4319
# Prediksi pada data testing
pred_lm <- predict(model_lm, newdata = data_test)
# Evaluasi performa
R2_lm <- summary(model_lm)$r.squared
MSE_lm <- mse(data_test$DO, pred_lm)
RMSE_lm <- rmse(data_test$DO, pred_lm)
cat("Hasil REGRESI LINEAR")
## Hasil REGRESI LINEAR
cat("R² :", round(R2_lm, 3), "\n")
## R² : 0.016
cat("MSE :", round(MSE_lm, 3), "\n")
## MSE : 1.08
cat("RMSE :", round(RMSE_lm, 3), "\n")
## RMSE : 1.039
model_spline <- lm(DO ~ ns(pH, df = 3) + ns(BOD, df = 3) + ns(TSS, df = 3) + ns(Suhu, df = 3),
data = data_train)
summary(model_spline)
##
## Call:
## lm(formula = DO ~ ns(pH, df = 3) + ns(BOD, df = 3) + ns(TSS,
## df = 3) + ns(Suhu, df = 3), data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4106 -0.5099 0.0305 0.5783 2.4649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.44612 0.68016 8.007 6.07e-14 ***
## ns(pH, df = 3)1 -0.19281 0.27434 -0.703 0.4829
## ns(pH, df = 3)2 0.66236 0.85851 0.772 0.4412
## ns(pH, df = 3)3 0.20075 0.39668 0.506 0.6133
## ns(BOD, df = 3)1 -0.03182 0.25992 -0.122 0.9027
## ns(BOD, df = 3)2 1.49449 0.78375 1.907 0.0578 .
## ns(BOD, df = 3)3 0.97050 0.38571 2.516 0.0126 *
## ns(TSS, df = 3)1 -0.09290 0.26453 -0.351 0.7258
## ns(TSS, df = 3)2 0.41987 0.67953 0.618 0.5373
## ns(TSS, df = 3)3 -0.05698 0.33290 -0.171 0.8642
## ns(Suhu, df = 3)1 -0.33735 0.27332 -1.234 0.2184
## ns(Suhu, df = 3)2 -0.60811 0.79639 -0.764 0.4459
## ns(Suhu, df = 3)3 0.01191 0.42098 0.028 0.9775
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9297 on 227 degrees of freedom
## Multiple R-squared: 0.04546, Adjusted R-squared: -0.004999
## F-statistic: 0.9009 on 12 and 227 DF, p-value: 0.5468
# Prediksi pada data testing
pred_spline <- predict(model_spline, newdata = data_test)
# Evaluasi performa
R2_spline <- summary(model_spline)$r.squared
MSE_spline <- mse(data_test$DO, pred_spline)
RMSE_spline <- rmse(data_test$DO, pred_spline)
cat("HASIL REGRESI SPLINE")
## HASIL REGRESI SPLINE
cat("R² :", round(R2_spline, 3), "\n")
## R² : 0.045
cat("MSE :", round(MSE_spline, 3), "\n")
## MSE : 1.093
cat("RMSE :", round(RMSE_spline, 3), "\n")
## RMSE : 1.045
eval_df <- data.frame(
Model = c("Regresi Linear", "Regresi Spline"),
R2 = c(R2_lm, R2_spline),
MSE = c(MSE_lm, MSE_spline),
RMSE = c(RMSE_lm, RMSE_spline)
)
cat("PERBANDINGAN KINERJA MODEL")
## PERBANDINGAN KINERJA MODEL
print(eval_df)
## Model R2 MSE RMSE
## 1 Regresi Linear 0.01602696 1.080184 1.039319
## 2 Regresi Spline 0.04546124 1.092923 1.045430
plot_data <- data.frame(
DO_Aktual = data_test$DO,
Prediksi_Linear = pred_lm,
Prediksi_Spline = pred_spline
)
# Plot Regresi Linear
ggplot(plot_data, aes(x = DO_Aktual, y = Prediksi_Linear)) +
geom_point(color = "blue", alpha = 0.6, size = 3) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Prediksi vs Aktual (Regresi Linear)",
x = "DO Aktual", y = "DO Prediksi")
# Plot Regresi Spline
ggplot(plot_data, aes(x = DO_Aktual, y = Prediksi_Spline)) +
geom_point(color = "green", alpha = 0.6, size = 3) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Prediksi vs Aktual (Regresi Spline)",
x = "DO Aktual", y = "DO Prediksi")
summary(model_lm)
##
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35543 -0.54022 -0.00989 0.63204 2.37154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.656990 1.306258 5.096 7.12e-07 ***
## pH -0.053099 0.121339 -0.438 0.6621
## BOD 0.130644 0.075076 1.740 0.0831 .
## TSS -0.003110 0.006539 -0.476 0.6348
## Suhu -0.018183 0.030199 -0.602 0.5477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9277 on 235 degrees of freedom
## Multiple R-squared: 0.01603, Adjusted R-squared: -0.0007215
## F-statistic: 0.9569 on 4 and 235 DF, p-value: 0.4319
Keterangan Berdasarkan hasil output regresi linear pada gambar, diketahui bahwa tidak ada variabel independen yang berpengaruh signifikan terhadap variabel DO, karena semua nilai p-value (Pr(>|t|)) lebih besar dari 0,05. Namun, jika dilihat dari nilai t-value, variabel BOD memiliki nilai t paling besar (1.630) dibandingkan variabel lainnya, sehingga dapat dikatakan bahwa BOD adalah variabel yang paling berpengaruh terhadap DO, meskipun pengaruhnya belum signifikan secara statistik. Hal ini menunjukkan bahwa perubahan nilai BOD cenderung memiliki hubungan paling kuat terhadap perubahan kadar DO dibandingkan variabel lain seperti pH, TSS, maupun Suhu.