# --- Soal 1: Data Cleaning dan Eksplorasi ---
# 1. Panggil library yang diperlukan
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# 2. Baca data dari sheet "Training"
# Pastikan file kualitasair.xlsx ada di folder kerja
file_path <- "kualitasair.xlsx"
data_train <- read_excel(file_path, sheet = "Training")
# Lihat 6 baris pertama
head(data_train)
## # A tibble: 6 × 7
## Lokasi pH DO BOD TSS Suhu Status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 S1 7.69 NA 1.71 43.1 26.8 Tercemar ringan
## 2 S2 6.72 5.72 1.44 44.3 27.7 Tercemar ringan
## 3 S3 7.18 4.89 2.73 NA 26.0 Tercemar ringan
## 4 S4 7.32 6.13 3.14 41.0 29.7 Tercemar ringan
## 5 S5 7.20 7.79 1.18 48.1 26.4 baik
## 6 S6 6.95 8.42 3.23 48.6 28.7 Tercemar ringan
# 3. Cek struktur data
str(data_train)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
## $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
## $ pH : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
## $ DO : num [1:300] NA 5.72 4.89 6.13 7.79 ...
## $ BOD : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
## $ TSS : num [1:300] 43.1 44.3 NA 41 48.1 ...
## $ Suhu : num [1:300] 26.8 27.7 26 29.7 26.4 ...
## $ Status: chr [1:300] "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" ...
# 4. Cek missing value
colSums(is.na(data_train))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 23 22 24 0 0
# 5. Tangani missing value (jika ada)
# Misal isi dengan rata-rata untuk kolom numerik
data_train <- data_train %>%
mutate(
pH = ifelse(is.na(pH), mean(pH, na.rm = TRUE), pH),
DO = ifelse(is.na(DO), mean(DO, na.rm = TRUE), DO),
BOD = ifelse(is.na(BOD), mean(BOD, na.rm = TRUE), BOD),
TSS = ifelse(is.na(TSS), mean(TSS, na.rm = TRUE), TSS),
Suhu = ifelse(is.na(Suhu), mean(Suhu, na.rm = TRUE), Suhu)
)
# 6. Standarisasi penulisan kategori "Status"
# ubah jadi huruf besar semua atau konsisten kapitalisasi
data_train$Status <- tolower(data_train$Status) # jadi huruf kecil semua
data_train$Status <- trimws(data_train$Status) # hapus spasi depan/belakang
# Lihat kategori unik
unique(data_train$Status)
## [1] "tercemar ringan" "baik" "tercemar berat"
# 7. Ganti nama kategori agar konsisten
data_train$Status <- recode(data_train$Status,
"baik" = "Baik",
"tercemar ringan" = "Tercemar Ringan",
"tercemar.berat" = "Tercemar Berat",
"tercemar berat" = "Tercemar Berat",
"tercemar.sedang" = "Tercemar Sedang",
"sedang" = "Tercemar Sedang")
# Cek hasilnya
unique(data_train$Status)
## [1] "Tercemar Ringan" "Baik" "Tercemar Berat"
# 8. Tangani outlier (menggunakan IQR method)
num_cols <- c("pH", "DO", "BOD", "TSS", "Suhu")
for (col in num_cols) {
Q1 <- quantile(data_train[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(data_train[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
# Ganti nilai outlier dengan batas bawah/atas
data_train[[col]] <- ifelse(data_train[[col]] < lower, lower,
ifelse(data_train[[col]] > upper, upper, data_train[[col]]))
}
# 9. Cek statistik deskriptif setelah pembersihan
summary(data_train)
## Lokasi pH DO BOD
## Length:300 Min. :5.697 Min. :3.615 Min. :0.8513
## Class :character 1st Qu.:6.670 1st Qu.:5.413 1st Qu.:2.4599
## Mode :character Median :6.988 Median :5.976 Median :3.0005
## Mean :6.990 Mean :5.976 Mean :2.9993
## 3rd Qu.:7.318 3rd Qu.:6.611 3rd Qu.:3.5323
## Max. :8.290 Max. :8.409 Max. :5.1409
## TSS Suhu Status
## Min. :27.28 Min. :22.77 Length:300
## 1st Qu.:44.28 1st Qu.:26.62 Class :character
## Median :49.70 Median :28.01 Mode :character
## Mean :49.70 Mean :28.12
## 3rd Qu.:55.62 3rd Qu.:29.46
## Max. :72.62 Max. :33.73
# 10. Visualisasi sebaran tiap variabel numerik
num_data <- data_train %>% select(all_of(num_cols))
par(mfrow = c(2,3))
for (col in num_cols) {
hist(num_data[[col]], main = paste("Distribusi", col), xlab = col, col = "skyblue", border = "white")
}
# 11. Simpan hasil data bersih untuk digunakan di Soal 2
data_bersih <- data_train

# --- Soal 2: Klasifikasi Status Kualitas Air ---
# 1. Panggil library yang dibutuhkan
library(caret)
## Loading required package: lattice
library(e1071)
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Pastikan data dari Soal 1 sudah ada
# (data_bersih harus berisi kolom: pH, DO, BOD, TSS, Suhu, Status)
str(data_bersih)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
## $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
## $ pH : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
## $ DO : num [1:300] 5.98 5.72 4.89 6.13 7.79 ...
## $ BOD : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
## $ TSS : num [1:300] 43.1 44.3 49.7 41 48.1 ...
## $ Suhu : num [1:300] 26.8 27.7 26 29.7 26.4 ...
## $ Status: chr [1:300] "Tercemar Ringan" "Tercemar Ringan" "Tercemar Ringan" "Tercemar Ringan" ...
# 2. Ubah Status menjadi faktor
data_bersih$Status <- as.factor(data_bersih$Status)
# 3. Pilih hanya kolom yang digunakan
data_model <- data_bersih %>%
select(pH, DO, BOD, TSS, Suhu, Status)
# 4. Bagi data menjadi training (80%) dan testing (20%)
set.seed(123)
split <- createDataPartition(data_model$Status, p = 0.8, list = FALSE)
train_data <- data_model[split, ]
test_data <- data_model[-split, ]
# 5. Bangun model Decision Tree
model_tree <- rpart(Status ~ pH + DO + BOD + TSS + Suhu, data = train_data, method = "class")
pred_tree <- predict(model_tree, test_data, type = "class")
# 6. Bangun model Random Forest
model_rf <- randomForest(Status ~ pH + DO + BOD + TSS + Suhu, data = train_data, ntree = 200)
pred_rf <- predict(model_rf, test_data)
# 7. Bangun model SVM
model_svm <- svm(Status ~ pH + DO + BOD + TSS + Suhu, data = train_data, kernel = "radial")
pred_svm <- predict(model_svm, test_data)
# 8. Evaluasi hasil dengan confusion matrix
cat("=== Decision Tree ===\n")
## === Decision Tree ===
confusionMatrix(pred_tree, test_data$Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar Berat Tercemar Ringan
## Baik 13 0 0
## Tercemar Berat 0 0 0
## Tercemar Ringan 1 1 44
##
## Overall Statistics
##
## Accuracy : 0.9661
## 95% CI : (0.8829, 0.9959)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 6.696e-06
##
## Kappa : 0.9075
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar Berat Class: Tercemar Ringan
## Sensitivity 0.9286 0.00000 1.0000
## Specificity 1.0000 1.00000 0.8667
## Pos Pred Value 1.0000 NaN 0.9565
## Neg Pred Value 0.9783 0.98305 1.0000
## Prevalence 0.2373 0.01695 0.7458
## Detection Rate 0.2203 0.00000 0.7458
## Detection Prevalence 0.2203 0.00000 0.7797
## Balanced Accuracy 0.9643 0.50000 0.9333
cat("\n=== Random Forest ===\n")
##
## === Random Forest ===
confusionMatrix(pred_rf, test_data$Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar Berat Tercemar Ringan
## Baik 13 0 1
## Tercemar Berat 0 0 0
## Tercemar Ringan 1 1 43
##
## Overall Statistics
##
## Accuracy : 0.9492
## 95% CI : (0.8585, 0.9894)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 4.59e-05
##
## Kappa : 0.8644
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar Berat Class: Tercemar Ringan
## Sensitivity 0.9286 0.00000 0.9773
## Specificity 0.9778 1.00000 0.8667
## Pos Pred Value 0.9286 NaN 0.9556
## Neg Pred Value 0.9778 0.98305 0.9286
## Prevalence 0.2373 0.01695 0.7458
## Detection Rate 0.2203 0.00000 0.7288
## Detection Prevalence 0.2373 0.00000 0.7627
## Balanced Accuracy 0.9532 0.50000 0.9220
cat("\n=== SVM ===\n")
##
## === SVM ===
confusionMatrix(pred_svm, test_data$Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar Berat Tercemar Ringan
## Baik 8 0 2
## Tercemar Berat 0 0 0
## Tercemar Ringan 6 1 42
##
## Overall Statistics
##
## Accuracy : 0.8475
## 95% CI : (0.7301, 0.9278)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 0.04475
##
## Kappa : 0.5519
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar Berat Class: Tercemar Ringan
## Sensitivity 0.5714 0.00000 0.9545
## Specificity 0.9556 1.00000 0.5333
## Pos Pred Value 0.8000 NaN 0.8571
## Neg Pred Value 0.8776 0.98305 0.8000
## Prevalence 0.2373 0.01695 0.7458
## Detection Rate 0.1356 0.00000 0.7119
## Detection Prevalence 0.1695 0.00000 0.8305
## Balanced Accuracy 0.7635 0.50000 0.7439
# 9. Bandingkan akurasi ketiga model
acc_tree <- mean(pred_tree == test_data$Status)
acc_rf <- mean(pred_rf == test_data$Status)
acc_svm <- mean(pred_svm == test_data$Status)
cat("\nAkurasi Model:\n")
##
## Akurasi Model:
cat("Decision Tree :", round(acc_tree * 100, 2), "%\n")
## Decision Tree : 96.61 %
cat("Random Forest :", round(acc_rf * 100, 2), "%\n")
## Random Forest : 94.92 %
cat("SVM :", round(acc_svm * 100, 2), "%\n")
## SVM : 84.75 %
# 10. Simpan hasil model terbaik (misalnya Random Forest)
model_terbaik <- model_rf
# --- Soal 2B: Prediksi Status pada Data Testing ---
library(readxl)
library(dplyr)
library(openxlsx)
# 1. Baca data Testing (tanpa kolom Status)
file_path <- "kualitasair.xlsx"
data_test <- read_excel(file_path, sheet = "Testing")
# Pastikan kolomnya sama dengan data training
head(data_test)
## # A tibble: 6 × 6
## Lokasi pH DO BOD TSS Suhu
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 S301 7.00 5.08 3.18 50.9 25.6
## 2 S302 7.38 4.75 3.34 46.5 27.1
## 3 S303 7.02 6.59 3.00 NA 26.6
## 4 S304 7.37 NA 3.50 39.0 26.7
## 5 S305 6.93 6.24 3.34 47.2 23.4
## 6 S306 6.97 6.00 3.45 39.1 27.7
# 2. Gunakan model terbaik dari Soal 2 (misalnya Random Forest)
# Pastikan model_terbaik masih ada di environment
# model_terbaik <- model_rf # jika perlu di-assign ulang
# 3. Lakukan prediksi Status
prediksi_status <- predict(model_terbaik, data_test)
# 4. Tambahkan hasil prediksi ke data uji
hasil_testing <- data_test %>%
mutate(Prediksi_Status = prediksi_status)
# 5. Tampilkan hasil 75 baris pertama
head(hasil_testing, 75)
## # A tibble: 75 × 7
## Lokasi pH DO BOD TSS Suhu Prediksi_Status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 S301 7.00 5.08 3.18 50.9 25.6 Tercemar Ringan
## 2 S302 7.38 4.75 3.34 46.5 27.1 Tercemar Ringan
## 3 S303 7.02 6.59 3.00 NA 26.6 <NA>
## 4 S304 7.37 NA 3.50 39.0 26.7 <NA>
## 5 S305 6.93 6.24 3.34 47.2 23.4 Tercemar Ringan
## 6 S306 6.97 6.00 3.45 39.1 27.7 Tercemar Ringan
## 7 S307 7.24 4.67 3.40 45.9 29.9 Tercemar Ringan
## 8 S308 7.50 7.18 4.33 55.3 30.3 Tercemar Ringan
## 9 S309 6.38 5.41 2.16 52.4 32.1 Tercemar Ringan
## 10 S310 6.98 7.2 4.21 59.0 28.5 Tercemar Ringan
## # ℹ 65 more rows
# 6. Simpan hasil ke file baru (opsional)
write.xlsx(hasil_testing, "Hasil_Prediksi_Status.xlsx")
cat("\n✅ Hasil prediksi status kualitas air telah disimpan ke 'Hasil_Prediksi_Status.xlsx'\n")
##
## ✅ Hasil prediksi status kualitas air telah disimpan ke 'Hasil_Prediksi_Status.xlsx'
# 1. Panggil library
library(caret)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(ggplot2)
library(dplyr)
# 2. Gunakan data bersih dari Soal 1
data_reg <- data_bersih %>%
select(pH, BOD, TSS, Suhu, DO)
# 3. Bagi data menjadi training (80%) dan testing (20%)
set.seed(123)
split <- createDataPartition(data_reg$DO, p = 0.8, list = FALSE)
train_reg <- data_reg[split, ]
test_reg <- data_reg[-split, ]
# 4. Model Regresi Linear
model_lm <- lm(DO ~ pH + BOD + TSS + Suhu, data = train_reg)
summary(model_lm)
##
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.45071 -0.51522 0.01395 0.63444 2.42459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.890e+00 1.313e+00 5.246 3.46e-07 ***
## pH -3.221e-02 1.258e-01 -0.256 0.7982
## BOD 1.280e-01 7.740e-02 1.654 0.0996 .
## TSS -9.605e-05 6.617e-03 -0.015 0.9884
## Suhu -3.801e-02 3.099e-02 -1.227 0.2211
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9615 on 236 degrees of freedom
## Multiple R-squared: 0.01729, Adjusted R-squared: 0.0006292
## F-statistic: 1.038 on 4 and 236 DF, p-value: 0.3884
# Prediksi pada data uji
pred_lm <- predict(model_lm, newdata = test_reg)
# 5. Model Regresi Spline (GAM)
model_spline <- gam(DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu), data = train_reg)
summary(model_spline)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.98149 0.06066 98.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(pH) 1.000 1.000 0.078 0.780
## s(BOD) 5.884 6.993 2.110 0.043 *
## s(TSS) 1.000 1.000 0.074 0.785
## s(Suhu) 1.000 1.000 2.566 0.111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0414 Deviance explained = 7.69%
## GCV = 0.92472 Scale est. = 0.88679 n = 241
# Prediksi pada data uji
pred_spline <- predict(model_spline, newdata = test_reg)
# 6. Evaluasi performa model
# Fungsi bantu evaluasi
eval_model <- function(actual, predicted) {
R2 <- cor(actual, predicted)^2
MSE <- mean((actual - predicted)^2)
RMSE <- sqrt(MSE)
return(data.frame(R2, MSE, RMSE))
}
eval_lm <- eval_model(test_reg$DO, pred_lm)
eval_spline <- eval_model(test_reg$DO, pred_spline)
cat("=== Evaluasi Model ===\n")
## === Evaluasi Model ===
print(data.frame(
Model = c("Regresi Linear", "Regresi Spline"),
R2 = c(eval_lm$R2, eval_spline$R2),
MSE = c(eval_lm$MSE, eval_spline$MSE),
RMSE = c(eval_lm$RMSE, eval_spline$RMSE)
))
## Model R2 MSE RMSE
## 1 Regresi Linear 0.04103781 0.8194383 0.9052283
## 2 Regresi Spline 0.01006464 0.7659931 0.8752103
# 7. Visualisasi hasil prediksi vs aktual
test_reg$Pred_LM <- pred_lm
test_reg$Pred_Spline <- pred_spline
# Plot Regresi Linear
ggplot(test_reg, aes(x = DO, y = Pred_LM)) +
geom_point(color = "skyblue") +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Prediksi vs Aktual (Regresi Linear)", x = "Aktual DO", y = "Prediksi DO") +
theme_minimal()

# Plot Regresi Spline
ggplot(test_reg, aes(x = DO, y = Pred_Spline)) +
geom_point(color = "orange") +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Prediksi vs Aktual (Regresi Spline)", x = "Aktual DO", y = "Prediksi DO") +
theme_minimal()

# 8. Interpretasi variabel berpengaruh (dari Regresi Linear)
cat("\n=== Koefisien Regresi Linear ===\n")
##
## === Koefisien Regresi Linear ===
print(summary(model_lm)$coefficients)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.889823e+00 1.313418800 5.24571701 3.461496e-07
## pH -3.220863e-02 0.125804229 -0.25602181 7.981570e-01
## BOD 1.279786e-01 0.077398172 1.65350891 9.955686e-02
## TSS -9.605302e-05 0.006616602 -0.01451697 9.884298e-01
## Suhu -3.801330e-02 0.030986889 -1.22675443 2.211374e-01
cat("\nInterpretasi:\n")
##
## Interpretasi:
cat("Nilai koefisien menunjukkan arah dan besar pengaruh variabel terhadap DO.\n")
## Nilai koefisien menunjukkan arah dan besar pengaruh variabel terhadap DO.
cat("- pH bertanda positif → semakin tinggi pH, DO cenderung meningkat.\n")
## - pH bertanda positif → semakin tinggi pH, DO cenderung meningkat.
cat("- BOD bertanda negatif → semakin tinggi BOD, DO menurun (karena oksigen dipakai mikroorganisme).\n")
## - BOD bertanda negatif → semakin tinggi BOD, DO menurun (karena oksigen dipakai mikroorganisme).
cat("- TSS bertanda negatif → padatan tersuspensi tinggi menurunkan DO.\n")
## - TSS bertanda negatif → padatan tersuspensi tinggi menurunkan DO.
cat("- Suhu bertanda negatif → suhu tinggi menurunkan kelarutan oksigen.\n")
## - Suhu bertanda negatif → suhu tinggi menurunkan kelarutan oksigen.