# Load library
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# 1. Baca data
file_path <- "D:/UTS STATLING SMT 5/kualitasair.xlsx"
df <- read_excel(file_path, sheet = 1)
# 2. Standarisasi nama kolom
names(df) <- tolower(names(df))
names(df) <- str_replace_all(names(df), "\\s+", "_")
# 3. Tangani missing value (imputasi median)
num_vars <- c("ph", "do", "bod", "tss", "suhu")
for (v in num_vars) {
df[[v]] <- as.numeric(df[[v]])
med <- median(df[[v]], na.rm = TRUE)
df[[v]][is.na(df[[v]])] <- med
}
# 4. Standarisasi kategori Status
df$status <- as.character(df$status)
df$status <- case_when(
str_detect(tolower(df$status), "baik") ~ "Baik",
str_detect(tolower(df$status), "ringan") ~ "Tercemar_ringan",
str_detect(tolower(df$status), "berat") ~ "Tercemar_berat",
df$status == "1" ~ "Baik",
df$status == "2" ~ "Tercemar_ringan",
df$status == "3" ~ "Tercemar_berat",
TRUE ~ NA_character_
)
df$status <- factor(df$status, levels = c("Baik", "Tercemar_ringan", "Tercemar_berat"))
# 5. Tangani outlier (winsorize)
winsorize <- function(x) {
q <- quantile(x, c(0.01, 0.99), na.rm = TRUE)
x[x < q[1]] <- q[1]
x[x > q[2]] <- q[2]
return(x)
}
for (v in num_vars) df[[v]] <- winsorize(df[[v]])
# 6. Ringkasan statistik deskriptif
desc <- psych::describe(df[num_vars])
print(desc)
## vars n mean sd median trimmed mad min max range skew kurtosis
## ph 1 300 6.99 0.49 6.99 6.99 0.49 5.78 8.11 2.33 -0.05 -0.26
## do 2 300 5.98 0.94 5.99 5.99 0.89 3.83 8.22 4.39 -0.11 -0.20
## bod 3 300 3.00 0.79 3.07 3.01 0.80 0.96 5.10 4.14 -0.04 0.10
## tss 4 300 49.68 9.11 49.52 49.74 8.24 27.71 72.23 44.52 -0.03 -0.06
## suhu 5 300 28.12 2.05 28.01 28.09 2.12 23.52 33.25 9.72 0.13 -0.27
## se
## ph 0.03
## do 0.05
## bod 0.05
## tss 0.53
## suhu 0.12
# Simpan hasil ke CSV
write.csv(desc, "ringkasan_deskriptif.csv", row.names = TRUE)
Kesimpulan: Berdasarkan hasil analisis deskriptif setelah dilakukan pembersihan data, imputasi nilai hilang, dan penanganan outlier, dapat disimpulkan bahwa kualitas air secara umum berada dalam kondisi baik hingga sedikit tercemar ringan. Nilai pH rata-rata sebesar 6,99 menunjukkan kondisi air yang netral, sedangkan kandungan oksigen terlarut (DO) sebesar 5,98 mg/L mengindikasikan bahwa air masih mampu mendukung kehidupan organisme akuatik. Nilai BOD rata-rata sebesar 3,00 mg/L menunjukkan tingkat pencemaran organik yang masih dalam batas wajar. Total padatan tersuspensi (TSS) sekitar 49,68 mg/L menunjukkan air agak keruh namun masih tergolong normal, dan suhu rata-rata 28,12°C sesuai dengan karakteristik perairan tropis. Secara keseluruhan, distribusi data yang simetris dan tanpa pencilan ekstrem menunjukkan hasil pengukuran yang stabil dan konsisten, sehingga dapat dikatakan bahwa kondisi kualitas air pada dataset ini relatif baik.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(readxl)
library(dplyr)
# Deteksi apakah ada sheet "Testing"
file_path <- "D:/UTS STATLING SMT 5/kualitasair.xlsx"
sheets <- excel_sheets("D:/UTS STATLING SMT 5/kualitasair.xlsx")
message("Sheets found: ", paste(sheets, collapse = ", "))
## Sheets found: Training, Testing
# Baca sebagai data uji
if(any(tolower(sheets) %in% c("testing","test","testset"))){
test_sheet <- sheets[tolower(sheets) %in% c("testing","test","testset")][1]
testset <- read_excel(file_path, sheet = test_sheet)
names(testset) <- tolower(names(testset))
message("Ditemukan sheet testset: ", test_sheet)
}
## Ditemukan sheet testset: Testing
# variabel numerik konsisten
num_vars <- c("ph", "do", "bod", "tss", "suhu")
# 1. Pisahkan training dan testing (80%:20%)
set.seed(123)
train_index <- createDataPartition(df$status, p = 0.8, list = FALSE)
train <- df[train_index, ]
test <- df[-train_index, ]
# --- MODEL 1: SVM ---
svm_model <- svm(status ~ ph + do + bod + tss + suhu, data = train, kernel = "radial")
svm_pred <- predict(svm_model, test)
conf_svm <- confusionMatrix(svm_pred, test$status)
conf_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar_ringan Tercemar_berat
## Baik 8 1 0
## Tercemar_ringan 6 43 1
## Tercemar_berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8644
## 95% CI : (0.7502, 0.9396)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 0.02096
##
## Kappa : 0.5913
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity 0.5714 0.9773 0.00000
## Specificity 0.9778 0.5333 1.00000
## Pos Pred Value 0.8889 0.8600 NaN
## Neg Pred Value 0.8800 0.8889 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.1356 0.7288 0.00000
## Detection Prevalence 0.1525 0.8475 0.00000
## Balanced Accuracy 0.7746 0.7553 0.50000
# --- MODEL 2: Decision Tree ---
dt_model <- rpart(status ~ ph + do + bod + tss + suhu, data = train, method = "class")
rpart.plot(dt_model)
dt_pred <- predict(dt_model, test, type = "class")
conf_dt <- confusionMatrix(dt_pred, test$status)
conf_dt
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar_ringan Tercemar_berat
## Baik 13 0 0
## Tercemar_ringan 1 44 1
## Tercemar_berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9661
## 95% CI : (0.8829, 0.9959)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 6.696e-06
##
## Kappa : 0.9075
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity 0.9286 1.0000 0.00000
## Specificity 1.0000 0.8667 1.00000
## Pos Pred Value 1.0000 0.9565 NaN
## Neg Pred Value 0.9783 1.0000 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.2203 0.7458 0.00000
## Detection Prevalence 0.2203 0.7797 0.00000
## Balanced Accuracy 0.9643 0.9333 0.50000
# --- MODEL 3: Random Forest ---
set.seed(123)
rf_model <- randomForest(status ~ ph + do + bod + tss + suhu, data = train, ntree = 300)
rf_pred <- predict(rf_model, test)
conf_rf <- confusionMatrix(rf_pred, test$status)
conf_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar_ringan Tercemar_berat
## Baik 13 0 0
## Tercemar_ringan 1 44 1
## Tercemar_berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9661
## 95% CI : (0.8829, 0.9959)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 6.696e-06
##
## Kappa : 0.9075
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity 0.9286 1.0000 0.00000
## Specificity 1.0000 0.8667 1.00000
## Pos Pred Value 1.0000 0.9565 NaN
## Neg Pred Value 0.9783 1.0000 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.2203 0.7458 0.00000
## Detection Prevalence 0.2203 0.7797 0.00000
## Balanced Accuracy 0.9643 0.9333 0.50000
# Evaluasi Model
acc_tab <- data.frame(
Model = c("SVM", "Decision Tree", "Random Forest"),
Accuracy = c(conf_svm$overall["Accuracy"],
conf_dt$overall["Accuracy"],
conf_rf$overall["Accuracy"])
)
print(acc_tab)
## Model Accuracy
## 1 SVM 0.8644068
## 2 Decision Tree 0.9661017
## 3 Random Forest 0.9661017
# Model terbaik (biasanya Random Forest)
best_model <- rf_model
# Prediksi 75 baris data TESTSET dan simpan ke file CSV
if (nrow(testset) > 0) {
# Tangani missing value di testset
for (col in num_vars) {
if (any(is.na(testset[[col]]))) {
median_val <- median(testset[[col]], na.rm = TRUE)
testset[[col]][is.na(testset[[col]])] <- median_val
message("Imputasi nilai NA pada kolom ", col, " dengan median: ", round(median_val, 3))
}
}
message("Memprediksi ", nrow(testset), " baris data testset...")
testset_pred <- predict(best_model, testset)
hasil_prediksi <- testset %>% mutate(Status_Prediksi_RF = testset_pred)
write.csv(hasil_prediksi, "Prediksi_Status_Testset.csv", row.names = FALSE)
message("File 'Prediksi_Status_Testset.csv' berhasil disimpan di folder kerja: ", getwd())
print(head(hasil_prediksi, 10))
} else {
message("Tidak ada data testset yang bisa diprediksi.")
}
## Imputasi nilai NA pada kolom do dengan median: 5.644
## Imputasi nilai NA pada kolom bod dengan median: 3.062
## Imputasi nilai NA pada kolom tss dengan median: 49.569
## Memprediksi 75 baris data testset...
## File 'Prediksi_Status_Testset.csv' berhasil disimpan di folder kerja: C:/Users/ASUS/OneDrive/Documents/STATLING SMT 5
## # A tibble: 10 × 7
## lokasi ph do bod tss suhu Status_Prediksi_RF
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 S301 7.00 5.08 3.18 50.9 25.6 Tercemar_ringan
## 2 S302 7.38 4.75 3.34 46.5 27.1 Tercemar_ringan
## 3 S303 7.02 6.59 3.00 49.6 26.6 Baik
## 4 S304 7.37 5.64 3.50 39.0 26.7 Tercemar_ringan
## 5 S305 6.93 6.24 3.34 47.2 23.4 Tercemar_ringan
## 6 S306 6.97 6.00 3.45 39.1 27.7 Tercemar_ringan
## 7 S307 7.24 4.67 3.40 45.9 29.9 Tercemar_ringan
## 8 S308 7.50 7.18 4.33 55.3 30.3 Tercemar_ringan
## 9 S309 6.38 5.41 2.16 52.4 32.1 Tercemar_ringan
## 10 S310 6.98 7.2 4.21 59.0 28.5 Tercemar_ringan
# Tampilkan confusion matrix model terbaik
print(conf_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar_ringan Tercemar_berat
## Baik 13 0 0
## Tercemar_ringan 1 44 1
## Tercemar_berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9661
## 95% CI : (0.8829, 0.9959)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 6.696e-06
##
## Kappa : 0.9075
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity 0.9286 1.0000 0.00000
## Specificity 1.0000 0.8667 1.00000
## Pos Pred Value 1.0000 0.9565 NaN
## Neg Pred Value 0.9783 1.0000 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.2203 0.7458 0.00000
## Detection Prevalence 0.2203 0.7797 0.00000
## Balanced Accuracy 0.9643 0.9333 0.50000
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(splines)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
# 1. Siapkan data regresi
reg_data <- df %>% select(ph, bod, tss, suhu, do)
set.seed(123)
idx <- createDataPartition(reg_data$do, p = 0.8, list = FALSE)
train_reg <- reg_data[idx, ]
test_reg <- reg_data[-idx, ]
# --- MODEL 1: Regresi Linear ---
lm_mod <- lm(do ~ ph + bod + tss + suhu, data = train_reg)
summary(lm_mod)
##
## Call:
## lm(formula = do ~ ph + bod + tss + suhu, data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24584 -0.52396 0.01717 0.60099 2.23837
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.634255 1.278076 5.191 4.51e-07 ***
## ph -0.041133 0.126065 -0.326 0.7445
## bod 0.148641 0.078857 1.885 0.0607 .
## tss 0.001173 0.006569 0.178 0.8585
## suhu -0.031401 0.029548 -1.063 0.2890
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9517 on 236 degrees of freedom
## Multiple R-squared: 0.01934, Adjusted R-squared: 0.002722
## F-statistic: 1.164 on 4 and 236 DF, p-value: 0.3275
lm_pred <- predict(lm_mod, test_reg)
r2_lm <- R2(lm_pred, test_reg$do)
mse_lm <- mse(test_reg$do, lm_pred)
rmse_lm <- rmse(test_reg$do, lm_pred)
# --- MODEL 2: Regresi Spline ---
spline_mod <- lm(do ~ ns(ph, 3) + ns(bod, 3) + ns(tss, 3) + ns(suhu, 3), data = train_reg)
summary(spline_mod)
##
## Call:
## lm(formula = do ~ ns(ph, 3) + ns(bod, 3) + ns(tss, 3) + ns(suhu,
## 3), data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.43100 -0.47558 0.00463 0.60943 2.48179
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.41192 0.65383 8.277 1.06e-14 ***
## ns(ph, 3)1 -0.17334 0.26284 -0.659 0.51025
## ns(ph, 3)2 0.43443 0.74165 0.586 0.55862
## ns(ph, 3)3 0.12737 0.32764 0.389 0.69782
## ns(bod, 3)1 0.10585 0.27083 0.391 0.69627
## ns(bod, 3)2 2.12593 0.87039 2.442 0.01535 *
## ns(bod, 3)3 1.02690 0.38099 2.695 0.00755 **
## ns(tss, 3)1 0.21930 0.26701 0.821 0.41232
## ns(tss, 3)2 -0.03129 0.67385 -0.046 0.96301
## ns(tss, 3)3 -0.17376 0.34710 -0.501 0.61713
## ns(suhu, 3)1 -0.30075 0.26282 -1.144 0.25369
## ns(suhu, 3)2 -0.96415 0.68567 -1.406 0.16105
## ns(suhu, 3)3 -0.23214 0.33848 -0.686 0.49351
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9533 on 228 degrees of freedom
## Multiple R-squared: 0.04933, Adjusted R-squared: -0.0007017
## F-statistic: 0.986 on 12 and 228 DF, p-value: 0.4631
spline_pred <- predict(spline_mod, test_reg)
r2_spline <- R2(spline_pred, test_reg$do)
mse_spline <- mse(test_reg$do, spline_pred)
rmse_spline <- rmse(test_reg$do, spline_pred)
# 2. Evaluasi performa
cat("Linear Regression -> R²:", round(r2_lm,3), "MSE:", round(mse_lm,3), "RMSE:", round(rmse_lm,3), "\n")
## Linear Regression -> R²: 0.053 MSE: 0.839 RMSE: 0.916
cat("Spline Regression -> R²:", round(r2_spline,3), "MSE:", round(mse_spline,3), "RMSE:", round(rmse_spline,3), "\n")
## Spline Regression -> R²: 0.003 MSE: 0.855 RMSE: 0.925
# 3. Visualisasi prediksi vs aktual
p1 <- ggplot(data.frame(Aktual=test_reg$do, Prediksi=lm_pred),
aes(x=Aktual, y=Prediksi)) +
geom_point(color="blue") +
geom_abline(slope=1, intercept=0, linetype="dashed") +
ggtitle("Linear Regression: DO Prediksi vs Aktual")
p2 <- ggplot(data.frame(Aktual=test_reg$do, Prediksi=spline_pred),
aes(x=Aktual, y=Prediksi)) +
geom_point(color="red") +
geom_abline(slope=1, intercept=0, linetype="dashed") +
ggtitle("Spline Regression: DO Prediksi vs Aktual")
gridExtra::grid.arrange(p1, p2, ncol=2)
# 4. Simpan hasil prediksi DO
prediksi_DO <- data.frame(Aktual = test_reg$do,
Prediksi_Linear = lm_pred,
Prediksi_Spline = spline_pred)
write.csv(prediksi_DO, "prediksi_do_test.csv", row.names = FALSE)
head(prediksi_DO)
## Aktual Prediksi_Linear Prediksi_Spline
## 1 5.9909 5.781970 5.661984
## 2 5.7236 5.753254 5.524390
## 3 4.9232 5.894591 5.957204
## 4 5.8043 6.114622 6.410521
## 5 6.4420 5.722684 5.899569
## 6 5.1307 5.971641 5.946668
Kesimpulan: Berdasarkan hasil evaluasi model regresi, nilai R² untuk Linear Regression sebesar 0,053 dan untuk Spline Regression sebesar 0,003, menunjukkan bahwa kedua model hanya mampu menjelaskan variasi data dengan sangat lemah. Nilai MSE dan RMSE yang cukup tinggi juga menandakan bahwa selisih antara nilai prediksi dan nilai aktual masih besar. Dengan demikian, dapat disimpulkan bahwa baik model regresi linear maupun spline belum mampu memprediksi nilai DO (Dissolved Oxygen) dengan akurasi yang baik, sehingga diperlukan pemilihan variabel atau pendekatan model lain agar hasil prediksi menjadi lebih representatif.