# Load Library
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(splines)
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:ggplot2':
##
## element
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(ggplot2)
library(broom)
library(openxlsx)
Soal 1: Data Cleaning dan Eksplorasi
# Baca Data Training & Testing
train_data <- read_excel("D:/File Tugas Kuliah/TUGAS MATKUL SEMESTER 5/STATISTIKA LINGKUNGAN/UTS/kualitasair.xlsx", sheet = "Training")
test_data <- read_excel("D:/File Tugas Kuliah/TUGAS MATKUL SEMESTER 5/STATISTIKA LINGKUNGAN/UTS/kualitasair.xlsx", sheet = "Testing")
colSums(is.na(train_data))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 23 22 24 0 0
colSums(is.na(test_data))
## Lokasi pH DO BOD TSS Suhu
## 0 0 8 9 7 0
# Pembersihan Data
clean_data <- function(df, is_training = TRUE) {
# Tangani Missing Value hanya kolom numerik
df <- df %>%
mutate(across(c("pH", "DO", "BOD", "TSS", "Suhu"), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
# Tangani Outlier untuk kolom numerik
numeric_cols <- c("pH", "DO", "BOD", "TSS", "Suhu")
for (col in numeric_cols) {
Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
batas_bawah <- Q1 - 1.5 * IQR
batas_atas <- Q3 + 1.5 * IQR
df[[col]] <- ifelse(df[[col]] < batas_bawah, batas_bawah,
ifelse(df[[col]] > batas_atas, batas_atas, df[[col]]))
}
# Standarisasi Status hanya jika data training
if(is_training & "Status" %in% names(df)) {
df <- df %>%
mutate(Status = str_to_lower(Status)) %>%
mutate(Status = case_when(
str_detect(Status, "baik") ~ "Baik",
str_detect(Status, "ringan") ~ "Tercemar ringan",
str_detect(Status, "berat") ~ "Tercemar berat",
TRUE ~ "Tidak diketahui"
)) %>%
mutate(Status = factor(Status, levels = c("Baik", "Tercemar ringan", "Tercemar berat")))
}
return(df)
}
train_data <- clean_data(train_data, is_training = TRUE)
test_data <- clean_data(test_data, is_training = FALSE)
colSums(is.na(train_data))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 0 0 0 0 0
colSums(is.na(test_data))
## Lokasi pH DO BOD TSS Suhu
## 0 0 0 0 0 0
# Statistik Deskriptif
cat("\n=== Statistik Deskriptif Data Training ===\n")
##
## === Statistik Deskriptif Data Training ===
print(summary(train_data))
## Lokasi pH DO BOD
## Length:300 Min. :5.697 Min. :3.615 Min. :0.8513
## Class :character 1st Qu.:6.670 1st Qu.:5.413 1st Qu.:2.4599
## Mode :character Median :6.988 Median :5.991 Median :3.0661
## Mean :6.990 Mean :5.977 Mean :3.0041
## 3rd Qu.:7.318 3rd Qu.:6.611 3rd Qu.:3.5323
## Max. :8.290 Max. :8.409 Max. :5.1409
## TSS Suhu Status
## Min. :27.28 Min. :22.77 Baik : 72
## 1st Qu.:44.28 1st Qu.:26.62 Tercemar ringan:221
## Median :49.52 Median :28.01 Tercemar berat : 7
## Mean :49.68 Mean :28.12
## 3rd Qu.:55.62 3rd Qu.:29.46
## Max. :72.62 Max. :33.73
cat("\n=== Statistik Deskriptif Data Testing ===\n")
##
## === Statistik Deskriptif Data Testing ===
print(summary(test_data))
## Lokasi pH DO BOD
## Length:75 Min. :5.514 Min. :4.005 Min. :1.368
## Class :character 1st Qu.:6.615 1st Qu.:5.354 1st Qu.:2.574
## Mode :character Median :6.965 Median :5.644 Median :3.062
## Mean :6.972 Mean :5.801 Mean :3.034
## 3rd Qu.:7.349 3rd Qu.:6.253 3rd Qu.:3.378
## Max. :8.449 Max. :7.602 Max. :4.584
## TSS Suhu
## Min. :24.95 Min. :23.39
## 1st Qu.:43.35 1st Qu.:26.79
## Median :49.57 Median :28.28
## Mean :49.16 Mean :28.37
## 3rd Qu.:55.62 3rd Qu.:29.88
## Max. :74.02 Max. :34.52
Soal 2 : Klasifikasi Status Kualitas Air
# ===========================
# 6️⃣ SOAL 3: Klasifikasi Status (Training)
# ===========================
predictors <- c("pH", "DO", "BOD", "TSS", "Suhu")
# SVM
svm_model <- svm(Status ~ ., data = train_data[, c(predictors, "Status")])
svm_pred <- predict(svm_model, newdata = train_data[, predictors])
# Decision Tree
tree_model <- rpart(Status ~ ., data = train_data[, c(predictors, "Status")], method = "class")
tree_pred <- predict(tree_model, newdata = train_data[, predictors], type = "class")
# Random Forest
rf_model <- randomForest(Status ~ ., data = train_data[, c(predictors, "Status")])
rf_pred <- predict(rf_model, newdata = train_data[, predictors])
# Confusion Matrix & Akurasi
cat("\n=== Confusion Matrix & Akurasi SVM ===\n")
##
## === Confusion Matrix & Akurasi SVM ===
print(confusionMatrix(svm_pred, train_data$Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar ringan Tercemar berat
## Baik 55 2 0
## Tercemar ringan 17 219 5
## Tercemar berat 0 0 2
##
## Overall Statistics
##
## Accuracy : 0.92
## 95% CI : (0.8833, 0.9481)
## No Information Rate : 0.7367
## P-Value [Acc > NIR] : 6.689e-16
##
## Kappa : 0.7793
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity 0.7639 0.9910 0.285714
## Specificity 0.9912 0.7215 1.000000
## Pos Pred Value 0.9649 0.9087 1.000000
## Neg Pred Value 0.9300 0.9661 0.983221
## Prevalence 0.2400 0.7367 0.023333
## Detection Rate 0.1833 0.7300 0.006667
## Detection Prevalence 0.1900 0.8033 0.006667
## Balanced Accuracy 0.8776 0.8562 0.642857
cat("\n=== Confusion Matrix & Akurasi Decision Tree ===\n")
##
## === Confusion Matrix & Akurasi Decision Tree ===
print(confusionMatrix(tree_pred, train_data$Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar ringan Tercemar berat
## Baik 66 1 0
## Tercemar ringan 6 217 3
## Tercemar berat 0 3 4
##
## Overall Statistics
##
## Accuracy : 0.9567
## 95% CI : (0.927, 0.9767)
## No Information Rate : 0.7367
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8891
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity 0.9167 0.9819 0.57143
## Specificity 0.9956 0.8861 0.98976
## Pos Pred Value 0.9851 0.9602 0.57143
## Neg Pred Value 0.9742 0.9459 0.98976
## Prevalence 0.2400 0.7367 0.02333
## Detection Rate 0.2200 0.7233 0.01333
## Detection Prevalence 0.2233 0.7533 0.02333
## Balanced Accuracy 0.9561 0.9340 0.78059
cat("\n=== Confusion Matrix & Akurasi Random Forest ===\n")
##
## === Confusion Matrix & Akurasi Random Forest ===
print(confusionMatrix(rf_pred, train_data$Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar ringan Tercemar berat
## Baik 72 0 0
## Tercemar ringan 0 221 0
## Tercemar berat 0 0 7
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9878, 1)
## No Information Rate : 0.7367
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity 1.00 1.0000 1.00000
## Specificity 1.00 1.0000 1.00000
## Pos Pred Value 1.00 1.0000 1.00000
## Neg Pred Value 1.00 1.0000 1.00000
## Prevalence 0.24 0.7367 0.02333
## Detection Rate 0.24 0.7367 0.02333
## Detection Prevalence 0.24 0.7367 0.02333
## Balanced Accuracy 1.00 1.0000 1.00000
# Interpretasi akurasi
cm_svm <- confusionMatrix(svm_pred, train_data$Status)
cm_tree <- confusionMatrix(tree_pred, train_data$Status)
cm_rf <- confusionMatrix(rf_pred, train_data$Status)
# Ringkasan Interpretasi hasil
interpret_akurasi_singkat <- function(cm, model_name){
acc <- cm$overall["Accuracy"]
cat("\nModel:", model_name, "\n")
cat(sprintf("Accuracy: %.2f%% - ", acc*100))
if(acc > 0.95){
cat("Sangat baik\n")
} else if(acc > 0.85){
cat("Baik\n")
} else if(acc > 0.7){
cat("Cukup\n")
} else {
cat("Kurang\n")
}
}
interpret_akurasi_singkat(cm_svm, "SVM")
##
## Model: SVM
## Accuracy: 92.00% - Baik
interpret_akurasi_singkat(cm_tree, "Decision Tree")
##
## Model: Decision Tree
## Accuracy: 95.67% - Sangat baik
interpret_akurasi_singkat(cm_rf, "Random Forest")
##
## Model: Random Forest
## Accuracy: 100.00% - Sangat baik
SOAL 3: Prediksi Variabel D
# Model Linear
model_lm <- lm(DO ~ pH + BOD + TSS + Suhu, data = train_data)
train_data$pred_linear <- predict(model_lm, newdata = train_data)
# Model Spline
model_spline <- lm(DO ~ bs(pH, df = 4) + bs(BOD, df = 4) + bs(TSS, df = 4) + bs(Suhu, df = 4),
data = train_data)
train_data$pred_spline <- predict(model_spline, newdata = train_data)
# Evaluasi performa
mse <- function(y, yhat) mean((y - yhat)^2)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
metrics <- tibble(
Model = c("Regresi Linear", "Regresi Spline"),
R2 = c(r2(train_data$DO, train_data$pred_linear),
r2(train_data$DO, train_data$pred_spline)),
MSE = c(mse(train_data$DO, train_data$pred_linear),
mse(train_data$DO, train_data$pred_spline)),
RMSE = c(rmse(train_data$DO, train_data$pred_linear),
rmse(train_data$DO, train_data$pred_spline))
)
print(metrics)
## # A tibble: 2 × 4
## Model R2 MSE RMSE
## <chr> <dbl> <dbl> <dbl>
## 1 Regresi Linear 0.00600 0.886 0.941
## 2 Regresi Spline 0.0566 0.841 0.917
# Visualisasi
ggplot(train_data, aes(x = DO, y = pred_linear)) +
geom_point(color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "Prediksi DO vs Aktual (Linear, Training)", x = "DO Aktual", y = "DO Prediksi") +
theme_minimal()

ggplot(train_data, aes(x = DO, y = pred_spline)) +
geom_point(color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "Prediksi DO vs Aktual (Spline, Training)", x = "DO Aktual", y = "DO Prediksi") +
theme_minimal()

# Variabel paling berpengaruh
coef_df <- tidy(model_lm) %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
rename(Variabel = term, Koefisien = estimate)
print(coef_df)
## # A tibble: 4 × 5
## Variabel Koefisien std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 BOD 0.0794 0.0693 1.15 0.253
## 2 Suhu -0.0203 0.0267 -0.759 0.448
## 3 pH -0.0152 0.112 -0.136 0.892
## 4 TSS 0.000214 0.00601 0.0356 0.972
# Buat interpretasi naratif otomatis
interpretasi <- function(df){
cat("=== Interpretasi Pengaruh Variabel terhadap DO ===\n\n")
for(i in 1:nrow(df)){
var <- df$Variabel[i]
koef <- df$Koefisien[i]
pval <- df$p.value[i]
arah <- if(koef > 0) "meningkatkan" else "menurunkan"
signif <- if(pval < 0.05) "signifikan" else "tidak signifikan"
cat(sprintf("Variabel %s memiliki koefisien %.4f, artinya kenaikan %s cenderung %s DO, dan secara statistik %s.\n",
var, koef, var, arah, signif))
}
}
# Terapkan ke dataframe koefisien
interpretasi(coef_df)
## === Interpretasi Pengaruh Variabel terhadap DO ===
##
## Variabel BOD memiliki koefisien 0.0794, artinya kenaikan BOD cenderung meningkatkan DO, dan secara statistik tidak signifikan.
## Variabel Suhu memiliki koefisien -0.0203, artinya kenaikan Suhu cenderung menurunkan DO, dan secara statistik tidak signifikan.
## Variabel pH memiliki koefisien -0.0152, artinya kenaikan pH cenderung menurunkan DO, dan secara statistik tidak signifikan.
## Variabel TSS memiliki koefisien 0.0002, artinya kenaikan TSS cenderung meningkatkan DO, dan secara statistik tidak signifikan.
Prediksi Status di Data Testing
# Variabel prediktor
predictors <- c("pH", "DO", "BOD", "TSS", "Suhu")
# Gunakan model yang sudah dilatih di data training
test_data$Status_pred_svm <- predict(svm_model, newdata = test_data[, predictors])
test_data$Status_pred_tree <- predict(tree_model, newdata = test_data[, predictors], type = "class")
test_data$Status_pred_rf <- predict(rf_model, newdata = test_data[, predictors])
# Tampilkan Prediksi
head(test_data[, c(predictors, "Status_pred_svm", "Status_pred_tree", "Status_pred_rf")])
## # A tibble: 6 × 8
## pH DO BOD TSS Suhu Status_pred_svm Status_pred_tree Status_pred_rf
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 7.00 5.08 3.18 50.9 25.6 Tercemar ringan Tercemar ringan Tercemar ringan
## 2 7.38 4.75 3.34 46.5 27.1 Tercemar ringan Tercemar ringan Tercemar ringan
## 3 7.02 6.59 3.00 49.6 26.6 Baik Baik Baik
## 4 7.37 5.64 3.50 39.0 26.7 Tercemar ringan Tercemar ringan Tercemar ringan
## 5 6.93 6.24 3.34 47.2 23.4 Tercemar ringan Tercemar ringan Tercemar ringan
## 6 6.97 6.00 3.45 39.1 27.7 Tercemar ringan Tercemar ringan Tercemar ringan
# Simpan hasil prediksi ke Excel
write.xlsx(test_data, "C:/Users/Fahsa MPN/Downloads/hasil_prediksi_testing.xlsx")