# Load Library
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(splines)
library(e1071)        
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(rpart)        
library(randomForest) 
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)        
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(ggplot2)
library(broom)
library(openxlsx)

Soal 1: Data Cleaning dan Eksplorasi

# Baca Data Training & Testing
train_data <- read_excel("D:/File Tugas Kuliah/TUGAS MATKUL SEMESTER 5/STATISTIKA LINGKUNGAN/UTS/kualitasair.xlsx", sheet = "Training")
test_data  <- read_excel("D:/File Tugas Kuliah/TUGAS MATKUL SEMESTER 5/STATISTIKA LINGKUNGAN/UTS/kualitasair.xlsx", sheet = "Testing")
colSums(is.na(train_data))
## Lokasi     pH     DO    BOD    TSS   Suhu Status 
##      0      0     23     22     24      0      0
colSums(is.na(test_data))
## Lokasi     pH     DO    BOD    TSS   Suhu 
##      0      0      8      9      7      0
# Pembersihan Data
clean_data <- function(df, is_training = TRUE) {
  # Tangani Missing Value hanya kolom numerik
  df <- df %>%
    mutate(across(c("pH", "DO", "BOD", "TSS", "Suhu"), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
  
  # Tangani Outlier untuk kolom numerik
  numeric_cols <- c("pH", "DO", "BOD", "TSS", "Suhu")
  for (col in numeric_cols) {
    Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
    Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
    IQR <- Q3 - Q1
    batas_bawah <- Q1 - 1.5 * IQR
    batas_atas <- Q3 + 1.5 * IQR
    df[[col]] <- ifelse(df[[col]] < batas_bawah, batas_bawah,
                        ifelse(df[[col]] > batas_atas, batas_atas, df[[col]]))
  }
  
  # Standarisasi Status hanya jika data training
  if(is_training & "Status" %in% names(df)) {
    df <- df %>%
      mutate(Status = str_to_lower(Status)) %>%
      mutate(Status = case_when(
        str_detect(Status, "baik") ~ "Baik",
        str_detect(Status, "ringan") ~ "Tercemar ringan",
        str_detect(Status, "berat") ~ "Tercemar berat",
        TRUE ~ "Tidak diketahui"
      )) %>%
      mutate(Status = factor(Status, levels = c("Baik", "Tercemar ringan", "Tercemar berat")))
  }
  
  return(df)
}
train_data <- clean_data(train_data, is_training = TRUE)
test_data  <- clean_data(test_data, is_training = FALSE)
colSums(is.na(train_data))
## Lokasi     pH     DO    BOD    TSS   Suhu Status 
##      0      0      0      0      0      0      0
colSums(is.na(test_data))
## Lokasi     pH     DO    BOD    TSS   Suhu 
##      0      0      0      0      0      0
# Statistik Deskriptif
cat("\n=== Statistik Deskriptif Data Training ===\n")
## 
## === Statistik Deskriptif Data Training ===
print(summary(train_data))
##     Lokasi                pH              DO             BOD        
##  Length:300         Min.   :5.697   Min.   :3.615   Min.   :0.8513  
##  Class :character   1st Qu.:6.670   1st Qu.:5.413   1st Qu.:2.4599  
##  Mode  :character   Median :6.988   Median :5.991   Median :3.0661  
##                     Mean   :6.990   Mean   :5.977   Mean   :3.0041  
##                     3rd Qu.:7.318   3rd Qu.:6.611   3rd Qu.:3.5323  
##                     Max.   :8.290   Max.   :8.409   Max.   :5.1409  
##       TSS             Suhu                   Status   
##  Min.   :27.28   Min.   :22.77   Baik           : 72  
##  1st Qu.:44.28   1st Qu.:26.62   Tercemar ringan:221  
##  Median :49.52   Median :28.01   Tercemar berat :  7  
##  Mean   :49.68   Mean   :28.12                        
##  3rd Qu.:55.62   3rd Qu.:29.46                        
##  Max.   :72.62   Max.   :33.73
cat("\n=== Statistik Deskriptif Data Testing ===\n")
## 
## === Statistik Deskriptif Data Testing ===
print(summary(test_data))
##     Lokasi                pH              DO             BOD       
##  Length:75          Min.   :5.514   Min.   :4.005   Min.   :1.368  
##  Class :character   1st Qu.:6.615   1st Qu.:5.354   1st Qu.:2.574  
##  Mode  :character   Median :6.965   Median :5.644   Median :3.062  
##                     Mean   :6.972   Mean   :5.801   Mean   :3.034  
##                     3rd Qu.:7.349   3rd Qu.:6.253   3rd Qu.:3.378  
##                     Max.   :8.449   Max.   :7.602   Max.   :4.584  
##       TSS             Suhu      
##  Min.   :24.95   Min.   :23.39  
##  1st Qu.:43.35   1st Qu.:26.79  
##  Median :49.57   Median :28.28  
##  Mean   :49.16   Mean   :28.37  
##  3rd Qu.:55.62   3rd Qu.:29.88  
##  Max.   :74.02   Max.   :34.52

Soal 2 : Klasifikasi Status Kualitas Air

# ===========================
# 6️⃣ SOAL 3: Klasifikasi Status (Training)
# ===========================
predictors <- c("pH", "DO", "BOD", "TSS", "Suhu")

# SVM
svm_model <- svm(Status ~ ., data = train_data[, c(predictors, "Status")])
svm_pred <- predict(svm_model, newdata = train_data[, predictors])

# Decision Tree
tree_model <- rpart(Status ~ ., data = train_data[, c(predictors, "Status")], method = "class")
tree_pred <- predict(tree_model, newdata = train_data[, predictors], type = "class")

# Random Forest
rf_model <- randomForest(Status ~ ., data = train_data[, c(predictors, "Status")])
rf_pred <- predict(rf_model, newdata = train_data[, predictors])

# Confusion Matrix & Akurasi
cat("\n=== Confusion Matrix & Akurasi SVM ===\n")
## 
## === Confusion Matrix & Akurasi SVM ===
print(confusionMatrix(svm_pred, train_data$Status))
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik              55               2              0
##   Tercemar ringan   17             219              5
##   Tercemar berat     0               0              2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.92            
##                  95% CI : (0.8833, 0.9481)
##     No Information Rate : 0.7367          
##     P-Value [Acc > NIR] : 6.689e-16       
##                                           
##                   Kappa : 0.7793          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity               0.7639                 0.9910              0.285714
## Specificity               0.9912                 0.7215              1.000000
## Pos Pred Value            0.9649                 0.9087              1.000000
## Neg Pred Value            0.9300                 0.9661              0.983221
## Prevalence                0.2400                 0.7367              0.023333
## Detection Rate            0.1833                 0.7300              0.006667
## Detection Prevalence      0.1900                 0.8033              0.006667
## Balanced Accuracy         0.8776                 0.8562              0.642857
cat("\n=== Confusion Matrix & Akurasi Decision Tree ===\n")
## 
## === Confusion Matrix & Akurasi Decision Tree ===
print(confusionMatrix(tree_pred, train_data$Status))
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik              66               1              0
##   Tercemar ringan    6             217              3
##   Tercemar berat     0               3              4
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9567         
##                  95% CI : (0.927, 0.9767)
##     No Information Rate : 0.7367         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8891         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity               0.9167                 0.9819               0.57143
## Specificity               0.9956                 0.8861               0.98976
## Pos Pred Value            0.9851                 0.9602               0.57143
## Neg Pred Value            0.9742                 0.9459               0.98976
## Prevalence                0.2400                 0.7367               0.02333
## Detection Rate            0.2200                 0.7233               0.01333
## Detection Prevalence      0.2233                 0.7533               0.02333
## Balanced Accuracy         0.9561                 0.9340               0.78059
cat("\n=== Confusion Matrix & Akurasi Random Forest ===\n")
## 
## === Confusion Matrix & Akurasi Random Forest ===
print(confusionMatrix(rf_pred, train_data$Status))
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik              72               0              0
##   Tercemar ringan    0             221              0
##   Tercemar berat     0               0              7
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9878, 1)
##     No Information Rate : 0.7367     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity                 1.00                 1.0000               1.00000
## Specificity                 1.00                 1.0000               1.00000
## Pos Pred Value              1.00                 1.0000               1.00000
## Neg Pred Value              1.00                 1.0000               1.00000
## Prevalence                  0.24                 0.7367               0.02333
## Detection Rate              0.24                 0.7367               0.02333
## Detection Prevalence        0.24                 0.7367               0.02333
## Balanced Accuracy           1.00                 1.0000               1.00000
# Interpretasi akurasi
cm_svm   <- confusionMatrix(svm_pred, train_data$Status)
cm_tree  <- confusionMatrix(tree_pred, train_data$Status)
cm_rf    <- confusionMatrix(rf_pred, train_data$Status)

# Ringkasan Interpretasi hasil
interpret_akurasi_singkat <- function(cm, model_name){
  acc <- cm$overall["Accuracy"]
  
  cat("\nModel:", model_name, "\n")
  cat(sprintf("Accuracy: %.2f%% - ", acc*100))
  
  if(acc > 0.95){
    cat("Sangat baik\n")
  } else if(acc > 0.85){
    cat("Baik\n")
  } else if(acc > 0.7){
    cat("Cukup\n")
  } else {
    cat("Kurang\n")
  }
}
interpret_akurasi_singkat(cm_svm, "SVM")
## 
## Model: SVM 
## Accuracy: 92.00% - Baik
interpret_akurasi_singkat(cm_tree, "Decision Tree")
## 
## Model: Decision Tree 
## Accuracy: 95.67% - Sangat baik
interpret_akurasi_singkat(cm_rf, "Random Forest")
## 
## Model: Random Forest 
## Accuracy: 100.00% - Sangat baik

SOAL 3: Prediksi Variabel D

# Model Linear
model_lm <- lm(DO ~ pH + BOD + TSS + Suhu, data = train_data)
train_data$pred_linear <- predict(model_lm, newdata = train_data)

# Model Spline
model_spline <- lm(DO ~ bs(pH, df = 4) + bs(BOD, df = 4) + bs(TSS, df = 4) + bs(Suhu, df = 4),
                   data = train_data)
train_data$pred_spline <- predict(model_spline, newdata = train_data)

# Evaluasi performa
mse <- function(y, yhat) mean((y - yhat)^2)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

metrics <- tibble(
  Model = c("Regresi Linear", "Regresi Spline"),
  R2 = c(r2(train_data$DO, train_data$pred_linear),
         r2(train_data$DO, train_data$pred_spline)),
  MSE = c(mse(train_data$DO, train_data$pred_linear),
          mse(train_data$DO, train_data$pred_spline)),
  RMSE = c(rmse(train_data$DO, train_data$pred_linear),
           rmse(train_data$DO, train_data$pred_spline))
)
print(metrics)
## # A tibble: 2 × 4
##   Model               R2   MSE  RMSE
##   <chr>            <dbl> <dbl> <dbl>
## 1 Regresi Linear 0.00600 0.886 0.941
## 2 Regresi Spline 0.0566  0.841 0.917
# Visualisasi
ggplot(train_data, aes(x = DO, y = pred_linear)) +
  geom_point(color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(title = "Prediksi DO vs Aktual (Linear, Training)", x = "DO Aktual", y = "DO Prediksi") +
  theme_minimal()

ggplot(train_data, aes(x = DO, y = pred_spline)) +
  geom_point(color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(title = "Prediksi DO vs Aktual (Spline, Training)", x = "DO Aktual", y = "DO Prediksi") +
  theme_minimal()

# Variabel paling berpengaruh
coef_df <- tidy(model_lm) %>%
  filter(term != "(Intercept)") %>%
  arrange(desc(abs(estimate))) %>%
  rename(Variabel = term, Koefisien = estimate)
print(coef_df)
## # A tibble: 4 × 5
##   Variabel Koefisien std.error statistic p.value
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>
## 1 BOD       0.0794     0.0693     1.15     0.253
## 2 Suhu     -0.0203     0.0267    -0.759    0.448
## 3 pH       -0.0152     0.112     -0.136    0.892
## 4 TSS       0.000214   0.00601    0.0356   0.972
# Buat interpretasi naratif otomatis
interpretasi <- function(df){
  cat("=== Interpretasi Pengaruh Variabel terhadap DO ===\n\n")
  
  for(i in 1:nrow(df)){
    var <- df$Variabel[i]
    koef <- df$Koefisien[i]
    pval <- df$p.value[i]
    
    arah <- if(koef > 0) "meningkatkan" else "menurunkan"
    
    signif <- if(pval < 0.05) "signifikan" else "tidak signifikan"
    
    cat(sprintf("Variabel %s memiliki koefisien %.4f, artinya kenaikan %s cenderung %s DO, dan secara statistik %s.\n",
                var, koef, var, arah, signif))
  }
}

# Terapkan ke dataframe koefisien
interpretasi(coef_df)
## === Interpretasi Pengaruh Variabel terhadap DO ===
## 
## Variabel BOD memiliki koefisien 0.0794, artinya kenaikan BOD cenderung meningkatkan DO, dan secara statistik tidak signifikan.
## Variabel Suhu memiliki koefisien -0.0203, artinya kenaikan Suhu cenderung menurunkan DO, dan secara statistik tidak signifikan.
## Variabel pH memiliki koefisien -0.0152, artinya kenaikan pH cenderung menurunkan DO, dan secara statistik tidak signifikan.
## Variabel TSS memiliki koefisien 0.0002, artinya kenaikan TSS cenderung meningkatkan DO, dan secara statistik tidak signifikan.

Prediksi Status di Data Testing

# Variabel prediktor
predictors <- c("pH", "DO", "BOD", "TSS", "Suhu")

# Gunakan model yang sudah dilatih di data training
test_data$Status_pred_svm  <- predict(svm_model, newdata = test_data[, predictors])
test_data$Status_pred_tree <- predict(tree_model, newdata = test_data[, predictors], type = "class")
test_data$Status_pred_rf   <- predict(rf_model, newdata = test_data[, predictors])

# Tampilkan Prediksi
head(test_data[, c(predictors, "Status_pred_svm", "Status_pred_tree", "Status_pred_rf")])
## # A tibble: 6 × 8
##      pH    DO   BOD   TSS  Suhu Status_pred_svm Status_pred_tree Status_pred_rf 
##   <dbl> <dbl> <dbl> <dbl> <dbl> <fct>           <fct>            <fct>          
## 1  7.00  5.08  3.18  50.9  25.6 Tercemar ringan Tercemar ringan  Tercemar ringan
## 2  7.38  4.75  3.34  46.5  27.1 Tercemar ringan Tercemar ringan  Tercemar ringan
## 3  7.02  6.59  3.00  49.6  26.6 Baik            Baik             Baik           
## 4  7.37  5.64  3.50  39.0  26.7 Tercemar ringan Tercemar ringan  Tercemar ringan
## 5  6.93  6.24  3.34  47.2  23.4 Tercemar ringan Tercemar ringan  Tercemar ringan
## 6  6.97  6.00  3.45  39.1  27.7 Tercemar ringan Tercemar ringan  Tercemar ringan
# Simpan hasil prediksi ke Excel
write.xlsx(test_data, "C:/Users/Fahsa MPN/Downloads/hasil_prediksi_testing.xlsx")