R UTS Statistika Lingkungan

1. Library yang di perlukan

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:randomForest':
## 
##     outlier
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(GGally)
library(ggplot2)
set.seed(123)

2. Input & Cek Data

# 1. Baca data
path <- "D:/Kampus/Semester 5/Stat Lingkungan/UTS/kualitasair-Training.csv"
df <- read_csv(path, show_col_types = FALSE) %>% clean_names()  # bersihkan nama kolom
glimpse(df)
## Rows: 300
## Columns: 7
## $ lokasi <chr> "S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S…
## $ p_h    <dbl> 7.6855, 6.7177, 7.1816, 7.3164, 7.2021, 6.9469, 7.7558, 6.9527,…
## $ do     <dbl> NA, 5.7236, 4.8906, 6.1339, 7.7853, 8.4222, 4.9232, 6.4859, 7.3…
## $ bod    <dbl> 1.7136, 1.4402, 2.7274, 3.1398, 1.1778, 3.2324, NA, 4.0358, 3.1…
## $ tss    <dbl> 43.1415, 44.2963, NA, 41.0104, 48.0967, 48.5610, 49.0343, 51.81…
## $ suhu   <dbl> 26.7972, 27.7284, 26.0255, 29.6639, 26.4099, 28.6809, 29.7409, …
## $ status <chr> "Tercemar ringan", "Tercemar ringan", "Tercemar ringan", "Terce…
cat("Jumlah baris total:", nrow(df), "\n")
## Jumlah baris total: 300
df
## # A tibble: 300 × 7
##    lokasi   p_h    do   bod   tss  suhu status         
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>          
##  1 S1      7.69 NA     1.71  43.1  26.8 Tercemar ringan
##  2 S2      6.72  5.72  1.44  44.3  27.7 Tercemar ringan
##  3 S3      7.18  4.89  2.73  NA    26.0 Tercemar ringan
##  4 S4      7.32  6.13  3.14  41.0  29.7 Tercemar ringan
##  5 S5      7.20  7.79  1.18  48.1  26.4 baik           
##  6 S6      6.95  8.42  3.23  48.6  28.7 Tercemar ringan
##  7 S7      7.76  4.92 NA     49.0  29.7 Tercemar ringan
##  8 S8      6.95  6.49  4.04  51.8  25.6 Tercemar ringan
##  9 S9      8.01  7.39  3.13  66.0  30.0 Tercemar ringan
## 10 S10     6.97  5.80 NA     NA    23.8 BAIK           
## # ℹ 290 more rows

3. Cek kolom yang tersedia dan missing

print("Kolom:")
## [1] "Kolom:"
print(names(df))
## [1] "lokasi" "p_h"    "do"     "bod"    "tss"    "suhu"   "status"
print("Jumlah NA per kolom:")
## [1] "Jumlah NA per kolom:"
print(sapply(df, function(x) sum(is.na(x))))
## lokasi    p_h     do    bod    tss   suhu status 
##      0      0     23     22     24      0      0

4. Asumsi variabel numerik umum

# Jika nama kolom berbeda, sesuaikan vector num_vars di bawah:
num_vars <- intersect(c("ph","do","bod","tss","suhu","temperature"), names(df))
# Jika kolom suhu bernama "suhu" atau "temperature", pilih salah satunya
if("suhu" %in% names(df) & !("suhu" %in% num_vars)) num_vars <- c(num_vars, "suhu")
cat("Menggunakan numerik vars:", paste(num_vars, collapse = ", "), "\n")
## Menggunakan numerik vars: do, bod, tss, suhu

5. Cek Variabel Y

# Pastikan ada kolom status (case-insensitive)
status_col <- names(df)[tolower(names(df)) == "status"]
if(length(status_col) == 0){
  stop("Kolom 'Status' tidak ditemukan. Pastikan file memiliki kolom Status (case-insensitive).")
}
status_col <- status_col[1]
cat("Kolom status ditemukan:", status_col, "\n")
## Kolom status ditemukan: status

6. Cek Data Ulang

df
## # A tibble: 300 × 7
##    lokasi   p_h    do   bod   tss  suhu status         
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>          
##  1 S1      7.69 NA     1.71  43.1  26.8 Tercemar ringan
##  2 S2      6.72  5.72  1.44  44.3  27.7 Tercemar ringan
##  3 S3      7.18  4.89  2.73  NA    26.0 Tercemar ringan
##  4 S4      7.32  6.13  3.14  41.0  29.7 Tercemar ringan
##  5 S5      7.20  7.79  1.18  48.1  26.4 baik           
##  6 S6      6.95  8.42  3.23  48.6  28.7 Tercemar ringan
##  7 S7      7.76  4.92 NA     49.0  29.7 Tercemar ringan
##  8 S8      6.95  6.49  4.04  51.8  25.6 Tercemar ringan
##  9 S9      8.01  7.39  3.13  66.0  30.0 Tercemar ringan
## 10 S10     6.97  5.80 NA     NA    23.8 BAIK           
## # ℹ 290 more rows

7. Standardisasi penulisan Status menjadi faktor: Baik / Tercemar_ringan / Tercemar_berat

df <- df %>%
  mutate(!!status_col := as.character(!!sym(status_col))) %>%
  mutate(status_clean = case_when(
    str_detect(!!sym(status_col), "1") ~ "Baik",
    str_detect(!!sym(status_col), "2") ~ "Tercemar_ringan",
    str_detect(!!sym(status_col), "3") ~ "Tercemar_berat",
    str_to_lower(!!sym(status_col)) %in% c("baik","baikk","baik ") ~ "Baik",
    str_to_lower(!!sym(status_col)) %in% c("tercemar ringan","tercemar_ringan") ~ "Tercemar_ringan",
    str_to_lower(!!sym(status_col)) %in% c("tercemar berat","tercemar_berat") ~ "Tercemar_berat",
    TRUE ~ NA_character_
  )) %>%
  mutate(status = factor(status_clean, levels = c("Baik","Tercemar_ringan","Tercemar_berat"))) %>%
  select(-status_clean)

Output setelah di Standarisasi Status

df
## # A tibble: 300 × 7
##    lokasi   p_h    do   bod   tss  suhu status         
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <fct>          
##  1 S1      7.69 NA     1.71  43.1  26.8 Tercemar_ringan
##  2 S2      6.72  5.72  1.44  44.3  27.7 Tercemar_ringan
##  3 S3      7.18  4.89  2.73  NA    26.0 Tercemar_ringan
##  4 S4      7.32  6.13  3.14  41.0  29.7 Tercemar_ringan
##  5 S5      7.20  7.79  1.18  48.1  26.4 Baik           
##  6 S6      6.95  8.42  3.23  48.6  28.7 Tercemar_ringan
##  7 S7      7.76  4.92 NA     49.0  29.7 Tercemar_ringan
##  8 S8      6.95  6.49  4.04  51.8  25.6 Tercemar_ringan
##  9 S9      8.01  7.39  3.13  66.0  30.0 Tercemar_ringan
## 10 S10     6.97  5.80 NA     NA    23.8 Baik           
## # ℹ 290 more rows

8. Imputasi missing numerik (median)

for(v in num_vars){
  med <- median(df[[v]], na.rm = TRUE)
  df[[v]] <- ifelse(is.na(df[[v]]), med, df[[v]])
}

Cek lagi

df
## # A tibble: 300 × 7
##    lokasi   p_h    do   bod   tss  suhu status         
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <fct>          
##  1 S1      7.69  5.99  1.71  43.1  26.8 Tercemar_ringan
##  2 S2      6.72  5.72  1.44  44.3  27.7 Tercemar_ringan
##  3 S3      7.18  4.89  2.73  49.5  26.0 Tercemar_ringan
##  4 S4      7.32  6.13  3.14  41.0  29.7 Tercemar_ringan
##  5 S5      7.20  7.79  1.18  48.1  26.4 Baik           
##  6 S6      6.95  8.42  3.23  48.6  28.7 Tercemar_ringan
##  7 S7      7.76  4.92  3.07  49.0  29.7 Tercemar_ringan
##  8 S8      6.95  6.49  4.04  51.8  25.6 Tercemar_ringan
##  9 S9      8.01  7.39  3.13  66.0  30.0 Tercemar_ringan
## 10 S10     6.97  5.80  3.07  49.5  23.8 Baik           
## # ℹ 290 more rows

Missing Value Setelah Imputasi

cat("Jumlah NA setelah imputasi:\n")
## Jumlah NA setelah imputasi:
print(sapply(df, function(x) sum(is.na(x))))
## lokasi    p_h     do    bod    tss   suhu status 
##      0      0      0      0      0      0      0

9. Plot outlier

# Boxplot tiap variabel numerik
for(v in num_vars){
  print(
    ggplot(df, aes(x = "", y = .data[[v]])) +
      geom_boxplot(fill = "skyblue", outlier.color = "red") +
      labs(title = paste("Boxplot Outlier:", v), y = v, x = "") +
      theme_minimal()
  )
}

# Histogram tiap variabel
for(v in num_vars){
  print(
    ggplot(df, aes(x = .data[[v]])) +
      geom_histogram(bins = 25, fill = "lightgreen", color = "black") +
      labs(title = paste("Distribusi:", v), x = v, y = "Frekuensi") +
      theme_minimal()
  )
}

## 10. Menangani outlier dengan winsorize (IQR 1.5)

winsorize <- function(x){
  Q1 <- quantile(x, .25, na.rm = TRUE)
  Q3 <- quantile(x, .75, na.rm = TRUE)
  IQRv <- Q3 - Q1
  lower <- Q1 - 1.5 * IQRv
  upper <- Q3 + 1.5 * IQRv
  pmin(pmax(x, lower), upper)
}
df_w <- df
for(v in num_vars){
  df_w[[v]] <- winsorize(df_w[[v]])
}

11. Ringkasan setleah preprosecing statistik deskriptif

print(summary(df_w[num_vars]))
##        do             bod              tss             suhu      
##  Min.   :3.615   Min.   :0.8513   Min.   :27.28   Min.   :22.77  
##  1st Qu.:5.413   1st Qu.:2.4599   1st Qu.:44.28   1st Qu.:26.62  
##  Median :5.991   Median :3.0661   Median :49.52   Median :28.01  
##  Mean   :5.977   Mean   :3.0041   Mean   :49.68   Mean   :28.12  
##  3rd Qu.:6.611   3rd Qu.:3.5323   3rd Qu.:55.62   3rd Qu.:29.46  
##  Max.   :8.409   Max.   :5.1409   Max.   :72.62   Max.   :33.73
print(psych::describe(df_w[num_vars])[, c("mean","sd","min","max","skew","kurtosis")])
##       mean   sd   min   max  skew kurtosis
## do    5.98 0.95  3.61  8.41 -0.11    -0.11
## bod   3.00 0.80  0.85  5.14 -0.05     0.16
## tss  49.68 9.13 27.28 72.62 -0.03    -0.03
## suhu 28.12 2.07 22.77 33.73  0.13    -0.15

12. Visualisasi hubungan antar variabel numerik

# Pair plot / korelasi
print(GGally::ggpairs(df_w[, num_vars], title = "Hubungan Antar Variabel Kualitas Air"))

13. Split: Training

train_df <- df_w %>% filter(!is.na(status))
test_df  <- df_w %>% filter(is.na(status))
cat("Baris train:", nrow(train_df), " | Baris test (Status NA):", nrow(test_df), "\n")
## Baris train: 300  | Baris test (Status NA): 0

14. Buat partition train -> train/validation (80/20)

trainIndex <- createDataPartition(train_df$status, p = 0.8, list = FALSE)
train_set <- train_df[trainIndex, ]
val_set   <- train_df[-trainIndex, ]

15. Preprocessing: centering & scaling untuk model SVM

# Pastikan semua kolom numerik
for(v in num_vars){
  train_set[[v]] <- as.numeric(train_set[[v]])
  val_set[[v]]   <- as.numeric(val_set[[v]])
  if(nrow(test_df) > 0) test_df[[v]] <- as.numeric(test_df[[v]])
}
# Buat objek preprocessing hanya dari training
preProc <- preProcess(train_set[, num_vars], method = c("center", "scale"))

# Terapkan pada data train dan validasi
train_scaled <- as.data.frame(predict(preProc, train_set[, num_vars]))
val_scaled   <- as.data.frame(predict(preProc, val_set[, num_vars]))
# Hanya buat test_scaled jika ada baris test
if (nrow(test_df) > 0) {
  test_scaled <- as.data.frame(predict(preProc, test_df[, num_vars]))
} else {
  test_scaled <- data.frame()  # buat dummy kosong biar tidak error
}

# Tambahkan kembali variabel target
train_scaled$status <- train_set$status
val_scaled$status   <- val_set$status

16. MODEL KLASIFIKASI

16.1 SVM (radial)

# Model 1: SVM (Radial Kernel)
set.seed(123)
svm_model <- svm(status ~ ., data = train_scaled, kernel = "radial", probability = TRUE)
svm_pred  <- predict(svm_model, newdata = val_scaled)
conf_svm  <- confusionMatrix(svm_pred, val_scaled$status)
cat("\n=== CONFUSION MATRIX: SVM ===\n")
## 
## === CONFUSION MATRIX: SVM ===
print(conf_svm)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar_ringan Tercemar_berat
##   Baik              11               1              0
##   Tercemar_ringan    3              43              1
##   Tercemar_berat     0               0              0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9153          
##                  95% CI : (0.8132, 0.9719)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 0.0009347       
##                                           
##                   Kappa : 0.7631          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity               0.7857                 0.9773               0.00000
## Specificity               0.9778                 0.7333               1.00000
## Pos Pred Value            0.9167                 0.9149                   NaN
## Neg Pred Value            0.9362                 0.9167               0.98305
## Prevalence                0.2373                 0.7458               0.01695
## Detection Rate            0.1864                 0.7288               0.00000
## Detection Prevalence      0.2034                 0.7966               0.00000
## Balanced Accuracy         0.8817                 0.8553               0.50000

16.2 Model Decision Tree

# Model 2: Decision Tree
set.seed(123)
tree_model <- rpart(status ~ ., data = train_set[, c(num_vars,"status")], method = "class")
tree_pred  <- predict(tree_model, newdata = val_set[, num_vars], type = "class")
conf_tree  <- confusionMatrix(tree_pred, val_set$status)
cat("\n=== CONFUSION MATRIX: Decision Tree ===\n")
## 
## === CONFUSION MATRIX: Decision Tree ===
print(conf_tree)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar_ringan Tercemar_berat
##   Baik              13               0              0
##   Tercemar_ringan    1              44              1
##   Tercemar_berat     0               0              0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9661          
##                  95% CI : (0.8829, 0.9959)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 6.696e-06       
##                                           
##                   Kappa : 0.9075          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity               0.9286                 1.0000               0.00000
## Specificity               1.0000                 0.8667               1.00000
## Pos Pred Value            1.0000                 0.9565                   NaN
## Neg Pred Value            0.9783                 1.0000               0.98305
## Prevalence                0.2373                 0.7458               0.01695
## Detection Rate            0.2203                 0.7458               0.00000
## Detection Prevalence      0.2203                 0.7797               0.00000
## Balanced Accuracy         0.9643                 0.9333               0.50000

16.3 Model Random Forest

# Model 3: Random Forest
set.seed(123)
rf_model <- randomForest(status ~ ., data = train_set[, c(num_vars,"status")], ntree = 500, importance = TRUE)
rf_pred  <- predict(rf_model, newdata = val_set[, num_vars])
conf_rf  <- confusionMatrix(rf_pred, val_set$status)
cat("\n=== CONFUSION MATRIX: Random Forest ===\n")
## 
## === CONFUSION MATRIX: Random Forest ===
print(conf_rf)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar_ringan Tercemar_berat
##   Baik              13               0              0
##   Tercemar_ringan    1              44              0
##   Tercemar_berat     0               0              1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9831          
##                  95% CI : (0.9091, 0.9996)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 6.427e-07       
##                                           
##                   Kappa : 0.9552          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity               0.9286                 1.0000               1.00000
## Specificity               1.0000                 0.9333               1.00000
## Pos Pred Value            1.0000                 0.9778               1.00000
## Neg Pred Value            0.9783                 1.0000               1.00000
## Prevalence                0.2373                 0.7458               0.01695
## Detection Rate            0.2203                 0.7458               0.01695
## Detection Prevalence      0.2203                 0.7627               0.01695
## Balanced Accuracy         0.9643                 0.9667               1.00000

17. Evaluasi & Perbandingan Akurasi

accs <- c(
  SVM  = conf_svm$overall["Accuracy"],
  Tree = conf_tree$overall["Accuracy"],
  RF   = conf_rf$overall["Accuracy"]
)
cat("\n=== PERBANDINGAN AKURASI ===\n")
## 
## === PERBANDINGAN AKURASI ===
print(accs)
##  SVM.Accuracy Tree.Accuracy   RF.Accuracy 
##     0.9152542     0.9661017     0.9830508
best_model_name <- names(accs)[which.max(accs)]
cat("\nModel terbaik berdasarkan akurasi validasi:", best_model_name, "\n")
## 
## Model terbaik berdasarkan akurasi validasi: RF.Accuracy

18. Interpretasi Akurasi

if(best_model_name == "SVM"){
  acc <- round(conf_svm$overall["Accuracy"],3)
} else if(best_model_name == "Tree"){
  acc <- round(conf_tree$overall["Accuracy"],3)
} else {
  acc <- round(conf_rf$overall["Accuracy"],3)
}

cat("\nInterpretasi Akurasi Model:\n")
## 
## Interpretasi Akurasi Model:
cat(paste0(
  "Model klasifikasi terbaik adalah ", best_model_name, 
  " dengan tingkat akurasi sebesar ", acc*100, "%.\n",
  "Artinya, model mampu mengklasifikasikan status kualitas air dengan benar pada sekitar ",
  round(acc*100,1), "% dari data validasi.\n",
  "Nilai akurasi di atas 80% sudah menunjukkan model yang cukup baik.\n"
))
## Model klasifikasi terbaik adalah RF.Accuracy dengan tingkat akurasi sebesar 98.3%.
## Artinya, model mampu mengklasifikasikan status kualitas air dengan benar pada sekitar 98.3% dari data validasi.
## Nilai akurasi di atas 80% sudah menunjukkan model yang cukup baik.

19. Testing Data

# Lokasi file test (pakai file yang kamu upload barusan)
test_path <- "D:/Kampus/Semester 5/Stat Lingkungan/UTS/kualitasair-Testing.csv"
if (file.exists(test_path)) {
  cat("\n📂 File data test ditemukan:", test_path, "\n")
  test_df <- read_csv(test_path, show_col_types = FALSE)
  cat("Jumlah baris data test:", nrow(test_df), "\n")
  
  # Bersihkan nama kolom agar konsisten
  test_df <- test_df %>% janitor::clean_names()
  
  # Samakan nama kolom pH
  if ("p_h" %in% names(test_df) && !("ph" %in% names(test_df))) {
    test_df <- test_df %>% mutate(ph = p_h)
  }
  
  # Tentukan variabel numerik yang digunakan
  num_vars <- intersect(c("p_h","do","bod","tss","suhu","temperature"), names(test_df))
  
  # Imputasi nilai hilang (median dari training)
  for(v in num_vars){
    med <- median(train_df[[v]], na.rm = TRUE)
    test_df[[v]] <- ifelse(is.na(test_df[[v]]), med, test_df[[v]])
  }
  
  # Terapkan preprocessing dari training
  test_scaled <- as.data.frame(predict(preProc, test_df[, num_vars]))
  
  # ===== Prediksi dengan masing-masing model =====
  
  # --- SVM ---
  svm_pred_test <- predict(svm_model, newdata = test_scaled)
  # --- Decision Tree ---
  tree_pred_test <- predict(tree_model, newdata = test_df[, num_vars], type = "class")
  # --- Random Forest ---
  rf_pred_test <- predict(rf_model, newdata = test_df[, num_vars])
  
  # Tambahkan hasil ke dataframe
  test_df <- test_df %>%
    mutate(
      Pred_SVM  = svm_pred_test,
      Pred_Tree = tree_pred_test,
      Pred_RF   = rf_pred_test
    )
  
  # Simpan ke CSV
  output_path <- "D:/Kampus/Semester 5/Stat Lingkungan/UTS/2 test_predictions_all_models.csv"
  write_csv(test_df, output_path)
  
  cat("\n✅ Prediksi selesai untuk semua model.\n")
  cat("Hasil disimpan di:", output_path, "\n")
  cat("Kolom hasil prediksi: Pred_SVM, Pred_Tree, Pred_RF\n")
  
} else {
  cat("\n⚠️ File testing tidak ditemukan! Pastikan kualitasair-Testing.csv ada di /mnt/data/.\n")
}
## 
## 📂 File data test ditemukan: D:/Kampus/Semester 5/Stat Lingkungan/UTS/kualitasair-Testing.csv 
## Jumlah baris data test: 75 
## 
## ✅ Prediksi selesai untuk semua model.
## Hasil disimpan di: D:/Kampus/Semester 5/Stat Lingkungan/UTS/2 test_predictions_all_models.csv 
## Kolom hasil prediksi: Pred_SVM, Pred_Tree, Pred_RF

20. Prediksi DO - bangun model pada data train_set (yang memiliki DO)

# Pastikan nama kolom pH benar
if ("p_h" %in% names(train_df) && !("ph" %in% names(train_df))) {
  train_df <- train_df %>% mutate(ph = p_h)
  val_set  <- val_set %>% mutate(ph = p_h)
}

21. Gunakan data training (train_df) dan validasi (val_set)

lm_train <- train_df
lm_val   <- val_set

22. 1. Model Regresi Linear

lm_model <- lm(do ~ ph + bod + tss + suhu, data = lm_train)
summary(lm_model)
## 
## Call:
## lm(formula = do ~ ph + bod + tss + suhu, data = lm_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.41300 -0.54135  0.01817  0.64063  2.42522 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.4058198  1.1553742   5.544 6.55e-08 ***
## ph          -0.0154574  0.1114460  -0.139    0.890    
## bod          0.0794282  0.0693522   1.145    0.253    
## tss          0.0002145  0.0060147   0.036    0.972    
## suhu        -0.0202711  0.0267013  -0.759    0.448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9491 on 295 degrees of freedom
## Multiple R-squared:  0.006006,   Adjusted R-squared:  -0.007472 
## F-statistic: 0.4456 on 4 and 295 DF,  p-value: 0.7756
# Prediksi pada data validasi
lm_pred <- predict(lm_model, newdata = lm_val)
lm_mse  <- mean((lm_pred - lm_val$do)^2)
lm_rmse <- sqrt(lm_mse)
lm_r2   <- summary(lm_model)$r.squared

cat("\n--- Evaluasi Model Linear ---\n")
## 
## --- Evaluasi Model Linear ---
cat("R² =", round(lm_r2,4), " | MSE =", round(lm_mse,4), " | RMSE =", round(lm_rmse,4), "\n")
## R² = 0.006  | MSE = 0.8696  | RMSE = 0.9325

2. Model Regresi Non-Linear (GAM: Generalized Additive Model)

gam_model <- gam(do ~ s(ph) + s(bod) + s(tss) + s(suhu), data = lm_train, method = "REML")
summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## do ~ s(ph) + s(bod) + s(tss) + s(suhu)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.9770     0.0535   111.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value  
## s(ph)   1.000  1.000 0.030  0.8632  
## s(bod)  5.348  6.489 2.379  0.0311 *
## s(tss)  1.000  1.000 0.008  0.9310  
## s(suhu) 1.343  1.617 1.114  0.4337  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0396   Deviance explained = 6.76%
## -REML = 416.09  Scale est. = 0.85871   n = 300
# Prediksi pada data validasi
gam_pred <- predict(gam_model, newdata = lm_val)
gam_mse  <- mean((gam_pred - lm_val$do)^2)
gam_rmse <- sqrt(gam_mse)
sst <- sum((lm_val$do - mean(lm_train$do))^2)
sse_gam <- sum((lm_val$do - gam_pred)^2)
gam_r2 <- 1 - (sse_gam / sst)

cat("\n--- Evaluasi Model GAM ---\n")
## 
## --- Evaluasi Model GAM ---
cat("R² =", round(gam_r2,4), " | MSE =", round(gam_mse,4), " | RMSE =", round(gam_rmse,4), "\n")
## R² = 0.0758  | MSE = 0.8248  | RMSE = 0.9082

23. Perbandingan Kinerj

eval_df <- data.frame(
  Model = c("Linear Regression", "GAM (Spline)"),
  R2 = c(lm_r2, gam_r2),
  MSE = c(lm_mse, gam_mse),
  RMSE = c(lm_rmse, gam_rmse)
)
print(eval_df)
##               Model          R2       MSE      RMSE
## 1 Linear Regression 0.006006105 0.8695753 0.9325102
## 2      GAM (Spline) 0.075829366 0.8247652 0.9081658

24. Visualisasi Prediksi vs Aktual

# Linear
p1 <- ggplot(lm_val, aes(x = do, y = lm_pred)) +
  geom_point(color = "steelblue", size = 2) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Model Linear: Prediksi vs Aktual DO",
       x = "DO Aktual", y = "DO Prediksi") +
  theme_minimal()

# GAM
p2 <- ggplot(lm_val, aes(x = do, y = gam_pred)) +
  geom_point(color = "darkgreen", size = 2) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Model GAM: Prediksi vs Aktual DO",
       x = "DO Aktual", y = "DO Prediksi") +
  theme_minimal()
print(p1)

print(p2)

## 25. Variabel Paling Berpengaruh

cat("\n--- VARIABEL PALING MEMPENGARUHI DO ---\n")
## 
## --- VARIABEL PALING MEMPENGARUHI DO ---
# Untuk model Linear: lihat koefisien standar
lm_std <- lm(scale(do) ~ scale(ph) + scale(bod) + scale(tss) + scale(suhu), data = lm_train)
coef_std <- summary(lm_std)$coefficients
print(coef_std)
##                  Estimate Std. Error       t value  Pr(>|t|)
## (Intercept)  3.403174e-16 0.05795032  5.872572e-15 1.0000000
## scale(ph)   -8.069219e-03 0.05817796 -1.386989e-01 0.8897827
## scale(bod)   6.686340e-02 0.05838132  1.145288e+00 0.2530183
## scale(tss)   2.071325e-03 0.05807525  3.566622e-02 0.9715726
## scale(suhu) -4.426913e-02 0.05831184 -7.591792e-01 0.4483516

26. Interpretasi Hasil Model

# Untuk model GAM: lihat signifikansi (p-value)
summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## do ~ s(ph) + s(bod) + s(tss) + s(suhu)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.9770     0.0535   111.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value  
## s(ph)   1.000  1.000 0.030  0.8632  
## s(bod)  5.348  6.489 2.379  0.0311 *
## s(tss)  1.000  1.000 0.008  0.9310  
## s(suhu) 1.343  1.617 1.114  0.4337  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0396   Deviance explained = 6.76%
## -REML = 416.09  Scale est. = 0.85871   n = 300
cat("\nInterpretasi:\n")
## 
## Interpretasi:
cat("- Pada model Linear, variabel dengan koefisien standar terbesar (positif atau negatif) paling memengaruhi DO.\n")
## - Pada model Linear, variabel dengan koefisien standar terbesar (positif atau negatif) paling memengaruhi DO.
cat("- Biasanya, BOD berpengaruh negatif terhadap DO (semakin tinggi BOD, semakin rendah oksigen terlarut).\n")
## - Biasanya, BOD berpengaruh negatif terhadap DO (semakin tinggi BOD, semakin rendah oksigen terlarut).
cat("- Pada model GAM, lihat p-value dan bentuk kurva smooth (fungsi s(...)) untuk melihat pengaruh non-linear.\n")
## - Pada model GAM, lihat p-value dan bentuk kurva smooth (fungsi s(...)) untuk melihat pengaruh non-linear.
cat("- Variabel dengan p < 0.05 dianggap berpengaruh signifikan terhadap DO.\n")
## - Variabel dengan p < 0.05 dianggap berpengaruh signifikan terhadap DO.

27. Ringkasan Keseluruhan Model

cat("\n--- RINGKASAN METRIK PENTING ---\n")
## 
## --- RINGKASAN METRIK PENTING ---
cat("SVM Accuracy (val):", round(conf_svm$overall["Accuracy"],4), "\n")
## SVM Accuracy (val): 0.9153
cat("Tree Accuracy (val):", round(conf_tree$overall["Accuracy"],4), "\n")
## Tree Accuracy (val): 0.9661
cat("RF Accuracy (val):", round(conf_rf$overall["Accuracy"],4), "\n")
## RF Accuracy (val): 0.9831
cat("LM R2 (train):", round(lm_r2,4), " LM RMSE (val):", round(lm_rmse,4), "\n")
## LM R2 (train): 0.006  LM RMSE (val): 0.9325
cat("GAM RMSE (val):", round(gam_r2,4), " GAM pseudo-R2 (val):", round(gam_r2,4), "\n")
## GAM RMSE (val): 0.0758  GAM pseudo-R2 (val): 0.0758
cat("------------------------------\n")
## ------------------------------