library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:ggplot2':
##
## element
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:randomForest':
##
## outlier
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(GGally)
library(ggplot2)
set.seed(123)
# 1. Baca data
path <- "D:/Kampus/Semester 5/Stat Lingkungan/UTS/kualitasair-Training.csv"
df <- read_csv(path, show_col_types = FALSE) %>% clean_names() # bersihkan nama kolom
glimpse(df)
## Rows: 300
## Columns: 7
## $ lokasi <chr> "S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S…
## $ p_h <dbl> 7.6855, 6.7177, 7.1816, 7.3164, 7.2021, 6.9469, 7.7558, 6.9527,…
## $ do <dbl> NA, 5.7236, 4.8906, 6.1339, 7.7853, 8.4222, 4.9232, 6.4859, 7.3…
## $ bod <dbl> 1.7136, 1.4402, 2.7274, 3.1398, 1.1778, 3.2324, NA, 4.0358, 3.1…
## $ tss <dbl> 43.1415, 44.2963, NA, 41.0104, 48.0967, 48.5610, 49.0343, 51.81…
## $ suhu <dbl> 26.7972, 27.7284, 26.0255, 29.6639, 26.4099, 28.6809, 29.7409, …
## $ status <chr> "Tercemar ringan", "Tercemar ringan", "Tercemar ringan", "Terce…
cat("Jumlah baris total:", nrow(df), "\n")
## Jumlah baris total: 300
df
## # A tibble: 300 × 7
## lokasi p_h do bod tss suhu status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 S1 7.69 NA 1.71 43.1 26.8 Tercemar ringan
## 2 S2 6.72 5.72 1.44 44.3 27.7 Tercemar ringan
## 3 S3 7.18 4.89 2.73 NA 26.0 Tercemar ringan
## 4 S4 7.32 6.13 3.14 41.0 29.7 Tercemar ringan
## 5 S5 7.20 7.79 1.18 48.1 26.4 baik
## 6 S6 6.95 8.42 3.23 48.6 28.7 Tercemar ringan
## 7 S7 7.76 4.92 NA 49.0 29.7 Tercemar ringan
## 8 S8 6.95 6.49 4.04 51.8 25.6 Tercemar ringan
## 9 S9 8.01 7.39 3.13 66.0 30.0 Tercemar ringan
## 10 S10 6.97 5.80 NA NA 23.8 BAIK
## # ℹ 290 more rows
print("Kolom:")
## [1] "Kolom:"
print(names(df))
## [1] "lokasi" "p_h" "do" "bod" "tss" "suhu" "status"
print("Jumlah NA per kolom:")
## [1] "Jumlah NA per kolom:"
print(sapply(df, function(x) sum(is.na(x))))
## lokasi p_h do bod tss suhu status
## 0 0 23 22 24 0 0
# Jika nama kolom berbeda, sesuaikan vector num_vars di bawah:
num_vars <- intersect(c("ph","do","bod","tss","suhu","temperature"), names(df))
# Jika kolom suhu bernama "suhu" atau "temperature", pilih salah satunya
if("suhu" %in% names(df) & !("suhu" %in% num_vars)) num_vars <- c(num_vars, "suhu")
cat("Menggunakan numerik vars:", paste(num_vars, collapse = ", "), "\n")
## Menggunakan numerik vars: do, bod, tss, suhu
# Pastikan ada kolom status (case-insensitive)
status_col <- names(df)[tolower(names(df)) == "status"]
if(length(status_col) == 0){
stop("Kolom 'Status' tidak ditemukan. Pastikan file memiliki kolom Status (case-insensitive).")
}
status_col <- status_col[1]
cat("Kolom status ditemukan:", status_col, "\n")
## Kolom status ditemukan: status
df
## # A tibble: 300 × 7
## lokasi p_h do bod tss suhu status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 S1 7.69 NA 1.71 43.1 26.8 Tercemar ringan
## 2 S2 6.72 5.72 1.44 44.3 27.7 Tercemar ringan
## 3 S3 7.18 4.89 2.73 NA 26.0 Tercemar ringan
## 4 S4 7.32 6.13 3.14 41.0 29.7 Tercemar ringan
## 5 S5 7.20 7.79 1.18 48.1 26.4 baik
## 6 S6 6.95 8.42 3.23 48.6 28.7 Tercemar ringan
## 7 S7 7.76 4.92 NA 49.0 29.7 Tercemar ringan
## 8 S8 6.95 6.49 4.04 51.8 25.6 Tercemar ringan
## 9 S9 8.01 7.39 3.13 66.0 30.0 Tercemar ringan
## 10 S10 6.97 5.80 NA NA 23.8 BAIK
## # ℹ 290 more rows
df <- df %>%
mutate(!!status_col := as.character(!!sym(status_col))) %>%
mutate(status_clean = case_when(
str_detect(!!sym(status_col), "1") ~ "Baik",
str_detect(!!sym(status_col), "2") ~ "Tercemar_ringan",
str_detect(!!sym(status_col), "3") ~ "Tercemar_berat",
str_to_lower(!!sym(status_col)) %in% c("baik","baikk","baik ") ~ "Baik",
str_to_lower(!!sym(status_col)) %in% c("tercemar ringan","tercemar_ringan") ~ "Tercemar_ringan",
str_to_lower(!!sym(status_col)) %in% c("tercemar berat","tercemar_berat") ~ "Tercemar_berat",
TRUE ~ NA_character_
)) %>%
mutate(status = factor(status_clean, levels = c("Baik","Tercemar_ringan","Tercemar_berat"))) %>%
select(-status_clean)
df
## # A tibble: 300 × 7
## lokasi p_h do bod tss suhu status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 S1 7.69 NA 1.71 43.1 26.8 Tercemar_ringan
## 2 S2 6.72 5.72 1.44 44.3 27.7 Tercemar_ringan
## 3 S3 7.18 4.89 2.73 NA 26.0 Tercemar_ringan
## 4 S4 7.32 6.13 3.14 41.0 29.7 Tercemar_ringan
## 5 S5 7.20 7.79 1.18 48.1 26.4 Baik
## 6 S6 6.95 8.42 3.23 48.6 28.7 Tercemar_ringan
## 7 S7 7.76 4.92 NA 49.0 29.7 Tercemar_ringan
## 8 S8 6.95 6.49 4.04 51.8 25.6 Tercemar_ringan
## 9 S9 8.01 7.39 3.13 66.0 30.0 Tercemar_ringan
## 10 S10 6.97 5.80 NA NA 23.8 Baik
## # ℹ 290 more rows
for(v in num_vars){
med <- median(df[[v]], na.rm = TRUE)
df[[v]] <- ifelse(is.na(df[[v]]), med, df[[v]])
}
df
## # A tibble: 300 × 7
## lokasi p_h do bod tss suhu status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 S1 7.69 5.99 1.71 43.1 26.8 Tercemar_ringan
## 2 S2 6.72 5.72 1.44 44.3 27.7 Tercemar_ringan
## 3 S3 7.18 4.89 2.73 49.5 26.0 Tercemar_ringan
## 4 S4 7.32 6.13 3.14 41.0 29.7 Tercemar_ringan
## 5 S5 7.20 7.79 1.18 48.1 26.4 Baik
## 6 S6 6.95 8.42 3.23 48.6 28.7 Tercemar_ringan
## 7 S7 7.76 4.92 3.07 49.0 29.7 Tercemar_ringan
## 8 S8 6.95 6.49 4.04 51.8 25.6 Tercemar_ringan
## 9 S9 8.01 7.39 3.13 66.0 30.0 Tercemar_ringan
## 10 S10 6.97 5.80 3.07 49.5 23.8 Baik
## # ℹ 290 more rows
cat("Jumlah NA setelah imputasi:\n")
## Jumlah NA setelah imputasi:
print(sapply(df, function(x) sum(is.na(x))))
## lokasi p_h do bod tss suhu status
## 0 0 0 0 0 0 0
# Boxplot tiap variabel numerik
for(v in num_vars){
print(
ggplot(df, aes(x = "", y = .data[[v]])) +
geom_boxplot(fill = "skyblue", outlier.color = "red") +
labs(title = paste("Boxplot Outlier:", v), y = v, x = "") +
theme_minimal()
)
}
# Histogram tiap variabel
for(v in num_vars){
print(
ggplot(df, aes(x = .data[[v]])) +
geom_histogram(bins = 25, fill = "lightgreen", color = "black") +
labs(title = paste("Distribusi:", v), x = v, y = "Frekuensi") +
theme_minimal()
)
}
## 10. Menangani outlier dengan winsorize (IQR 1.5)
winsorize <- function(x){
Q1 <- quantile(x, .25, na.rm = TRUE)
Q3 <- quantile(x, .75, na.rm = TRUE)
IQRv <- Q3 - Q1
lower <- Q1 - 1.5 * IQRv
upper <- Q3 + 1.5 * IQRv
pmin(pmax(x, lower), upper)
}
df_w <- df
for(v in num_vars){
df_w[[v]] <- winsorize(df_w[[v]])
}
print(summary(df_w[num_vars]))
## do bod tss suhu
## Min. :3.615 Min. :0.8513 Min. :27.28 Min. :22.77
## 1st Qu.:5.413 1st Qu.:2.4599 1st Qu.:44.28 1st Qu.:26.62
## Median :5.991 Median :3.0661 Median :49.52 Median :28.01
## Mean :5.977 Mean :3.0041 Mean :49.68 Mean :28.12
## 3rd Qu.:6.611 3rd Qu.:3.5323 3rd Qu.:55.62 3rd Qu.:29.46
## Max. :8.409 Max. :5.1409 Max. :72.62 Max. :33.73
print(psych::describe(df_w[num_vars])[, c("mean","sd","min","max","skew","kurtosis")])
## mean sd min max skew kurtosis
## do 5.98 0.95 3.61 8.41 -0.11 -0.11
## bod 3.00 0.80 0.85 5.14 -0.05 0.16
## tss 49.68 9.13 27.28 72.62 -0.03 -0.03
## suhu 28.12 2.07 22.77 33.73 0.13 -0.15
# Pair plot / korelasi
print(GGally::ggpairs(df_w[, num_vars], title = "Hubungan Antar Variabel Kualitas Air"))
train_df <- df_w %>% filter(!is.na(status))
test_df <- df_w %>% filter(is.na(status))
cat("Baris train:", nrow(train_df), " | Baris test (Status NA):", nrow(test_df), "\n")
## Baris train: 300 | Baris test (Status NA): 0
trainIndex <- createDataPartition(train_df$status, p = 0.8, list = FALSE)
train_set <- train_df[trainIndex, ]
val_set <- train_df[-trainIndex, ]
# Pastikan semua kolom numerik
for(v in num_vars){
train_set[[v]] <- as.numeric(train_set[[v]])
val_set[[v]] <- as.numeric(val_set[[v]])
if(nrow(test_df) > 0) test_df[[v]] <- as.numeric(test_df[[v]])
}
# Buat objek preprocessing hanya dari training
preProc <- preProcess(train_set[, num_vars], method = c("center", "scale"))
# Terapkan pada data train dan validasi
train_scaled <- as.data.frame(predict(preProc, train_set[, num_vars]))
val_scaled <- as.data.frame(predict(preProc, val_set[, num_vars]))
# Hanya buat test_scaled jika ada baris test
if (nrow(test_df) > 0) {
test_scaled <- as.data.frame(predict(preProc, test_df[, num_vars]))
} else {
test_scaled <- data.frame() # buat dummy kosong biar tidak error
}
# Tambahkan kembali variabel target
train_scaled$status <- train_set$status
val_scaled$status <- val_set$status
# Model 1: SVM (Radial Kernel)
set.seed(123)
svm_model <- svm(status ~ ., data = train_scaled, kernel = "radial", probability = TRUE)
svm_pred <- predict(svm_model, newdata = val_scaled)
conf_svm <- confusionMatrix(svm_pred, val_scaled$status)
cat("\n=== CONFUSION MATRIX: SVM ===\n")
##
## === CONFUSION MATRIX: SVM ===
print(conf_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar_ringan Tercemar_berat
## Baik 11 1 0
## Tercemar_ringan 3 43 1
## Tercemar_berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9153
## 95% CI : (0.8132, 0.9719)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 0.0009347
##
## Kappa : 0.7631
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity 0.7857 0.9773 0.00000
## Specificity 0.9778 0.7333 1.00000
## Pos Pred Value 0.9167 0.9149 NaN
## Neg Pred Value 0.9362 0.9167 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.1864 0.7288 0.00000
## Detection Prevalence 0.2034 0.7966 0.00000
## Balanced Accuracy 0.8817 0.8553 0.50000
# Model 2: Decision Tree
set.seed(123)
tree_model <- rpart(status ~ ., data = train_set[, c(num_vars,"status")], method = "class")
tree_pred <- predict(tree_model, newdata = val_set[, num_vars], type = "class")
conf_tree <- confusionMatrix(tree_pred, val_set$status)
cat("\n=== CONFUSION MATRIX: Decision Tree ===\n")
##
## === CONFUSION MATRIX: Decision Tree ===
print(conf_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar_ringan Tercemar_berat
## Baik 13 0 0
## Tercemar_ringan 1 44 1
## Tercemar_berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9661
## 95% CI : (0.8829, 0.9959)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 6.696e-06
##
## Kappa : 0.9075
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity 0.9286 1.0000 0.00000
## Specificity 1.0000 0.8667 1.00000
## Pos Pred Value 1.0000 0.9565 NaN
## Neg Pred Value 0.9783 1.0000 0.98305
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.2203 0.7458 0.00000
## Detection Prevalence 0.2203 0.7797 0.00000
## Balanced Accuracy 0.9643 0.9333 0.50000
# Model 3: Random Forest
set.seed(123)
rf_model <- randomForest(status ~ ., data = train_set[, c(num_vars,"status")], ntree = 500, importance = TRUE)
rf_pred <- predict(rf_model, newdata = val_set[, num_vars])
conf_rf <- confusionMatrix(rf_pred, val_set$status)
cat("\n=== CONFUSION MATRIX: Random Forest ===\n")
##
## === CONFUSION MATRIX: Random Forest ===
print(conf_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar_ringan Tercemar_berat
## Baik 13 0 0
## Tercemar_ringan 1 44 0
## Tercemar_berat 0 0 1
##
## Overall Statistics
##
## Accuracy : 0.9831
## 95% CI : (0.9091, 0.9996)
## No Information Rate : 0.7458
## P-Value [Acc > NIR] : 6.427e-07
##
## Kappa : 0.9552
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar_ringan Class: Tercemar_berat
## Sensitivity 0.9286 1.0000 1.00000
## Specificity 1.0000 0.9333 1.00000
## Pos Pred Value 1.0000 0.9778 1.00000
## Neg Pred Value 0.9783 1.0000 1.00000
## Prevalence 0.2373 0.7458 0.01695
## Detection Rate 0.2203 0.7458 0.01695
## Detection Prevalence 0.2203 0.7627 0.01695
## Balanced Accuracy 0.9643 0.9667 1.00000
accs <- c(
SVM = conf_svm$overall["Accuracy"],
Tree = conf_tree$overall["Accuracy"],
RF = conf_rf$overall["Accuracy"]
)
cat("\n=== PERBANDINGAN AKURASI ===\n")
##
## === PERBANDINGAN AKURASI ===
print(accs)
## SVM.Accuracy Tree.Accuracy RF.Accuracy
## 0.9152542 0.9661017 0.9830508
best_model_name <- names(accs)[which.max(accs)]
cat("\nModel terbaik berdasarkan akurasi validasi:", best_model_name, "\n")
##
## Model terbaik berdasarkan akurasi validasi: RF.Accuracy
if(best_model_name == "SVM"){
acc <- round(conf_svm$overall["Accuracy"],3)
} else if(best_model_name == "Tree"){
acc <- round(conf_tree$overall["Accuracy"],3)
} else {
acc <- round(conf_rf$overall["Accuracy"],3)
}
cat("\nInterpretasi Akurasi Model:\n")
##
## Interpretasi Akurasi Model:
cat(paste0(
"Model klasifikasi terbaik adalah ", best_model_name,
" dengan tingkat akurasi sebesar ", acc*100, "%.\n",
"Artinya, model mampu mengklasifikasikan status kualitas air dengan benar pada sekitar ",
round(acc*100,1), "% dari data validasi.\n",
"Nilai akurasi di atas 80% sudah menunjukkan model yang cukup baik.\n"
))
## Model klasifikasi terbaik adalah RF.Accuracy dengan tingkat akurasi sebesar 98.3%.
## Artinya, model mampu mengklasifikasikan status kualitas air dengan benar pada sekitar 98.3% dari data validasi.
## Nilai akurasi di atas 80% sudah menunjukkan model yang cukup baik.
# Lokasi file test (pakai file yang kamu upload barusan)
test_path <- "D:/Kampus/Semester 5/Stat Lingkungan/UTS/kualitasair-Testing.csv"
if (file.exists(test_path)) {
cat("\n📂 File data test ditemukan:", test_path, "\n")
test_df <- read_csv(test_path, show_col_types = FALSE)
cat("Jumlah baris data test:", nrow(test_df), "\n")
# Bersihkan nama kolom agar konsisten
test_df <- test_df %>% janitor::clean_names()
# Samakan nama kolom pH
if ("p_h" %in% names(test_df) && !("ph" %in% names(test_df))) {
test_df <- test_df %>% mutate(ph = p_h)
}
# Tentukan variabel numerik yang digunakan
num_vars <- intersect(c("p_h","do","bod","tss","suhu","temperature"), names(test_df))
# Imputasi nilai hilang (median dari training)
for(v in num_vars){
med <- median(train_df[[v]], na.rm = TRUE)
test_df[[v]] <- ifelse(is.na(test_df[[v]]), med, test_df[[v]])
}
# Terapkan preprocessing dari training
test_scaled <- as.data.frame(predict(preProc, test_df[, num_vars]))
# ===== Prediksi dengan masing-masing model =====
# --- SVM ---
svm_pred_test <- predict(svm_model, newdata = test_scaled)
# --- Decision Tree ---
tree_pred_test <- predict(tree_model, newdata = test_df[, num_vars], type = "class")
# --- Random Forest ---
rf_pred_test <- predict(rf_model, newdata = test_df[, num_vars])
# Tambahkan hasil ke dataframe
test_df <- test_df %>%
mutate(
Pred_SVM = svm_pred_test,
Pred_Tree = tree_pred_test,
Pred_RF = rf_pred_test
)
# Simpan ke CSV
output_path <- "D:/Kampus/Semester 5/Stat Lingkungan/UTS/2 test_predictions_all_models.csv"
write_csv(test_df, output_path)
cat("\n✅ Prediksi selesai untuk semua model.\n")
cat("Hasil disimpan di:", output_path, "\n")
cat("Kolom hasil prediksi: Pred_SVM, Pred_Tree, Pred_RF\n")
} else {
cat("\n⚠️ File testing tidak ditemukan! Pastikan kualitasair-Testing.csv ada di /mnt/data/.\n")
}
##
## 📂 File data test ditemukan: D:/Kampus/Semester 5/Stat Lingkungan/UTS/kualitasair-Testing.csv
## Jumlah baris data test: 75
##
## ✅ Prediksi selesai untuk semua model.
## Hasil disimpan di: D:/Kampus/Semester 5/Stat Lingkungan/UTS/2 test_predictions_all_models.csv
## Kolom hasil prediksi: Pred_SVM, Pred_Tree, Pred_RF
# Pastikan nama kolom pH benar
if ("p_h" %in% names(train_df) && !("ph" %in% names(train_df))) {
train_df <- train_df %>% mutate(ph = p_h)
val_set <- val_set %>% mutate(ph = p_h)
}
lm_train <- train_df
lm_val <- val_set
lm_model <- lm(do ~ ph + bod + tss + suhu, data = lm_train)
summary(lm_model)
##
## Call:
## lm(formula = do ~ ph + bod + tss + suhu, data = lm_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.41300 -0.54135 0.01817 0.64063 2.42522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4058198 1.1553742 5.544 6.55e-08 ***
## ph -0.0154574 0.1114460 -0.139 0.890
## bod 0.0794282 0.0693522 1.145 0.253
## tss 0.0002145 0.0060147 0.036 0.972
## suhu -0.0202711 0.0267013 -0.759 0.448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9491 on 295 degrees of freedom
## Multiple R-squared: 0.006006, Adjusted R-squared: -0.007472
## F-statistic: 0.4456 on 4 and 295 DF, p-value: 0.7756
# Prediksi pada data validasi
lm_pred <- predict(lm_model, newdata = lm_val)
lm_mse <- mean((lm_pred - lm_val$do)^2)
lm_rmse <- sqrt(lm_mse)
lm_r2 <- summary(lm_model)$r.squared
cat("\n--- Evaluasi Model Linear ---\n")
##
## --- Evaluasi Model Linear ---
cat("R² =", round(lm_r2,4), " | MSE =", round(lm_mse,4), " | RMSE =", round(lm_rmse,4), "\n")
## R² = 0.006 | MSE = 0.8696 | RMSE = 0.9325
gam_model <- gam(do ~ s(ph) + s(bod) + s(tss) + s(suhu), data = lm_train, method = "REML")
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## do ~ s(ph) + s(bod) + s(tss) + s(suhu)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9770 0.0535 111.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ph) 1.000 1.000 0.030 0.8632
## s(bod) 5.348 6.489 2.379 0.0311 *
## s(tss) 1.000 1.000 0.008 0.9310
## s(suhu) 1.343 1.617 1.114 0.4337
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0396 Deviance explained = 6.76%
## -REML = 416.09 Scale est. = 0.85871 n = 300
# Prediksi pada data validasi
gam_pred <- predict(gam_model, newdata = lm_val)
gam_mse <- mean((gam_pred - lm_val$do)^2)
gam_rmse <- sqrt(gam_mse)
sst <- sum((lm_val$do - mean(lm_train$do))^2)
sse_gam <- sum((lm_val$do - gam_pred)^2)
gam_r2 <- 1 - (sse_gam / sst)
cat("\n--- Evaluasi Model GAM ---\n")
##
## --- Evaluasi Model GAM ---
cat("R² =", round(gam_r2,4), " | MSE =", round(gam_mse,4), " | RMSE =", round(gam_rmse,4), "\n")
## R² = 0.0758 | MSE = 0.8248 | RMSE = 0.9082
eval_df <- data.frame(
Model = c("Linear Regression", "GAM (Spline)"),
R2 = c(lm_r2, gam_r2),
MSE = c(lm_mse, gam_mse),
RMSE = c(lm_rmse, gam_rmse)
)
print(eval_df)
## Model R2 MSE RMSE
## 1 Linear Regression 0.006006105 0.8695753 0.9325102
## 2 GAM (Spline) 0.075829366 0.8247652 0.9081658
# Linear
p1 <- ggplot(lm_val, aes(x = do, y = lm_pred)) +
geom_point(color = "steelblue", size = 2) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Model Linear: Prediksi vs Aktual DO",
x = "DO Aktual", y = "DO Prediksi") +
theme_minimal()
# GAM
p2 <- ggplot(lm_val, aes(x = do, y = gam_pred)) +
geom_point(color = "darkgreen", size = 2) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Model GAM: Prediksi vs Aktual DO",
x = "DO Aktual", y = "DO Prediksi") +
theme_minimal()
print(p1)
print(p2)
## 25. Variabel Paling Berpengaruh
cat("\n--- VARIABEL PALING MEMPENGARUHI DO ---\n")
##
## --- VARIABEL PALING MEMPENGARUHI DO ---
# Untuk model Linear: lihat koefisien standar
lm_std <- lm(scale(do) ~ scale(ph) + scale(bod) + scale(tss) + scale(suhu), data = lm_train)
coef_std <- summary(lm_std)$coefficients
print(coef_std)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.403174e-16 0.05795032 5.872572e-15 1.0000000
## scale(ph) -8.069219e-03 0.05817796 -1.386989e-01 0.8897827
## scale(bod) 6.686340e-02 0.05838132 1.145288e+00 0.2530183
## scale(tss) 2.071325e-03 0.05807525 3.566622e-02 0.9715726
## scale(suhu) -4.426913e-02 0.05831184 -7.591792e-01 0.4483516
# Untuk model GAM: lihat signifikansi (p-value)
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## do ~ s(ph) + s(bod) + s(tss) + s(suhu)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9770 0.0535 111.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ph) 1.000 1.000 0.030 0.8632
## s(bod) 5.348 6.489 2.379 0.0311 *
## s(tss) 1.000 1.000 0.008 0.9310
## s(suhu) 1.343 1.617 1.114 0.4337
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0396 Deviance explained = 6.76%
## -REML = 416.09 Scale est. = 0.85871 n = 300
cat("\nInterpretasi:\n")
##
## Interpretasi:
cat("- Pada model Linear, variabel dengan koefisien standar terbesar (positif atau negatif) paling memengaruhi DO.\n")
## - Pada model Linear, variabel dengan koefisien standar terbesar (positif atau negatif) paling memengaruhi DO.
cat("- Biasanya, BOD berpengaruh negatif terhadap DO (semakin tinggi BOD, semakin rendah oksigen terlarut).\n")
## - Biasanya, BOD berpengaruh negatif terhadap DO (semakin tinggi BOD, semakin rendah oksigen terlarut).
cat("- Pada model GAM, lihat p-value dan bentuk kurva smooth (fungsi s(...)) untuk melihat pengaruh non-linear.\n")
## - Pada model GAM, lihat p-value dan bentuk kurva smooth (fungsi s(...)) untuk melihat pengaruh non-linear.
cat("- Variabel dengan p < 0.05 dianggap berpengaruh signifikan terhadap DO.\n")
## - Variabel dengan p < 0.05 dianggap berpengaruh signifikan terhadap DO.
cat("\n--- RINGKASAN METRIK PENTING ---\n")
##
## --- RINGKASAN METRIK PENTING ---
cat("SVM Accuracy (val):", round(conf_svm$overall["Accuracy"],4), "\n")
## SVM Accuracy (val): 0.9153
cat("Tree Accuracy (val):", round(conf_tree$overall["Accuracy"],4), "\n")
## Tree Accuracy (val): 0.9661
cat("RF Accuracy (val):", round(conf_rf$overall["Accuracy"],4), "\n")
## RF Accuracy (val): 0.9831
cat("LM R2 (train):", round(lm_r2,4), " LM RMSE (val):", round(lm_rmse,4), "\n")
## LM R2 (train): 0.006 LM RMSE (val): 0.9325
cat("GAM RMSE (val):", round(gam_r2,4), " GAM pseudo-R2 (val):", round(gam_r2,4), "\n")
## GAM RMSE (val): 0.0758 GAM pseudo-R2 (val): 0.0758
cat("------------------------------\n")
## ------------------------------