req <- c("readxl","dplyr","tidyr","ggplot2","caret","e1071","rpart","rpart.plot",
         "randomForest","mgcv","gridExtra","scales")
inst <- req[!req %in% installed.packages()[,"Package"]]
if(length(inst)) install.packages(inst, dependencies=TRUE)
lapply(req, library, character.only=TRUE)
## Warning: package 'readxl' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
## Warning: package 'e1071' was built under R version 4.3.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
## Warning: package 'rpart' was built under R version 4.3.3
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.3.3
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
## Warning: package 'gridExtra' was built under R version 4.3.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## [[1]]
## [1] "readxl"    "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
## [1] "dplyr"     "readxl"    "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "tidyr"     "dplyr"     "readxl"    "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "ggplot2"   "tidyr"     "dplyr"     "readxl"    "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "caret"     "lattice"   "ggplot2"   "tidyr"     "dplyr"     "readxl"   
##  [7] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [13] "base"     
## 
## [[6]]
##  [1] "e1071"     "caret"     "lattice"   "ggplot2"   "tidyr"     "dplyr"    
##  [7] "readxl"    "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [13] "methods"   "base"     
## 
## [[7]]
##  [1] "rpart"     "e1071"     "caret"     "lattice"   "ggplot2"   "tidyr"    
##  [7] "dplyr"     "readxl"    "stats"     "graphics"  "grDevices" "utils"    
## [13] "datasets"  "methods"   "base"     
## 
## [[8]]
##  [1] "rpart.plot" "rpart"      "e1071"      "caret"      "lattice"   
##  [6] "ggplot2"    "tidyr"      "dplyr"      "readxl"     "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## [[9]]
##  [1] "randomForest" "rpart.plot"   "rpart"        "e1071"        "caret"       
##  [6] "lattice"      "ggplot2"      "tidyr"        "dplyr"        "readxl"      
## [11] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [16] "methods"      "base"        
## 
## [[10]]
##  [1] "mgcv"         "nlme"         "randomForest" "rpart.plot"   "rpart"       
##  [6] "e1071"        "caret"        "lattice"      "ggplot2"      "tidyr"       
## [11] "dplyr"        "readxl"       "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "gridExtra"    "mgcv"         "nlme"         "randomForest" "rpart.plot"  
##  [6] "rpart"        "e1071"        "caret"        "lattice"      "ggplot2"     
## [11] "tidyr"        "dplyr"        "readxl"       "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[12]]
##  [1] "scales"       "gridExtra"    "mgcv"         "nlme"         "randomForest"
##  [6] "rpart.plot"   "rpart"        "e1071"        "caret"        "lattice"     
## [11] "ggplot2"      "tidyr"        "dplyr"        "readxl"       "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"
set.seed(123)
data_raw <- readxl::read_excel("kualitasair.xlsx") %>% as.data.frame()
cat("Ukuran data awal:", dim(data_raw), "\n")
## Ukuran data awal: 300 7
head(data_raw)
##   Lokasi     pH     DO    BOD     TSS    Suhu          Status
## 1     S1 7.6855     NA 1.7136 43.1415 26.7972 Tercemar ringan
## 2     S2 6.7177 5.7236 1.4402 44.2963 27.7284 Tercemar ringan
## 3     S3 7.1816 4.8906 2.7274      NA 26.0255 Tercemar ringan
## 4     S4 7.3164 6.1339 3.1398 41.0104 29.6639 Tercemar ringan
## 5     S5 7.2021 7.7853 1.1778 48.0967 26.4099            baik
## 6     S6 6.9469 8.4222 3.2324 48.5610 28.6809 Tercemar ringan

SOAL 1: DATA CLEANING & EKSPLORASI (30%)

# 1.1 Standardize column names (toleransi)
names(data_raw) <- make.names(names(data_raw))

# Ensure expected columns exist or stop with message
expected <- c("Lokasi","pH","DO","BOD","TSS","Suhu","Status")
missing_cols <- setdiff(expected, names(data_raw))
if(length(missing_cols)>0){
  stop("Data tidak memiliki kolom: ", paste(missing_cols, collapse=", "),
       )
}
# 1.2 Inspect missing value counts
cat("\n-- Missing values (pre-clean) --\n")
## 
## -- Missing values (pre-clean) --
print(sapply(data_raw[expected], function(x) sum(is.na(x))))
## Lokasi     pH     DO    BOD    TSS   Suhu Status 
##      0      0     23     22     24      0      0
# 1.3 Standarisasi Status penulisan (kosong/angka/teks)
data <- data_raw
# Trim and lowercase for mapping
data$Status <- as.character(data$Status)
data$Status[trimws(data$Status)==""] <- NA
st <- tolower(trimws(data$Status))
st[st %in% c("1","baik","baik=1")] <- "Baik"
st[st %in% c("2","tercemar ringan","tercemar_ringan","tercemar ringan=2")] <- "Tercemar ringan"
st[st %in% c("3","tercemar berat","tercemar_berat","tercemar berat=3")] <- "Tercemar berat"
# if still NA and DO exists, optionally infer from DO (only if user wants auto-label)
# For now keep NA (we'll drop unlabeled for classification), but will create labels for regression if needed.
data$Status <- ifelse(is.na(st), NA, st)
data$Status <- factor(data$Status, levels = c("Baik","Tercemar ringan","Tercemar berat"))
# 1.4 Imputasi missing numeric: impute median (defensive)
num_vars <- c("pH","DO","BOD","TSS","Suhu")
for(v in num_vars){
  if(!is.numeric(data[[v]])) data[[v]] <- as.numeric(data[[v]])
  med <- median(data[[v]], na.rm=TRUE)
  data[[v]][is.na(data[[v]])] <- med
}
# 1.5 Outlier detection (IQR) and winsorize to 1%/99% quantiles
winsorize <- function(x, probs=c(0.01,0.99)){ q <- quantile(x, probs=probs, na.rm=TRUE); pmax(pmin(x, q[2]), q[1]) }
for(v in num_vars){
  # report count outlier
  x <- data_raw[[v]]
  Q1 <- quantile(x,0.25,na.rm=TRUE); Q3 <- quantile(x,0.75,na.rm=TRUE); IQRv <- Q3-Q1
  lower <- Q1-1.5*IQRv; upper <- Q3+1.5*IQRv
  out_idx <- which(x < lower | x > upper)
  cat(sprintf("Variabel %s: outlier (IQR) = %d\n", v, length(out_idx)))
  # apply winsorize on cleaned data
  data[[v]] <- winsorize(data[[v]])
}
## Variabel pH: outlier (IQR) = 4
## Variabel DO: outlier (IQR) = 2
## Variabel BOD: outlier (IQR) = 3
## Variabel TSS: outlier (IQR) = 3
## Variabel Suhu: outlier (IQR) = 2
# 1.6 Jika ada Status NA, hapus baris tsb untuk analisis klasifikasi. (still keep for regression if needed)
cat("\nJumlah Status NA sebelum filter:", sum(is.na(data$Status)), "\n")
## 
## Jumlah Status NA sebelum filter: 0
data_class <- data %>% filter(!is.na(Status))  # untuk klasifikasi
cat("Ukuran data untuk klasifikasi (setelah hapus NA Status):", nrow(data_class), "\n")
## Ukuran data untuk klasifikasi (setelah hapus NA Status): 300
# 1.7 Ringkasan statistik deskriptif setelah pembersihan
desc_stats <- data %>%
  summarise(across(all_of(num_vars),
                   list(mean = ~mean(.), sd = ~sd(.), median = ~median(.), min = ~min(.), max = ~max(.)))) 
cat("\n-- Ringkasan statistik deskriptif (setelah pembersihan) --\n")
## 
## -- Ringkasan statistik deskriptif (setelah pembersihan) --
print(desc_stats)
##    pH_mean     pH_sd pH_median  pH_min   pH_max  DO_mean     DO_sd DO_median
## 1 6.989464 0.4857012   6.98805 5.77872 8.106373 5.977196 0.9387891    5.9909
##     DO_min   DO_max BOD_mean    BOD_sd BOD_median  BOD_min BOD_max TSS_mean
## 1 3.829614 8.223684 3.004537 0.7925544     3.0661 0.956324 5.09963 49.68304
##    TSS_sd TSS_median  TSS_min  TSS_max Suhu_mean  Suhu_sd Suhu_median Suhu_min
## 1 9.11172   49.52205 27.70734 72.22947  28.12117 2.046895    28.01485 23.52489
##   Suhu_max
## 1   33.247
cat("\nDistribusi Status (setelah cleaning):\n")
## 
## Distribusi Status (setelah cleaning):
print(table(data_class$Status))
## 
##            Baik Tercemar ringan  Tercemar berat 
##              72             221               7
# Save simple descriptive table
write.csv(data.frame(stat = names(desc_stats), t(desc_stats)), "descriptive_stats_after_cleaning.csv", row.names=FALSE)

SOAL 2: KLASIFIKASI STATUS KUALITAS AIR (35%)

# 2.1 Pastikan cukup baris untuk split 75
if(nrow(data_class) < 80){
  stop("Setelah pembersihan jumlah data berlabel kurang dari 80; tidak cukup untuk split. Hentikan.")
}
# 2.2 Split fixed test = 75
set.seed(123)
test_idx <- sample(seq_len(nrow(data_class)), size = 75)
test <- data_class[test_idx, ]
train <- data_class[-test_idx, ]
cat("\nUkuran train:", nrow(train), "Ukuran test:", nrow(test), "\n")
## 
## Ukuran train: 225 Ukuran test: 75
# 2.3 Prepare formula dan trainControl
features <- c("pH","DO","BOD","TSS","Suhu")
formula_clf <- as.formula(paste("Status ~", paste(features, collapse = " + ")))
trctrl <- trainControl(method="cv", number=5, classProbs = TRUE)
# --- Fix: Pastikan level factor valid untuk caret ---
orig_levels <- levels(train$Status) # simpan level asli
orig_levels
## [1] "Baik"            "Tercemar ringan" "Tercemar berat"
safe_levels <- make.names(orig_levels) # bikin versi valid (space -> .)
safe_levels
## [1] "Baik"            "Tercemar.ringan" "Tercemar.berat"
# Terapkan ke train & test
levels(train$Status) <- safe_levels
levels(test$Status)  <- safe_levels

# Cek ulang
levels(train$Status)
## [1] "Baik"            "Tercemar.ringan" "Tercemar.berat"
levels(test$Status)
## [1] "Baik"            "Tercemar.ringan" "Tercemar.berat"
# 2.4 SVM (classification)
svm_mod <- train(formula_clf, data = train, method = "svmRadial",
                 preProcess = c("center","scale"), trControl = trctrl, tuneLength = 5)
pred_svm <- predict(svm_mod, newdata = test)
cm_svm <- confusionMatrix(pred_svm, test$Status)
cat("\n--- SVM (classification) ---\n")
## 
## --- SVM (classification) ---
print(cm_svm)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar.ringan Tercemar.berat
##   Baik              17               3              0
##   Tercemar.ringan    4              50              1
##   Tercemar.berat     0               0              0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8933          
##                  95% CI : (0.8006, 0.9528)
##     No Information Rate : 0.7067          
##     P-Value [Acc > NIR] : 1e-04           
##                                           
##                   Kappa : 0.738           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar.ringan Class: Tercemar.berat
## Sensitivity               0.8095                 0.9434               0.00000
## Specificity               0.9444                 0.7727               1.00000
## Pos Pred Value            0.8500                 0.9091                   NaN
## Neg Pred Value            0.9273                 0.8500               0.98667
## Prevalence                0.2800                 0.7067               0.01333
## Detection Rate            0.2267                 0.6667               0.00000
## Detection Prevalence      0.2667                 0.7333               0.00000
## Balanced Accuracy         0.8770                 0.8581               0.50000
# 2.5 Decision Tree (rpart)
tree_mod <- rpart::rpart(formula_clf, data = train, method = "class", control = rpart.control(cp=0.01))
pred_tree <- predict(tree_mod, newdata = test, type="class")
cm_tree <- confusionMatrix(pred_tree, test$Status)
cat("\n--- Decision Tree ---\n")
## 
## --- Decision Tree ---
print(cm_tree)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar.ringan Tercemar.berat
##   Baik              20               2              0
##   Tercemar.ringan    1              51              1
##   Tercemar.berat     0               0              0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9467         
##                  95% CI : (0.869, 0.9853)
##     No Information Rate : 0.7067         
##     P-Value [Acc > NIR] : 2.034e-07      
##                                          
##                   Kappa : 0.8726         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar.ringan Class: Tercemar.berat
## Sensitivity               0.9524                 0.9623               0.00000
## Specificity               0.9630                 0.9091               1.00000
## Pos Pred Value            0.9091                 0.9623                   NaN
## Neg Pred Value            0.9811                 0.9091               0.98667
## Prevalence                0.2800                 0.7067               0.01333
## Detection Rate            0.2667                 0.6800               0.00000
## Detection Prevalence      0.2933                 0.7067               0.00000
## Balanced Accuracy         0.9577                 0.9357               0.50000
# optional plot: rpart.plot(tree_mod)
# 2.6 Random Forest
rf_mod <- randomForest::randomForest(formula_clf, data = train, ntree = 500, importance = TRUE)
pred_rf <- predict(rf_mod, newdata = test)
cm_rf <- confusionMatrix(pred_rf, test$Status)
cat("\n--- Random Forest ---\n")
## 
## --- Random Forest ---
print(cm_rf)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar.ringan Tercemar.berat
##   Baik              20               1              0
##   Tercemar.ringan    1              52              1
##   Tercemar.berat     0               0              0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.96            
##                  95% CI : (0.8875, 0.9917)
##     No Information Rate : 0.7067          
##     P-Value [Acc > NIR] : 2.622e-08       
##                                           
##                   Kappa : 0.9031          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar.ringan Class: Tercemar.berat
## Sensitivity               0.9524                 0.9811               0.00000
## Specificity               0.9815                 0.9091               1.00000
## Pos Pred Value            0.9524                 0.9630                   NaN
## Neg Pred Value            0.9815                 0.9524               0.98667
## Prevalence                0.2800                 0.7067               0.01333
## Detection Rate            0.2667                 0.6933               0.00000
## Detection Prevalence      0.2800                 0.7200               0.00000
## Balanced Accuracy         0.9669                 0.9451               0.50000
# --- Fix: Pastikan level factor valid untuk caret ---
orig_levels <- levels(train$Status) # simpan level asli
orig_levels
## [1] "Baik"            "Tercemar.ringan" "Tercemar.berat"
safe_levels <- make.names(orig_levels) # bikin versi valid (space -> .)
safe_levels
## [1] "Baik"            "Tercemar.ringan" "Tercemar.berat"
# Terapkan ke train & test
levels(train$Status) <- safe_levels
levels(test$Status)  <- safe_levels

# Cek ulang
levels(train$Status)
## [1] "Baik"            "Tercemar.ringan" "Tercemar.berat"
levels(test$Status)
## [1] "Baik"            "Tercemar.ringan" "Tercemar.berat"
# 2.7 Summary accuracy table
acc_tbl <- data.frame(
  Model = c("SVM","Decision Tree","Random Forest"),
  Accuracy = c(as.numeric(cm_svm$overall["Accuracy"]),
               as.numeric(cm_tree$overall["Accuracy"]),
               as.numeric(cm_rf$overall["Accuracy"]))
)
cat("\nRingkasan akurasi:\n")
## 
## Ringkasan akurasi:
print(acc_tbl)
##           Model  Accuracy
## 1           SVM 0.8933333
## 2 Decision Tree 0.9466667
## 3 Random Forest 0.9600000
# Save confusion matrices to CSVs
write.csv(as.data.frame(cm_svm$table), "cm_svm.csv", row.names=FALSE)
write.csv(as.data.frame(cm_tree$table), "cm_tree.csv", row.names=FALSE)
write.csv(as.data.frame(cm_rf$table), "cm_rf.csv", row.names=FALSE)

# Simple interpretation strings (console)
cat("\nInterpretasi singkat:\n")
## 
## Interpretasi singkat:
cat(sprintf("SVM accuracy = %.2f%%\n", acc_tbl$Accuracy[1]*100))
## SVM accuracy = 89.33%
cat(sprintf("Decision Tree accuracy = %.2f%%\n", acc_tbl$Accuracy[2]*100))
## Decision Tree accuracy = 94.67%
cat(sprintf("Random Forest accuracy = %.2f%%\n", acc_tbl$Accuracy[3]*100))
## Random Forest accuracy = 96.00%
cat("Rekomendasi: gunakan Random Forest bila mengutamakan akurasi & stabilitas.\n")
## Rekomendasi: gunakan Random Forest bila mengutamakan akurasi & stabilitas.
hasil_prediksi <- data.frame(
  Lokasi = test$Lokasi,
  pH = test$pH,
  DO_Aktual = test$DO,
  BOD = test$BOD,
  TSS = test$TSS,
  Suhu = test$Suhu,
  Status_Aktual = test$Status,
  Prediksi_Status_RF = pred_rf,
  Prediksi_Status_SVM = pred_svm,
  Prediksi_Status_Tree = pred_tree
)
write.csv(hasil_prediksi, "cm_rf.csv", row.names = FALSE)
cat("File 'cm_rf.csv' berisi 75 baris dibuat di:", getwd(), "\n")
## File 'cm_rf.csv' berisi 75 baris dibuat di: C:/Users/alima/OneDrive/ドキュメント

SOAL 3: PREDIKSI VARIABEL DO (35%)

# For regression, we can use full cleaned data (including rows where Status may be NA)
# but we'll ensure no NA in predictors/DO
data_reg <- data %>% select(Lokasi, pH, DO, BOD, TSS, Suhu, Status)
data_reg <- data_reg[complete.cases(data_reg[, c("DO","pH","BOD","TSS","Suhu")]), ]
# Create train/test split for regression (use same test indexes as classification if desired)
# Here we reuse the same test indices for comparability: if test indices exist in data_reg, use them; else do 20% split
if(all(test$Lokasi %in% data_reg$Lokasi)){
  test_do <- data_reg[data_reg$Lokasi %in% test$Lokasi, ]
  train_do <- data_reg[!data_reg$Lokasi %in% test$Lokasi, ]
} else {
  set.seed(123)
  idx <- createDataPartition(data_reg$DO, p = 0.8, list = FALSE)
  train_do <- data_reg[idx, ]
  test_do <- data_reg[-idx, ]
}

cat("\nUkuran train_do:", nrow(train_do), "Ukuran test_do:", nrow(test_do), "\n")
## 
## Ukuran train_do: 225 Ukuran test_do: 75
# 3.1 Linear Regression
lm_mod <- lm(DO ~ pH + BOD + TSS + Suhu, data = train_do)
summary(lm_mod)
## 
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = train_do)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.17410 -0.50822  0.02886  0.64160  2.22459 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.009774   1.318828   4.557 8.61e-06 ***
## pH           0.053710   0.129267   0.415    0.678    
## BOD          0.116030   0.076947   1.508    0.133    
## TSS          0.007931   0.006708   1.182    0.238    
## Suhu        -0.042695   0.030960  -1.379    0.169    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9352 on 220 degrees of freedom
## Multiple R-squared:  0.02277,    Adjusted R-squared:  0.005002 
## F-statistic: 1.282 on 4 and 220 DF,  p-value: 0.2782
# 3.2 Spline / GAM
gam_mod <- mgcv::gam(DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu), data = train_do)
summary(gam_mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.92805    0.05961   99.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value   
## s(pH)   1.000  1.000 0.064 0.80085   
## s(BOD)  6.651  7.738 3.220 0.00213 **
## s(TSS)  1.000  1.000 0.870 0.35196   
## s(Suhu) 1.000  1.000 2.999 0.08478 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0906   Deviance explained =   13%
## GCV = 0.83914  Scale est. = 0.79942   n = 225
# 3.3 Predictions
pred_lm <- predict(lm_mod, newdata = test_do)
pred_gam <- predict(gam_mod, newdata = test_do)
# 3.4 Evaluation metrics
mse <- function(obs,pred) mean((obs - pred)^2, na.rm=TRUE)
rmse <- function(obs,pred) sqrt(mse(obs,pred))
r2 <- function(obs,pred){
  ss_res <- sum((obs - pred)^2, na.rm=TRUE)
  ss_tot <- sum((obs - mean(obs, na.rm=TRUE))^2, na.rm=TRUE)
  1 - ss_res/ss_tot
}

eval_reg <- data.frame(
  Model = c("Linear Regression", "Spline (GAM)"),
  R2 = c(r2(test_do$DO, pred_lm), r2(test_do$DO, pred_gam)),
  MSE = c(mse(test_do$DO, pred_lm), mse(test_do$DO, pred_gam)),
  RMSE = c(rmse(test_do$DO, pred_lm), rmse(test_do$DO, pred_gam))
)
cat("\nEvaluasi model prediksi DO:\n")
## 
## Evaluasi model prediksi DO:
print(eval_reg)
##               Model         R2       MSE      RMSE
## 1 Linear Regression -0.1323336 0.9727259 0.9862687
## 2      Spline (GAM) -0.1783631 1.0122673 1.0061150
write.csv(eval_reg, "eval_prediksi_DO.csv", row.names=FALSE)
# 3.5 Visualisasi Prediksi vs Aktual
p1 <- ggplot(data.frame(Aktual=test_do$DO, Prediksi=pred_lm), aes(x=Aktual, y=Prediksi)) +
  geom_point() + geom_abline(slope=1, intercept=0, linetype="dashed") +
  labs(title="Linear Regression: DO Aktual vs Prediksi", x="DO Aktual", y="DO Prediksi") +
  theme_minimal()

p2 <- ggplot(data.frame(Aktual=test_do$DO, Prediksi=pred_gam), aes(x=Aktual, y=Prediksi)) +
  geom_point() + geom_abline(slope=1, intercept=0, linetype="dashed") +
  labs(title="GAM (spline): DO Aktual vs Prediksi", x="DO Aktual", y="DO Prediksi") +
  theme_minimal()

# display plots side by side
gridExtra::grid.arrange(p1,p2, ncol=2)

# 3.6 Variable importance for DO (random forest regression)
rf_reg <- randomForest::randomForest(DO ~ pH + BOD + TSS + Suhu, data = train_do, ntree = 500, importance = TRUE)
cat("\nVariable importance (regression RF):\n")
## 
## Variable importance (regression RF):
print(importance(rf_reg))
##         %IncMSE IncNodePurity
## pH   -1.8782446      38.25933
## BOD   7.9625686      47.41260
## TSS   2.9462298      43.57158
## Suhu  0.2532323      41.57513
varImpPlot(rf_reg, main="Variable Importance - RF (DO)")

# Save DO predictions
hasil_do <- data.frame(Lokasi = test_do$Lokasi, DO_Aktual = test_do$DO, Pred_LM = pred_lm, Pred_GAM = pred_gam)
write.csv(hasil_do, "hasil_prediksi_DO_test.csv", row.names = FALSE)
cat("File 'hasil_prediksi_DO_test.csv' dibuat.\n")
## File 'hasil_prediksi_DO_test.csv' dibuat.

RINGKASAN SINGKAT - Soal1: Missing values ditangani (imputasi median), outlier dibatasi (winsorize), Status distandarkan. - Soal2: Klasifikasi (SVM, Decision Tree, Random Forest). Akurasi tercatat pada tabel ‘acc_tbl’. Random Forest direkomendasikan. - Soal3: Prediksi DO -> GAM (spline) menangkap komponen non-linear; cek ‘eval_prediksi_DO.csv’ untuk R2/MSE/RMSE. - Hasil 75 baris prediksi disimpan: ‘cm_rf.csv’ (status) dan ‘hasil_prediksi_DO_test.csv’ (DO predictions).