# --- Soal 1: Data Cleaning dan Eksplorasi ---

# 1. Panggil library yang diperlukan
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# 2. Baca data dari sheet "Training"
# Pastikan file kualitasair.xlsx ada di folder kerja
file_path <- "kualitasair.xlsx"
data_train <- read_excel(file_path, sheet = "Training")

# Lihat 6 baris pertama
head(data_train)
## # A tibble: 6 × 7
##   Lokasi    pH    DO   BOD   TSS  Suhu Status         
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>          
## 1 S1      7.69 NA     1.71  43.1  26.8 Tercemar ringan
## 2 S2      6.72  5.72  1.44  44.3  27.7 Tercemar ringan
## 3 S3      7.18  4.89  2.73  NA    26.0 Tercemar ringan
## 4 S4      7.32  6.13  3.14  41.0  29.7 Tercemar ringan
## 5 S5      7.20  7.79  1.18  48.1  26.4 baik           
## 6 S6      6.95  8.42  3.23  48.6  28.7 Tercemar ringan
# 3. Cek struktur data
str(data_train)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
##  $ pH    : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
##  $ DO    : num [1:300] NA 5.72 4.89 6.13 7.79 ...
##  $ BOD   : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
##  $ TSS   : num [1:300] 43.1 44.3 NA 41 48.1 ...
##  $ Suhu  : num [1:300] 26.8 27.7 26 29.7 26.4 ...
##  $ Status: chr [1:300] "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" ...
# 4. Cek missing value
colSums(is.na(data_train))
## Lokasi     pH     DO    BOD    TSS   Suhu Status 
##      0      0     23     22     24      0      0
# 5. Tangani missing value (jika ada)
# Misal isi dengan rata-rata untuk kolom numerik
data_train <- data_train %>%
  mutate(
    pH   = ifelse(is.na(pH), mean(pH, na.rm = TRUE), pH),
    DO   = ifelse(is.na(DO), mean(DO, na.rm = TRUE), DO),
    BOD  = ifelse(is.na(BOD), mean(BOD, na.rm = TRUE), BOD),
    TSS  = ifelse(is.na(TSS), mean(TSS, na.rm = TRUE), TSS),
    Suhu = ifelse(is.na(Suhu), mean(Suhu, na.rm = TRUE), Suhu)
  )

# 6. Standarisasi penulisan kategori "Status"
# ubah jadi huruf besar semua atau konsisten kapitalisasi
data_train$Status <- tolower(data_train$Status) # jadi huruf kecil semua
data_train$Status <- trimws(data_train$Status)  # hapus spasi depan/belakang

# Lihat kategori unik
unique(data_train$Status)
## [1] "tercemar ringan" "baik"            "tercemar berat"
# 7. Ganti nama kategori agar konsisten
data_train$Status <- recode(data_train$Status,
                            "baik" = "Baik",
                            "tercemar ringan" = "Tercemar Ringan",
                            "tercemar.berat" = "Tercemar Berat",
                            "tercemar berat" = "Tercemar Berat",
                            "tercemar.sedang" = "Tercemar Sedang",
                            "sedang" = "Tercemar Sedang")

# Cek hasilnya
unique(data_train$Status)
## [1] "Tercemar Ringan" "Baik"            "Tercemar Berat"
# 8. Tangani outlier (menggunakan IQR method)
num_cols <- c("pH", "DO", "BOD", "TSS", "Suhu")

for (col in num_cols) {
  Q1 <- quantile(data_train[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data_train[[col]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  # Ganti nilai outlier dengan batas bawah/atas
  data_train[[col]] <- ifelse(data_train[[col]] < lower, lower,
                              ifelse(data_train[[col]] > upper, upper, data_train[[col]]))
}

# 9. Cek statistik deskriptif setelah pembersihan
summary(data_train)
##     Lokasi                pH              DO             BOD        
##  Length:300         Min.   :5.697   Min.   :3.615   Min.   :0.8513  
##  Class :character   1st Qu.:6.670   1st Qu.:5.413   1st Qu.:2.4599  
##  Mode  :character   Median :6.988   Median :5.976   Median :3.0005  
##                     Mean   :6.990   Mean   :5.976   Mean   :2.9993  
##                     3rd Qu.:7.318   3rd Qu.:6.611   3rd Qu.:3.5323  
##                     Max.   :8.290   Max.   :8.409   Max.   :5.1409  
##       TSS             Suhu          Status         
##  Min.   :27.28   Min.   :22.77   Length:300        
##  1st Qu.:44.28   1st Qu.:26.62   Class :character  
##  Median :49.70   Median :28.01   Mode  :character  
##  Mean   :49.70   Mean   :28.12                     
##  3rd Qu.:55.62   3rd Qu.:29.46                     
##  Max.   :72.62   Max.   :33.73
# 10. Visualisasi sebaran tiap variabel numerik
num_data <- data_train %>% select(all_of(num_cols))
par(mfrow = c(2,3))
for (col in num_cols) {
  hist(num_data[[col]], main = paste("Distribusi", col), xlab = col, col = "skyblue", border = "white")
}

# 11. Simpan hasil data bersih untuk digunakan di Soal 2
data_bersih <- data_train

# --- Soal 2: Klasifikasi Status Kualitas Air ---

# 1. Panggil library yang dibutuhkan
library(caret)
## Loading required package: lattice
library(e1071)
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# Pastikan data dari Soal 1 sudah ada
# (data_bersih harus berisi kolom: pH, DO, BOD, TSS, Suhu, Status)
str(data_bersih)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
##  $ pH    : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
##  $ DO    : num [1:300] 5.98 5.72 4.89 6.13 7.79 ...
##  $ BOD   : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
##  $ TSS   : num [1:300] 43.1 44.3 49.7 41 48.1 ...
##  $ Suhu  : num [1:300] 26.8 27.7 26 29.7 26.4 ...
##  $ Status: chr [1:300] "Tercemar Ringan" "Tercemar Ringan" "Tercemar Ringan" "Tercemar Ringan" ...
# 2. Ubah Status menjadi faktor
data_bersih$Status <- as.factor(data_bersih$Status)

# 3. Pilih hanya kolom yang digunakan
data_model <- data_bersih %>%
  select(pH, DO, BOD, TSS, Suhu, Status)

# 4. Bagi data menjadi training (80%) dan testing (20%)
set.seed(123)
split <- createDataPartition(data_model$Status, p = 0.8, list = FALSE)
train_data <- data_model[split, ]
test_data  <- data_model[-split, ]

# 5. Bangun model Decision Tree
model_tree <- rpart(Status ~ pH + DO + BOD + TSS + Suhu, data = train_data, method = "class")
pred_tree <- predict(model_tree, test_data, type = "class")

# 6. Bangun model Random Forest
model_rf <- randomForest(Status ~ pH + DO + BOD + TSS + Suhu, data = train_data, ntree = 200)
pred_rf <- predict(model_rf, test_data)

# 7. Bangun model SVM
model_svm <- svm(Status ~ pH + DO + BOD + TSS + Suhu, data = train_data, kernel = "radial")
pred_svm <- predict(model_svm, test_data)

# 8. Evaluasi hasil dengan confusion matrix
cat("=== Decision Tree ===\n")
## === Decision Tree ===
confusionMatrix(pred_tree, test_data$Status)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar Berat Tercemar Ringan
##   Baik              13              0               0
##   Tercemar Berat     0              0               0
##   Tercemar Ringan    1              1              44
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9661          
##                  95% CI : (0.8829, 0.9959)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 6.696e-06       
##                                           
##                   Kappa : 0.9075          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar Berat Class: Tercemar Ringan
## Sensitivity               0.9286               0.00000                 1.0000
## Specificity               1.0000               1.00000                 0.8667
## Pos Pred Value            1.0000                   NaN                 0.9565
## Neg Pred Value            0.9783               0.98305                 1.0000
## Prevalence                0.2373               0.01695                 0.7458
## Detection Rate            0.2203               0.00000                 0.7458
## Detection Prevalence      0.2203               0.00000                 0.7797
## Balanced Accuracy         0.9643               0.50000                 0.9333
cat("\n=== Random Forest ===\n")
## 
## === Random Forest ===
confusionMatrix(pred_rf, test_data$Status)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar Berat Tercemar Ringan
##   Baik              13              0               1
##   Tercemar Berat     0              0               0
##   Tercemar Ringan    1              1              43
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9492          
##                  95% CI : (0.8585, 0.9894)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 4.59e-05        
##                                           
##                   Kappa : 0.8644          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar Berat Class: Tercemar Ringan
## Sensitivity               0.9286               0.00000                 0.9773
## Specificity               0.9778               1.00000                 0.8667
## Pos Pred Value            0.9286                   NaN                 0.9556
## Neg Pred Value            0.9778               0.98305                 0.9286
## Prevalence                0.2373               0.01695                 0.7458
## Detection Rate            0.2203               0.00000                 0.7288
## Detection Prevalence      0.2373               0.00000                 0.7627
## Balanced Accuracy         0.9532               0.50000                 0.9220
cat("\n=== SVM ===\n")
## 
## === SVM ===
confusionMatrix(pred_svm, test_data$Status)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar Berat Tercemar Ringan
##   Baik               8              0               2
##   Tercemar Berat     0              0               0
##   Tercemar Ringan    6              1              42
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8475          
##                  95% CI : (0.7301, 0.9278)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 0.04475         
##                                           
##                   Kappa : 0.5519          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar Berat Class: Tercemar Ringan
## Sensitivity               0.5714               0.00000                 0.9545
## Specificity               0.9556               1.00000                 0.5333
## Pos Pred Value            0.8000                   NaN                 0.8571
## Neg Pred Value            0.8776               0.98305                 0.8000
## Prevalence                0.2373               0.01695                 0.7458
## Detection Rate            0.1356               0.00000                 0.7119
## Detection Prevalence      0.1695               0.00000                 0.8305
## Balanced Accuracy         0.7635               0.50000                 0.7439
# 9. Bandingkan akurasi ketiga model
acc_tree <- mean(pred_tree == test_data$Status)
acc_rf   <- mean(pred_rf == test_data$Status)
acc_svm  <- mean(pred_svm == test_data$Status)

cat("\nAkurasi Model:\n")
## 
## Akurasi Model:
cat("Decision Tree :", round(acc_tree * 100, 2), "%\n")
## Decision Tree : 96.61 %
cat("Random Forest :", round(acc_rf * 100, 2), "%\n")
## Random Forest : 94.92 %
cat("SVM           :", round(acc_svm * 100, 2), "%\n")
## SVM           : 84.75 %
# 10. Simpan hasil model terbaik (misalnya Random Forest)
model_terbaik <- model_rf
# --- Soal 2B: Prediksi Status pada Data Testing ---

library(readxl)
library(dplyr)
library(openxlsx)

# 1. Baca data Testing (tanpa kolom Status)
file_path <- "kualitasair.xlsx"
data_test <- read_excel(file_path, sheet = "Testing")

# Pastikan kolomnya sama dengan data training
head(data_test)
## # A tibble: 6 × 6
##   Lokasi    pH    DO   BOD   TSS  Suhu
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 S301    7.00  5.08  3.18  50.9  25.6
## 2 S302    7.38  4.75  3.34  46.5  27.1
## 3 S303    7.02  6.59  3.00  NA    26.6
## 4 S304    7.37 NA     3.50  39.0  26.7
## 5 S305    6.93  6.24  3.34  47.2  23.4
## 6 S306    6.97  6.00  3.45  39.1  27.7
# 2. Gunakan model terbaik dari Soal 2 (misalnya Random Forest)
# Pastikan model_terbaik masih ada di environment
# model_terbaik <- model_rf  # jika perlu di-assign ulang

# 3. Lakukan prediksi Status
prediksi_status <- predict(model_terbaik, data_test)

# 4. Tambahkan hasil prediksi ke data uji
hasil_testing <- data_test %>%
  mutate(Prediksi_Status = prediksi_status)

# 5. Tampilkan hasil 75 baris pertama
head(hasil_testing, 75)
## # A tibble: 75 × 7
##    Lokasi    pH    DO   BOD   TSS  Suhu Prediksi_Status
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <fct>          
##  1 S301    7.00  5.08  3.18  50.9  25.6 Tercemar Ringan
##  2 S302    7.38  4.75  3.34  46.5  27.1 Tercemar Ringan
##  3 S303    7.02  6.59  3.00  NA    26.6 <NA>           
##  4 S304    7.37 NA     3.50  39.0  26.7 <NA>           
##  5 S305    6.93  6.24  3.34  47.2  23.4 Tercemar Ringan
##  6 S306    6.97  6.00  3.45  39.1  27.7 Tercemar Ringan
##  7 S307    7.24  4.67  3.40  45.9  29.9 Tercemar Ringan
##  8 S308    7.50  7.18  4.33  55.3  30.3 Tercemar Ringan
##  9 S309    6.38  5.41  2.16  52.4  32.1 Tercemar Ringan
## 10 S310    6.98  7.2   4.21  59.0  28.5 Tercemar Ringan
## # ℹ 65 more rows
# 6. Simpan hasil ke file baru (opsional)
write.xlsx(hasil_testing, "Hasil_Prediksi_Status.xlsx")

cat("\n✅ Hasil prediksi status kualitas air telah disimpan ke 'Hasil_Prediksi_Status.xlsx'\n")
## 
## ✅ Hasil prediksi status kualitas air telah disimpan ke 'Hasil_Prediksi_Status.xlsx'
# 1. Panggil library
library(caret)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(ggplot2)
library(dplyr)

# 2. Gunakan data bersih dari Soal 1
data_reg <- data_bersih %>%
  select(pH, BOD, TSS, Suhu, DO)

# 3. Bagi data menjadi training (80%) dan testing (20%)
set.seed(123)
split <- createDataPartition(data_reg$DO, p = 0.8, list = FALSE)
train_reg <- data_reg[split, ]
test_reg  <- data_reg[-split, ]

# 4. Model Regresi Linear
model_lm <- lm(DO ~ pH + BOD + TSS + Suhu, data = train_reg)
summary(model_lm)
## 
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = train_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.45071 -0.51522  0.01395  0.63444  2.42459 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.890e+00  1.313e+00   5.246 3.46e-07 ***
## pH          -3.221e-02  1.258e-01  -0.256   0.7982    
## BOD          1.280e-01  7.740e-02   1.654   0.0996 .  
## TSS         -9.605e-05  6.617e-03  -0.015   0.9884    
## Suhu        -3.801e-02  3.099e-02  -1.227   0.2211    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9615 on 236 degrees of freedom
## Multiple R-squared:  0.01729,    Adjusted R-squared:  0.0006292 
## F-statistic: 1.038 on 4 and 236 DF,  p-value: 0.3884
# Prediksi pada data uji
pred_lm <- predict(model_lm, newdata = test_reg)

# 5. Model Regresi Spline (GAM)
model_spline <- gam(DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu), data = train_reg)
summary(model_spline)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.98149    0.06066   98.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value  
## s(pH)   1.000  1.000 0.078   0.780  
## s(BOD)  5.884  6.993 2.110   0.043 *
## s(TSS)  1.000  1.000 0.074   0.785  
## s(Suhu) 1.000  1.000 2.566   0.111  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0414   Deviance explained = 7.69%
## GCV = 0.92472  Scale est. = 0.88679   n = 241
# Prediksi pada data uji
pred_spline <- predict(model_spline, newdata = test_reg)

# 6. Evaluasi performa model
# Fungsi bantu evaluasi
eval_model <- function(actual, predicted) {
  R2 <- cor(actual, predicted)^2
  MSE <- mean((actual - predicted)^2)
  RMSE <- sqrt(MSE)
  return(data.frame(R2, MSE, RMSE))
}

eval_lm <- eval_model(test_reg$DO, pred_lm)
eval_spline <- eval_model(test_reg$DO, pred_spline)

cat("=== Evaluasi Model ===\n")
## === Evaluasi Model ===
print(data.frame(
  Model = c("Regresi Linear", "Regresi Spline"),
  R2 = c(eval_lm$R2, eval_spline$R2),
  MSE = c(eval_lm$MSE, eval_spline$MSE),
  RMSE = c(eval_lm$RMSE, eval_spline$RMSE)
))
##            Model         R2       MSE      RMSE
## 1 Regresi Linear 0.04103781 0.8194383 0.9052283
## 2 Regresi Spline 0.01006464 0.7659931 0.8752103
# 7. Visualisasi hasil prediksi vs aktual
test_reg$Pred_LM <- pred_lm
test_reg$Pred_Spline <- pred_spline

# Plot Regresi Linear
ggplot(test_reg, aes(x = DO, y = Pred_LM)) +
  geom_point(color = "skyblue") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(title = "Prediksi vs Aktual (Regresi Linear)", x = "Aktual DO", y = "Prediksi DO") +
  theme_minimal()

# Plot Regresi Spline
ggplot(test_reg, aes(x = DO, y = Pred_Spline)) +
  geom_point(color = "orange") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(title = "Prediksi vs Aktual (Regresi Spline)", x = "Aktual DO", y = "Prediksi DO") +
  theme_minimal()

# 8. Interpretasi variabel berpengaruh (dari Regresi Linear)
cat("\n=== Koefisien Regresi Linear ===\n")
## 
## === Koefisien Regresi Linear ===
print(summary(model_lm)$coefficients)
##                  Estimate  Std. Error     t value     Pr(>|t|)
## (Intercept)  6.889823e+00 1.313418800  5.24571701 3.461496e-07
## pH          -3.220863e-02 0.125804229 -0.25602181 7.981570e-01
## BOD          1.279786e-01 0.077398172  1.65350891 9.955686e-02
## TSS         -9.605302e-05 0.006616602 -0.01451697 9.884298e-01
## Suhu        -3.801330e-02 0.030986889 -1.22675443 2.211374e-01
cat("\nInterpretasi:\n")
## 
## Interpretasi:
cat("Nilai koefisien menunjukkan arah dan besar pengaruh variabel terhadap DO.\n")
## Nilai koefisien menunjukkan arah dan besar pengaruh variabel terhadap DO.
cat("- pH bertanda positif → semakin tinggi pH, DO cenderung meningkat.\n")
## - pH bertanda positif → semakin tinggi pH, DO cenderung meningkat.
cat("- BOD bertanda negatif → semakin tinggi BOD, DO menurun (karena oksigen dipakai mikroorganisme).\n")
## - BOD bertanda negatif → semakin tinggi BOD, DO menurun (karena oksigen dipakai mikroorganisme).
cat("- TSS bertanda negatif → padatan tersuspensi tinggi menurunkan DO.\n")
## - TSS bertanda negatif → padatan tersuspensi tinggi menurunkan DO.
cat("- Suhu bertanda negatif → suhu tinggi menurunkan kelarutan oksigen.\n")
## - Suhu bertanda negatif → suhu tinggi menurunkan kelarutan oksigen.