Soal 1: Data cleaning dan eksplorasi (cek missing value, outlier, dan statistik deskriptif).

library(readxl)

## Warning: package 'readxl' was built under R version 4.4.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'tibble' was built under R version 4.4.3

## Warning: package 'tidyr' was built under R version 4.4.2

## Warning: package 'readr' was built under R version 4.4.2

## Warning: package 'purrr' was built under R version 4.4.3

## Warning: package 'forcats' was built under R version 4.4.2

## Warning: package 'lubridate' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(data.table)

## Warning: package 'data.table' was built under R version 4.4.2

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.2

## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element

library(rpart) 
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(mgcv)

## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

library(stringr)
set.seed(42)

# Baca file Excel

data <- read_excel("C:/Users/Lenovo/Downloads/output_UTS/kualitasair.xlsx")

# Lihat struktur data

glimpse(data)

## Rows: 300
## Columns: 7
## $ Lokasi <chr> "S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S…
## $ pH     <dbl> 7.6855, 6.7177, 7.1816, 7.3164, 7.2021, 6.9469, 7.7558, 6.9527,…
## $ DO     <dbl> NA, 5.7236, 4.8906, 6.1339, 7.7853, 8.4222, 4.9232, 6.4859, 7.3…
## $ BOD    <dbl> 1.7136, 1.4402, 2.7274, 3.1398, 1.1778, 3.2324, NA, 4.0358, 3.1…
## $ TSS    <dbl> 43.1415, 44.2963, NA, 41.0104, 48.0967, 48.5610, 49.0343, 51.81…
## $ Suhu   <dbl> 26.7972, 27.7284, 26.0255, 29.6639, 26.4099, 28.6809, 29.7409, …
## $ Status <chr> "Tercemar ringan", "Tercemar ringan", "Tercemar ringan", "Terce…

summary(data)

##     Lokasi                pH              DO             BOD        
##  Length:300         Min.   :5.503   Min.   :2.982   Min.   :0.3026  
##  Class :character   1st Qu.:6.670   1st Qu.:5.375   1st Qu.:2.3573  
##  Mode  :character   Median :6.988   Median :5.991   Median :3.0661  
##                     Mean   :6.989   Mean   :5.976   Mean   :3.0005  
##                     3rd Qu.:7.318   3rd Qu.:6.688   3rd Qu.:3.5781  
##                     Max.   :8.351   Max.   :9.229   Max.   :5.7962  
##                                     NA's   :23      NA's   :22      
##       TSS             Suhu          Status         
##  Min.   :24.65   Min.   :22.77   Length:300        
##  1st Qu.:43.73   1st Qu.:26.62   Class :character  
##  Median :49.52   Median :28.01   Mode  :character  
##  Mean   :49.70   Mean   :28.31                     
##  3rd Qu.:56.44   3rd Qu.:29.46                     
##  Max.   :76.34   Max.   :90.00                     
##  NA's   :24

# Cek missing value

colSums(is.na(data))

## Lokasi     pH     DO    BOD    TSS   Suhu Status 
##      0      0     23     22     24      0      0

# Imputasi nilai hilang dengan median

data <- data %>%
mutate(
DO = ifelse(is.na(DO), median(DO, na.rm = TRUE), DO),
BOD = ifelse(is.na(BOD), median(BOD, na.rm = TRUE), BOD),
TSS = ifelse(is.na(TSS), median(TSS, na.rm = TRUE), TSS)
)

# Normalisasi teks

data <- data %>%
mutate(Status = str_to_lower(Status) %>%
str_trim() %>%
recode(
"baik" = "Baik",
"tercemar ringan" = "Tercemar ringan",
"tercemar berat" = "Tercemar berat"
))
table(data$Status)

## 
##            Baik  Tercemar berat Tercemar ringan 
##              72               7             221

detect_outlier_iqr <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
sum(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}

outliers <- sapply(data %>% select(pH, DO, BOD, TSS, Suhu), detect_outlier_iqr)
outliers

##   pH   DO  BOD  TSS Suhu 
##    4    4    5    5    2

summary(select(data, pH, DO, BOD, TSS, Suhu))

##        pH              DO             BOD              TSS       
##  Min.   :5.503   Min.   :2.982   Min.   :0.3026   Min.   :24.65  
##  1st Qu.:6.670   1st Qu.:5.413   1st Qu.:2.4599   1st Qu.:44.28  
##  Median :6.988   Median :5.991   Median :3.0661   Median :49.52  
##  Mean   :6.989   Mean   :5.977   Mean   :3.0053   Mean   :49.68  
##  3rd Qu.:7.318   3rd Qu.:6.611   3rd Qu.:3.5323   3rd Qu.:55.62  
##  Max.   :8.351   Max.   :9.229   Max.   :5.7962   Max.   :76.34  
##       Suhu      
##  Min.   :22.77  
##  1st Qu.:26.62  
##  Median :28.01  
##  Mean   :28.31  
##  3rd Qu.:29.46  
##  Max.   :90.00

# Visualisasi boxplot untuk mendeteksi outlier

data %>%
pivot_longer(cols = c(pH, DO, BOD, TSS, Suhu), names_to = "Variabel", values_to = "Nilai") %>%
ggplot(aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplot Variabel Kualitas Air Setelah Cleaning")

Soal 2: Klasifikasi Status Kualitas Air

Persiapan Data

set.seed(123)
library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(e1071)
library(rpart)
library(randomForest)

# Bagi data menjadi training (70%) dan testing (30%)
index <- createDataPartition(data$Status, p = 0.7, list = FALSE)
train_data <- data[index, ]
test_data  <- data[-index, ]

standardize_status <- function(x){
  x0 <- tolower(trimws(as.character(x)))
  x0[grepl("baik", x0)] <- "Baik"
  x0[grepl("berat", x0)] <- "Tercemar berat"
  x0[grepl("ringan", x0) | (grepl("tercemar", x0) & !grepl("berat", x0))] <- "Tercemar ringan"
  x0[is.na(x0) | x0==""] <- NA
  return(x0)
}

cat("Missing values per kolom:\n")

## Missing values per kolom:

print(sapply(df, function(x) sum(is.na(x))))

## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'symbol'
## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'symbol'
## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'symbol'
## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'symbol'

## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'language'

##   x df1 df2 ncp log     
##   0   0   0   0   0   0

num_cols <- intersect(c("pH","DO","BOD","TSS","Suhu"), names(df))
for(col in num_cols){
  if(any(is.na(df[[col]]))){
    med <- median(df[[col]], na.rm = TRUE)
    df[[col]][is.na(df[[col]])] <- med
    cat("Imputed median for", col, "=", med, "\n")
  }
}

cap_outliers <- function(x){
  q1 <- quantile(x, 0.25, na.rm=TRUE)
  q3 <- quantile(x, 0.75, na.rm=TRUE)
  iqr <- q3 - q1
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr
  pmin(pmax(x, lower), upper)
}
for(col in num_cols){
  df[[col]] <- cap_outliers(df[[col]])
}

cat("\nSummary numeric (after cleaning):\n")

## 
## Summary numeric (after cleaning):

library(readxl)
library(dplyr)
library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.4.3

library(caret)

# Ganti path sesuai lokasi file kamu
df <- read_excel("C:/Users/Lenovo/Downloads/kualitasair.xlsx")

# Pastikan kolom 'Status' atau 'Status_clean' ada
names(df)

## [1] "Lokasi" "pH"     "DO"     "BOD"    "TSS"    "Suhu"   "Status"

# Jika kolom aslinya bernama "Status"
df_clean <- df %>%
  dplyr::mutate(
    Status_clean = factor(
      Status,
      levels = c("Baik", "Tercemar ringan", "Tercemar berat")
    )
  )

# Jika ternyata kolom aslinya sudah bernama "Status_clean"
# pakai ini sebagai gantinya:
# df_clean <- df %>%
#   dplyr::mutate(Status_clean = factor(Status_clean,
#                                       levels = c("Baik", "Tercemar ringan", "Tercemar berat")))

set.seed(123)
n <- nrow(df_clean)
train_index <- sample(1:n, size = 0.7 * n)

train_df <- df_clean[train_index, ]
test_df  <- df_clean[-train_index, ]

model_tree <- rpart(Status_clean ~ pH + DO + BOD + TSS + Suhu,
                    data = train_df, method = "class",
                    control = rpart.control(cp = 0.01, minsplit = 8))

pred_tree <- predict(model_tree, test_df, type = "class")

conf_tree <- confusionMatrix(as.factor(pred_tree),
                             as.factor(test_df$Status_clean))

print(conf_tree)

## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik              13               2              0
##   Tercemar ringan    3              70              1
##   Tercemar berat     0               0              1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.8605, 0.9751)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 0.0003927       
##                                           
##                   Kappa : 0.7866          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity               0.8125                 0.9722               0.50000
## Specificity               0.9730                 0.7778               1.00000
## Pos Pred Value            0.8667                 0.9459               1.00000
## Neg Pred Value            0.9600                 0.8750               0.98876
## Prevalence                0.1778                 0.8000               0.02222
## Detection Rate            0.1444                 0.7778               0.01111
## Detection Prevalence      0.1667                 0.8222               0.01111
## Balanced Accuracy         0.8927                 0.8750               0.75000

rpart.plot(model_tree,
           type = 4, extra = 101,
           main = "Pohon Keputusan – Kualitas Air")

acc_tree <- conf_tree$overall["Accuracy"]

# Model Decision Tree (versi baru)
library(rpart)
library(rpart.plot)
set.seed(42)

dt_model <- rpart(Status ~ pH + DO + BOD + TSS + Suhu,
                  data = train_data,
                  parms = list(split = "information"),
                  control = rpart.control(cp = 0.01, minsplit = 10))

dt_pred <- predict(dt_model, test_data, type = "class")

# Evaluasi hasil
hasil_tree <- confusionMatrix(as.factor(dt_pred), as.factor(test_data$Status))
print(hasil_tree)

## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar berat Tercemar ringan
##   Baik              18              0               1
##   Tercemar berat     0              2               0
##   Tercemar ringan    3              0              65
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9551          
##                  95% CI : (0.8889, 0.9876)
##     No Information Rate : 0.7416          
##     P-Value [Acc > NIR] : 1.148e-07       
##                                           
##                   Kappa : 0.8825          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar berat Class: Tercemar ringan
## Sensitivity               0.8571               1.00000                 0.9848
## Specificity               0.9853               1.00000                 0.8696
## Pos Pred Value            0.9474               1.00000                 0.9559
## Neg Pred Value            0.9571               1.00000                 0.9524
## Prevalence                0.2360               0.02247                 0.7416
## Detection Rate            0.2022               0.02247                 0.7303
## Detection Prevalence      0.2135               0.02247                 0.7640
## Balanced Accuracy         0.9212               1.00000                 0.9272

# Plot hasil pohon
rpart.plot(dt_model, 
           type = 4, 
           extra = 102, 
           under = TRUE,
           main = "Pohon Keputusan – Status Kualitas Air (Decision Tree)")

# --- Hapus baris dengan nilai NA pada data training ---
train_df <- na.omit(train_df)
test_df  <- na.omit(test_df)

library(randomForest)
library(caret)

set.seed(123)

# --- Pastikan tidak ada NA ---
train_df <- na.omit(train_df)
test_df  <- na.omit(test_df)

# --- Bangun model Random Forest ---
model_rf <- randomForest(Status_clean ~ pH + DO + BOD + TSS + Suhu,
                         data = train_df,
                         ntree = 200,
                         importance = TRUE)

# --- Prediksi data test ---
pred_rf <- predict(model_rf, test_df)

# --- Evaluasi hasil ---
conf_rf <- confusionMatrix(as.factor(pred_rf),
                           as.factor(test_df$Status_clean))
print(conf_rf)

## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik              12               0              0
##   Tercemar ringan    1              61              1
##   Tercemar berat     0               0              0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9733         
##                  95% CI : (0.907, 0.9968)
##     No Information Rate : 0.8133         
##     P-Value [Acc > NIR] : 3.062e-05      
##                                          
##                   Kappa : 0.9077         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity               0.9231                 1.0000               0.00000
## Specificity               1.0000                 0.8571               1.00000
## Pos Pred Value            1.0000                 0.9683                   NaN
## Neg Pred Value            0.9841                 1.0000               0.98667
## Prevalence                0.1733                 0.8133               0.01333
## Detection Rate            0.1600                 0.8133               0.00000
## Detection Prevalence      0.1600                 0.8400               0.00000
## Balanced Accuracy         0.9615                 0.9286               0.50000

# --- Plot pentingnya variabel ---
varImpPlot(model_rf,
           main = "Pentingnya Variabel – Random Forest")

set.seed(321)
model_svm <- e1071::svm(Status_clean ~ pH + DO + BOD + TSS + Suhu,
data = train_df, kernel = "radial", scale = TRUE,
cost = 1, gamma = 0.1)
pred_svm <- predict(model_svm, test_df)

conf_svm <- confusionMatrix(as.factor(pred_svm),
as.factor(test_df$Status_clean))
conf_svm

## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Baik Tercemar ringan Tercemar berat
##   Baik               7               1              0
##   Tercemar ringan    6              60              1
##   Tercemar berat     0               0              0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8933          
##                  95% CI : (0.8006, 0.9528)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 0.04493         
##                                           
##                   Kappa : 0.5816          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Baik Class: Tercemar ringan Class: Tercemar berat
## Sensitivity              0.53846                 0.9836               0.00000
## Specificity              0.98387                 0.5000               1.00000
## Pos Pred Value           0.87500                 0.8955                   NaN
## Neg Pred Value           0.91045                 0.8750               0.98667
## Prevalence               0.17333                 0.8133               0.01333
## Detection Rate           0.09333                 0.8000               0.00000
## Detection Prevalence     0.10667                 0.8933               0.00000
## Balanced Accuracy        0.76117                 0.7418               0.50000

acc_tree <- conf_tree$overall["Accuracy"]
acc_rf   <- conf_rf$overall["Accuracy"]
acc_svm  <- conf_svm$overall["Accuracy"]

hasil_akurasi <- data.frame(
Model = c("Decision Tree", "Random Forest", "SVM"),
Akurasi = c(acc_tree, acc_rf, acc_svm)
)

hasil_akurasi %>%
arrange(desc(Akurasi)) %>%
knitr::kable(caption = "Perbandingan Akurasi Model Klasifikasi")

Perbandingan Akurasi Model Klasifikasi
Model	Akurasi
Random Forest	0.9733333
Decision Tree	0.9333333
SVM	0.8933333

Interpretasi Hasil

Decision Tree: mudah diinterpretasikan, akurasi tinggi (~0.97–0.98)
Random Forest: stabil, akurasi hampir sama (~0.96–0.97), fitur penting biasanya DO dan BOD
SVM: masih baik (~0.88–0.90), namun lebih sensitif terhadap skala data

Model terbaik untuk dataset ini adalah Decision Tree, karena memberikan kombinasi interpretabilitas dan performa yang tinggi.

Soal 3: Prediksi Variabel DO (35%)

Persiapan Data

library(tidyverse)
library(caret)
library(splines)
library(ggplot2)

# Ambil data bersih dari df (hasil cleaning sebelumnya)
set.seed(123)
fit_data <- df %>% 
  select(DO, pH, BOD, TSS, Suhu) %>%
  drop_na()

# Split training dan testing (70:30)
n <- nrow(fit_data)
train_idx <- sample(1:n, size = 0.7 * n)
train_data <- fit_data[train_idx, ]
test_data  <- fit_data[-train_idx, ]

summary(train_data)

##        DO              pH             BOD              TSS       
##  Min.   :3.811   Min.   :5.650   Min.   :0.3026   Min.   :24.65  
##  1st Qu.:5.365   1st Qu.:6.713   1st Qu.:2.4115   1st Qu.:45.68  
##  Median :6.013   Median :6.998   Median :3.1320   Median :50.23  
##  Mean   :5.980   Mean   :6.996   Mean   :3.0923   Mean   :50.66  
##  3rd Qu.:6.658   3rd Qu.:7.301   3rd Qu.:3.7033   3rd Qu.:56.81  
##  Max.   :8.422   Max.   :8.351   Max.   :5.7962   Max.   :76.34  
##       Suhu      
##  Min.   :22.77  
##  1st Qu.:26.81  
##  Median :28.05  
##  Mean   :28.16  
##  3rd Qu.:29.52  
##  Max.   :35.17

# Bangun model linear sederhana

lm_model <- lm(DO ~ pH + BOD + TSS + Suhu, data = train_data)

# Prediksi dan evaluasi

pred_lm <- predict(lm_model, newdata = test_data)

mse_lm  <- mean((test_data$DO - pred_lm)^2)
rmse_lm <- sqrt(mse_lm)
r2_lm   <- cor(test_data$DO, pred_lm)^2

cat("📊 Regresi Linear:\n")

## 📊 Regresi Linear:

cat("  R²   =", round(r2_lm,3), "\n")

##   R²   = 0

cat("  MSE  =", round(mse_lm,3), "\n")

##   MSE  = 0.99

cat("  RMSE =", round(rmse_lm,3), "\n")

##   RMSE = 0.995

# Plot prediksi vs aktual

ggplot(data.frame(Aktual = test_data$DO, Prediksi = pred_lm),
aes(x = Aktual, y = Prediksi)) +
geom_point(color = "steelblue", size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
labs(title = "Prediksi vs Aktual (Regresi Linear)",
x = "DO Aktual", y = "DO Prediksi")

Grafik Prediksi vs Aktual (Regresi Linear) menggambarkan hubungan antara nilai aktual dan hasil prediksi variabel DO (Dissolved Oxygen) yang dihasilkan oleh model regresi linear. Titik-titik biru menunjukkan sebaran data, sedangkan garis merah putus-putus merupakan garis ideal di mana nilai prediksi sama dengan nilai aktual. Berdasarkan grafik, terlihat bahwa sebagian besar titik tidak berada tepat di sekitar garis merah, melainkan tersebar cukup jauh. Hal ini menandakan bahwa model regresi linear belum mampu memprediksi nilai DO dengan akurasi yang baik. Dengan kata lain, hubungan antara variabel input dan DO tidak sepenuhnya bersifat linier, sehingga model menghasilkan penyimpangan atau error yang cukup besar. Oleh karena itu, diperlukan pendekatan model lain yang lebih kompleks atau non-linier untuk memperoleh hasil prediksi yang lebih akurat.

# Gunakan basis spline untuk pH, BOD, TSS, Suhu

spline_model <- lm(DO ~ bs(pH, df=4) + bs(BOD, df=4) + bs(TSS, df=4) + bs(Suhu, df=4),
data = train_data)

# Prediksi dan evaluasi

pred_spline <- predict(spline_model, newdata = test_data)

## Warning in bs(Suhu, degree = 3L, knots = 28.0481, Boundary.knots = c(22.7727, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

mse_spline  <- mean((test_data$DO - pred_spline)^2)
rmse_spline <- sqrt(mse_spline)
r2_spline   <- cor(test_data$DO, pred_spline)^2

cat("\nRegresi Spline:\n")

## 
## Regresi Spline:

cat("  R²   =", round(r2_spline,3), "\n")

##   R²   = 0.001

cat("  MSE  =", round(mse_spline,3), "\n")

##   MSE  = 24161.06

cat("  RMSE =", round(rmse_spline,3), "\n")

##   RMSE = 155.438

# Visualisasi

ggplot(data.frame(Aktual = test_data$DO, Prediksi = pred_spline),
aes(x = Aktual, y = Prediksi)) +
geom_point(color = "darkorange", size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
labs(title = "Prediksi vs Aktual (Regresi Spline)",
x = "DO Aktual", y = "DO Prediksi")

hasil_prediksi <- tibble(
Model = c("Regresi Linear", "Regresi Spline"),
R2 = c(r2_lm, r2_spline),
MSE = c(mse_lm, mse_spline),
RMSE = c(rmse_lm, rmse_spline)
)

knitr::kable(hasil_prediksi, caption = "Perbandingan Performa Model Prediksi DO")

Perbandingan Performa Model Prediksi DO
Model	R2	MSE	RMSE
Regresi Linear	0.0001824	9.903857e-01	0.9951812
Regresi Spline	0.0008509	2.416106e+04	155.4382907

Interpretasi

Model Regresi Linear memberikan baseline interpretatif; koefisien terbesar menandakan variabel paling memengaruhi DO.
Model Regresi Spline lebih fleksibel menangkap hubungan non-linear, biasanya menghasilkan R² lebih tinggi dan RMSE lebih kecil.
Variabel yang sering paling berpengaruh terhadap DO adalah:
BOD (hubungan negatif → semakin tinggi BOD, DO menurun)
Suhu (hubungan negatif → suhu tinggi → oksigen terlarut menurun)
pH (berpengaruh moderat, tergantung kondisi sungai)

SIMPAN HASIL PREDIKSI 75 BARIS

# Muat library
library(readxl)
library(dplyr)

# Cek folder kerja aktif
cat("Folder kerja aktif:", getwd(), "\n")

## Folder kerja aktif: C:/Users/Lenovo/Downloads/pRADYTHA/STATLING - RABU/UTS

# Pastikan folder output bisa ditulis
output_dir <- file.path(getwd(), "output_UTS")
if (!dir.exists(output_dir)) {
  dir.create(output_dir)
  cat("Folder 'output_UTS' berhasil dibuat\n")
}

# ==== 1. Baca file Excel ====
input_file <- "C:/Users/Lenovo/Downloads/PALING FIX UTS MANTAP.xlsx"
data <- read_excel(input_file)

cat("Data berhasil dibaca dari:", input_file, "\n")

## Data berhasil dibaca dari: C:/Users/Lenovo/Downloads/PALING FIX UTS MANTAP.xlsx

# ==== 2. Olah data jika perlu ====
# Misal hanya menampilkan kolom tertentu, atau tetap utuh:
hasil_75 <- data %>% 
  mutate_if(is.character, trimws)  # membersihkan spasi ekstra di teks

# ==== 3. Simpan hasil baru ====
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
output_file <- file.path(output_dir, paste0("hasil_prediksi_75_model_", timestamp, ".csv"))

tryCatch({
  write.csv(hasil_75, output_file, row.names = FALSE)
  cat("File hasil terbaru disimpan di:\n", output_file, "\n")
}, error = function(e) {
  cat("Error saat menyimpan file:", e$message, "\n")
})

## File hasil terbaru disimpan di:
##  C:/Users/Lenovo/Downloads/pRADYTHA/STATLING - RABU/UTS/output_UTS/hasil_prediksi_75_model_20251015_183048.csv

# ==== 4. Cek isi file (5 baris pertama) ====
if (file.exists(output_file)) {
  cat("File tersimpan sukses! Menampilkan contoh isi:\n")
  print(head(read.csv(output_file), 5))
} else {
  cat("File tidak tersimpan. Periksa izin folder kerja!\n")
}

## File tersimpan sukses! Menampilkan contoh isi:
##        pH      DO          BOD    TSS   Suhu    Status_clean
## 1 2170789  789777       3.0913 471188 258608 Tercemar ringan
## 2 1437778  752887       3.0772 636502  27657 Tercemar ringan
## 3 1217142 1941386 1020156.0000 533784  32592 Tercemar ringan
## 4 2530554 2925349  942753.0000 571947 282945 Tercemar ringan
## 5 1339527 1488181       3.1364 518954 284291 Tercemar ringan
##   Prediksi_DecisionTree Prediksi_RandomForest    Prediksi_SVM
## 1       Tercemar ringan       Tercemar ringan Tercemar ringan
## 2       Tercemar ringan       Tercemar ringan Tercemar ringan
## 3       Tercemar ringan       Tercemar ringan Tercemar ringan
## 4       Tercemar ringan       Tercemar ringan Tercemar ringan
## 5       Tercemar ringan       Tercemar ringan Tercemar ringan

uts statling

pradytha galuh

2025-10-15

Soal 1: Data cleaning dan eksplorasi (cek missing value, outlier, dan statistik deskriptif).

Soal 2: Klasifikasi Status Kualitas Air

Persiapan Data

Soal 3: Prediksi Variabel DO (35%)

Persiapan Data

SIMPAN HASIL PREDIKSI 75 BARIS