Instal & panggil library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(rpart)
library(rpart.plot)
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Loading required package: lattice
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(splines)

Import Data

library(readxl)

train <- read_excel("C:/Users/ASUS/Downloads/kualitasair.xlsx", sheet = "Training")
test <- read_excel("C:/Users/ASUS/Downloads/kualitasair.xlsx", sheet = "Testing")

Cek jumlah baris Data

nrow(train) 
## [1] 300
nrow(test) 
## [1] 75

Soal 1: Data Cleaning dan Eksplorasi

a.Identifikasi dan tangani missing value, outlier, dan inkonsistensi kategori

Cek Missing Value

colSums(is.na(train))
## Lokasi     pH     DO    BOD    TSS   Suhu Status 
##      0      0     23     22     24      0      0

Imputasi missing value dengan median

train$pH[is.na(train$pH)]   <- median(train$pH, na.rm = TRUE)
train$DO[is.na(train$DO)]   <- median(train$DO, na.rm = TRUE)
train$BOD[is.na(train$BOD)] <- median(train$BOD, na.rm = TRUE)
train$TSS[is.na(train$TSS)] <- median(train$TSS, na.rm = TRUE)
train$Suhu[is.na(train$Suhu)] <- median(train$Suhu, na.rm = TRUE)

# Cek ulang apakah sudah bersih dari NA
colSums(is.na(train))
## Lokasi     pH     DO    BOD    TSS   Suhu Status 
##      0      0      0      0      0      0      0

Deteksi Outlier

### Deteksi Outlier

# Tangani outlier dengan aturan IQR (optional)
for (col in c("pH", "DO", "BOD", "TSS", "Suhu")) {
  Q1 <- quantile(train[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(train[[col]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  
  # Ganti nilai di luar batas menjadi batas bawah/atas
  train[[col]][train[[col]] < lower] <- lower
  train[[col]][train[[col]] > upper] <- upper
}

# Statistik deskriptif setelah cleaning
summary(train[, c("pH", "DO", "BOD", "TSS", "Suhu")])
##        pH              DO             BOD              TSS       
##  Min.   :5.697   Min.   :3.615   Min.   :0.8513   Min.   :27.28  
##  1st Qu.:6.670   1st Qu.:5.413   1st Qu.:2.4599   1st Qu.:44.28  
##  Median :6.988   Median :5.991   Median :3.0661   Median :49.52  
##  Mean   :6.990   Mean   :5.977   Mean   :3.0041   Mean   :49.68  
##  3rd Qu.:7.318   3rd Qu.:6.611   3rd Qu.:3.5323   3rd Qu.:55.62  
##  Max.   :8.290   Max.   :8.409   Max.   :5.1409   Max.   :72.62  
##       Suhu      
##  Min.   :22.77  
##  1st Qu.:26.62  
##  Median :28.01  
##  Mean   :28.12  
##  3rd Qu.:29.46  
##  Max.   :33.73
# Cek jumlah dan posisi outlier (setelah penanganan)
boxplot.stats(train$pH)$out
## numeric(0)
boxplot.stats(train$DO)$out
## numeric(0)
boxplot.stats(train$BOD)$out
## numeric(0)
boxplot.stats(train$TSS)$out
## numeric(0)
boxplot.stats(train$Suhu)$out
## numeric(0)

b.Lakukan standarisasi penulisan kategori Status.

Inkonsistensi Kategori

# Standarisasi penulisan kategori Status
train$Status <- tolower(trimws(train$Status))   # ubah semua huruf jadi kecil & hapus spasi
train$Status <- factor(train$Status)

# Cek ulang kategori unik
unique(train$Status)
## [1] tercemar ringan baik            tercemar berat 
## Levels: baik tercemar berat tercemar ringan
table(train$Status)
## 
##            baik  tercemar berat tercemar ringan 
##              72               7             221

Keterangan Setelah dilakukan standarisasi didapatkan 3 kategori pada variabel Status: Baik (72 data) → kualitas air sesuai baku mutu. Tercemar ringan (221 data) → air sedikit melebihi ambang batas. Tercemar berat (7 data) → air mengalami pencemaran signifikan.

c.Tampilkan ringkasan statistik deskriptif setelah pembersihan.

# Ringkasan statistik
summary(train)
##     Lokasi                pH              DO             BOD        
##  Length:300         Min.   :5.697   Min.   :3.615   Min.   :0.8513  
##  Class :character   1st Qu.:6.670   1st Qu.:5.413   1st Qu.:2.4599  
##  Mode  :character   Median :6.988   Median :5.991   Median :3.0661  
##                     Mean   :6.990   Mean   :5.977   Mean   :3.0041  
##                     3rd Qu.:7.318   3rd Qu.:6.611   3rd Qu.:3.5323  
##                     Max.   :8.290   Max.   :8.409   Max.   :5.1409  
##       TSS             Suhu                   Status   
##  Min.   :27.28   Min.   :22.77   baik           : 72  
##  1st Qu.:44.28   1st Qu.:26.62   tercemar berat :  7  
##  Median :49.52   Median :28.01   tercemar ringan:221  
##  Mean   :49.68   Mean   :28.12                        
##  3rd Qu.:55.62   3rd Qu.:29.46                        
##  Max.   :72.62   Max.   :33.73
# Visualisasi distribusi
ggplot(train, aes(x = pH, fill = Status)) +
  geom_histogram(bins = 20, alpha = 0.7) +
  theme_minimal() + ggtitle("Distribusi pH berdasarkan Status")

# Korelasi antar variabel numerik
pairs(train[, c("pH","DO","BOD","TSS","Suhu")], main="Korelasi antar variabel numerik")

Keterangan a. Nilai pH berkisar antara 5,5 hingga 8,3 → masih dalam rentang wajar air alami. b. Nilai DO (oksigen terlarut) rata-rata 6 mg/L → cukup baik untuk ekosistem sungai. c. Nilai BOD dan TSS bervariasi antar lokasi, mencerminkan perbedaan tingkat pencemaran. d. Suhu air relatif stabil (rata-rata 28°C). e. Sebagian besar lokasi (221 dari 300) masuk kategori tercemar ringan, menandakan pencemaran ringan dominan di area studi.

Soal 2: Klasifikasi Status Kualitas Air

a.Gunakan variabel numerik (pH, DO, BOD, TSS, Suhu) untuk mengklasifikasikan Status

set.seed(123)

train$Status <- tolower(trimws(train$Status))
train$Status <- as.factor(train$Status)

# Pilih variabel numerik
features <- c("pH", "DO", "BOD", "TSS", "Suhu")

b.Bagi data menjadi training dan testing

# Bagi data menjadi Training (80%) dan Testing (20%)
index <- createDataPartition(train$Status, p = 0.8, list = FALSE)
data_train <- train[index, ]
data_test  <- train[-index, ]

nrow(data_train)
## [1] 241
nrow(data_test)
## [1] 59
# Pastikan variabel target faktor di kedua dataset
data_train$Status <- as.factor(data_train$Status)
data_test$Status  <- factor(data_test$Status, levels = levels(data_train$Status))

Tangani NA jika menggunakan imputasi median

for (f in features) {
  if (any(is.na(data_train[[f]]))) {
    data_train[[f]][is.na(data_train[[f]])] <- median(data_train[[f]], na.rm = TRUE)
  }
  if (any(is.na(data_test[[f]]))) {
    data_test[[f]][is.na(data_test[[f]])] <- median(data_train[[f]], na.rm = TRUE)
  }
}

data_test\(Status <- factor(data_test\)Status, levels = levels(data_train$Status))

c. Bangun model klasifikasi dengan SVR, Decision Tree, dan Random Forest.

SVM

# Standarisasi data
preProc <- preProcess(data_train[, features], method = c("center", "scale"))
train_svm <- predict(preProc, data_train[, features])
test_svm  <- predict(preProc, data_test[, features])
# Bangun model SVM (klasifikasi)
model_svm <- svm(x = train_svm, y = data_train$Status, kernel = "radial")
# Prediksi
pred_svm <- predict(model_svm, test_svm)

# Evaluasi
conf_svm <- confusionMatrix(pred_svm, data_test$Status)
conf_svm
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        baik tercemar berat tercemar ringan
##   baik               8              0               2
##   tercemar berat     0              0               0
##   tercemar ringan    6              1              42
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8475          
##                  95% CI : (0.7301, 0.9278)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 0.04475         
##                                           
##                   Kappa : 0.5519          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: baik Class: tercemar berat Class: tercemar ringan
## Sensitivity               0.5714               0.00000                 0.9545
## Specificity               0.9556               1.00000                 0.5333
## Pos Pred Value            0.8000                   NaN                 0.8571
## Neg Pred Value            0.8776               0.98305                 0.8000
## Prevalence                0.2373               0.01695                 0.7458
## Detection Rate            0.1356               0.00000                 0.7119
## Detection Prevalence      0.1695               0.00000                 0.8305
## Balanced Accuracy         0.7635               0.50000                 0.7439

**Decision Tree

model_tree <- rpart(Status ~ pH + DO + BOD + TSS + Suhu,
                    data = data_train, method = "class")
# Visualisasi tree
rpart.plot(model_tree, main = "Decision Tree Kualitas Air")

# Prediksi dan evaluasi
pred_tree <- predict(model_tree, data_test, type = "class")
conf_tree <- confusionMatrix(pred_tree, data_test$Status)
print(conf_tree)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        baik tercemar berat tercemar ringan
##   baik              13              0               0
##   tercemar berat     0              0               0
##   tercemar ringan    1              1              44
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9661          
##                  95% CI : (0.8829, 0.9959)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 6.696e-06       
##                                           
##                   Kappa : 0.9075          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: baik Class: tercemar berat Class: tercemar ringan
## Sensitivity               0.9286               0.00000                 1.0000
## Specificity               1.0000               1.00000                 0.8667
## Pos Pred Value            1.0000                   NaN                 0.9565
## Neg Pred Value            0.9783               0.98305                 1.0000
## Prevalence                0.2373               0.01695                 0.7458
## Detection Rate            0.2203               0.00000                 0.7458
## Detection Prevalence      0.2203               0.00000                 0.7797
## Balanced Accuracy         0.9643               0.50000                 0.9333

Random Forest

model_rf <- randomForest(Status ~ pH + DO + BOD + TSS + Suhu,
                         data = data_train, ntree = 200, importance = TRUE)
# Prediksi dan evaluasi
pred_rf <- predict(model_rf, data_test)
conf_rf <- confusionMatrix(pred_rf, data_test$Status)
print(conf_rf)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        baik tercemar berat tercemar ringan
##   baik              13              0               1
##   tercemar berat     0              0               0
##   tercemar ringan    1              1              43
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9492          
##                  95% CI : (0.8585, 0.9894)
##     No Information Rate : 0.7458          
##     P-Value [Acc > NIR] : 4.59e-05        
##                                           
##                   Kappa : 0.8644          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: baik Class: tercemar berat Class: tercemar ringan
## Sensitivity               0.9286               0.00000                 0.9773
## Specificity               0.9778               1.00000                 0.8667
## Pos Pred Value            0.9286                   NaN                 0.9556
## Neg Pred Value            0.9778               0.98305                 0.9286
## Prevalence                0.2373               0.01695                 0.7458
## Detection Rate            0.2203               0.00000                 0.7288
## Detection Prevalence      0.2373               0.00000                 0.7627
## Balanced Accuracy         0.9532               0.50000                 0.9220

d. Evaluasi hasil dengan confusion matrix dan interpretasi akurasi.

# Evaluasi SVM
cm_svm <- table(Prediksi = pred_svm, Aktual = data_test$Status)
cat("CONFUSION MATRIX SVM")
## CONFUSION MATRIX SVM
print(cm_svm)
##                  Aktual
## Prediksi          baik tercemar berat tercemar ringan
##   baik               8              0               2
##   tercemar berat     0              0               0
##   tercemar ringan    6              1              42
# Evaluasi Decision Tree
cm_tree <- table(Prediksi = pred_tree, Aktual = data_test$Status)
cat("CONFUSION MATRIX DECISION TREE")
## CONFUSION MATRIX DECISION TREE
print(cm_tree)
##                  Aktual
## Prediksi          baik tercemar berat tercemar ringan
##   baik              13              0               0
##   tercemar berat     0              0               0
##   tercemar ringan    1              1              44
# Evaluasi Random Forest
cm_rf <- table(Prediksi = pred_rf, Aktual = data_test$Status)
cat("CONFUSION MATRIX RANDOM FOREST")
## CONFUSION MATRIX RANDOM FOREST
print(cm_rf)
##                  Aktual
## Prediksi          baik tercemar berat tercemar ringan
##   baik              13              0               1
##   tercemar berat     0              0               0
##   tercemar ringan    1              1              43
accuracy_svm  <- sum(diag(cm_svm))  / sum(cm_svm)
accuracy_tree <- sum(diag(cm_tree)) / sum(cm_tree)
accuracy_rf   <- sum(diag(cm_rf))   / sum(cm_rf)

accuracy_results <- data.frame(
  Model = c("SVM", "Decision Tree", "Random Forest"),
  Accuracy = c(accuracy_svm, accuracy_tree, accuracy_rf)
)

cat("AKURASI SETIAP MODEL")
## AKURASI SETIAP MODEL
print(accuracy_results)
##           Model  Accuracy
## 1           SVM 0.8474576
## 2 Decision Tree 0.9661017
## 3 Random Forest 0.9491525
best_model <- accuracy_results$Model[which.max(accuracy_results$Accuracy)]
cat("\nModel dengan akurasi tertinggi adalah:", best_model, "\n")
## 
## Model dengan akurasi tertinggi adalah: Decision Tree

Soal 3: Prediksi Variabel DO

a.Gunakan Regresi Linear dan Regresi Spline untuk memprediksi nilai DO berdasarkan pH, BOD, TSS, dan Suhu.

features <- c("pH", "BOD", "TSS", "Suhu", "DO")

# Tangani NA jika masih ada
for (f in features) {
  if (any(is.na(train[[f]]))) {
    train[[f]][is.na(train[[f]])] <- median(train[[f]], na.rm = TRUE)
  }
}
# Bagi data menjadi training (80%) dan testing (20%)
index <- sample(1:nrow(train), 0.8 * nrow(train))
data_train <- train[index, ]
data_test  <- train[-index, ]

Model Regresi Linear

model_lm <- lm(DO ~ pH + BOD + TSS + Suhu, data = data_train)
summary(model_lm)
## 
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = data_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35543 -0.54022 -0.00989  0.63204  2.37154 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.656990   1.306258   5.096 7.12e-07 ***
## pH          -0.053099   0.121339  -0.438   0.6621    
## BOD          0.130644   0.075076   1.740   0.0831 .  
## TSS         -0.003110   0.006539  -0.476   0.6348    
## Suhu        -0.018183   0.030199  -0.602   0.5477    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9277 on 235 degrees of freedom
## Multiple R-squared:  0.01603,    Adjusted R-squared:  -0.0007215 
## F-statistic: 0.9569 on 4 and 235 DF,  p-value: 0.4319
# Prediksi pada data testing
pred_lm <- predict(model_lm, newdata = data_test)

# Evaluasi performa
R2_lm   <- summary(model_lm)$r.squared
MSE_lm  <- mse(data_test$DO, pred_lm)
RMSE_lm <- rmse(data_test$DO, pred_lm)

cat("Hasil REGRESI LINEAR")
## Hasil REGRESI LINEAR
cat("R²   :", round(R2_lm, 3), "\n")
## R²   : 0.016
cat("MSE  :", round(MSE_lm, 3), "\n")
## MSE  : 1.08
cat("RMSE :", round(RMSE_lm, 3), "\n")
## RMSE : 1.039

Model Regresi Spline

model_spline <- lm(DO ~ ns(pH, df = 3) + ns(BOD, df = 3) + ns(TSS, df = 3) + ns(Suhu, df = 3),
                   data = data_train)
summary(model_spline)
## 
## Call:
## lm(formula = DO ~ ns(pH, df = 3) + ns(BOD, df = 3) + ns(TSS, 
##     df = 3) + ns(Suhu, df = 3), data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4106 -0.5099  0.0305  0.5783  2.4649 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.44612    0.68016   8.007 6.07e-14 ***
## ns(pH, df = 3)1   -0.19281    0.27434  -0.703   0.4829    
## ns(pH, df = 3)2    0.66236    0.85851   0.772   0.4412    
## ns(pH, df = 3)3    0.20075    0.39668   0.506   0.6133    
## ns(BOD, df = 3)1  -0.03182    0.25992  -0.122   0.9027    
## ns(BOD, df = 3)2   1.49449    0.78375   1.907   0.0578 .  
## ns(BOD, df = 3)3   0.97050    0.38571   2.516   0.0126 *  
## ns(TSS, df = 3)1  -0.09290    0.26453  -0.351   0.7258    
## ns(TSS, df = 3)2   0.41987    0.67953   0.618   0.5373    
## ns(TSS, df = 3)3  -0.05698    0.33290  -0.171   0.8642    
## ns(Suhu, df = 3)1 -0.33735    0.27332  -1.234   0.2184    
## ns(Suhu, df = 3)2 -0.60811    0.79639  -0.764   0.4459    
## ns(Suhu, df = 3)3  0.01191    0.42098   0.028   0.9775    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9297 on 227 degrees of freedom
## Multiple R-squared:  0.04546,    Adjusted R-squared:  -0.004999 
## F-statistic: 0.9009 on 12 and 227 DF,  p-value: 0.5468
# Prediksi pada data testing
pred_spline <- predict(model_spline, newdata = data_test)

# Evaluasi performa
R2_spline   <- summary(model_spline)$r.squared
MSE_spline  <- mse(data_test$DO, pred_spline)
RMSE_spline <- rmse(data_test$DO, pred_spline)

cat("HASIL REGRESI SPLINE")
## HASIL REGRESI SPLINE
cat("R²   :", round(R2_spline, 3), "\n")
## R²   : 0.045
cat("MSE  :", round(MSE_spline, 3), "\n")
## MSE  : 1.093
cat("RMSE :", round(RMSE_spline, 3), "\n")
## RMSE : 1.045

b.Evaluasi performa model (R²/MSE/RMSE)

eval_df <- data.frame(
  Model = c("Regresi Linear", "Regresi Spline"),
  R2 = c(R2_lm, R2_spline),
  MSE = c(MSE_lm, MSE_spline),
  RMSE = c(RMSE_lm, RMSE_spline)
)

cat("PERBANDINGAN KINERJA MODEL")
## PERBANDINGAN KINERJA MODEL
print(eval_df)
##            Model         R2      MSE     RMSE
## 1 Regresi Linear 0.01602696 1.080184 1.039319
## 2 Regresi Spline 0.04546124 1.092923 1.045430

c.Visualisasikan hasil prediksi vs aktual.

Visualisasi Prediksi VS Aktual

plot_data <- data.frame(
  DO_Aktual = data_test$DO,
  Prediksi_Linear = pred_lm,
  Prediksi_Spline = pred_spline
)
# Plot Regresi Linear
ggplot(plot_data, aes(x = DO_Aktual, y = Prediksi_Linear)) +
  geom_point(color = "blue", alpha = 0.6, size = 3) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Prediksi vs Aktual (Regresi Linear)",
       x = "DO Aktual", y = "DO Prediksi")

# Plot Regresi Spline
ggplot(plot_data, aes(x = DO_Aktual, y = Prediksi_Spline)) +
  geom_point(color = "green", alpha = 0.6, size = 3) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Prediksi vs Aktual (Regresi Spline)",
       x = "DO Aktual", y = "DO Prediksi")

d.Jelaskan variabel yang paling memengaruhi DO

summary(model_lm)
## 
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = data_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35543 -0.54022 -0.00989  0.63204  2.37154 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.656990   1.306258   5.096 7.12e-07 ***
## pH          -0.053099   0.121339  -0.438   0.6621    
## BOD          0.130644   0.075076   1.740   0.0831 .  
## TSS         -0.003110   0.006539  -0.476   0.6348    
## Suhu        -0.018183   0.030199  -0.602   0.5477    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9277 on 235 degrees of freedom
## Multiple R-squared:  0.01603,    Adjusted R-squared:  -0.0007215 
## F-statistic: 0.9569 on 4 and 235 DF,  p-value: 0.4319

Keterangan Berdasarkan hasil output regresi linear pada gambar, diketahui bahwa tidak ada variabel independen yang berpengaruh signifikan terhadap variabel DO, karena semua nilai p-value (Pr(>|t|)) lebih besar dari 0,05. Namun, jika dilihat dari nilai t-value, variabel BOD memiliki nilai t paling besar (1.630) dibandingkan variabel lainnya, sehingga dapat dikatakan bahwa BOD adalah variabel yang paling berpengaruh terhadap DO, meskipun pengaruhnya belum signifikan secara statistik. Hal ini menunjukkan bahwa perubahan nilai BOD cenderung memiliki hubungan paling kuat terhadap perubahan kadar DO dibandingkan variabel lain seperti pH, TSS, maupun Suhu.