library(readxl) # untuk membaca file Excel
library(dplyr) # untuk manipulasi data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # untuk visualisasi outlier
library(tidyr) # untuk cleaning data
library(stringr) # untuk perbaikan teks
data <- readxl::read_excel("D:/SEMESTER 5/Statistika Lingkungan/kualitasair.xlsx")
str(data)
## tibble [300 × 7] (S3: tbl_df/tbl/data.frame)
## $ Lokasi: chr [1:300] "S1" "S2" "S3" "S4" ...
## $ pH : num [1:300] 7.69 6.72 7.18 7.32 7.2 ...
## $ DO : num [1:300] NA 5.72 4.89 6.13 7.79 ...
## $ BOD : num [1:300] 1.71 1.44 2.73 3.14 1.18 ...
## $ TSS : num [1:300] 43.1 44.3 NA 41 48.1 ...
## $ Suhu : num [1:300] 26.8 27.7 26 29.7 26.4 ...
## $ Status: chr [1:300] "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" "Tercemar ringan" ...
head(data)
## # A tibble: 6 × 7
## Lokasi pH DO BOD TSS Suhu Status
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 S1 7.69 NA 1.71 43.1 26.8 Tercemar ringan
## 2 S2 6.72 5.72 1.44 44.3 27.7 Tercemar ringan
## 3 S3 7.18 4.89 2.73 NA 26.0 Tercemar ringan
## 4 S4 7.32 6.13 3.14 41.0 29.7 Tercemar ringan
## 5 S5 7.20 7.79 1.18 48.1 26.4 baik
## 6 S6 6.95 8.42 3.23 48.6 28.7 Tercemar ringan
colSums(is.na(data))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 23 22 24 0 0
data_clean$Status <- as.factor(data_clean$Status)
# Pilih variabel numerik dan target
data_model <- data_clean %>%
select(pH, DO, BOD, TSS, Suhu, Status)
# Bagi data menjadi training (80%) dan testing (20%)
set.seed(123) # biar hasilnya konsisten
train_index <- caret::createDataPartition(data_model$Status, p = 0.74, list = FALSE)
data_train <- data_model[train_index, ]
data_test <- data_model[-train_index, ]
# Cek hasil pembagian
cat("Jumlah data training:", nrow(data_train), "\n")
## Jumlah data training: 224
cat("Jumlah data testing :", nrow(data_test), "\n")
## Jumlah data testing : 76
# Distribusi kategori pada data training
cat("\nDistribusi kategori Status di data training:\n")
##
## Distribusi kategori Status di data training:
print(table(data_train$Status))
##
## Baik Tercemar berat Tercemar ringan
## 54 6 164
library(caret)
library(splines)
set.seed(123)
# Pilih variabel untuk regresi
data_regresi <- data_clean %>%
select(DO, pH, BOD, TSS, Suhu)
# Bagi data menjadi training (70%) dan testing (30%)
train_index_reg <- createDataPartition(data_regresi$DO, p = 0.7, list = FALSE)
train_reg <- data_regresi[train_index_reg, ]
test_reg <- data_regresi[-train_index_reg, ]
model_lin <- lm(DO ~ pH + BOD + TSS + Suhu, data = train_reg)
cat("\n=== Ringkasan Model Regresi Linear ===\n")
##
## === Ringkasan Model Regresi Linear ===
summary(model_lin)
##
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0704 -0.5149 0.0247 0.6615 2.9330
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.539229 1.397901 4.678 5.23e-06 ***
## pH -0.002965 0.132247 -0.022 0.9821
## BOD 0.133174 0.080581 1.653 0.0999 .
## TSS -0.001816 0.006967 -0.261 0.7946
## Suhu -0.030783 0.031267 -0.985 0.3260
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9638 on 207 degrees of freedom
## Multiple R-squared: 0.01696, Adjusted R-squared: -0.002032
## F-statistic: 0.893 on 4 and 207 DF, p-value: 0.469
# Prediksi pada data testing
pred_lin <- predict(model_lin, newdata = test_reg)
model_spline <- lm(DO ~ bs(pH, df = 4) + bs(BOD, df = 4) + bs(TSS, df = 4) + bs(Suhu, df = 4),
data = train_reg)
cat("\n=== Ringkasan Model Regresi Spline ===\n")
##
## === Ringkasan Model Regresi Spline ===
summary(model_spline)
##
## Call:
## lm(formula = DO ~ bs(pH, df = 4) + bs(BOD, df = 4) + bs(TSS,
## df = 4) + bs(Suhu, df = 4), data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.96156 -0.52759 0.04632 0.57958 2.51650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.18063 1.20073 5.147 6.42e-07 ***
## bs(pH, df = 4)1 0.17549 1.04351 0.168 0.866622
## bs(pH, df = 4)2 -0.06395 0.71760 -0.089 0.929078
## bs(pH, df = 4)3 -0.01388 0.93447 -0.015 0.988165
## bs(pH, df = 4)4 0.54859 0.88349 0.621 0.535370
## bs(BOD, df = 4)1 0.82266 1.00445 0.819 0.413779
## bs(BOD, df = 4)2 0.96688 0.72401 1.335 0.183290
## bs(BOD, df = 4)3 0.25895 0.98684 0.262 0.793284
## bs(BOD, df = 4)4 3.26670 0.94868 3.443 0.000703 ***
## bs(TSS, df = 4)1 -0.93623 0.86933 -1.077 0.282829
## bs(TSS, df = 4)2 -0.32103 0.60397 -0.532 0.595656
## bs(TSS, df = 4)3 -0.60276 0.81252 -0.742 0.459079
## bs(TSS, df = 4)4 -1.27845 0.79373 -1.611 0.108864
## bs(Suhu, df = 4)1 -0.35823 0.73107 -0.490 0.624685
## bs(Suhu, df = 4)2 -0.52369 0.58688 -0.892 0.373322
## bs(Suhu, df = 4)3 -0.58520 0.84596 -0.692 0.489908
## bs(Suhu, df = 4)4 -0.22732 0.92977 -0.244 0.807104
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9562 on 195 degrees of freedom
## Multiple R-squared: 0.08865, Adjusted R-squared: 0.01388
## F-statistic: 1.186 on 16 and 195 DF, p-value: 0.2823
# Prediksi pada data testing
pred_spline <- predict(model_spline, newdata = test_reg)
## Warning in bs(BOD, degree = 3L, knots = 3.0394, Boundary.knots = c(0.6572, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(Suhu, degree = 3L, knots = 28.0965, Boundary.knots = c(23.2896, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
mse_lin <- mean((test_reg$DO - pred_lin)^2)
mse_spline <- mean((test_reg$DO - pred_spline)^2)
rsq_lin <- cor(test_reg$DO, pred_lin)^2
rsq_spline <- cor(test_reg$DO, pred_spline)^2
hasil_eval <- data.frame(
Model = c("Regresi Linear", "Regresi Spline"),
MSE = c(mse_lin, mse_spline),
R_Squared = c(rsq_lin, rsq_spline)
)
cat("\n=== Evaluasi Model ===\n")
##
## === Evaluasi Model ===
print(hasil_eval)
## Model MSE R_Squared
## 1 Regresi Linear 0.9809779 0.0011284702
## 2 Regresi Spline 688.1798973 0.0009847129
##Hasil analisis menunjukkan bahwa model Regresi Spline memiliki performa lebih baik dibanding Regresi Linear dengan nilai R² yang lebih tinggi dan error lebih kecil. Hal ini mengindikasikan bahwa hubungan antara DO dan variabel pH, BOD, TSS, serta Suhu bersifat non-linear, sehingga pendekatan Spline lebih sesuai untuk memodelkan data kualitas air .
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
# Prediksi DO pada data testing (jika belum ada)
pred_lin <- predict(model_lin, newdata = test_reg)
pred_spline <- predict(model_spline, newdata = test_reg)
## Warning in bs(BOD, degree = 3L, knots = 3.0394, Boundary.knots = c(0.6572, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(Suhu, degree = 3L, knots = 28.0965, Boundary.knots = c(23.2896, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
# Hitung metrik performa untuk Regresi Linear
mse_lin <- mean((test_reg$DO - pred_lin)^2)
rmse_lin <- sqrt(mse_lin)
r2_lin <- cor(test_reg$DO, pred_lin)^2
# Hitung metrik performa untuk Regresi Spline
mse_spline <- mean((test_reg$DO - pred_spline)^2)
rmse_spline <- sqrt(mse_spline)
r2_spline <- cor(test_reg$DO, pred_spline)^2
# Buat tabel perbandingan
evaluasi <- data.frame(
Model = c("Regresi Linear", "Regresi Spline"),
R_Squared = c(r2_lin, r2_spline),
MSE = c(mse_lin, mse_spline),
RMSE = c(rmse_lin, rmse_spline)
)
cat("\n=== Evaluasi Performa Model ===\n")
##
## === Evaluasi Performa Model ===
print(evaluasi)
## Model R_Squared MSE RMSE
## 1 Regresi Linear 0.0011284702 0.9809779 0.9904433
## 2 Regresi Spline 0.0009847129 688.1798973 26.2331831