req <- c("readxl","dplyr","tidyr","ggplot2","caret","e1071","rpart","rpart.plot",
"randomForest","mgcv","gridExtra","scales")
inst <- req[!req %in% installed.packages()[,"Package"]]
if(length(inst)) install.packages(inst, dependencies=TRUE)
lapply(req, library, character.only=TRUE)
## Warning: package 'readxl' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
## Warning: package 'e1071' was built under R version 4.3.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
## Warning: package 'rpart' was built under R version 4.3.3
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.3.3
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## [[1]]
## [1] "readxl" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "dplyr" "readxl" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
##
## [[3]]
## [1] "tidyr" "dplyr" "readxl" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "ggplot2" "tidyr" "dplyr" "readxl" "stats" "graphics"
## [7] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "caret" "lattice" "ggplot2" "tidyr" "dplyr" "readxl"
## [7] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [13] "base"
##
## [[6]]
## [1] "e1071" "caret" "lattice" "ggplot2" "tidyr" "dplyr"
## [7] "readxl" "stats" "graphics" "grDevices" "utils" "datasets"
## [13] "methods" "base"
##
## [[7]]
## [1] "rpart" "e1071" "caret" "lattice" "ggplot2" "tidyr"
## [7] "dplyr" "readxl" "stats" "graphics" "grDevices" "utils"
## [13] "datasets" "methods" "base"
##
## [[8]]
## [1] "rpart.plot" "rpart" "e1071" "caret" "lattice"
## [6] "ggplot2" "tidyr" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[9]]
## [1] "randomForest" "rpart.plot" "rpart" "e1071" "caret"
## [6] "lattice" "ggplot2" "tidyr" "dplyr" "readxl"
## [11] "stats" "graphics" "grDevices" "utils" "datasets"
## [16] "methods" "base"
##
## [[10]]
## [1] "mgcv" "nlme" "randomForest" "rpart.plot" "rpart"
## [6] "e1071" "caret" "lattice" "ggplot2" "tidyr"
## [11] "dplyr" "readxl" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "gridExtra" "mgcv" "nlme" "randomForest" "rpart.plot"
## [6] "rpart" "e1071" "caret" "lattice" "ggplot2"
## [11] "tidyr" "dplyr" "readxl" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[12]]
## [1] "scales" "gridExtra" "mgcv" "nlme" "randomForest"
## [6] "rpart.plot" "rpart" "e1071" "caret" "lattice"
## [11] "ggplot2" "tidyr" "dplyr" "readxl" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
set.seed(123)
data_raw <- readxl::read_excel("kualitasair.xlsx") %>% as.data.frame()
cat("Ukuran data awal:", dim(data_raw), "\n")
## Ukuran data awal: 300 7
head(data_raw)
## Lokasi pH DO BOD TSS Suhu Status
## 1 S1 7.6855 NA 1.7136 43.1415 26.7972 Tercemar ringan
## 2 S2 6.7177 5.7236 1.4402 44.2963 27.7284 Tercemar ringan
## 3 S3 7.1816 4.8906 2.7274 NA 26.0255 Tercemar ringan
## 4 S4 7.3164 6.1339 3.1398 41.0104 29.6639 Tercemar ringan
## 5 S5 7.2021 7.7853 1.1778 48.0967 26.4099 baik
## 6 S6 6.9469 8.4222 3.2324 48.5610 28.6809 Tercemar ringan
# 1.1 Standardize column names (toleransi)
names(data_raw) <- make.names(names(data_raw))
# Ensure expected columns exist or stop with message
expected <- c("Lokasi","pH","DO","BOD","TSS","Suhu","Status")
missing_cols <- setdiff(expected, names(data_raw))
if(length(missing_cols)>0){
stop("Data tidak memiliki kolom: ", paste(missing_cols, collapse=", "),
)
}
# 1.2 Inspect missing value counts
cat("\n-- Missing values (pre-clean) --\n")
##
## -- Missing values (pre-clean) --
print(sapply(data_raw[expected], function(x) sum(is.na(x))))
## Lokasi pH DO BOD TSS Suhu Status
## 0 0 23 22 24 0 0
# 1.3 Standarisasi Status penulisan (kosong/angka/teks)
data <- data_raw
# Trim and lowercase for mapping
data$Status <- as.character(data$Status)
data$Status[trimws(data$Status)==""] <- NA
st <- tolower(trimws(data$Status))
st[st %in% c("1","baik","baik=1")] <- "Baik"
st[st %in% c("2","tercemar ringan","tercemar_ringan","tercemar ringan=2")] <- "Tercemar ringan"
st[st %in% c("3","tercemar berat","tercemar_berat","tercemar berat=3")] <- "Tercemar berat"
# if still NA and DO exists, optionally infer from DO (only if user wants auto-label)
# For now keep NA (we'll drop unlabeled for classification), but will create labels for regression if needed.
data$Status <- ifelse(is.na(st), NA, st)
data$Status <- factor(data$Status, levels = c("Baik","Tercemar ringan","Tercemar berat"))
# 1.4 Imputasi missing numeric: impute median (defensive)
num_vars <- c("pH","DO","BOD","TSS","Suhu")
for(v in num_vars){
if(!is.numeric(data[[v]])) data[[v]] <- as.numeric(data[[v]])
med <- median(data[[v]], na.rm=TRUE)
data[[v]][is.na(data[[v]])] <- med
}
# 1.5 Outlier detection (IQR) and winsorize to 1%/99% quantiles
winsorize <- function(x, probs=c(0.01,0.99)){ q <- quantile(x, probs=probs, na.rm=TRUE); pmax(pmin(x, q[2]), q[1]) }
for(v in num_vars){
# report count outlier
x <- data_raw[[v]]
Q1 <- quantile(x,0.25,na.rm=TRUE); Q3 <- quantile(x,0.75,na.rm=TRUE); IQRv <- Q3-Q1
lower <- Q1-1.5*IQRv; upper <- Q3+1.5*IQRv
out_idx <- which(x < lower | x > upper)
cat(sprintf("Variabel %s: outlier (IQR) = %d\n", v, length(out_idx)))
# apply winsorize on cleaned data
data[[v]] <- winsorize(data[[v]])
}
## Variabel pH: outlier (IQR) = 4
## Variabel DO: outlier (IQR) = 2
## Variabel BOD: outlier (IQR) = 3
## Variabel TSS: outlier (IQR) = 3
## Variabel Suhu: outlier (IQR) = 2
# 1.6 Jika ada Status NA, hapus baris tsb untuk analisis klasifikasi. (still keep for regression if needed)
cat("\nJumlah Status NA sebelum filter:", sum(is.na(data$Status)), "\n")
##
## Jumlah Status NA sebelum filter: 0
data_class <- data %>% filter(!is.na(Status)) # untuk klasifikasi
cat("Ukuran data untuk klasifikasi (setelah hapus NA Status):", nrow(data_class), "\n")
## Ukuran data untuk klasifikasi (setelah hapus NA Status): 300
# 1.7 Ringkasan statistik deskriptif setelah pembersihan
desc_stats <- data %>%
summarise(across(all_of(num_vars),
list(mean = ~mean(.), sd = ~sd(.), median = ~median(.), min = ~min(.), max = ~max(.))))
cat("\n-- Ringkasan statistik deskriptif (setelah pembersihan) --\n")
##
## -- Ringkasan statistik deskriptif (setelah pembersihan) --
print(desc_stats)
## pH_mean pH_sd pH_median pH_min pH_max DO_mean DO_sd DO_median
## 1 6.989464 0.4857012 6.98805 5.77872 8.106373 5.977196 0.9387891 5.9909
## DO_min DO_max BOD_mean BOD_sd BOD_median BOD_min BOD_max TSS_mean
## 1 3.829614 8.223684 3.004537 0.7925544 3.0661 0.956324 5.09963 49.68304
## TSS_sd TSS_median TSS_min TSS_max Suhu_mean Suhu_sd Suhu_median Suhu_min
## 1 9.11172 49.52205 27.70734 72.22947 28.12117 2.046895 28.01485 23.52489
## Suhu_max
## 1 33.247
cat("\nDistribusi Status (setelah cleaning):\n")
##
## Distribusi Status (setelah cleaning):
print(table(data_class$Status))
##
## Baik Tercemar ringan Tercemar berat
## 72 221 7
# Save simple descriptive table
write.csv(data.frame(stat = names(desc_stats), t(desc_stats)), "descriptive_stats_after_cleaning.csv", row.names=FALSE)
# 2.1 Pastikan cukup baris untuk split 75
if(nrow(data_class) < 80){
stop("Setelah pembersihan jumlah data berlabel kurang dari 80; tidak cukup untuk split. Hentikan.")
}
# 2.2 Split fixed test = 75
set.seed(123)
test_idx <- sample(seq_len(nrow(data_class)), size = 75)
test <- data_class[test_idx, ]
train <- data_class[-test_idx, ]
cat("\nUkuran train:", nrow(train), "Ukuran test:", nrow(test), "\n")
##
## Ukuran train: 225 Ukuran test: 75
# 2.3 Prepare formula dan trainControl
features <- c("pH","DO","BOD","TSS","Suhu")
formula_clf <- as.formula(paste("Status ~", paste(features, collapse = " + ")))
trctrl <- trainControl(method="cv", number=5, classProbs = TRUE)
# --- Fix: Pastikan level factor valid untuk caret ---
orig_levels <- levels(train$Status) # simpan level asli
orig_levels
## [1] "Baik" "Tercemar ringan" "Tercemar berat"
safe_levels <- make.names(orig_levels) # bikin versi valid (space -> .)
safe_levels
## [1] "Baik" "Tercemar.ringan" "Tercemar.berat"
# Terapkan ke train & test
levels(train$Status) <- safe_levels
levels(test$Status) <- safe_levels
# Cek ulang
levels(train$Status)
## [1] "Baik" "Tercemar.ringan" "Tercemar.berat"
levels(test$Status)
## [1] "Baik" "Tercemar.ringan" "Tercemar.berat"
# 2.4 SVM (classification)
svm_mod <- train(formula_clf, data = train, method = "svmRadial",
preProcess = c("center","scale"), trControl = trctrl, tuneLength = 5)
pred_svm <- predict(svm_mod, newdata = test)
cm_svm <- confusionMatrix(pred_svm, test$Status)
cat("\n--- SVM (classification) ---\n")
##
## --- SVM (classification) ---
print(cm_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar.ringan Tercemar.berat
## Baik 17 3 0
## Tercemar.ringan 4 50 1
## Tercemar.berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8933
## 95% CI : (0.8006, 0.9528)
## No Information Rate : 0.7067
## P-Value [Acc > NIR] : 1e-04
##
## Kappa : 0.738
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar.ringan Class: Tercemar.berat
## Sensitivity 0.8095 0.9434 0.00000
## Specificity 0.9444 0.7727 1.00000
## Pos Pred Value 0.8500 0.9091 NaN
## Neg Pred Value 0.9273 0.8500 0.98667
## Prevalence 0.2800 0.7067 0.01333
## Detection Rate 0.2267 0.6667 0.00000
## Detection Prevalence 0.2667 0.7333 0.00000
## Balanced Accuracy 0.8770 0.8581 0.50000
# 2.5 Decision Tree (rpart)
tree_mod <- rpart::rpart(formula_clf, data = train, method = "class", control = rpart.control(cp=0.01))
pred_tree <- predict(tree_mod, newdata = test, type="class")
cm_tree <- confusionMatrix(pred_tree, test$Status)
cat("\n--- Decision Tree ---\n")
##
## --- Decision Tree ---
print(cm_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar.ringan Tercemar.berat
## Baik 20 2 0
## Tercemar.ringan 1 51 1
## Tercemar.berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9467
## 95% CI : (0.869, 0.9853)
## No Information Rate : 0.7067
## P-Value [Acc > NIR] : 2.034e-07
##
## Kappa : 0.8726
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar.ringan Class: Tercemar.berat
## Sensitivity 0.9524 0.9623 0.00000
## Specificity 0.9630 0.9091 1.00000
## Pos Pred Value 0.9091 0.9623 NaN
## Neg Pred Value 0.9811 0.9091 0.98667
## Prevalence 0.2800 0.7067 0.01333
## Detection Rate 0.2667 0.6800 0.00000
## Detection Prevalence 0.2933 0.7067 0.00000
## Balanced Accuracy 0.9577 0.9357 0.50000
# optional plot: rpart.plot(tree_mod)
# 2.6 Random Forest
rf_mod <- randomForest::randomForest(formula_clf, data = train, ntree = 500, importance = TRUE)
pred_rf <- predict(rf_mod, newdata = test)
cm_rf <- confusionMatrix(pred_rf, test$Status)
cat("\n--- Random Forest ---\n")
##
## --- Random Forest ---
print(cm_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Baik Tercemar.ringan Tercemar.berat
## Baik 20 1 0
## Tercemar.ringan 1 52 1
## Tercemar.berat 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.8875, 0.9917)
## No Information Rate : 0.7067
## P-Value [Acc > NIR] : 2.622e-08
##
## Kappa : 0.9031
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Baik Class: Tercemar.ringan Class: Tercemar.berat
## Sensitivity 0.9524 0.9811 0.00000
## Specificity 0.9815 0.9091 1.00000
## Pos Pred Value 0.9524 0.9630 NaN
## Neg Pred Value 0.9815 0.9524 0.98667
## Prevalence 0.2800 0.7067 0.01333
## Detection Rate 0.2667 0.6933 0.00000
## Detection Prevalence 0.2800 0.7200 0.00000
## Balanced Accuracy 0.9669 0.9451 0.50000
# --- Fix: Pastikan level factor valid untuk caret ---
orig_levels <- levels(train$Status) # simpan level asli
orig_levels
## [1] "Baik" "Tercemar.ringan" "Tercemar.berat"
safe_levels <- make.names(orig_levels) # bikin versi valid (space -> .)
safe_levels
## [1] "Baik" "Tercemar.ringan" "Tercemar.berat"
# Terapkan ke train & test
levels(train$Status) <- safe_levels
levels(test$Status) <- safe_levels
# Cek ulang
levels(train$Status)
## [1] "Baik" "Tercemar.ringan" "Tercemar.berat"
levels(test$Status)
## [1] "Baik" "Tercemar.ringan" "Tercemar.berat"
# 2.7 Summary accuracy table
acc_tbl <- data.frame(
Model = c("SVM","Decision Tree","Random Forest"),
Accuracy = c(as.numeric(cm_svm$overall["Accuracy"]),
as.numeric(cm_tree$overall["Accuracy"]),
as.numeric(cm_rf$overall["Accuracy"]))
)
cat("\nRingkasan akurasi:\n")
##
## Ringkasan akurasi:
print(acc_tbl)
## Model Accuracy
## 1 SVM 0.8933333
## 2 Decision Tree 0.9466667
## 3 Random Forest 0.9600000
# Save confusion matrices to CSVs
write.csv(as.data.frame(cm_svm$table), "cm_svm.csv", row.names=FALSE)
write.csv(as.data.frame(cm_tree$table), "cm_tree.csv", row.names=FALSE)
write.csv(as.data.frame(cm_rf$table), "cm_rf.csv", row.names=FALSE)
# Simple interpretation strings (console)
cat("\nInterpretasi singkat:\n")
##
## Interpretasi singkat:
cat(sprintf("SVM accuracy = %.2f%%\n", acc_tbl$Accuracy[1]*100))
## SVM accuracy = 89.33%
cat(sprintf("Decision Tree accuracy = %.2f%%\n", acc_tbl$Accuracy[2]*100))
## Decision Tree accuracy = 94.67%
cat(sprintf("Random Forest accuracy = %.2f%%\n", acc_tbl$Accuracy[3]*100))
## Random Forest accuracy = 96.00%
cat("Rekomendasi: gunakan Random Forest bila mengutamakan akurasi & stabilitas.\n")
## Rekomendasi: gunakan Random Forest bila mengutamakan akurasi & stabilitas.
hasil_prediksi <- data.frame(
Lokasi = test$Lokasi,
pH = test$pH,
DO_Aktual = test$DO,
BOD = test$BOD,
TSS = test$TSS,
Suhu = test$Suhu,
Status_Aktual = test$Status,
Prediksi_Status_RF = pred_rf,
Prediksi_Status_SVM = pred_svm,
Prediksi_Status_Tree = pred_tree
)
write.csv(hasil_prediksi, "cm_rf.csv", row.names = FALSE)
cat("File 'cm_rf.csv' berisi 75 baris dibuat di:", getwd(), "\n")
## File 'cm_rf.csv' berisi 75 baris dibuat di: C:/Users/alima/OneDrive/ドキュメント
# For regression, we can use full cleaned data (including rows where Status may be NA)
# but we'll ensure no NA in predictors/DO
data_reg <- data %>% select(Lokasi, pH, DO, BOD, TSS, Suhu, Status)
data_reg <- data_reg[complete.cases(data_reg[, c("DO","pH","BOD","TSS","Suhu")]), ]
# Create train/test split for regression (use same test indexes as classification if desired)
# Here we reuse the same test indices for comparability: if test indices exist in data_reg, use them; else do 20% split
if(all(test$Lokasi %in% data_reg$Lokasi)){
test_do <- data_reg[data_reg$Lokasi %in% test$Lokasi, ]
train_do <- data_reg[!data_reg$Lokasi %in% test$Lokasi, ]
} else {
set.seed(123)
idx <- createDataPartition(data_reg$DO, p = 0.8, list = FALSE)
train_do <- data_reg[idx, ]
test_do <- data_reg[-idx, ]
}
cat("\nUkuran train_do:", nrow(train_do), "Ukuran test_do:", nrow(test_do), "\n")
##
## Ukuran train_do: 225 Ukuran test_do: 75
# 3.1 Linear Regression
lm_mod <- lm(DO ~ pH + BOD + TSS + Suhu, data = train_do)
summary(lm_mod)
##
## Call:
## lm(formula = DO ~ pH + BOD + TSS + Suhu, data = train_do)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.17410 -0.50822 0.02886 0.64160 2.22459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.009774 1.318828 4.557 8.61e-06 ***
## pH 0.053710 0.129267 0.415 0.678
## BOD 0.116030 0.076947 1.508 0.133
## TSS 0.007931 0.006708 1.182 0.238
## Suhu -0.042695 0.030960 -1.379 0.169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9352 on 220 degrees of freedom
## Multiple R-squared: 0.02277, Adjusted R-squared: 0.005002
## F-statistic: 1.282 on 4 and 220 DF, p-value: 0.2782
# 3.2 Spline / GAM
gam_mod <- mgcv::gam(DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu), data = train_do)
summary(gam_mod)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DO ~ s(pH) + s(BOD) + s(TSS) + s(Suhu)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.92805 0.05961 99.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(pH) 1.000 1.000 0.064 0.80085
## s(BOD) 6.651 7.738 3.220 0.00213 **
## s(TSS) 1.000 1.000 0.870 0.35196
## s(Suhu) 1.000 1.000 2.999 0.08478 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0906 Deviance explained = 13%
## GCV = 0.83914 Scale est. = 0.79942 n = 225
# 3.3 Predictions
pred_lm <- predict(lm_mod, newdata = test_do)
pred_gam <- predict(gam_mod, newdata = test_do)
# 3.4 Evaluation metrics
mse <- function(obs,pred) mean((obs - pred)^2, na.rm=TRUE)
rmse <- function(obs,pred) sqrt(mse(obs,pred))
r2 <- function(obs,pred){
ss_res <- sum((obs - pred)^2, na.rm=TRUE)
ss_tot <- sum((obs - mean(obs, na.rm=TRUE))^2, na.rm=TRUE)
1 - ss_res/ss_tot
}
eval_reg <- data.frame(
Model = c("Linear Regression", "Spline (GAM)"),
R2 = c(r2(test_do$DO, pred_lm), r2(test_do$DO, pred_gam)),
MSE = c(mse(test_do$DO, pred_lm), mse(test_do$DO, pred_gam)),
RMSE = c(rmse(test_do$DO, pred_lm), rmse(test_do$DO, pred_gam))
)
cat("\nEvaluasi model prediksi DO:\n")
##
## Evaluasi model prediksi DO:
print(eval_reg)
## Model R2 MSE RMSE
## 1 Linear Regression -0.1323336 0.9727259 0.9862687
## 2 Spline (GAM) -0.1783631 1.0122673 1.0061150
write.csv(eval_reg, "eval_prediksi_DO.csv", row.names=FALSE)
# 3.5 Visualisasi Prediksi vs Aktual
p1 <- ggplot(data.frame(Aktual=test_do$DO, Prediksi=pred_lm), aes(x=Aktual, y=Prediksi)) +
geom_point() + geom_abline(slope=1, intercept=0, linetype="dashed") +
labs(title="Linear Regression: DO Aktual vs Prediksi", x="DO Aktual", y="DO Prediksi") +
theme_minimal()
p2 <- ggplot(data.frame(Aktual=test_do$DO, Prediksi=pred_gam), aes(x=Aktual, y=Prediksi)) +
geom_point() + geom_abline(slope=1, intercept=0, linetype="dashed") +
labs(title="GAM (spline): DO Aktual vs Prediksi", x="DO Aktual", y="DO Prediksi") +
theme_minimal()
# display plots side by side
gridExtra::grid.arrange(p1,p2, ncol=2)
# 3.6 Variable importance for DO (random forest regression)
rf_reg <- randomForest::randomForest(DO ~ pH + BOD + TSS + Suhu, data = train_do, ntree = 500, importance = TRUE)
cat("\nVariable importance (regression RF):\n")
##
## Variable importance (regression RF):
print(importance(rf_reg))
## %IncMSE IncNodePurity
## pH -1.8782446 38.25933
## BOD 7.9625686 47.41260
## TSS 2.9462298 43.57158
## Suhu 0.2532323 41.57513
varImpPlot(rf_reg, main="Variable Importance - RF (DO)")
# Save DO predictions
hasil_do <- data.frame(Lokasi = test_do$Lokasi, DO_Aktual = test_do$DO, Pred_LM = pred_lm, Pred_GAM = pred_gam)
write.csv(hasil_do, "hasil_prediksi_DO_test.csv", row.names = FALSE)
cat("File 'hasil_prediksi_DO_test.csv' dibuat.\n")
## File 'hasil_prediksi_DO_test.csv' dibuat.
RINGKASAN SINGKAT - Soal1: Missing values ditangani (imputasi median), outlier dibatasi (winsorize), Status distandarkan. - Soal2: Klasifikasi (SVM, Decision Tree, Random Forest). Akurasi tercatat pada tabel ‘acc_tbl’. Random Forest direkomendasikan. - Soal3: Prediksi DO -> GAM (spline) menangkap komponen non-linear; cek ‘eval_prediksi_DO.csv’ untuk R2/MSE/RMSE. - Hasil 75 baris prediksi disimpan: ‘cm_rf.csv’ (status) dan ‘hasil_prediksi_DO_test.csv’ (DO predictions).