# --- 1. Library
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
set.seed(42)

# --- 2. Buat data simulasi
n <- 100
data <- data.frame(
  pH = round(runif(n, 5.5, 8.5), 2),
  TDS = sample(50:500, n, replace=TRUE),
  DO = round(runif(n, 2, 10), 2),
  BOD = round(runif(n, 1, 10), 2),
  COD = sample(10:100, n, replace=TRUE),
  ammonia = round(runif(n, 0.1, 3.0), 2),
  nitrate = round(runif(n, 0.1, 10.0), 2)
)

# --- 3. Tentukan label kualitas_air
data$kualitas_air <- with(data, ifelse(
  pH >= 6.5 & pH <= 8.5 & TDS < 300 & DO >= 5 & BOD < 3 & COD < 50 & ammonia < 1 & nitrate < 5,
  "Baik",
  ifelse(DO >= 3, "Sedang", "Buruk")
))
data$kualitas_air <- as.factor(data$kualitas_air)

# --- 4. Simpan sebagai file CSV
write.csv(data, "data_kualitas_air_dki.csv", row.names = FALSE)

# --- 5. Split Data
index <- createDataPartition(data$kualitas_air, p=0.8, list=FALSE)
train_data <- data[index, ]
test_data <- data[-index, ]

# --- 6. Bangun model Random Forest
model_rf <- randomForest(kualitas_air ~ ., data=train_data, ntree=100, mtry=3, importance=TRUE)
print(model_rf)
## 
## Call:
##  randomForest(formula = kualitas_air ~ ., data = train_data, ntree = 100,      mtry = 3, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##        Buruk Sedang class.error
## Buruk     12      0           0
## Sedang     0     68           0
# --- 7. Prediksi dan evaluasi
prediksi <- predict(model_rf, newdata=test_data)
confusionMatrix(prediksi, test_data$kualitas_air)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Buruk Sedang
##     Buruk      2      0
##     Sedang     1     17
##                                           
##                Accuracy : 0.95            
##                  95% CI : (0.7513, 0.9987)
##     No Information Rate : 0.85            
##     P-Value [Acc > NIR] : 0.1756          
##                                           
##                   Kappa : 0.7727          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9444          
##              Prevalence : 0.1500          
##          Detection Rate : 0.1000          
##    Detection Prevalence : 0.1000          
##       Balanced Accuracy : 0.8333          
##                                           
##        'Positive' Class : Buruk           
## 
# --- 8. Visualisasi pentingnya variabel
varImpPlot(model_rf)

# Load library
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()  masks randomForest::combine()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ purrr::lift()     masks caret::lift()
## ✖ ggplot2::margin() masks randomForest::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(randomForest)
library(caret)

# Simulasi data kualitas air
set.seed(123)
n <- 300
air_data <- data.frame(
  pH = runif(n, 5.5, 8.5),
  TDS = runif(n, 100, 1000),
  DO = runif(n, 2, 10),
  BOD = runif(n, 1, 10),
  COD = runif(n, 10, 100),
  Amonia = runif(n, 0.1, 5),
  Nitrat = runif(n, 0.1, 10)
)

# Menentukan kelas kualitas berdasarkan aturan sederhana (simulasi)
air_data$Kualitas <- with(air_data, ifelse(
  DO > 6 & BOD < 3 & COD < 40 & Amonia < 1 & Nitrat < 5, "Baik",
  ifelse(DO > 4 & BOD < 6 & COD < 60, "Sedang", "Buruk")
))

air_data$Kualitas <- factor(air_data$Kualitas, levels = c("Baik", "Sedang", "Buruk"))

# Lihat ringkasan data
summary(air_data)
##        pH             TDS              DO             BOD       
##  Min.   :5.502   Min.   :100.4   Min.   :2.010   Min.   :1.022  
##  1st Qu.:6.280   1st Qu.:339.6   1st Qu.:3.896   1st Qu.:3.423  
##  Median :6.939   Median :538.0   Median :6.029   Median :5.487  
##  Mean   :6.999   Mean   :549.0   Mean   :5.927   Mean   :5.434  
##  3rd Qu.:7.699   3rd Qu.:784.5   3rd Qu.:7.950   3rd Qu.:7.506  
##  Max.   :8.483   Max.   :999.5   Max.   :9.959   Max.   :9.934  
##       COD            Amonia           Nitrat        Kualitas  
##  Min.   :10.10   Min.   :0.1263   Min.   :0.135   Baik  :  0  
##  1st Qu.:31.81   1st Qu.:1.4357   1st Qu.:2.430   Sedang: 74  
##  Median :55.38   Median :2.5458   Median :5.024   Buruk :226  
##  Mean   :54.88   Mean   :2.5952   Mean   :4.974               
##  3rd Qu.:77.30   3rd Qu.:3.9244   3rd Qu.:7.406               
##  Max.   :99.96   Max.   :4.9822   Max.   :9.977
# Visualisasi distribusi kelas
ggplot(air_data, aes(x = Kualitas, fill = Kualitas)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribusi Kualitas Air", x = "Kelas", y = "Jumlah Sampel")

# Buat data simulasi
set.seed(123)
air_data <- data.frame(
  pH = c(6.8, 7.2, 6.5, 7.8, 8.0, 6.0, 5.8, 7.0, 6.3, 7.5),
  TDS = c(400, 550, 300, 800, 700, 950, 1000, 600, 850, 500),
  DO = c(7.2, 6.5, 5.8, 4.2, 3.5, 6.8, 2.5, 4.8, 5.0, 6.0),
  BOD = c(2.0, 3.5, 4.2, 6.0, 7.0, 2.5, 8.5, 5.0, 4.5, 3.0),
  COD = c(35, 45, 55, 70, 80, 30, 90, 60, 50, 40),
  Amonia = c(0.5, 0.8, 1.2, 2.5, 3.0, 0.3, 4.0, 2.0, 1.5, 0.7),
  Nitrat = c(2.0, 3.5, 6.0, 7.5, 8.0, 1.5, 9.5, 6.5, 5.5, 3.0)
)

# Tambahkan label kualitas berdasarkan kondisi (aturan sederhana)
air_data$Kualitas <- with(air_data, ifelse(
  DO > 6 & BOD < 3 & COD < 40 & Amonia < 1 & Nitrat < 5, "Baik",
  ifelse(DO > 4 & BOD < 6 & COD < 60, "Sedang", "Buruk")
))

# Konversi ke faktor
air_data$Kualitas <- as.factor(air_data$Kualitas)

# Lihat tabel
print(air_data)
##     pH  TDS  DO BOD COD Amonia Nitrat Kualitas
## 1  6.8  400 7.2 2.0  35    0.5    2.0     Baik
## 2  7.2  550 6.5 3.5  45    0.8    3.5   Sedang
## 3  6.5  300 5.8 4.2  55    1.2    6.0   Sedang
## 4  7.8  800 4.2 6.0  70    2.5    7.5    Buruk
## 5  8.0  700 3.5 7.0  80    3.0    8.0    Buruk
## 6  6.0  950 6.8 2.5  30    0.3    1.5     Baik
## 7  5.8 1000 2.5 8.5  90    4.0    9.5    Buruk
## 8  7.0  600 4.8 5.0  60    2.0    6.5    Buruk
## 9  6.3  850 5.0 4.5  50    1.5    5.5   Sedang
## 10 7.5  500 6.0 3.0  40    0.7    3.0   Sedang