library(class) # untuk knn
library(caret) # untuk evaluasi
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(dplyr)     # untuk manipulasi data
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
waterquality <- read_excel("D:/KULIAH NABILA/SMT 6/DATA MINING/Data water quality.xlsx")
View(waterquality)
head(waterquality)
## # A tibble: 6 × 10
##   ph             Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
##   <chr>          <chr>    <chr>  <chr>       <chr>   <chr>        <chr>         
## 1 <NA>           204.890… 20791… 7.30021187… 368.51… 564.3086541… 10.3797830780…
## 2 3.71608007538… 129.422… 18630… 6.63524588… <NA>    592.8853591… 15.1800131163…
## 3 8.09912418929… 224.236… 19909… 9.27588360… <NA>    418.6062130… 16.8686369295…
## 4 8.31676588421… 214.373… 22018… 8.05933237… 356.88… 363.2665161… 18.4365244954…
## 5 9.09222345629… 181.101… 17978… 6.54659997… 310.13… 398.4108133… 11.5582794434…
## 6 5.58408663845… 188.313… 28748… 7.54486878… 326.67… 280.4679159… 8.39973464015…
## # ℹ 3 more variables: Trihalomethanes <chr>, Turbidity <chr>, Potability <dbl>
summary(waterquality)
##       ph              Hardness            Solids          Chloramines       
##  Length:3276        Length:3276        Length:3276        Length:3276       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Sulfate          Conductivity       Organic_carbon     Trihalomethanes   
##  Length:3276        Length:3276        Length:3276        Length:3276       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   Turbidity           Potability    
##  Length:3276        Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000  
##                     Mean   :0.3901  
##                     3rd Qu.:1.0000  
##                     Max.   :1.0000
# Ubah kolom menjadi numeric jika belum
waterquality$ph <- as.numeric(waterquality$ph)
waterquality$Hardness <- as.numeric(waterquality$Hardness)
waterquality$Solids <- as.numeric(waterquality$Solids)
waterquality$Chloramines <- as.numeric(waterquality$Chloramines)
waterquality$Sulfate <- as.numeric(waterquality$Sulfate)
waterquality$Conductivity <- as.numeric(waterquality$Conductivity)
waterquality$Organic_carbon <- as.numeric(waterquality$Organic_carbon)
waterquality$Trihalomethanes <- as.numeric(waterquality$Trihalomethanes)
waterquality$Turbidity <- as.numeric(waterquality$Turbidity)
# Cek jumlah missing values per kolom
colSums(is.na(waterquality))  # Menghitung jumlah nilai NA (kosong) di setiap kolom
##              ph        Hardness          Solids     Chloramines         Sulfate 
##             491               0               0               0             781 
##    Conductivity  Organic_carbon Trihalomethanes       Turbidity      Potability 
##               0               0             162               0               0
# Visualisasi missing values
library(VIM)  # Library untuk visualisasi missing values
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.4.3
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
# Mengganti missing values dengan median di setiap kolom
waterquality$ph[is.na(waterquality$ph)] <- median(waterquality$ph, na.rm = TRUE)
waterquality$Sulfate[is.na(waterquality$Sulfate)] <- median(waterquality$Sulfate, na.rm = TRUE)
waterquality$Trihalomethanes[is.na(waterquality$Trihalomethanes)] <- median(waterquality$Trihalomethanes, na.rm = TRUE)
# Cek ulang data setelah preprocessing
summary(waterquality)  # Menampilkan ringkasan statistik setelah preprocessing
##        ph            Hardness          Solids         Chloramines    
##  Min.   : 0.000   Min.   : 47.43   Min.   :  320.9   Min.   : 0.352  
##  1st Qu.: 6.278   1st Qu.:176.85   1st Qu.:15666.7   1st Qu.: 6.127  
##  Median : 7.037   Median :196.97   Median :20927.8   Median : 7.130  
##  Mean   : 7.074   Mean   :196.37   Mean   :22014.1   Mean   : 7.122  
##  3rd Qu.: 7.870   3rd Qu.:216.67   3rd Qu.:27332.8   3rd Qu.: 8.115  
##  Max.   :14.000   Max.   :323.12   Max.   :61227.2   Max.   :13.127  
##     Sulfate       Conductivity   Organic_carbon  Trihalomethanes  
##  Min.   :129.0   Min.   :181.5   Min.   : 2.20   Min.   :  0.738  
##  1st Qu.:317.1   1st Qu.:365.7   1st Qu.:12.07   1st Qu.: 56.648  
##  Median :333.1   Median :421.9   Median :14.22   Median : 66.622  
##  Mean   :333.6   Mean   :426.2   Mean   :14.28   Mean   : 66.407  
##  3rd Qu.:350.4   3rd Qu.:481.8   3rd Qu.:16.56   3rd Qu.: 76.667  
##  Max.   :481.0   Max.   :753.3   Max.   :28.30   Max.   :124.000  
##    Turbidity       Potability    
##  Min.   :1.450   Min.   :0.0000  
##  1st Qu.:3.440   1st Qu.:0.0000  
##  Median :3.955   Median :0.0000  
##  Mean   :3.967   Mean   :0.3901  
##  3rd Qu.:4.500   3rd Qu.:1.0000  
##  Max.   :6.739   Max.   :1.0000
# Ganti label numerik 0/1 menjadi label karakter
waterquality$Potability <- factor(waterquality$Potability,
                                  levels = c(0,1),
                                  labels = c("Tidak Layak", "Layak"))
table(waterquality$Potability)
## 
## Tidak Layak       Layak 
##        1998        1278
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
# Membagi dataset menjadi data latih dan data uji
set.seed(150)  # Set seed untuk reprodusibilitas
split <- sample.split(waterquality$Potability, SplitRatio = 0.7)  # 70% untuk latih, 30% untuk uji
data_latih <- subset(waterquality, split == TRUE)  # Data latih
data_uji <- subset(waterquality, split == FALSE)  # Data uji
# Buat fungsi normalisasi (min-max)
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}
# Normalisasi semua kolom fitur, tanpa kolom label
data_latih_norm <- as.data.frame(lapply(data_latih[, -which(names(data_latih) == "Potability")], normalize))
data_uji_norm <- as.data.frame(lapply(data_uji[, -which(names(data_uji) == "Potability")], normalize))
# Tambahkan kembali kolom target (Potability)
data_latih_norm$Potability <- data_latih$Potability
data_uji_norm$Potability <- data_uji$Potability
# Memuat paket yang diperlukan
library(caTools)

# Membagi dataset menjadi data latih dan data uji
set.seed(123)  # Set seed untuk reprodusibilitas
split <- sample.split(waterquality$Potability, SplitRatio = 0.7)  # 70% untuk latih, 30% untuk uji
data_latih <- subset(waterquality, split == TRUE)  # Data latih
data_uji <- subset(waterquality, split == FALSE)  # Data uji

# Cek struktur data latih dan data uji
str(data_latih)
## tibble [2,294 × 10] (S3: tbl_df/tbl/data.frame)
##  $ ph             : num [1:2294] 7.04 8.1 5.58 10.22 7.04 ...
##  $ Hardness       : num [1:2294] 205 224 188 248 119 ...
##  $ Solids         : num [1:2294] 20791 19910 28749 28750 14286 ...
##  $ Chloramines    : num [1:2294] 7.3 9.28 7.54 7.51 7.8 ...
##  $ Sulfate        : num [1:2294] 369 333 327 394 269 ...
##  $ Conductivity   : num [1:2294] 564 419 280 284 389 ...
##  $ Organic_carbon : num [1:2294] 10.4 16.9 8.4 13.8 12.7 ...
##  $ Trihalomethanes: num [1:2294] 87 66.4 54.9 84.6 53.9 ...
##  $ Turbidity      : num [1:2294] 2.96 3.06 2.56 2.67 3.6 ...
##  $ Potability     : Factor w/ 2 levels "Tidak Layak",..: 1 1 1 1 1 1 1 1 1 1 ...
str(data_uji)
## tibble [982 × 10] (S3: tbl_df/tbl/data.frame)
##  $ ph             : num [1:982] 3.72 8.32 9.09 8.64 7.36 ...
##  $ Hardness       : num [1:982] 129 214 181 203 166 ...
##  $ Solids         : num [1:982] 18630 22018 17979 13672 32453 ...
##  $ Chloramines    : num [1:982] 6.64 8.06 6.55 4.56 7.55 ...
##  $ Sulfate        : num [1:982] 333 357 310 303 327 ...
##  $ Conductivity   : num [1:982] 593 363 398 475 425 ...
##  $ Organic_carbon : num [1:982] 15.2 18.4 11.6 12.4 15.6 ...
##  $ Trihalomethanes: num [1:982] 56.3 100.3 32 62.8 78.7 ...
##  $ Turbidity      : num [1:982] 4.5 4.63 4.08 4.4 3.66 ...
##  $ Potability     : Factor w/ 2 levels "Tidak Layak",..: 1 1 1 1 1 1 1 1 1 1 ...
library(class)

# Jalankan KNN dengan k = 15
prediksi_knn <- knn(
  train = data_latih_norm[, -which(names(data_latih_norm) == "Potability")],
  test = data_uji_norm[, -which(names(data_uji_norm) == "Potability")],
  cl = data_latih_norm$Potability,
  k = 15
)
library(caret)

# Confusion Matrix
confusionMatrix(prediksi_knn, data_uji_norm$Potability)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    Tidak Layak Layak
##   Tidak Layak         522   299
##   Layak                77    84
##                                           
##                Accuracy : 0.6171          
##                  95% CI : (0.5859, 0.6476)
##     No Information Rate : 0.61            
##     P-Value [Acc > NIR] : 0.3361          
##                                           
##                   Kappa : 0.1014          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8715          
##             Specificity : 0.2193          
##          Pos Pred Value : 0.6358          
##          Neg Pred Value : 0.5217          
##              Prevalence : 0.6100          
##          Detection Rate : 0.5316          
##    Detection Prevalence : 0.8360          
##       Balanced Accuracy : 0.5454          
##                                           
##        'Positive' Class : Tidak Layak     
## 

#Kasus# Sebuah lembaga pengujian air tengah melakukan evaluasi terhadap kualitas air dari salah satu wilayah industri yang menunjukkan aktivitas kimia dan limbah tinggi. Air dari wilayah tersebut telah diuji di laboratorium dan menghasilkan sejumlah parameter fisik dan kimia, seperti: ph = 7.2, Hardness = 180, Solids = 15000, Chloramines = 8.5, Sulfate = 330, Conductivity = 420, Organic_carbon = 15, Trihalomethanes = 80, Turbidity = 3.8

Namun, lembaga tersebut tidak memiliki acuan klasifikasi eksplisit apakah air dengan kombinasi parameter tertentu bisa langsung dianggap layak konsumsi (potable) atau tidak layak.

# Buat data sampel baru
sampel_baru <- data.frame(
  ph = 7.2,
  Hardness = 180,
  Solids = 15000,
  Chloramines = 8.5,
  Sulfate = 330,
  Conductivity = 420,
  Organic_carbon = 15,
  Trihalomethanes = 80,
  Turbidity = 3.8
)
# Simpan nilai min dan max dari data latih
min_vals <- sapply(data_latih[, -which(names(data_latih) == "Potability")], min)
max_vals <- sapply(data_latih[, -which(names(data_latih) == "Potability")], max)

# Normalisasi sampel baru dengan min-max dari data latih
sampel_baru_norm <- as.data.frame(mapply(function(x, minv, maxv) {
  (x - minv) / (maxv - minv)
}, sampel_baru, min_vals, max_vals))

# Pastikan nama kolom sama
colnames(sampel_baru) <- names(min_vals)
prediksi_sampel <- knn(
  train = data_latih_norm[, -ncol(data_latih_norm)],
  test = sampel_baru,
  cl = data_latih_norm$Potability,
  k = 15
)

cat("Hasil prediksi kelayakan air untuk data sampel baru adalah:", as.character(prediksi_sampel), "\n")
## Hasil prediksi kelayakan air untuk data sampel baru adalah: Tidak Layak