library(class) # untuk knn
library(caret) # untuk evaluasi
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(dplyr) # untuk manipulasi data
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
waterquality <- read_excel("D:/KULIAH NABILA/SMT 6/DATA MINING/Data water quality.xlsx")
View(waterquality)
head(waterquality)
## # A tibble: 6 × 10
## ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <NA> 204.890… 20791… 7.30021187… 368.51… 564.3086541… 10.3797830780…
## 2 3.71608007538… 129.422… 18630… 6.63524588… <NA> 592.8853591… 15.1800131163…
## 3 8.09912418929… 224.236… 19909… 9.27588360… <NA> 418.6062130… 16.8686369295…
## 4 8.31676588421… 214.373… 22018… 8.05933237… 356.88… 363.2665161… 18.4365244954…
## 5 9.09222345629… 181.101… 17978… 6.54659997… 310.13… 398.4108133… 11.5582794434…
## 6 5.58408663845… 188.313… 28748… 7.54486878… 326.67… 280.4679159… 8.39973464015…
## # ℹ 3 more variables: Trihalomethanes <chr>, Turbidity <chr>, Potability <dbl>
summary(waterquality)
## ph Hardness Solids Chloramines
## Length:3276 Length:3276 Length:3276 Length:3276
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Sulfate Conductivity Organic_carbon Trihalomethanes
## Length:3276 Length:3276 Length:3276 Length:3276
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Turbidity Potability
## Length:3276 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.3901
## 3rd Qu.:1.0000
## Max. :1.0000
# Ubah kolom menjadi numeric jika belum
waterquality$ph <- as.numeric(waterquality$ph)
waterquality$Hardness <- as.numeric(waterquality$Hardness)
waterquality$Solids <- as.numeric(waterquality$Solids)
waterquality$Chloramines <- as.numeric(waterquality$Chloramines)
waterquality$Sulfate <- as.numeric(waterquality$Sulfate)
waterquality$Conductivity <- as.numeric(waterquality$Conductivity)
waterquality$Organic_carbon <- as.numeric(waterquality$Organic_carbon)
waterquality$Trihalomethanes <- as.numeric(waterquality$Trihalomethanes)
waterquality$Turbidity <- as.numeric(waterquality$Turbidity)
# Cek jumlah missing values per kolom
colSums(is.na(waterquality)) # Menghitung jumlah nilai NA (kosong) di setiap kolom
## ph Hardness Solids Chloramines Sulfate
## 491 0 0 0 781
## Conductivity Organic_carbon Trihalomethanes Turbidity Potability
## 0 0 162 0 0
# Visualisasi missing values
library(VIM) # Library untuk visualisasi missing values
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.4.3
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
# Mengganti missing values dengan median di setiap kolom
waterquality$ph[is.na(waterquality$ph)] <- median(waterquality$ph, na.rm = TRUE)
waterquality$Sulfate[is.na(waterquality$Sulfate)] <- median(waterquality$Sulfate, na.rm = TRUE)
waterquality$Trihalomethanes[is.na(waterquality$Trihalomethanes)] <- median(waterquality$Trihalomethanes, na.rm = TRUE)
# Cek ulang data setelah preprocessing
summary(waterquality) # Menampilkan ringkasan statistik setelah preprocessing
## ph Hardness Solids Chloramines
## Min. : 0.000 Min. : 47.43 Min. : 320.9 Min. : 0.352
## 1st Qu.: 6.278 1st Qu.:176.85 1st Qu.:15666.7 1st Qu.: 6.127
## Median : 7.037 Median :196.97 Median :20927.8 Median : 7.130
## Mean : 7.074 Mean :196.37 Mean :22014.1 Mean : 7.122
## 3rd Qu.: 7.870 3rd Qu.:216.67 3rd Qu.:27332.8 3rd Qu.: 8.115
## Max. :14.000 Max. :323.12 Max. :61227.2 Max. :13.127
## Sulfate Conductivity Organic_carbon Trihalomethanes
## Min. :129.0 Min. :181.5 Min. : 2.20 Min. : 0.738
## 1st Qu.:317.1 1st Qu.:365.7 1st Qu.:12.07 1st Qu.: 56.648
## Median :333.1 Median :421.9 Median :14.22 Median : 66.622
## Mean :333.6 Mean :426.2 Mean :14.28 Mean : 66.407
## 3rd Qu.:350.4 3rd Qu.:481.8 3rd Qu.:16.56 3rd Qu.: 76.667
## Max. :481.0 Max. :753.3 Max. :28.30 Max. :124.000
## Turbidity Potability
## Min. :1.450 Min. :0.0000
## 1st Qu.:3.440 1st Qu.:0.0000
## Median :3.955 Median :0.0000
## Mean :3.967 Mean :0.3901
## 3rd Qu.:4.500 3rd Qu.:1.0000
## Max. :6.739 Max. :1.0000
# Ganti label numerik 0/1 menjadi label karakter
waterquality$Potability <- factor(waterquality$Potability,
levels = c(0,1),
labels = c("Tidak Layak", "Layak"))
table(waterquality$Potability)
##
## Tidak Layak Layak
## 1998 1278
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
# Membagi dataset menjadi data latih dan data uji
set.seed(150) # Set seed untuk reprodusibilitas
split <- sample.split(waterquality$Potability, SplitRatio = 0.7) # 70% untuk latih, 30% untuk uji
data_latih <- subset(waterquality, split == TRUE) # Data latih
data_uji <- subset(waterquality, split == FALSE) # Data uji
# Buat fungsi normalisasi (min-max)
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
# Normalisasi semua kolom fitur, tanpa kolom label
data_latih_norm <- as.data.frame(lapply(data_latih[, -which(names(data_latih) == "Potability")], normalize))
data_uji_norm <- as.data.frame(lapply(data_uji[, -which(names(data_uji) == "Potability")], normalize))
# Tambahkan kembali kolom target (Potability)
data_latih_norm$Potability <- data_latih$Potability
data_uji_norm$Potability <- data_uji$Potability
# Memuat paket yang diperlukan
library(caTools)
# Membagi dataset menjadi data latih dan data uji
set.seed(123) # Set seed untuk reprodusibilitas
split <- sample.split(waterquality$Potability, SplitRatio = 0.7) # 70% untuk latih, 30% untuk uji
data_latih <- subset(waterquality, split == TRUE) # Data latih
data_uji <- subset(waterquality, split == FALSE) # Data uji
# Cek struktur data latih dan data uji
str(data_latih)
## tibble [2,294 × 10] (S3: tbl_df/tbl/data.frame)
## $ ph : num [1:2294] 7.04 8.1 5.58 10.22 7.04 ...
## $ Hardness : num [1:2294] 205 224 188 248 119 ...
## $ Solids : num [1:2294] 20791 19910 28749 28750 14286 ...
## $ Chloramines : num [1:2294] 7.3 9.28 7.54 7.51 7.8 ...
## $ Sulfate : num [1:2294] 369 333 327 394 269 ...
## $ Conductivity : num [1:2294] 564 419 280 284 389 ...
## $ Organic_carbon : num [1:2294] 10.4 16.9 8.4 13.8 12.7 ...
## $ Trihalomethanes: num [1:2294] 87 66.4 54.9 84.6 53.9 ...
## $ Turbidity : num [1:2294] 2.96 3.06 2.56 2.67 3.6 ...
## $ Potability : Factor w/ 2 levels "Tidak Layak",..: 1 1 1 1 1 1 1 1 1 1 ...
str(data_uji)
## tibble [982 × 10] (S3: tbl_df/tbl/data.frame)
## $ ph : num [1:982] 3.72 8.32 9.09 8.64 7.36 ...
## $ Hardness : num [1:982] 129 214 181 203 166 ...
## $ Solids : num [1:982] 18630 22018 17979 13672 32453 ...
## $ Chloramines : num [1:982] 6.64 8.06 6.55 4.56 7.55 ...
## $ Sulfate : num [1:982] 333 357 310 303 327 ...
## $ Conductivity : num [1:982] 593 363 398 475 425 ...
## $ Organic_carbon : num [1:982] 15.2 18.4 11.6 12.4 15.6 ...
## $ Trihalomethanes: num [1:982] 56.3 100.3 32 62.8 78.7 ...
## $ Turbidity : num [1:982] 4.5 4.63 4.08 4.4 3.66 ...
## $ Potability : Factor w/ 2 levels "Tidak Layak",..: 1 1 1 1 1 1 1 1 1 1 ...
library(class)
# Jalankan KNN dengan k = 15
prediksi_knn <- knn(
train = data_latih_norm[, -which(names(data_latih_norm) == "Potability")],
test = data_uji_norm[, -which(names(data_uji_norm) == "Potability")],
cl = data_latih_norm$Potability,
k = 15
)
library(caret)
# Confusion Matrix
confusionMatrix(prediksi_knn, data_uji_norm$Potability)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak Layak Layak
## Tidak Layak 522 299
## Layak 77 84
##
## Accuracy : 0.6171
## 95% CI : (0.5859, 0.6476)
## No Information Rate : 0.61
## P-Value [Acc > NIR] : 0.3361
##
## Kappa : 0.1014
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8715
## Specificity : 0.2193
## Pos Pred Value : 0.6358
## Neg Pred Value : 0.5217
## Prevalence : 0.6100
## Detection Rate : 0.5316
## Detection Prevalence : 0.8360
## Balanced Accuracy : 0.5454
##
## 'Positive' Class : Tidak Layak
##
#Kasus# Sebuah lembaga pengujian air tengah melakukan evaluasi terhadap kualitas air dari salah satu wilayah industri yang menunjukkan aktivitas kimia dan limbah tinggi. Air dari wilayah tersebut telah diuji di laboratorium dan menghasilkan sejumlah parameter fisik dan kimia, seperti: ph = 7.2, Hardness = 180, Solids = 15000, Chloramines = 8.5, Sulfate = 330, Conductivity = 420, Organic_carbon = 15, Trihalomethanes = 80, Turbidity = 3.8
Namun, lembaga tersebut tidak memiliki acuan klasifikasi eksplisit apakah air dengan kombinasi parameter tertentu bisa langsung dianggap layak konsumsi (potable) atau tidak layak.
# Buat data sampel baru
sampel_baru <- data.frame(
ph = 7.2,
Hardness = 180,
Solids = 15000,
Chloramines = 8.5,
Sulfate = 330,
Conductivity = 420,
Organic_carbon = 15,
Trihalomethanes = 80,
Turbidity = 3.8
)
# Simpan nilai min dan max dari data latih
min_vals <- sapply(data_latih[, -which(names(data_latih) == "Potability")], min)
max_vals <- sapply(data_latih[, -which(names(data_latih) == "Potability")], max)
# Normalisasi sampel baru dengan min-max dari data latih
sampel_baru_norm <- as.data.frame(mapply(function(x, minv, maxv) {
(x - minv) / (maxv - minv)
}, sampel_baru, min_vals, max_vals))
# Pastikan nama kolom sama
colnames(sampel_baru) <- names(min_vals)
prediksi_sampel <- knn(
train = data_latih_norm[, -ncol(data_latih_norm)],
test = sampel_baru,
cl = data_latih_norm$Potability,
k = 15
)
cat("Hasil prediksi kelayakan air untuk data sampel baru adalah:", as.character(prediksi_sampel), "\n")
## Hasil prediksi kelayakan air untuk data sampel baru adalah: Tidak Layak