library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(class)
library(readr)
#penyiapan data
data("Default")
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
set.seed(123)
Default$student = as.numeric(Default$student) - 1
default_idx = sample(nrow(Default), 5000)
default_trn = Default[default_idx, ]
default_tst = Default[-default_idx, ]
#training data
X_default_trn = default_trn[, -1]
y_default_trn = default_trn$default
#testing data
X_default_tst = default_tst[, -1]
y_default_tst = default_tst$default
#Prediksi
head(knn(train = X_default_trn,
test = X_default_tst,
cl = y_default_trn,
k = 3))
## [1] No No No No No No
## Levels: No Yes
#fungsi untuk menghitung error
calc_class_err = function(actual, predicted) {
mean(actual != predicted)
}
calc_class_err(actual = y_default_tst,
predicted = knn(train = X_default_trn,
test = X_default_tst,
cl = y_default_trn,
k = 5))
## [1] 0.033
#untuk prediktor yang sudah dinormalisasi
calc_class_err(actual = y_default_tst,
predicted = knn(train = scale(X_default_trn),
test = scale(X_default_tst),
cl = y_default_trn,
k = 5))
## [1] 0.032
#Memilih k terbaik
set.seed(123)
k_to_try = 1:100
err_k = rep(x = 0, times = length(k_to_try))
for (i in seq_along(k_to_try)) {
pred = knn(train = scale(X_default_trn),
test = scale(X_default_tst),
cl = y_default_trn,
k = k_to_try[i])
err_k[i] = calc_class_err(y_default_tst, pred)
}
#plot error vs pilihan k
plot(err_k, type = "b", col = "dodgerblue", cex = 1, pch = 20,
xlab = "k, number of neighbors", ylab = "classification error",
main = "(Test) Error Rate vs Neighbors")
#add line for min error seen
abline(h = min(err_k), col = "darkorange", lty = 3)
#add line for minority prevalence in test set
abline(h = mean(y_default_tst == "Yes"), col = "grey", lty = 2)

min(err_k)
## [1] 0.0282
which(err_k == min(err_k))
## [1] 17