library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(class)
library(readr)
#penyiapan data
data("Default")
str(Default)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
set.seed(123)
Default$student = as.numeric(Default$student) - 1
default_idx = sample(nrow(Default), 5000)
default_trn = Default[default_idx, ]
default_tst = Default[-default_idx, ]
#training data
X_default_trn = default_trn[, -1]
y_default_trn = default_trn$default
#testing data
X_default_tst = default_tst[, -1]
y_default_tst = default_tst$default
#Prediksi
head(knn(train = X_default_trn,
 test = X_default_tst,
 cl = y_default_trn,
 k = 3))
## [1] No No No No No No
## Levels: No Yes
#fungsi untuk menghitung error
calc_class_err = function(actual, predicted) {
 mean(actual != predicted)
}
calc_class_err(actual = y_default_tst,
 predicted = knn(train = X_default_trn,
 test = X_default_tst,
 cl = y_default_trn,
 k = 5))
## [1] 0.033
#untuk prediktor yang sudah dinormalisasi
calc_class_err(actual = y_default_tst,
 predicted = knn(train = scale(X_default_trn),
 test = scale(X_default_tst),
 cl = y_default_trn,
 k = 5))
## [1] 0.032
#Memilih k terbaik
set.seed(123)
k_to_try = 1:100
err_k = rep(x = 0, times = length(k_to_try))
for (i in seq_along(k_to_try)) {
 pred = knn(train = scale(X_default_trn),
 test = scale(X_default_tst),
 cl = y_default_trn,
 k = k_to_try[i])
 err_k[i] = calc_class_err(y_default_tst, pred)
}
#plot error vs pilihan k
plot(err_k, type = "b", col = "dodgerblue", cex = 1, pch = 20,
 xlab = "k, number of neighbors", ylab = "classification error",
 main = "(Test) Error Rate vs Neighbors")

#add line for min error seen
abline(h = min(err_k), col = "darkorange", lty = 3)

#add line for minority prevalence in test set
abline(h = mean(y_default_tst == "Yes"), col = "grey", lty = 2)

min(err_k)
## [1] 0.0282
which(err_k == min(err_k))
## [1] 17