What KNN is about
Non-parametric method used for both classification and regression.
Most of the times it is used for classification problems, where the response variable is categorical
Data Preparation
abalone <- read.csv("abalone.csv", header =T, na.strings=c("","NA"))
suppressWarnings(suppressMessages(library(dplyr)))
#create an sex column that is numeric
abalone <- abalone %>%
mutate(sex_num =case_when(
sex %in% 'M' ~ 0,
sex %in% "F" ~ 1,
sex %in% "I" ~ 2
))
#create an age column
abalone <- abalone %>%
mutate(age=case_when(
rings %in% 1:5 ~ "young",
rings %in% 6:13 ~ "adult",
rings %in% 14:30 ~ "old"
))
# remove rings, sex
abalone <- abalone[c(-1, -9)]
str(abalone)
'data.frame': 4177 obs. of 9 variables:
$ length : num 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
$ diameter: num 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
$ height : num 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
$ weight : num 0.514 0.226 0.677 0.516 0.205 ...
$ shucked : num 0.2245 0.0995 0.2565 0.2155 0.0895 ...
$ viscera : num 0.101 0.0485 0.1415 0.114 0.0395 ...
$ shell : num 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
$ sex_num : num 0 0 1 0 2 2 1 1 0 1 ...
$ age : chr "old" "adult" "adult" "adult" ...
Scale and prepare training and test set
### the dependent variable is age , with the different values young adult old
### standardize the predictors
set.seed(100)
abalone_scale <- data.frame(scale(abalone[1:8]))
### add the target variable to the data set abalone_scale
abalone$age <- as.factor(abalone$age)
abalone_scale <- cbind(abalone_scale, age = abalone$age)
i <- sample(4177, 2088)
abalone_train <- abalone_scale[i,]
abalone_test <- abalone_scale[-i,]
The value of K is important in the KNN algorithm, because the prediction accuracy in the test set depends on it. The optimal value of K is the value that leads to the highest prediction accuracy.
### we use the tune.knn function in the e1071 package to determine a good K number
### this function performs a 10-fold cross-validation
suppressWarnings(suppressMessages(library(e1071)))
t_knn <- tune.knn(abalone_train[,-9], factor(abalone_train[,9]), k = 1:100)
t_knn # names(t_knn) to see the list of variables
Parameter tuning of ‘knn.wrapper’:
- sampling method: 10-fold cross validation
- best parameters:
k
17
- best performance: 0.1379325
plot(t_knn)

Run the prediction
library(class)
age <- abalone_train$age
pred <- knn(train = abalone_train[,-9], test = abalone_test[,-9], cl = age, k = t_knn$best.parameters)
check the result
### get the prediction accuracy in the test set
mean(pred == abalone_test$age)
[1] 0.8721876
table(pred,abalone_test$age)
pred adult old young
adult 1726 183 55
old 18 41 0
young 11 0 55
This library also shows same answers above. If you want mse, convert the above age factors into numeric.
suppressWarnings(suppressMessages(library(Metrics)))
# tp / tp+tn
accuracy(abalone_test$age, pred)
[1] 0.8721876
# classification error = 1-accuracy above
ce(abalone_test$age, pred)
[1] 0.1278124
LS0tDQp0aXRsZTogIk5vdGVib29rIEFiYWxvbmUgLSBLLW5lYXJlc3QgbmVpZ2hib3IgKEtOTikgIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyBXaGF0IEtOTiBpcyBhYm91dA0KDQpOb24tcGFyYW1ldHJpYyBtZXRob2QgdXNlZCBmb3IgYm90aCBjbGFzc2lmaWNhdGlvbiBhbmQgcmVncmVzc2lvbi4NCg0KTW9zdCBvZiB0aGUgdGltZXMgaXQgaXMgdXNlZCBmb3IgY2xhc3NpZmljYXRpb24gcHJvYmxlbXMsIHdoZXJlIHRoZSByZXNwb25zZSB2YXJpYWJsZSBpcyBjYXRlZ29yaWNhbA0KDQoNCiMgRGF0YSBQcmVwYXJhdGlvbg0KYGBge3J9DQphYmFsb25lIDwtIHJlYWQuY3N2KCJhYmFsb25lLmNzdiIsIGhlYWRlciA9VCwgbmEuc3RyaW5ncz1jKCIiLCJOQSIpKQ0KDQoNCnN1cHByZXNzV2FybmluZ3Moc3VwcHJlc3NNZXNzYWdlcyhsaWJyYXJ5KGRwbHlyKSkpDQoNCg0KI2NyZWF0ZSBhbiBzZXggY29sdW1uIHRoYXQgaXMgbnVtZXJpYw0KYWJhbG9uZSA8LSBhYmFsb25lICU+JQ0KICBtdXRhdGUoc2V4X251bSA9Y2FzZV93aGVuKA0KICAgIHNleCAlaW4lICdNJyB+IDAsDQogICAgc2V4ICVpbiUgIkYiICB+IDEsDQogICAgc2V4ICVpbiUgIkkiIH4gMg0KICApKQ0KDQojY3JlYXRlIGFuIGFnZSBjb2x1bW4NCmFiYWxvbmUgPC0gYWJhbG9uZSAlPiUNCiAgbXV0YXRlKGFnZT1jYXNlX3doZW4oDQogICAgcmluZ3MgJWluJSAxOjUgfiAieW91bmciLA0KICAgIHJpbmdzICVpbiUgNjoxMyB+ICJhZHVsdCIsDQogICAgcmluZ3MgJWluJSAxNDozMCB+ICJvbGQiDQogICkpDQoNCg0KIyByZW1vdmUgcmluZ3MsIHNleA0KYWJhbG9uZSA8LSBhYmFsb25lW2MoLTEsIC05KV0NCnN0cihhYmFsb25lKQ0KYGBgDQoNCiMgU2NhbGUgYW5kIHByZXBhcmUgdHJhaW5pbmcgYW5kIHRlc3Qgc2V0DQpgYGB7cn0NCiMjIyB0aGUgZGVwZW5kZW50IHZhcmlhYmxlIGlzIGFnZSAsIHdpdGggdGhlIGRpZmZlcmVudCB2YWx1ZXMgeW91bmcgYWR1bHQgb2xkDQojIyMgc3RhbmRhcmRpemUgdGhlIHByZWRpY3RvcnMNCg0Kc2V0LnNlZWQoMTAwKQ0KYWJhbG9uZV9zY2FsZSA8LSBkYXRhLmZyYW1lKHNjYWxlKGFiYWxvbmVbMTo4XSkpDQoNCiMjIyBhZGQgdGhlIHRhcmdldCB2YXJpYWJsZSB0byB0aGUgZGF0YSBzZXQgYWJhbG9uZV9zY2FsZSANCg0KYWJhbG9uZSRhZ2UgPC0gYXMuZmFjdG9yKGFiYWxvbmUkYWdlKQ0KYWJhbG9uZV9zY2FsZSAgPC0gY2JpbmQoYWJhbG9uZV9zY2FsZSwgYWdlID0gYWJhbG9uZSRhZ2UpDQoNCmkgPC0gc2FtcGxlKDQxNzcsIDIwODgpDQphYmFsb25lX3RyYWluIDwtIGFiYWxvbmVfc2NhbGVbaSxdDQphYmFsb25lX3Rlc3QgPC0gYWJhbG9uZV9zY2FsZVstaSxdDQpgYGANCg0KIyBUaGUgdmFsdWUgb2YgSyBpcyBpbXBvcnRhbnQgaW4gdGhlIEtOTiBhbGdvcml0aG0sIGJlY2F1c2UgdGhlIHByZWRpY3Rpb24gYWNjdXJhY3kgaW4gdGhlIHRlc3Qgc2V0IGRlcGVuZHMgb24gaXQuIFRoZSBvcHRpbWFsIHZhbHVlIG9mIEsgaXMgdGhlIHZhbHVlIHRoYXQgbGVhZHMgdG8gdGhlIGhpZ2hlc3QgcHJlZGljdGlvbiBhY2N1cmFjeS4NCg0KYGBge3J9DQojIyMgd2UgdXNlIHRoZSB0dW5lLmtubiBmdW5jdGlvbiBpbiB0aGUgZTEwNzEgcGFja2FnZSB0byBkZXRlcm1pbmUgYSBnb29kIEsgbnVtYmVyDQojIyMgdGhpcyBmdW5jdGlvbiBwZXJmb3JtcyBhIDEwLWZvbGQgY3Jvc3MtdmFsaWRhdGlvbg0KDQoNCnN1cHByZXNzV2FybmluZ3Moc3VwcHJlc3NNZXNzYWdlcyhsaWJyYXJ5KGUxMDcxKSkpDQoNCnRfa25uIDwtIHR1bmUua25uKGFiYWxvbmVfdHJhaW5bLC05XSwgZmFjdG9yKGFiYWxvbmVfdHJhaW5bLDldKSwgayA9IDE6MTAwKQ0KDQp0X2tubiAjIG5hbWVzKHRfa25uKSB0byBzZWUgdGhlIGxpc3Qgb2YgdmFyaWFibGVzDQoNCnBsb3QodF9rbm4pDQpgYGANCg0KIyBSdW4gdGhlIHByZWRpY3Rpb24NCmBgYHtyfQ0KbGlicmFyeShjbGFzcykNCmFnZSA8LSBhYmFsb25lX3RyYWluJGFnZQ0KcHJlZCA8LSBrbm4odHJhaW4gPSBhYmFsb25lX3RyYWluWywtOV0sIHRlc3QgPSBhYmFsb25lX3Rlc3RbLC05XSwgY2wgPSBhZ2UsIGsgPSB0X2tubiRiZXN0LnBhcmFtZXRlcnMpDQpgYGANCg0KIyBjaGVjayB0aGUgcmVzdWx0DQpgYGB7cn0NCiMjIyBnZXQgdGhlIHByZWRpY3Rpb24gYWNjdXJhY3kgaW4gdGhlIHRlc3Qgc2V0DQptZWFuKHByZWQgPT0gYWJhbG9uZV90ZXN0JGFnZSkNCnRhYmxlKHByZWQsYWJhbG9uZV90ZXN0JGFnZSkNCmBgYA0KDQojIFRoaXMgbGlicmFyeSAgYWxzbyBzaG93cyBzYW1lIGFuc3dlcnMgYWJvdmUuIElmIHlvdSB3YW50IG1zZSwgY29udmVydCB0aGUgYWJvdmUgYWdlIGZhY3RvcnMgaW50byBudW1lcmljLg0KDQpgYGB7cn0NCnN1cHByZXNzV2FybmluZ3Moc3VwcHJlc3NNZXNzYWdlcyhsaWJyYXJ5KE1ldHJpY3MpKSkNCiMgdHAgLyB0cCt0bg0KYWNjdXJhY3koYWJhbG9uZV90ZXN0JGFnZSwgcHJlZCkNCg0KIyBjbGFzc2lmaWNhdGlvbiBlcnJvciA9IDEtYWNjdXJhY3kgYWJvdmUNCmNlKGFiYWxvbmVfdGVzdCRhZ2UsIHByZWQpDQoNCmBgYA0KDQo=