library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mower.df <- read.csv("DMBA-R-datasets/RidingMowers.csv")

Example: Riding Mowers

set.seed(111)
train.index <- sample(row.names(mower.df), 0.6*dim(mower.df)[1])
valid.index <- setdiff(row.names(mower.df), train.index)
train.df <- mower.df[train.index, ]
valid.df <- mower.df[valid.index, ]
## new household
new.df <- data.frame(Income = 60, Lot_Size = 20)
## scatter plot
plot(Lot_Size ~ Income, data=train.df, pch=ifelse(train.df$Ownership=="Owner", 1, 3))
text(train.df$Income, train.df$Lot_Size, rownames(train.df), pos=4)
text(60, 20, "X")
legend("topright", c("owner", "non-owner", "newhousehold"), pch = c(1, 3, 4))

# Choosing k

# initialize normalized training, validation data, complete data frames to originals
train.norm.df <- train.df
valid.norm.df <- valid.df
mower.norm.df <- mower.df
# use preProcess() from the caret package to normalize Income and Lot_Size.
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
norm.values <- preProcess(train.df[, 1:2], method=c("center", "scale"))
train.norm.df[, 1:2] <- predict(norm.values, train.df[, 1:2])
valid.norm.df[, 1:2] <- predict(norm.values, valid.df[, 1:2])
mower.norm.df[, 1:2] <- predict(norm.values, mower.df[, 1:2])
new.norm.df <- predict(norm.values, new.df)
# use knn() to compute knn.
# knn() is available in library FNN (provides a list of the nearest neighbors)
# and library class (allows a numerical output variable).
library(FNN)
nn <- knn(train = train.norm.df[, 1:2], test = new.norm.df,
cl = train.norm.df[, 3], k = 3)
row.names(train.df)[attr(nn, "nn.index")]
## [1] "4"  "14" "3"
nn
## [1] Owner
## attr(,"nn.index")
##      [,1] [,2] [,3]
## [1,]   12    1    6
## attr(,"nn.dist")
##           [,1]      [,2]      [,3]
## [1,] 0.3315012 0.5781654 0.7092952
## Levels: Owner

ACCURACY (OR CORRECT RATE) OF k-NN PREDICTIONS IN VALIDATION SET FOR VARIOUS CHOICES OF k

# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 14, 1), accuracy = rep(0, 14))
# compute knn for different k on validation.
for(i in 1:14) {
knn.pred <- knn(train.norm.df[, 1:2], valid.norm.df[, 1:2],
cl = train.norm.df[, 3], k = i)

accuracy.df[i, 2] <- confusionMatrix(factor(knn.pred, levels =c("Nonowner","Owner") ), factor(valid.norm.df[, 3], levels=c("Nonowner","Owner")))$overall[1]
}
accuracy.df
##     k accuracy
## 1   1      0.4
## 2   2      0.3
## 3   3      0.5
## 4   4      0.4
## 5   5      0.5
## 6   6      0.3
## 7   7      0.4
## 8   8      0.4
## 9   9      0.5
## 10 10      0.3
## 11 11      0.3
## 12 12      0.3
## 13 13      0.3
## 14 14      0.3

Best k=3. different from the textbook

CLASSIFYING A NEW HOUSEHOLD USING THE “BEST K” = 3

knn.pred.new <- knn(mower.norm.df[, 1:2], new.norm.df,
cl = mower.norm.df[, 3], k = 3)
row.names(train.df)[attr(nn, "nn.index")]
## [1] "4"  "14" "3"
nn
## [1] Owner
## attr(,"nn.index")
##      [,1] [,2] [,3]
## [1,]   12    1    6
## attr(,"nn.dist")
##           [,1]      [,2]      [,3]
## [1,] 0.3315012 0.5781654 0.7092952
## Levels: Owner