Example: Riding Mowers
library(ggrepel)
mowers.df <- mlba::RidingMowers
set.seed(35)
idx <- sample(nrow(mowers.df), 0.6*nrow(mowers.df))
train.df <- mowers.df[idx, ]
holdout.df <- mowers.df[-idx, ]
## new household
new.df <- data.frame(Income = 60, Lot_Size = 20)
ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership)) +
geom_point(data=train.df) +
geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) +
geom_point(data=cbind(new.df, Ownership='New'))

g <- ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership, fill=Ownership)) +
geom_point(data=train.df, size=4) +
geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) +
geom_point(data=cbind(new.df, Ownership='New'), size=5) +
scale_shape_manual(values = c(18, 15, 21)) +
scale_color_manual(values = c('black', 'darkorange', 'steelblue')) +
scale_fill_manual(values = c('black', 'darkorange', 'lightblue'))
g

ggsave(file=file.path(getwd(), "figures", "chapter_07", "knn-riding-mower.pdf"),
g + theme_bw(), width=6, height=4, units="in")
library(caret)
# train k-NN model with k=3
model <- train(Ownership ~ ., data=train.df,
method="knn", # specify the model
preProcess=c("center", "scale"), # normalize data
tuneGrid=expand.grid(k=3),
trControl=trainControl(method="none"))
model
#> k-Nearest Neighbors
#>
#> 14 samples
#> 2 predictor
#> 2 classes: 'Nonowner', 'Owner'
#>
#> Pre-processing: centered (2), scaled (2)
#> Resampling: None
# predict new data point
predict(model, new.df)
#> [1] Owner
#> Levels: Nonowner Owner
# determine nearest neighbors to new data point
train.norm.df <- predict(model$preProcess, train.df)
new.norm.df <- predict(model$preProcess, new.df)
distances <- apply(train.norm.df[, 1:2], 1,
function(d){ sqrt(sum((d - new.norm.df)^2)) })
rownames(train.df)[order(distances)][1:3]
#> [1] "9" "14" "1"
Choosing \(k\)
# use leave-one-out cross-validation for small dataset
trControl <- trainControl(method="loocv", number=5, allowParallel=TRUE)
model <- train(Ownership ~ ., data=train.df,
method="knn",
preProcess=c("center", "scale"),
tuneGrid=expand.grid(k=seq(1, 13, 2)),
trControl=trControl)
model
#> k-Nearest Neighbors
#>
#> 14 samples
#> 2 predictor
#> 2 classes: 'Nonowner', 'Owner'
#>
#> Pre-processing: centered (2), scaled (2)
#> Resampling: Leave-One-Out Cross-Validation
#> Summary of sample sizes: 13, 13, 13, 13, 13, 13, ...
#> Resampling results across tuning parameters:
#>
#> k Accuracy Kappa
#> 1 0.4285714 0
#> 3 0.6428571 0
#> 5 0.6428571 0
#> 7 0.7142857 0
#> 9 0.5714286 0
#> 11 0.6428571 0
#> 13 0.6428571 0
#>
#> Accuracy was used to select the optimal model using the largest value.
#> The final value used for the model was k = 7.
model <- train(Ownership ~ ., data=mowers.df,
method="knn",
preProcess=c("center", "scale"),
tuneGrid=expand.grid(k=7),
trControl=trainControl(method="none"))
predict(model, new.df)
#> [1] Owner
#> Levels: Nonowner Owner
Setting the Cutoff Value
train.norm.df <- predict(model$preProcess, train.df)
new.norm.df <- predict(model$preProcess, new.df)
distances <- apply(train.norm.df[, 1:2], 1,
function(d){ sqrt(sum((d - new.norm.df)^2)) })
train.df[order(distances)[1:8],]
#> Income Lot_Size Ownership
#> 9 69.0 20.0 Owner
#> 14 52.8 20.8 Nonowner
#> 1 60.0 18.4 Owner
#> 13 75.0 19.6 Nonowner
#> 11 51.0 22.0 Owner
#> 18 49.2 17.6 Nonowner
#> 15 64.8 17.2 Nonowner
#> 8 82.8 22.4 Owner