if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}

The \(k\)-NN Classifier (Categorical Outcome)

Example: Riding Mowers

library(ggrepel)
mowers.df <- mlba::RidingMowers
set.seed(35)

idx <- sample(nrow(mowers.df), 0.6*nrow(mowers.df))
train.df <- mowers.df[idx, ]
holdout.df <- mowers.df[-idx, ]
## new household
new.df <- data.frame(Income = 60, Lot_Size = 20)

ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership)) +
  geom_point(data=train.df) +
  geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) +
  geom_point(data=cbind(new.df, Ownership='New'))

g <- ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership, fill=Ownership)) +
  geom_point(data=train.df, size=4) +
  geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) +
  geom_point(data=cbind(new.df, Ownership='New'),  size=5) +
  scale_shape_manual(values = c(18, 15, 21)) +
  scale_color_manual(values = c('black', 'darkorange', 'steelblue')) +
  scale_fill_manual(values = c('black', 'darkorange', 'lightblue'))

g

ggsave(file=file.path(getwd(), "figures", "chapter_07", "knn-riding-mower.pdf"),
       g + theme_bw(), width=6, height=4, units="in")
library(caret)
# train k-NN model with k=3
model <- train(Ownership ~ ., data=train.df,
               method="knn",  # specify the model
               preProcess=c("center", "scale"),  # normalize data
               tuneGrid=expand.grid(k=3),
               trControl=trainControl(method="none"))
model
#> k-Nearest Neighbors 
#> 
#> 14 samples
#>  2 predictor
#>  2 classes: 'Nonowner', 'Owner' 
#> 
#> Pre-processing: centered (2), scaled (2) 
#> Resampling: None
# predict new data point
predict(model, new.df)
#> [1] Owner
#> Levels: Nonowner Owner
# determine nearest neighbors to new data point
train.norm.df <- predict(model$preProcess, train.df)
new.norm.df <- predict(model$preProcess, new.df)
distances <- apply(train.norm.df[, 1:2], 1,
                   function(d){ sqrt(sum((d - new.norm.df)^2)) })
rownames(train.df)[order(distances)][1:3]
#> [1] "9"  "14" "1"

Choosing \(k\)

# use leave-one-out cross-validation for small dataset
trControl <- trainControl(method="loocv", number=5, allowParallel=TRUE)
model <- train(Ownership ~ ., data=train.df,
               method="knn",
               preProcess=c("center", "scale"),
               tuneGrid=expand.grid(k=seq(1, 13, 2)),
               trControl=trControl)
model
#> k-Nearest Neighbors 
#> 
#> 14 samples
#>  2 predictor
#>  2 classes: 'Nonowner', 'Owner' 
#> 
#> Pre-processing: centered (2), scaled (2) 
#> Resampling: Leave-One-Out Cross-Validation 
#> Summary of sample sizes: 13, 13, 13, 13, 13, 13, ... 
#> Resampling results across tuning parameters:
#> 
#>   k   Accuracy   Kappa
#>    1  0.4285714  0    
#>    3  0.6428571  0    
#>    5  0.6428571  0    
#>    7  0.7142857  0    
#>    9  0.5714286  0    
#>   11  0.6428571  0    
#>   13  0.6428571  0    
#> 
#> Accuracy was used to select the optimal model using the largest value.
#> The final value used for the model was k = 7.
model <- train(Ownership ~ ., data=mowers.df,
               method="knn",
               preProcess=c("center", "scale"),
               tuneGrid=expand.grid(k=7),
               trControl=trainControl(method="none"))
predict(model, new.df)
#> [1] Owner
#> Levels: Nonowner Owner

Setting the Cutoff Value

train.norm.df <- predict(model$preProcess, train.df)
new.norm.df <- predict(model$preProcess, new.df)
distances <- apply(train.norm.df[, 1:2], 1,
                   function(d){ sqrt(sum((d - new.norm.df)^2)) })
train.df[order(distances)[1:8],]
#>    Income Lot_Size Ownership
#> 9    69.0     20.0     Owner
#> 14   52.8     20.8  Nonowner
#> 1    60.0     18.4     Owner
#> 13   75.0     19.6  Nonowner
#> 11   51.0     22.0     Owner
#> 18   49.2     17.6  Nonowner
#> 15   64.8     17.2  Nonowner
#> 8    82.8     22.4     Owner