library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mower.df <- read.csv("DMBA-R-datasets/RidingMowers.csv")
set.seed(111)
train.index <- sample(row.names(mower.df), 0.6*dim(mower.df)[1])
valid.index <- setdiff(row.names(mower.df), train.index)
train.df <- mower.df[train.index, ]
valid.df <- mower.df[valid.index, ]
## new household
new.df <- data.frame(Income = 60, Lot_Size = 20)
## scatter plot
plot(Lot_Size ~ Income, data=train.df, pch=ifelse(train.df$Ownership=="Owner", 1, 3))
text(train.df$Income, train.df$Lot_Size, rownames(train.df), pos=4)
text(60, 20, "X")
legend("topright", c("owner", "non-owner", "newhousehold"), pch = c(1, 3, 4))
# Choosing k
# initialize normalized training, validation data, complete data frames to originals
train.norm.df <- train.df
valid.norm.df <- valid.df
mower.norm.df <- mower.df
# use preProcess() from the caret package to normalize Income and Lot_Size.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
norm.values <- preProcess(train.df[, 1:2], method=c("center", "scale"))
train.norm.df[, 1:2] <- predict(norm.values, train.df[, 1:2])
valid.norm.df[, 1:2] <- predict(norm.values, valid.df[, 1:2])
mower.norm.df[, 1:2] <- predict(norm.values, mower.df[, 1:2])
new.norm.df <- predict(norm.values, new.df)
# use knn() to compute knn.
# knn() is available in library FNN (provides a list of the nearest neighbors)
# and library class (allows a numerical output variable).
library(FNN)
nn <- knn(train = train.norm.df[, 1:2], test = new.norm.df,
cl = train.norm.df[, 3], k = 3)
row.names(train.df)[attr(nn, "nn.index")]
## [1] "4" "14" "3"
nn
## [1] Owner
## attr(,"nn.index")
## [,1] [,2] [,3]
## [1,] 12 1 6
## attr(,"nn.dist")
## [,1] [,2] [,3]
## [1,] 0.3315012 0.5781654 0.7092952
## Levels: Owner
# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 14, 1), accuracy = rep(0, 14))
# compute knn for different k on validation.
for(i in 1:14) {
knn.pred <- knn(train.norm.df[, 1:2], valid.norm.df[, 1:2],
cl = train.norm.df[, 3], k = i)
accuracy.df[i, 2] <- confusionMatrix(factor(knn.pred, levels =c("Nonowner","Owner") ), factor(valid.norm.df[, 3], levels=c("Nonowner","Owner")))$overall[1]
}
accuracy.df
## k accuracy
## 1 1 0.4
## 2 2 0.3
## 3 3 0.5
## 4 4 0.4
## 5 5 0.5
## 6 6 0.3
## 7 7 0.4
## 8 8 0.4
## 9 9 0.5
## 10 10 0.3
## 11 11 0.3
## 12 12 0.3
## 13 13 0.3
## 14 14 0.3
Best k=3. different from the textbook
knn.pred.new <- knn(mower.norm.df[, 1:2], new.norm.df,
cl = mower.norm.df[, 3], k = 3)
row.names(train.df)[attr(nn, "nn.index")]
## [1] "4" "14" "3"
nn
## [1] Owner
## attr(,"nn.index")
## [,1] [,2] [,3]
## [1,] 12 1 6
## attr(,"nn.dist")
## [,1] [,2] [,3]
## [1,] 0.3315012 0.5781654 0.7092952
## Levels: Owner