require(devtools)
## Loading required package: devtools
## Loading required package: usethis
 install_version("ElemStatLearn", version = "2015.6.26.2", repos = "http://cran.us.r-project.org")
## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 4.0 from https://cran.r-project.org/bin/windows/Rtools/.
## Downloading package from url: http://cran.us.r-project.org/src/contrib/Archive/ElemStatLearn/ElemStatLearn_2015.6.26.2.tar.gz
## Installing package into 'C:/Users/hounnou2/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)

KNN Classification (Diabetes)

#install.packages("mlbench") # run this line if you don't have the package
 library(mlbench)
 library(ElemStatLearn)
  data(PimaIndiansDiabetes)

  set.seed(2)
  trainid = sample(1:nrow(PimaIndiansDiabetes), nrow(PimaIndiansDiabetes)/2)
  Diab.train = PimaIndiansDiabetes[trainid, ]
  Diab.test = PimaIndiansDiabetes[-trainid, ]
 
  #install.packages("FNN") 
  library(FNN)
  
  kgrid = c(1:20)
  TrainingError = rep(NA, 20)
  TestError = rep(NA, 20)
for (i in 1:20)
{
TrainingError[i] = mean(FNN::knn(train = Diab.train[, -9], 
                                 test = Diab.train[, -9],
cl = as.factor(Diab.train[, 9]), k = kgrid
[i]) != Diab.train[, 9])
TestError[i] = mean(FNN::knn(train = Diab.train[, -9], test = Diab.test
[, -9],
cl = as.factor(Diab.train[, 9]), k = kgrid
[i]) != Diab.test[, 9])
}
plot(kgrid, TestError, ylim = c(0, max(TestError)),ylab = "Test error", 
     xlab = "K value", pch = 16, col = "6")
points(kgrid, TrainingError, pch = 16, col = "4")
legend("bottom", c("Training error", "Test error"), col = c("4", "6"), 
       pch = 16, cex = 1.5, bg='lightblue')

which.min(TestError)
## [1] 9

The plot does have a U-shaped error. The optimal k value is 9.

KNN Classification (Handwritten Digit)

 library(ElemStatLearn)
mydata = zip.train[sample(1:nrow(zip.train), size = 500), ]
kgrid = c(1:20)
  TrainingError = rep(NA, 20)
  TestError = rep(NA, 20)
for (i in 1:20)
{
TrainingError[i] = mean(FNN::knn(train = mydata[, -1], test = mydata[, -1],
cl = as.factor(mydata[, 1]), k = kgrid[i]) != mydata[, 1])
TestError[i] = mean(FNN::knn(train = mydata[, -1], test = zip.test[, -1],
cl = as.factor(mydata[, 1]), k = kgrid[i]) != zip.test[, 1])
}
plot(kgrid, TestError, ylim = c(0, max(TestError)), ylab = "Test error", 
     xlab = "K value", pch = 16, col = "6")
points(kgrid, TrainingError, pch = 19, col = "4")
legend("bottom", c("Training error", "Test error"), col = c("4", "6"), 
       pch = 19, cex = 1.5, bg='lightblue')

which.min(TestError)
## [1] 1

The optimal k value is 1. The plot does not have a U-shape. The number of covariates \(p\) is too high. Using lower number of \(p\) (lower subspace) may improve the results.

Writing my own KNN algorithm

a. Generate \(n=1000\), \(p=5\) and write a function

myknn <- function(xtest, xtrain, ytrain, k){
  if (is.vector(xtest)){
  if (length(xtest) != ncol(xtrain)) stop("xtest and xtrain dim not the same")
distance = rowSums(abs(sweep(xtrain, MARGIN = 2, xtest, FUN = "-")))
  return( mean(ytrain[rank(distance, ties.method = "random") <= k]) )}
  if (is.matrix(xtest)){
  if (ncol(xtest) != ncol(xtrain)) stop("xtest and xtrain dim not the same")
ytest = rep(NA, nrow(xtest))
  for (i in 1:nrow(xtest)){
distance = rowSums(abs(sweep(xtrain, MARGIN = 2, xtest[i, ], FUN = "-")))
ytest[i] = mean(ytrain[rank(distance, ties.method = "random") <= k])}
  return(ytest)}
  stop("xtest is not a vector or a matrix")
}

b. Model Evaluation

set.seed(123)
N = 1000; P = 5
X = matrix(rnorm(N*P), N, P)
Y = X[,1] + 0.5*X[,2] - X[,3] + rnorm(N)
ntrain = 500
xtrain = X[1:ntrain, ]
xtest = X[(ntrain+1):N, ]
ytrain = Y[1:ntrain]
ytest = Y[(ntrain+1):N]
# Mean squared error
mean((myknn(xtest, xtrain, ytrain, k = 5) - ytest)^2)
## [1] 1.563658

c. Optimal tuning parameters

kgrid = c(1:10)
  SquaredError = rep(NA, 10)
  Dff = rep(NA, 10)
for (i in 1:10)
{
SquaredError[i] = mean((myknn(xtest, xtrain, ytrain, k = kgrid[i]) - ytest)^2)
Dff[i] = N/kgrid[i]
}
plot(Dff , SquaredError, ylim = c(0, max(SquaredError)), xlim = c(0, max(Dff)), 
     ylab = "Mean squared error", 
     xlab = "Degree of freedom", pch = 16, col = "6")
legend("bottomright", c("Mean squared error"), col = c("6"), 
       pch = 19, cex = 1.5, bg='lightblue')

which.min(SquaredError)
## [1] 7

The optimal k is 7 and the corresponding degree of freedom is 143.

Curse of Dimensionality

library(FNN)
P = 100
set.seed(123)
kvalue <- c(1:60)
Errors <- rep(NA, length(kvalue))
X1 = cbind(X, matrix(rnorm(N*(P-5)), N, (P-5)))
X2 = cbind(X, X %*% matrix(runif(5*(P-5)), 5, P-5))

# First case
xtrain = X1[1:ntrain, ]
xtest = X1[(ntrain+1):N, ]
for (i in 1:length(Errors))
Errors[i] = mean((FNN::knn.reg(xtrain, xtest, y = ytrain, k = kvalue
[i])$pred - ytest)^2)
MSE1 <- min(Errors)
Optimal_k1 <- kvalue[which.min(Errors)]
print(Optimal_k1)
## [1] 8
print(MSE1)
## [1] 1.925884
# second case
xtrain = X2[1:ntrain, ]
xtest = X2[(ntrain+1):N, ]
for (i in 1:length(Errors))
Errors[i] = mean((FNN::knn.reg(xtrain, xtest, y = ytrain, k = kvalue
[i])$pred - ytest)^2)
MSE2 <- min(Errors)
Optimal_k2 <- kvalue[which.min(Errors)]
print(Optimal_k2)
## [1] 4
print(MSE2)
## [1] 1.784692

The optimal k for the first setting is 8 with 1.925884 as MSE. The optimal k for the second setting is 4 and the MSE is 1.784692. Based on the MSE, kNN performs better in the second setting. Yes this should be expected because the high dimensional data have low dimensional structures. Hense, the curse of dimensionality.