require(devtools)
## Loading required package: devtools
## Loading required package: usethis
install_version("ElemStatLearn", version = "2015.6.26.2", repos = "http://cran.us.r-project.org")
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install Rtools 4.0 from https://cran.r-project.org/bin/windows/Rtools/.
## Downloading package from url: http://cran.us.r-project.org/src/contrib/Archive/ElemStatLearn/ElemStatLearn_2015.6.26.2.tar.gz
## Installing package into 'C:/Users/hounnou2/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
#install.packages("mlbench") # run this line if you don't have the package
library(mlbench)
library(ElemStatLearn)
data(PimaIndiansDiabetes)
set.seed(2)
trainid = sample(1:nrow(PimaIndiansDiabetes), nrow(PimaIndiansDiabetes)/2)
Diab.train = PimaIndiansDiabetes[trainid, ]
Diab.test = PimaIndiansDiabetes[-trainid, ]
#install.packages("FNN")
library(FNN)
kgrid = c(1:20)
TrainingError = rep(NA, 20)
TestError = rep(NA, 20)
for (i in 1:20)
{
TrainingError[i] = mean(FNN::knn(train = Diab.train[, -9],
test = Diab.train[, -9],
cl = as.factor(Diab.train[, 9]), k = kgrid
[i]) != Diab.train[, 9])
TestError[i] = mean(FNN::knn(train = Diab.train[, -9], test = Diab.test
[, -9],
cl = as.factor(Diab.train[, 9]), k = kgrid
[i]) != Diab.test[, 9])
}
plot(kgrid, TestError, ylim = c(0, max(TestError)),ylab = "Test error",
xlab = "K value", pch = 16, col = "6")
points(kgrid, TrainingError, pch = 16, col = "4")
legend("bottom", c("Training error", "Test error"), col = c("4", "6"),
pch = 16, cex = 1.5, bg='lightblue')
which.min(TestError)
## [1] 9
The plot does have a U-shaped error. The optimal k value is 9.
library(ElemStatLearn)
mydata = zip.train[sample(1:nrow(zip.train), size = 500), ]
kgrid = c(1:20)
TrainingError = rep(NA, 20)
TestError = rep(NA, 20)
for (i in 1:20)
{
TrainingError[i] = mean(FNN::knn(train = mydata[, -1], test = mydata[, -1],
cl = as.factor(mydata[, 1]), k = kgrid[i]) != mydata[, 1])
TestError[i] = mean(FNN::knn(train = mydata[, -1], test = zip.test[, -1],
cl = as.factor(mydata[, 1]), k = kgrid[i]) != zip.test[, 1])
}
plot(kgrid, TestError, ylim = c(0, max(TestError)), ylab = "Test error",
xlab = "K value", pch = 16, col = "6")
points(kgrid, TrainingError, pch = 19, col = "4")
legend("bottom", c("Training error", "Test error"), col = c("4", "6"),
pch = 19, cex = 1.5, bg='lightblue')
which.min(TestError)
## [1] 1
The optimal k value is 1. The plot does not have a U-shape. The number of covariates \(p\) is too high. Using lower number of \(p\) (lower subspace) may improve the results.
myknn <- function(xtest, xtrain, ytrain, k){
if (is.vector(xtest)){
if (length(xtest) != ncol(xtrain)) stop("xtest and xtrain dim not the same")
distance = rowSums(abs(sweep(xtrain, MARGIN = 2, xtest, FUN = "-")))
return( mean(ytrain[rank(distance, ties.method = "random") <= k]) )}
if (is.matrix(xtest)){
if (ncol(xtest) != ncol(xtrain)) stop("xtest and xtrain dim not the same")
ytest = rep(NA, nrow(xtest))
for (i in 1:nrow(xtest)){
distance = rowSums(abs(sweep(xtrain, MARGIN = 2, xtest[i, ], FUN = "-")))
ytest[i] = mean(ytrain[rank(distance, ties.method = "random") <= k])}
return(ytest)}
stop("xtest is not a vector or a matrix")
}
set.seed(123)
N = 1000; P = 5
X = matrix(rnorm(N*P), N, P)
Y = X[,1] + 0.5*X[,2] - X[,3] + rnorm(N)
ntrain = 500
xtrain = X[1:ntrain, ]
xtest = X[(ntrain+1):N, ]
ytrain = Y[1:ntrain]
ytest = Y[(ntrain+1):N]
# Mean squared error
mean((myknn(xtest, xtrain, ytrain, k = 5) - ytest)^2)
## [1] 1.563658
kgrid = c(1:10)
SquaredError = rep(NA, 10)
Dff = rep(NA, 10)
for (i in 1:10)
{
SquaredError[i] = mean((myknn(xtest, xtrain, ytrain, k = kgrid[i]) - ytest)^2)
Dff[i] = N/kgrid[i]
}
plot(Dff , SquaredError, ylim = c(0, max(SquaredError)), xlim = c(0, max(Dff)),
ylab = "Mean squared error",
xlab = "Degree of freedom", pch = 16, col = "6")
legend("bottomright", c("Mean squared error"), col = c("6"),
pch = 19, cex = 1.5, bg='lightblue')
which.min(SquaredError)
## [1] 7
The optimal k is 7 and the corresponding degree of freedom is 143.
library(FNN)
P = 100
set.seed(123)
kvalue <- c(1:60)
Errors <- rep(NA, length(kvalue))
X1 = cbind(X, matrix(rnorm(N*(P-5)), N, (P-5)))
X2 = cbind(X, X %*% matrix(runif(5*(P-5)), 5, P-5))
# First case
xtrain = X1[1:ntrain, ]
xtest = X1[(ntrain+1):N, ]
for (i in 1:length(Errors))
Errors[i] = mean((FNN::knn.reg(xtrain, xtest, y = ytrain, k = kvalue
[i])$pred - ytest)^2)
MSE1 <- min(Errors)
Optimal_k1 <- kvalue[which.min(Errors)]
print(Optimal_k1)
## [1] 8
print(MSE1)
## [1] 1.925884
# second case
xtrain = X2[1:ntrain, ]
xtest = X2[(ntrain+1):N, ]
for (i in 1:length(Errors))
Errors[i] = mean((FNN::knn.reg(xtrain, xtest, y = ytrain, k = kvalue
[i])$pred - ytest)^2)
MSE2 <- min(Errors)
Optimal_k2 <- kvalue[which.min(Errors)]
print(Optimal_k2)
## [1] 4
print(MSE2)
## [1] 1.784692
The optimal k for the first setting is 8 with 1.925884 as MSE. The optimal k for the second setting is 4 and the MSE is 1.784692. Based on the MSE, kNN performs better in the second setting. Yes this should be expected because the high dimensional data have low dimensional structures. Hense, the curse of dimensionality.