KNN Classifier
# Loading package
# library(e1071)
library(caTools)
library(class)
Splitting data
# load the Pima Indians dataset from the mlbench dataset
library(mlbench)
data(PimaIndiansDiabetes)
# rename dataset to have shorter name because lazy
diabetes <- PimaIndiansDiabetes
# Splitting data into train and test data
set.seed(100)
split <- sample.split(diabetes, SplitRatio = 0.8)
train_cl <- subset(diabetes, split == "TRUE")
test_cl <- subset(diabetes, split == "FALSE")
# Feature Scaling
train_scale <- scale(train_cl[, 1:8])
test_scale <- scale(test_cl[, 1:8])
# train_y <- scale(train_cl[, 5])
# test_y <- scale(test_cl[, 5])
Creating KNN model
# Fitting KNN Model to training dataset
classifier_knn <- knn(train = train_scale,
cl = train_cl$diabetes,
test = test_scale,
k = 1)
classifier_knn
## [1] pos neg neg neg pos neg neg neg neg neg neg neg pos neg neg pos neg pos
## [19] neg neg neg neg pos neg neg pos neg pos pos neg neg neg neg pos neg pos
## [37] neg neg neg pos neg pos pos neg neg neg pos pos neg pos neg neg neg neg
## [55] pos neg pos neg pos neg neg neg pos pos pos pos neg pos neg pos pos neg
## [73] pos neg neg pos neg neg neg pos pos neg neg pos neg pos pos neg neg neg
## [91] neg neg neg pos pos neg neg neg pos neg neg pos neg neg pos neg pos neg
## [109] neg neg neg neg pos pos pos pos pos pos neg pos pos pos neg neg neg neg
## [127] neg neg neg neg neg neg pos neg pos neg pos pos neg pos neg pos neg pos
## [145] neg neg neg pos neg neg neg pos pos pos neg pos neg pos neg neg neg neg
## [163] neg neg pos pos neg pos neg neg neg
## Levels: neg pos
Model Evaluation
# Confusion Matrix
cm <- table(test_cl$diabetes, classifier_knn)
cm
## classifier_knn
## neg pos
## neg 79 32
## pos 27 33
Calculate accuracy with different K
# Model Evaluation - Choosing K =1
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test_cl$diabetes)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.654970760233918"
# K = 7
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$diabetes,
k = 23)
misClassError <- mean(classifier_knn != test_cl$diabetes)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.795321637426901"
Optimization
- search better k parameter
i=1
k.optm=1
for (i in 1:39){
y_pred = knn(train = train_scale,
test = test_scale,
cl = train_cl$diabetes,
k = i )
k.optm[i] <- 1- mean(y_pred != test_cl$diabetes)
k=i
cat(k,'=',k.optm[i],'')
}
## 1 = 0.6549708 2 = 0.6666667 3 = 0.7426901 4 = 0.6900585 5 = 0.7309942 6 = 0.748538 7 = 0.7368421 8 = 0.7309942 9 = 0.7368421 10 = 0.7251462 11 = 0.7602339 12 = 0.748538 13 = 0.7719298 14 = 0.748538 15 = 0.754386 16 = 0.754386 17 = 0.7602339 18 = 0.7192982 19 = 0.7719298 20 = 0.754386 21 = 0.7836257 22 = 0.7777778 23 = 0.7953216 24 = 0.7719298 25 = 0.7777778 26 = 0.7836257 27 = 0.7719298 28 = 0.7660819 29 = 0.7777778 30 = 0.7660819 31 = 0.7660819 32 = 0.7602339 33 = 0.7719298 34 = 0.754386 35 = 0.7602339 36 = 0.7719298 37 = 0.7777778 38 = 0.7836257 39 = 0.7836257
plot(k.optm, type="b", xlab="K- Value",ylab="RMSE level")

Visualization
# Visualising the Training set results
# Install ElemStatLearn if not present
KNN regression
Data exploring
library("Amelia")
data("Boston", package = "MASS")
missmap(Boston,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE)

library(corrplot)
## corrplot 0.92 loaded
corrplot(cor((Boston)))

library(Hmisc)
describe(Boston)
## Boston
##
## 14 Variables 506 Observations
## --------------------------------------------------------------------------------
## crim
## n missing distinct Info Mean Gmd .05 .10
## 506 0 504 1 3.614 5.794 0.02791 0.03819
## .25 .50 .75 .90 .95
## 0.08204 0.25651 3.67708 10.75300 15.78915
##
## lowest : 0.00632 0.00906 0.01096 0.01301 0.01311
## highest: 45.74610 51.13580 67.92080 73.53410 88.97620
## --------------------------------------------------------------------------------
## zn
## n missing distinct Info Mean Gmd .05 .10
## 506 0 26 0.603 11.36 18.77 0.0 0.0
## .25 .50 .75 .90 .95
## 0.0 0.0 12.5 42.5 80.0
##
## lowest : 0.0 12.5 17.5 18.0 20.0, highest: 82.5 85.0 90.0 95.0 100.0
## --------------------------------------------------------------------------------
## indus
## n missing distinct Info Mean Gmd .05 .10
## 506 0 76 0.982 11.14 7.705 2.18 2.91
## .25 .50 .75 .90 .95
## 5.19 9.69 18.10 19.58 21.89
##
## lowest : 0.46 0.74 1.21 1.22 1.25, highest: 18.10 19.58 21.89 25.65 27.74
## --------------------------------------------------------------------------------
## chas
## n missing distinct Info Sum Mean Gmd
## 506 0 2 0.193 35 0.06917 0.129
##
## --------------------------------------------------------------------------------
## nox
## n missing distinct Info Mean Gmd .05 .10
## 506 0 81 1 0.5547 0.1295 0.4092 0.4270
## .25 .50 .75 .90 .95
## 0.4490 0.5380 0.6240 0.7130 0.7400
##
## lowest : 0.385 0.389 0.392 0.394 0.398, highest: 0.713 0.718 0.740 0.770 0.871
## --------------------------------------------------------------------------------
## rm
## n missing distinct Info Mean Gmd .05 .10
## 506 0 446 1 6.285 0.7515 5.314 5.594
## .25 .50 .75 .90 .95
## 5.886 6.208 6.623 7.152 7.588
##
## lowest : 3.561 3.863 4.138 4.368 4.519, highest: 8.375 8.398 8.704 8.725 8.780
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 506 0 356 0.999 68.57 31.52 17.72 26.95
## .25 .50 .75 .90 .95
## 45.02 77.50 94.07 98.80 100.00
##
## lowest : 2.9 6.0 6.2 6.5 6.6, highest: 98.8 98.9 99.1 99.3 100.0
## --------------------------------------------------------------------------------
## dis
## n missing distinct Info Mean Gmd .05 .10
## 506 0 412 1 3.795 2.298 1.462 1.628
## .25 .50 .75 .90 .95
## 2.100 3.207 5.188 6.817 7.828
##
## lowest : 1.1296 1.1370 1.1691 1.1742 1.1781
## highest: 9.2203 9.2229 10.5857 10.7103 12.1265
## --------------------------------------------------------------------------------
## rad
## n missing distinct Info Mean Gmd
## 506 0 9 0.959 9.549 8.518
##
## lowest : 1 2 3 4 5, highest: 5 6 7 8 24
##
## Value 1 2 3 4 5 6 7 8 24
## Frequency 20 24 38 110 115 26 17 24 132
## Proportion 0.040 0.047 0.075 0.217 0.227 0.051 0.034 0.047 0.261
## --------------------------------------------------------------------------------
## tax
## n missing distinct Info Mean Gmd .05 .10
## 506 0 66 0.981 408.2 181.7 222 233
## .25 .50 .75 .90 .95
## 279 330 666 666 666
##
## lowest : 187 188 193 198 216, highest: 432 437 469 666 711
## --------------------------------------------------------------------------------
## ptratio
## n missing distinct Info Mean Gmd .05 .10
## 506 0 46 0.978 18.46 2.383 14.70 14.75
## .25 .50 .75 .90 .95
## 17.40 19.05 20.20 20.90 21.00
##
## lowest : 12.6 13.0 13.6 14.4 14.7, highest: 20.9 21.0 21.1 21.2 22.0
## --------------------------------------------------------------------------------
## black
## n missing distinct Info Mean Gmd .05 .10
## 506 0 357 0.986 356.7 65.5 84.59 290.27
## .25 .50 .75 .90 .95
## 375.38 391.44 396.23 396.90 396.90
##
## lowest : 0.32 2.52 2.60 3.50 3.65, highest: 396.28 396.30 396.33 396.42 396.90
## --------------------------------------------------------------------------------
## lstat
## n missing distinct Info Mean Gmd .05 .10
## 506 0 455 1 12.65 7.881 3.708 4.680
## .25 .50 .75 .90 .95
## 6.950 11.360 16.955 23.035 26.808
##
## lowest : 1.73 1.92 1.98 2.47 2.87, highest: 34.37 34.41 34.77 36.98 37.97
## --------------------------------------------------------------------------------
## medv
## n missing distinct Info Mean Gmd .05 .10
## 506 0 229 1 22.53 9.778 10.20 12.75
## .25 .50 .75 .90 .95
## 17.02 21.20 25.00 34.80 43.40
##
## lowest : 5.0 5.6 6.3 7.0 7.2, highest: 46.7 48.3 48.5 48.8 50.0
## --------------------------------------------------------------------------------
Prepareing data
Boston <- dplyr::select (Boston ,medv , crim , rm , tax , lstat)
# Splitting the dataset into
# the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(Boston$medv,
SplitRatio = 0.75)
training_set_origi = subset(Boston,
split == TRUE)
test_set_origi = subset(Boston,
split == FALSE)
# Feature Scaling
training_set = scale(training_set_origi[,-1] )
test_set = scale(test_set_origi [,-1])
Creating model
# Fitting K-NN to the Training set
# and Predicting the Test set results
# library(class)
y_pred = knn(train = training_set[, -1],
test = test_set[, -1],
cl = training_set_origi[, 1],
k = 15 )
#
Evaluation
# converting factor into character then into numeric
error <- test_set_origi[,1]-as.numeric (as.character(y_pred))
head(error)
## [1] -3.5 -0.5 -1.4 -2.6 1.0 -8.8
rmse <- sqrt(mean(error)^2)
rmse
## [1] 0.8487179
plot(error)

head(cbind(test_set_origi[,1], as.numeric (as.character(y_pred))))
## [,1] [,2]
## [1,] 18.2 21.7
## [2,] 19.9 20.4
## [3,] 17.5 18.9
## [4,] 15.2 17.8
## [5,] 14.5 13.5
## [6,] 15.6 24.4
Optimization
- search better k parameter
i=1
k.optm=1
for (i in 1:29){
y_pred = knn(train = training_set[, -1],
test = test_set[, -1],
cl = training_set_origi[, 1],
k = i )
k.optm[i] <- sqrt(mean( test_set_origi[,1]-as.numeric (as.character(y_pred)) )^2)
k=i
cat(k,'=',k.optm[i],'')
}
## 1 = 0.35 2 = 0.5371795 3 = 0.9705128 4 = 1.105128 5 = 1.373077 6 = 0.4512821 7 = 0.6230769 8 = 0.575641 9 = 1.325641 10 = 1.176923 11 = 0.6628205 12 = 0.15 13 = 0.04358974 14 = 0.724359 15 = 0.3551282 16 = 0.07820513 17 = 0.07820513 18 = 0.6346154 19 = 0.2628205 20 = 0.4769231 21 = 0.9294872 22 = 0.6423077 23 = 0.4333333 24 = 0.4320513 25 = 0.3807692 26 = 1.061538 27 = 0.924359 28 = 0.7230769 29 = 0.03461538
plot(k.optm, type="b", xlab="K- Value",ylab="RMSE level")
