KNN Classifier

# Loading package
# library(e1071)
library(caTools)
library(class)

Splitting data

# load the Pima Indians dataset from the mlbench dataset
library(mlbench)
data(PimaIndiansDiabetes) 
# rename dataset to have shorter name because lazy
diabetes <- PimaIndiansDiabetes
# Splitting data into train and test data
set.seed(100)

split <- sample.split(diabetes, SplitRatio = 0.8)
train_cl <- subset(diabetes, split == "TRUE")
test_cl <- subset(diabetes, split == "FALSE")
# Feature Scaling
train_scale <- scale(train_cl[, 1:8])
test_scale <- scale(test_cl[, 1:8])

# train_y <- scale(train_cl[, 5])
# test_y <- scale(test_cl[, 5])

Creating KNN model

# Fitting KNN Model to training dataset
classifier_knn <- knn(train = train_scale,
                      cl = train_cl$diabetes,
                      
                      test = test_scale,
                      k = 1)
classifier_knn
##   [1] pos neg neg neg pos neg neg neg neg neg neg neg pos neg neg pos neg pos
##  [19] neg neg neg neg pos neg neg pos neg pos pos neg neg neg neg pos neg pos
##  [37] neg neg neg pos neg pos pos neg neg neg pos pos neg pos neg neg neg neg
##  [55] pos neg pos neg pos neg neg neg pos pos pos pos neg pos neg pos pos neg
##  [73] pos neg neg pos neg neg neg pos pos neg neg pos neg pos pos neg neg neg
##  [91] neg neg neg pos pos neg neg neg pos neg neg pos neg neg pos neg pos neg
## [109] neg neg neg neg pos pos pos pos pos pos neg pos pos pos neg neg neg neg
## [127] neg neg neg neg neg neg pos neg pos neg pos pos neg pos neg pos neg pos
## [145] neg neg neg pos neg neg neg pos pos pos neg pos neg pos neg neg neg neg
## [163] neg neg pos pos neg pos neg neg neg
## Levels: neg pos

Model Evaluation

  • Creat confusion matrix
# Confusion Matrix
cm <- table(test_cl$diabetes, classifier_knn)
cm
##      classifier_knn
##       neg pos
##   neg  79  32
##   pos  27  33

Calculate accuracy with different K

# Model Evaluation - Choosing K =1
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test_cl$diabetes)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.654970760233918"
# K = 7
classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train_cl$diabetes,
                      k = 23)

misClassError <- mean(classifier_knn != test_cl$diabetes)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.795321637426901"

Optimization

  • search better k parameter
i=1
k.optm=1

for (i in 1:39){
 y_pred = knn(train = train_scale,
             test = test_scale,
             
             cl = train_cl$diabetes,
             k = i )
 
 k.optm[i] <-   1- mean(y_pred != test_cl$diabetes)
 
 k=i
 cat(k,'=',k.optm[i],'')
 }
## 1 = 0.6549708 2 = 0.6666667 3 = 0.7426901 4 = 0.6900585 5 = 0.7309942 6 = 0.748538 7 = 0.7368421 8 = 0.7309942 9 = 0.7368421 10 = 0.7251462 11 = 0.7602339 12 = 0.748538 13 = 0.7719298 14 = 0.748538 15 = 0.754386 16 = 0.754386 17 = 0.7602339 18 = 0.7192982 19 = 0.7719298 20 = 0.754386 21 = 0.7836257 22 = 0.7777778 23 = 0.7953216 24 = 0.7719298 25 = 0.7777778 26 = 0.7836257 27 = 0.7719298 28 = 0.7660819 29 = 0.7777778 30 = 0.7660819 31 = 0.7660819 32 = 0.7602339 33 = 0.7719298 34 = 0.754386 35 = 0.7602339 36 = 0.7719298 37 = 0.7777778 38 = 0.7836257 39 = 0.7836257
  • Accuracy plot k=15
plot(k.optm, type="b", xlab="K- Value",ylab="RMSE level")

Visualization

# Visualising the Training set results
# Install ElemStatLearn if not present 

KNN regression

Data exploring

library("Amelia")
data("Boston", package = "MASS")
missmap(Boston,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE)

library(corrplot)
## corrplot 0.92 loaded
corrplot(cor((Boston)))

library(Hmisc)
describe(Boston)
## Boston 
## 
##  14  Variables      506  Observations
## --------------------------------------------------------------------------------
## crim 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0      504        1    3.614    5.794  0.02791  0.03819 
##      .25      .50      .75      .90      .95 
##  0.08204  0.25651  3.67708 10.75300 15.78915 
## 
## lowest :  0.00632  0.00906  0.01096  0.01301  0.01311
## highest: 45.74610 51.13580 67.92080 73.53410 88.97620
## --------------------------------------------------------------------------------
## zn 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0       26    0.603    11.36    18.77      0.0      0.0 
##      .25      .50      .75      .90      .95 
##      0.0      0.0     12.5     42.5     80.0 
## 
## lowest :   0.0  12.5  17.5  18.0  20.0, highest:  82.5  85.0  90.0  95.0 100.0
## --------------------------------------------------------------------------------
## indus 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0       76    0.982    11.14    7.705     2.18     2.91 
##      .25      .50      .75      .90      .95 
##     5.19     9.69    18.10    19.58    21.89 
## 
## lowest :  0.46  0.74  1.21  1.22  1.25, highest: 18.10 19.58 21.89 25.65 27.74
## --------------------------------------------------------------------------------
## chas 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      506        0        2    0.193       35  0.06917    0.129 
## 
## --------------------------------------------------------------------------------
## nox 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0       81        1   0.5547   0.1295   0.4092   0.4270 
##      .25      .50      .75      .90      .95 
##   0.4490   0.5380   0.6240   0.7130   0.7400 
## 
## lowest : 0.385 0.389 0.392 0.394 0.398, highest: 0.713 0.718 0.740 0.770 0.871
## --------------------------------------------------------------------------------
## rm 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0      446        1    6.285   0.7515    5.314    5.594 
##      .25      .50      .75      .90      .95 
##    5.886    6.208    6.623    7.152    7.588 
## 
## lowest : 3.561 3.863 4.138 4.368 4.519, highest: 8.375 8.398 8.704 8.725 8.780
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0      356    0.999    68.57    31.52    17.72    26.95 
##      .25      .50      .75      .90      .95 
##    45.02    77.50    94.07    98.80   100.00 
## 
## lowest :   2.9   6.0   6.2   6.5   6.6, highest:  98.8  98.9  99.1  99.3 100.0
## --------------------------------------------------------------------------------
## dis 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0      412        1    3.795    2.298    1.462    1.628 
##      .25      .50      .75      .90      .95 
##    2.100    3.207    5.188    6.817    7.828 
## 
## lowest :  1.1296  1.1370  1.1691  1.1742  1.1781
## highest:  9.2203  9.2229 10.5857 10.7103 12.1265
## --------------------------------------------------------------------------------
## rad 
##        n  missing distinct     Info     Mean      Gmd 
##      506        0        9    0.959    9.549    8.518 
## 
## lowest :  1  2  3  4  5, highest:  5  6  7  8 24
##                                                                 
## Value          1     2     3     4     5     6     7     8    24
## Frequency     20    24    38   110   115    26    17    24   132
## Proportion 0.040 0.047 0.075 0.217 0.227 0.051 0.034 0.047 0.261
## --------------------------------------------------------------------------------
## tax 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0       66    0.981    408.2    181.7      222      233 
##      .25      .50      .75      .90      .95 
##      279      330      666      666      666 
## 
## lowest : 187 188 193 198 216, highest: 432 437 469 666 711
## --------------------------------------------------------------------------------
## ptratio 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0       46    0.978    18.46    2.383    14.70    14.75 
##      .25      .50      .75      .90      .95 
##    17.40    19.05    20.20    20.90    21.00 
## 
## lowest : 12.6 13.0 13.6 14.4 14.7, highest: 20.9 21.0 21.1 21.2 22.0
## --------------------------------------------------------------------------------
## black 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0      357    0.986    356.7     65.5    84.59   290.27 
##      .25      .50      .75      .90      .95 
##   375.38   391.44   396.23   396.90   396.90 
## 
## lowest :   0.32   2.52   2.60   3.50   3.65, highest: 396.28 396.30 396.33 396.42 396.90
## --------------------------------------------------------------------------------
## lstat 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0      455        1    12.65    7.881    3.708    4.680 
##      .25      .50      .75      .90      .95 
##    6.950   11.360   16.955   23.035   26.808 
## 
## lowest :  1.73  1.92  1.98  2.47  2.87, highest: 34.37 34.41 34.77 36.98 37.97
## --------------------------------------------------------------------------------
## medv 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      506        0      229        1    22.53    9.778    10.20    12.75 
##      .25      .50      .75      .90      .95 
##    17.02    21.20    25.00    34.80    43.40 
## 
## lowest :  5.0  5.6  6.3  7.0  7.2, highest: 46.7 48.3 48.5 48.8 50.0
## --------------------------------------------------------------------------------

Prepareing data

Boston <-    dplyr::select (Boston ,medv , crim , rm , tax , lstat)
# Splitting the dataset into 
# the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(Boston$medv, 
                     SplitRatio = 0.75)
training_set_origi = subset(Boston, 
                      split == TRUE)
test_set_origi = subset(Boston, 
                  split == FALSE)
# Feature Scaling

training_set = scale(training_set_origi[,-1] )
test_set = scale(test_set_origi [,-1])

Creating model

# Fitting K-NN to the Training set 
# and Predicting the Test set results
# library(class)
y_pred = knn(train = training_set[, -1],
             test = test_set[, -1],
             
             cl = training_set_origi[, 1],
             k = 15 )

# 

Evaluation

# converting factor into character then into numeric
error <- test_set_origi[,1]-as.numeric (as.character(y_pred))
head(error)
## [1] -3.5 -0.5 -1.4 -2.6  1.0 -8.8
rmse <- sqrt(mean(error)^2)
rmse
## [1] 0.8487179
plot(error)

head(cbind(test_set_origi[,1], as.numeric (as.character(y_pred))))
##      [,1] [,2]
## [1,] 18.2 21.7
## [2,] 19.9 20.4
## [3,] 17.5 18.9
## [4,] 15.2 17.8
## [5,] 14.5 13.5
## [6,] 15.6 24.4

Optimization

  • search better k parameter
i=1
k.optm=1

for (i in 1:29){
 y_pred = knn(train = training_set[, -1],
             test = test_set[, -1],
             
             cl = training_set_origi[, 1],
             k = i )
 
 k.optm[i] <-  sqrt(mean(   test_set_origi[,1]-as.numeric (as.character(y_pred))   )^2)
 
 k=i
 cat(k,'=',k.optm[i],'')
 }
## 1 = 0.35 2 = 0.5371795 3 = 0.9705128 4 = 1.105128 5 = 1.373077 6 = 0.4512821 7 = 0.6230769 8 = 0.575641 9 = 1.325641 10 = 1.176923 11 = 0.6628205 12 = 0.15 13 = 0.04358974 14 = 0.724359 15 = 0.3551282 16 = 0.07820513 17 = 0.07820513 18 = 0.6346154 19 = 0.2628205 20 = 0.4769231 21 = 0.9294872 22 = 0.6423077 23 = 0.4333333 24 = 0.4320513 25 = 0.3807692 26 = 1.061538 27 = 0.924359 28 = 0.7230769 29 = 0.03461538
  • Accuracy plot k=15
plot(k.optm, type="b", xlab="K- Value",ylab="RMSE level")