K-nearest neighbors for classification

library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.5.3
data <- read.csv('F:/Machine Learning/Data Science/Machine Learning/KNN/binary.csv')

str(data)
## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...

KNN for Classification

#replace admit '0' & '1' with 'No' & 'Yes' respectively

data$admit[data$admit==0] <- 'No'
data$admit[data$admit==1] <- 'Yes'

str(data)
## 'data.frame':    400 obs. of  4 variables:
##  $ admit: chr  "No" "Yes" "Yes" "Yes" ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
#convert chr to factor
data$admit <- as.factor(data$admit)
str(data)
## 'data.frame':    400 obs. of  4 variables:
##  $ admit: Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 1 2 1 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
#Data Partition
set.seed(1234)

ind <- sample(2, nrow(data), replace = T, prob=c(0.7,0.3))

train <- data[ind==1,]
test <- data[ind==2,]

dim(train)
## [1] 284   4
dim(test)
## [1] 116   4
#repeated cross-validation method
trcontrol <- trainControl(method = "repeatedcv",
                          number = 10,
                          repeats = 3)
#KNN for classification 
#by default it runs 'accuracy' metric for selecting optimal 'k'
#preProc is to normalize the values

set.seed(222)
fit <- train(admit ~ .,
             data=train,
             method='knn',
             tuneLength=20,
             trControl = trcontrol,
             preProc = c("center","scale"))

fit
## k-Nearest Neighbors 
## 
## 284 samples
##   3 predictor
##   2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 256, 256, 256, 256, 256, 255, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.6358785  0.09112381
##    7  0.6676108  0.14676016
##    9  0.6795156  0.16944350
##   11  0.6792282  0.16232672
##   13  0.6651067  0.12648903
##   15  0.6768883  0.14837562
##   17  0.6826765  0.16001288
##   19  0.6816502  0.14927686
##   21  0.6794745  0.13529673
##   23  0.6830049  0.13663649
##   25  0.6899425  0.15415306
##   27  0.6875616  0.14970069
##   29  0.7027915  0.18620447
##   31  0.6981117  0.16567350
##   33  0.7028325  0.17997849
##   35  0.6958128  0.15765299
##   37  0.6946223  0.15615233
##   39  0.6958949  0.15870467
##   41  0.6912151  0.14179428
##   43  0.6923235  0.14317763
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 33.

So model achieves highest accuracy when k=33.

#Model Performance 
plot(fit)

#feature importance
varImp(fit)
## ROC curve variable importance
## 
##      Importance
## gpa      100.00
## rank      25.18
## gre        0.00

With the 3 features in the model, gpa is more significantly contributing in predicting the outcome variable.

#prediction on train data

p1 <- predict(fit , train)

head(p1)
## [1] No  No  Yes No  No  No 
## Levels: No Yes
#confusion matrix on train data
confusionMatrix(p1, train$admit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  180  72
##        Yes  11  21
##                                         
##                Accuracy : 0.7077        
##                  95% CI : (0.6511, 0.76)
##     No Information Rate : 0.6725        
##     P-Value [Acc > NIR] : 0.1141        
##                                         
##                   Kappa : 0.2022        
##                                         
##  Mcnemar's Test P-Value : 4.523e-11     
##                                         
##             Sensitivity : 0.9424        
##             Specificity : 0.2258        
##          Pos Pred Value : 0.7143        
##          Neg Pred Value : 0.6562        
##              Prevalence : 0.6725        
##          Detection Rate : 0.6338        
##    Detection Prevalence : 0.8873        
##       Balanced Accuracy : 0.5841        
##                                         
##        'Positive' Class : No            
## 
#Prediction on test data

p <- predict(fit, test)

head(p)
## [1] No  No  No  Yes No  No 
## Levels: No Yes
#confusion matrix
confusionMatrix(p, test$admit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  79  30
##        Yes  3   4
##                                           
##                Accuracy : 0.7155          
##                  95% CI : (0.6243, 0.7954)
##     No Information Rate : 0.7069          
##     P-Value [Acc > NIR] : 0.465           
##                                           
##                   Kappa : 0.1056          
##                                           
##  Mcnemar's Test P-Value : 6.011e-06       
##                                           
##             Sensitivity : 0.9634          
##             Specificity : 0.1176          
##          Pos Pred Value : 0.7248          
##          Neg Pred Value : 0.5714          
##              Prevalence : 0.7069          
##          Detection Rate : 0.6810          
##    Detection Prevalence : 0.9397          
##       Balanced Accuracy : 0.5405          
##                                           
##        'Positive' Class : No              
## 

So model achieved 71.55% accuracy on the test (unseen) data.

#repeated cross-validation method
trcontrol <- trainControl(method = "repeatedcv",
                          number = 10,
                          repeats = 3,
                          classProbs = TRUE,
                          summaryFunction = twoClassSummary)
#use 'ROC' metric for selecting optimal k
fit <- train(admit ~ .,
             data=train,
             method='knn',
             tuneLength=20,
             trControl = trcontrol,
             preProc = c("center","scale"),
             metric = 'ROC',
             tuneGrid = expand.grid(k=1:60))

fit
## k-Nearest Neighbors 
## 
## 284 samples
##   3 predictor
##   2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 255, 255, 255, 256, 256, 255, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec     
##    1  0.5378363  0.7071053  0.3707407
##    2  0.5731954  0.7187719  0.3825926
##    3  0.5829990  0.8041228  0.3481481
##    4  0.5710941  0.7674561  0.3262963
##    5  0.5901174  0.8356140  0.2677778
##    6  0.6044347  0.8339474  0.2900000
##    7  0.6095132  0.8689474  0.2829630
##    8  0.6115249  0.8638596  0.2659259
##    9  0.6169859  0.8845614  0.2722222
##   10  0.6116857  0.8724561  0.2677778
##   11  0.6179893  0.8778070  0.2696296
##   12  0.6134532  0.8776316  0.2766667
##   13  0.6206277  0.8849123  0.2559259
##   14  0.6174615  0.8864912  0.2696296
##   15  0.6232125  0.8935965  0.2444444
##   16  0.6254035  0.8847368  0.2444444
##   17  0.6356871  0.8881579  0.2233333
##   18  0.6388704  0.9003509  0.2307407
##   19  0.6413163  0.9019298  0.2196296
##   20  0.6460658  0.9022807  0.2018519
##   21  0.6494795  0.9072807  0.2122222
##   22  0.6446433  0.9107895  0.1911111
##   23  0.6518255  0.9264912  0.1848148
##   24  0.6533587  0.9211404  0.2029630
##   25  0.6555731  0.9316667  0.1844444
##   26  0.6564059  0.9264035  0.1951852
##   27  0.6624483  0.9316667  0.1914815
##   28  0.6706979  0.9386842  0.1911111
##   29  0.6727271  0.9421930  0.1874074
##   30  0.6732778  0.9475439  0.1877778
##   31  0.6717943  0.9386842  0.1833333
##   32  0.6700127  0.9369298  0.1762963
##   33  0.6700292  0.9421930  0.1914815
##   34  0.6707407  0.9439474  0.1803704
##   35  0.6714201  0.9439474  0.1840741
##   36  0.6743977  0.9421930  0.1692593
##   37  0.6718114  0.9439474  0.1881481
##   38  0.6700132  0.9386842  0.1811111
##   39  0.6689196  0.9351754  0.1840741
##   40  0.6676696  0.9386842  0.1844444
##   41  0.6688177  0.9369298  0.1814815
##   42  0.6650663  0.9404386  0.1637037
##   43  0.6641969  0.9439474  0.1711111
##   44  0.6670361  0.9457018  0.1525926
##   45  0.6698216  0.9492105  0.1637037
##   46  0.6705361  0.9527193  0.1674074
##   47  0.6728845  0.9492105  0.1488889
##   48  0.6719610  0.9492105  0.1418519
##   49  0.6711949  0.9509649  0.1451852
##   50  0.6735648  0.9527193  0.1414815
##   51  0.6740136  0.9527193  0.1414815
##   52  0.6740721  0.9492105  0.1377778
##   53  0.6777057  0.9509649  0.1414815
##   54  0.6776910  0.9544737  0.1414815
##   55  0.6742919  0.9579825  0.1303704
##   56  0.6719479  0.9579825  0.1344444
##   57  0.6724103  0.9614912  0.1381481
##   58  0.6693163  0.9579825  0.1307407
##   59  0.6695161  0.9597368  0.1344444
##   60  0.6725073  0.9649123  0.1125926
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 53.

Now we can see that the above values are based on the ROC metric.

#plot
plot(fit)

#feature importance based on 'ROC' metric
varImp(fit)
## ROC curve variable importance
## 
##      Importance
## gpa      100.00
## rank      25.18
## gre        0.00
#prediction on train data

p2 <- predict(fit, train)

confusionMatrix(p2, train$admit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  182  79
##        Yes   9  14
##                                           
##                Accuracy : 0.6901          
##                  95% CI : (0.6328, 0.7434)
##     No Information Rate : 0.6725          
##     P-Value [Acc > NIR] : 0.2864          
##                                           
##                   Kappa : 0.1282          
##                                           
##  Mcnemar's Test P-Value : 1.903e-13       
##                                           
##             Sensitivity : 0.9529          
##             Specificity : 0.1505          
##          Pos Pred Value : 0.6973          
##          Neg Pred Value : 0.6087          
##              Prevalence : 0.6725          
##          Detection Rate : 0.6408          
##    Detection Prevalence : 0.9190          
##       Balanced Accuracy : 0.5517          
##                                           
##        'Positive' Class : No              
## 
#Prediction on test data
p4 <- predict(fit, test)

confusionMatrix(p4, test$admit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  81  32
##        Yes  1   2
##                                           
##                Accuracy : 0.7155          
##                  95% CI : (0.6243, 0.7954)
##     No Information Rate : 0.7069          
##     P-Value [Acc > NIR] : 0.465           
##                                           
##                   Kappa : 0.0636          
##                                           
##  Mcnemar's Test P-Value : 1.767e-07       
##                                           
##             Sensitivity : 0.98780         
##             Specificity : 0.05882         
##          Pos Pred Value : 0.71681         
##          Neg Pred Value : 0.66667         
##              Prevalence : 0.70690         
##          Detection Rate : 0.69828         
##    Detection Prevalence : 0.97414         
##       Balanced Accuracy : 0.52331         
##                                           
##        'Positive' Class : No              
## 

We can see that the model achieved about 73% of accuracy on unseen data with “ROC” metric which is higher than the accuracy achieved with “accuracy” metric.

K-nearest neighbors for Regression

data("BostonHousing")
data <- BostonHousing

str(data)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(1234)

ind <- sample(2, nrow(data), replace = T, prob=c(0.7,0.3))

train <- data[ind==1,]
test <- data[ind==2,]
#repeated cross-validation method

trcontrol <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)
#knn for regression, by default it runs RMSE metric
set.seed(333)

fit <- train(medv ~ .,
             data = train,
             tuneGrid = expand.grid(k=1:70),
             method='knn',
             trControl = trcontrol,
             preProc = c('center','scale'))

#Model performance
fit
## k-Nearest Neighbors 
## 
## 355 samples
##  13 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 319, 320, 319, 320, 321, 319, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  4.220233  0.7893295  2.772957
##    2  3.865542  0.8121023  2.645454
##    3  3.934359  0.8120572  2.622213
##    4  4.056351  0.8099926  2.685353
##    5  4.114220  0.8104629  2.740373
##    6  4.238611  0.7955515  2.846685
##    7  4.251670  0.7944795  2.860067
##    8  4.256004  0.7954250  2.887005
##    9  4.251901  0.7993392  2.892098
##   10  4.231215  0.8026196  2.888946
##   11  4.271776  0.7995826  2.925118
##   12  4.320730  0.7944261  2.958073
##   13  4.347364  0.7929000  2.974351
##   14  4.383062  0.7918880  3.019517
##   15  4.419955  0.7909817  3.060170
##   16  4.454649  0.7889415  3.080520
##   17  4.510544  0.7849833  3.125955
##   18  4.559126  0.7833061  3.162832
##   19  4.590455  0.7818717  3.190647
##   20  4.618037  0.7815097  3.218698
##   21  4.657021  0.7778881  3.243445
##   22  4.691509  0.7777435  3.278319
##   23  4.735923  0.7756833  3.312395
##   24  4.781045  0.7728091  3.347412
##   25  4.811122  0.7711752  3.378069
##   26  4.849810  0.7697044  3.401861
##   27  4.881436  0.7684275  3.430206
##   28  4.906460  0.7692417  3.445831
##   29  4.937899  0.7698780  3.468374
##   30  4.975059  0.7673481  3.494391
##   31  5.009274  0.7663530  3.515671
##   32  5.037252  0.7663301  3.536316
##   33  5.064711  0.7659840  3.547531
##   34  5.097279  0.7652401  3.568150
##   35  5.129075  0.7637818  3.589298
##   36  5.163558  0.7616490  3.613341
##   37  5.199721  0.7600383  3.641035
##   38  5.236509  0.7577097  3.664487
##   39  5.275571  0.7551501  3.689218
##   40  5.299488  0.7533839  3.704732
##   41  5.333833  0.7506246  3.720636
##   42  5.361248  0.7490764  3.737189
##   43  5.392890  0.7467751  3.758837
##   44  5.421543  0.7443202  3.782809
##   45  5.453338  0.7418674  3.801114
##   46  5.470509  0.7407170  3.808809
##   47  5.498008  0.7382269  3.822770
##   48  5.525593  0.7357696  3.844304
##   49  5.545909  0.7351324  3.857904
##   50  5.575568  0.7325247  3.879289
##   51  5.598911  0.7311560  3.894973
##   52  5.617319  0.7288680  3.905818
##   53  5.636814  0.7279102  3.924568
##   54  5.655302  0.7263812  3.937522
##   55  5.669892  0.7258178  3.946392
##   56  5.681869  0.7263559  3.955676
##   57  5.693715  0.7256033  3.962371
##   58  5.715992  0.7236138  3.978380
##   59  5.735040  0.7225544  3.991542
##   60  5.754101  0.7220259  4.003576
##   61  5.774510  0.7205421  4.015272
##   62  5.795145  0.7187276  4.029275
##   63  5.804344  0.7186301  4.035603
##   64  5.815839  0.7181030  4.042268
##   65  5.830220  0.7174876  4.049738
##   66  5.843720  0.7169533  4.062396
##   67  5.853462  0.7164493  4.068685
##   68  5.866166  0.7159775  4.075606
##   69  5.878718  0.7160276  4.085416
##   70  5.887376  0.7152239  4.093900
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 2.
#plot
plot(fit)

We can see that we get lower values for Root Mean Square Error which is better when ‘k’ values are low and after that it continues to increase.

#important features
varImp(fit)
## loess r-squared variable importance
## 
##         Overall
## lstat    100.00
## rm        94.28
## indus     88.94
## tax       69.84
## ptratio   69.20
## rad       42.23
## zn        38.95
## crim      35.25
## nox       31.91
## b         24.68
## age       22.83
## dis       19.78
## chas       0.00

Lstat that’s the most important variable whereas chas that seems to be the least important variable in this dataset.

#prediction on train data
p1 <- predict(fit, train)

RMSE(p1, train$medv)
## [1] 2.111708
#prediction

p <- predict(fit, test)

RMSE(p, test$medv)
## [1] 6.151268
#plot

plot(p ~ test$medv)

We can see how is the performance of the model. Obviously this is not a perfect prediction, there is a scope for improvement

#R-squared metric

fit <- train(medv ~ .,
             data = train,
             tuneGrid = expand.grid(k=1:70),
             method='knn',
             metric="Rsquared",
             trControl = trcontrol,
             preProc = c('center','scale'))

#model perfomance
fit
## k-Nearest Neighbors 
## 
## 355 samples
##  13 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 319, 320, 319, 320, 319, 319, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  3.994455  0.7951351  2.637610
##    2  3.754720  0.8205465  2.527137
##    3  3.898348  0.8172672  2.579194
##    4  4.091573  0.8021918  2.707841
##    5  4.148494  0.8009280  2.755362
##    6  4.181742  0.7970423  2.797126
##    7  4.228585  0.7915498  2.827837
##    8  4.248178  0.7900817  2.859755
##    9  4.260175  0.7907409  2.868872
##   10  4.259691  0.7932396  2.873678
##   11  4.285392  0.7921388  2.891003
##   12  4.322493  0.7897200  2.928212
##   13  4.373486  0.7855017  2.979284
##   14  4.396708  0.7842051  3.013882
##   15  4.443218  0.7808016  3.058756
##   16  4.465601  0.7810004  3.080610
##   17  4.494113  0.7808069  3.101784
##   18  4.559028  0.7760992  3.155200
##   19  4.588414  0.7746603  3.181453
##   20  4.615645  0.7731785  3.207387
##   21  4.652467  0.7716002  3.233556
##   22  4.689769  0.7706785  3.268834
##   23  4.739586  0.7689167  3.307920
##   24  4.769788  0.7679972  3.335529
##   25  4.805783  0.7668204  3.358423
##   26  4.836154  0.7666792  3.381292
##   27  4.877529  0.7644500  3.418885
##   28  4.898705  0.7662903  3.443252
##   29  4.933702  0.7646452  3.469975
##   30  4.957030  0.7634719  3.483001
##   31  4.997993  0.7616180  3.505370
##   32  5.022626  0.7624679  3.519330
##   33  5.053992  0.7610815  3.539655
##   34  5.089068  0.7598263  3.559844
##   35  5.119412  0.7591416  3.585384
##   36  5.149067  0.7592888  3.605492
##   37  5.177847  0.7585224  3.625891
##   38  5.216942  0.7563849  3.647255
##   39  5.254944  0.7537319  3.670383
##   40  5.299148  0.7500426  3.699656
##   41  5.329022  0.7477060  3.718442
##   42  5.357098  0.7458972  3.740345
##   43  5.387161  0.7433551  3.757284
##   44  5.417467  0.7402409  3.779443
##   45  5.446810  0.7382343  3.798189
##   46  5.473329  0.7362620  3.816182
##   47  5.496075  0.7349448  3.831527
##   48  5.516363  0.7337574  3.839882
##   49  5.538929  0.7323922  3.857105
##   50  5.560418  0.7312524  3.876276
##   51  5.584433  0.7287451  3.889934
##   52  5.602280  0.7271014  3.902295
##   53  5.625391  0.7248296  3.920985
##   54  5.645916  0.7229610  3.934932
##   55  5.659777  0.7226568  3.943793
##   56  5.673651  0.7220088  3.952638
##   57  5.687988  0.7215625  3.959990
##   58  5.708186  0.7197333  3.973820
##   59  5.727482  0.7187173  3.982935
##   60  5.739980  0.7184282  3.995282
##   61  5.755116  0.7178123  4.004297
##   62  5.776096  0.7155370  4.017374
##   63  5.791640  0.7144609  4.028265
##   64  5.803123  0.7140652  4.036646
##   65  5.816088  0.7136431  4.045364
##   66  5.832064  0.7127467  4.055126
##   67  5.841452  0.7117901  4.059296
##   68  5.854516  0.7116302  4.071632
##   69  5.862202  0.7122801  4.079466
##   70  5.872551  0.7118770  4.087277
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was k = 2.
#plot
plot(fit)

#prediction
p <- predict(fit, newdata = test)

RMSE(p, test$medv)
## [1] 6.151268
#plot
plot(p ~ test$medv)