library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.5.3
data <- read.csv('F:/Machine Learning/Data Science/Machine Learning/KNN/binary.csv')
str(data)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
#replace admit '0' & '1' with 'No' & 'Yes' respectively
data$admit[data$admit==0] <- 'No'
data$admit[data$admit==1] <- 'Yes'
str(data)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: chr "No" "Yes" "Yes" "Yes" ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
#convert chr to factor
data$admit <- as.factor(data$admit)
str(data)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 1 2 1 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
#Data Partition
set.seed(1234)
ind <- sample(2, nrow(data), replace = T, prob=c(0.7,0.3))
train <- data[ind==1,]
test <- data[ind==2,]
dim(train)
## [1] 284 4
dim(test)
## [1] 116 4
#repeated cross-validation method
trcontrol <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
#KNN for classification
#by default it runs 'accuracy' metric for selecting optimal 'k'
#preProc is to normalize the values
set.seed(222)
fit <- train(admit ~ .,
data=train,
method='knn',
tuneLength=20,
trControl = trcontrol,
preProc = c("center","scale"))
fit
## k-Nearest Neighbors
##
## 284 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 256, 256, 256, 256, 256, 255, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.6358785 0.09112381
## 7 0.6676108 0.14676016
## 9 0.6795156 0.16944350
## 11 0.6792282 0.16232672
## 13 0.6651067 0.12648903
## 15 0.6768883 0.14837562
## 17 0.6826765 0.16001288
## 19 0.6816502 0.14927686
## 21 0.6794745 0.13529673
## 23 0.6830049 0.13663649
## 25 0.6899425 0.15415306
## 27 0.6875616 0.14970069
## 29 0.7027915 0.18620447
## 31 0.6981117 0.16567350
## 33 0.7028325 0.17997849
## 35 0.6958128 0.15765299
## 37 0.6946223 0.15615233
## 39 0.6958949 0.15870467
## 41 0.6912151 0.14179428
## 43 0.6923235 0.14317763
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 33.
So model achieves highest accuracy when k=33.
#Model Performance
plot(fit)
#feature importance
varImp(fit)
## ROC curve variable importance
##
## Importance
## gpa 100.00
## rank 25.18
## gre 0.00
With the 3 features in the model, gpa is more significantly contributing in predicting the outcome variable.
#prediction on train data
p1 <- predict(fit , train)
head(p1)
## [1] No No Yes No No No
## Levels: No Yes
#confusion matrix on train data
confusionMatrix(p1, train$admit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 180 72
## Yes 11 21
##
## Accuracy : 0.7077
## 95% CI : (0.6511, 0.76)
## No Information Rate : 0.6725
## P-Value [Acc > NIR] : 0.1141
##
## Kappa : 0.2022
##
## Mcnemar's Test P-Value : 4.523e-11
##
## Sensitivity : 0.9424
## Specificity : 0.2258
## Pos Pred Value : 0.7143
## Neg Pred Value : 0.6562
## Prevalence : 0.6725
## Detection Rate : 0.6338
## Detection Prevalence : 0.8873
## Balanced Accuracy : 0.5841
##
## 'Positive' Class : No
##
#Prediction on test data
p <- predict(fit, test)
head(p)
## [1] No No No Yes No No
## Levels: No Yes
#confusion matrix
confusionMatrix(p, test$admit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 79 30
## Yes 3 4
##
## Accuracy : 0.7155
## 95% CI : (0.6243, 0.7954)
## No Information Rate : 0.7069
## P-Value [Acc > NIR] : 0.465
##
## Kappa : 0.1056
##
## Mcnemar's Test P-Value : 6.011e-06
##
## Sensitivity : 0.9634
## Specificity : 0.1176
## Pos Pred Value : 0.7248
## Neg Pred Value : 0.5714
## Prevalence : 0.7069
## Detection Rate : 0.6810
## Detection Prevalence : 0.9397
## Balanced Accuracy : 0.5405
##
## 'Positive' Class : No
##
So model achieved 71.55% accuracy on the test (unseen) data.
#repeated cross-validation method
trcontrol <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary)
#use 'ROC' metric for selecting optimal k
fit <- train(admit ~ .,
data=train,
method='knn',
tuneLength=20,
trControl = trcontrol,
preProc = c("center","scale"),
metric = 'ROC',
tuneGrid = expand.grid(k=1:60))
fit
## k-Nearest Neighbors
##
## 284 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 255, 255, 255, 256, 256, 255, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 1 0.5378363 0.7071053 0.3707407
## 2 0.5731954 0.7187719 0.3825926
## 3 0.5829990 0.8041228 0.3481481
## 4 0.5710941 0.7674561 0.3262963
## 5 0.5901174 0.8356140 0.2677778
## 6 0.6044347 0.8339474 0.2900000
## 7 0.6095132 0.8689474 0.2829630
## 8 0.6115249 0.8638596 0.2659259
## 9 0.6169859 0.8845614 0.2722222
## 10 0.6116857 0.8724561 0.2677778
## 11 0.6179893 0.8778070 0.2696296
## 12 0.6134532 0.8776316 0.2766667
## 13 0.6206277 0.8849123 0.2559259
## 14 0.6174615 0.8864912 0.2696296
## 15 0.6232125 0.8935965 0.2444444
## 16 0.6254035 0.8847368 0.2444444
## 17 0.6356871 0.8881579 0.2233333
## 18 0.6388704 0.9003509 0.2307407
## 19 0.6413163 0.9019298 0.2196296
## 20 0.6460658 0.9022807 0.2018519
## 21 0.6494795 0.9072807 0.2122222
## 22 0.6446433 0.9107895 0.1911111
## 23 0.6518255 0.9264912 0.1848148
## 24 0.6533587 0.9211404 0.2029630
## 25 0.6555731 0.9316667 0.1844444
## 26 0.6564059 0.9264035 0.1951852
## 27 0.6624483 0.9316667 0.1914815
## 28 0.6706979 0.9386842 0.1911111
## 29 0.6727271 0.9421930 0.1874074
## 30 0.6732778 0.9475439 0.1877778
## 31 0.6717943 0.9386842 0.1833333
## 32 0.6700127 0.9369298 0.1762963
## 33 0.6700292 0.9421930 0.1914815
## 34 0.6707407 0.9439474 0.1803704
## 35 0.6714201 0.9439474 0.1840741
## 36 0.6743977 0.9421930 0.1692593
## 37 0.6718114 0.9439474 0.1881481
## 38 0.6700132 0.9386842 0.1811111
## 39 0.6689196 0.9351754 0.1840741
## 40 0.6676696 0.9386842 0.1844444
## 41 0.6688177 0.9369298 0.1814815
## 42 0.6650663 0.9404386 0.1637037
## 43 0.6641969 0.9439474 0.1711111
## 44 0.6670361 0.9457018 0.1525926
## 45 0.6698216 0.9492105 0.1637037
## 46 0.6705361 0.9527193 0.1674074
## 47 0.6728845 0.9492105 0.1488889
## 48 0.6719610 0.9492105 0.1418519
## 49 0.6711949 0.9509649 0.1451852
## 50 0.6735648 0.9527193 0.1414815
## 51 0.6740136 0.9527193 0.1414815
## 52 0.6740721 0.9492105 0.1377778
## 53 0.6777057 0.9509649 0.1414815
## 54 0.6776910 0.9544737 0.1414815
## 55 0.6742919 0.9579825 0.1303704
## 56 0.6719479 0.9579825 0.1344444
## 57 0.6724103 0.9614912 0.1381481
## 58 0.6693163 0.9579825 0.1307407
## 59 0.6695161 0.9597368 0.1344444
## 60 0.6725073 0.9649123 0.1125926
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 53.
Now we can see that the above values are based on the ROC metric.
#plot
plot(fit)
#feature importance based on 'ROC' metric
varImp(fit)
## ROC curve variable importance
##
## Importance
## gpa 100.00
## rank 25.18
## gre 0.00
#prediction on train data
p2 <- predict(fit, train)
confusionMatrix(p2, train$admit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 182 79
## Yes 9 14
##
## Accuracy : 0.6901
## 95% CI : (0.6328, 0.7434)
## No Information Rate : 0.6725
## P-Value [Acc > NIR] : 0.2864
##
## Kappa : 0.1282
##
## Mcnemar's Test P-Value : 1.903e-13
##
## Sensitivity : 0.9529
## Specificity : 0.1505
## Pos Pred Value : 0.6973
## Neg Pred Value : 0.6087
## Prevalence : 0.6725
## Detection Rate : 0.6408
## Detection Prevalence : 0.9190
## Balanced Accuracy : 0.5517
##
## 'Positive' Class : No
##
#Prediction on test data
p4 <- predict(fit, test)
confusionMatrix(p4, test$admit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 81 32
## Yes 1 2
##
## Accuracy : 0.7155
## 95% CI : (0.6243, 0.7954)
## No Information Rate : 0.7069
## P-Value [Acc > NIR] : 0.465
##
## Kappa : 0.0636
##
## Mcnemar's Test P-Value : 1.767e-07
##
## Sensitivity : 0.98780
## Specificity : 0.05882
## Pos Pred Value : 0.71681
## Neg Pred Value : 0.66667
## Prevalence : 0.70690
## Detection Rate : 0.69828
## Detection Prevalence : 0.97414
## Balanced Accuracy : 0.52331
##
## 'Positive' Class : No
##
We can see that the model achieved about 73% of accuracy on unseen data with “ROC” metric which is higher than the accuracy achieved with “accuracy” metric.
data("BostonHousing")
data <- BostonHousing
str(data)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(1234)
ind <- sample(2, nrow(data), replace = T, prob=c(0.7,0.3))
train <- data[ind==1,]
test <- data[ind==2,]
#repeated cross-validation method
trcontrol <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
#knn for regression, by default it runs RMSE metric
set.seed(333)
fit <- train(medv ~ .,
data = train,
tuneGrid = expand.grid(k=1:70),
method='knn',
trControl = trcontrol,
preProc = c('center','scale'))
#Model performance
fit
## k-Nearest Neighbors
##
## 355 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 319, 320, 319, 320, 321, 319, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 4.220233 0.7893295 2.772957
## 2 3.865542 0.8121023 2.645454
## 3 3.934359 0.8120572 2.622213
## 4 4.056351 0.8099926 2.685353
## 5 4.114220 0.8104629 2.740373
## 6 4.238611 0.7955515 2.846685
## 7 4.251670 0.7944795 2.860067
## 8 4.256004 0.7954250 2.887005
## 9 4.251901 0.7993392 2.892098
## 10 4.231215 0.8026196 2.888946
## 11 4.271776 0.7995826 2.925118
## 12 4.320730 0.7944261 2.958073
## 13 4.347364 0.7929000 2.974351
## 14 4.383062 0.7918880 3.019517
## 15 4.419955 0.7909817 3.060170
## 16 4.454649 0.7889415 3.080520
## 17 4.510544 0.7849833 3.125955
## 18 4.559126 0.7833061 3.162832
## 19 4.590455 0.7818717 3.190647
## 20 4.618037 0.7815097 3.218698
## 21 4.657021 0.7778881 3.243445
## 22 4.691509 0.7777435 3.278319
## 23 4.735923 0.7756833 3.312395
## 24 4.781045 0.7728091 3.347412
## 25 4.811122 0.7711752 3.378069
## 26 4.849810 0.7697044 3.401861
## 27 4.881436 0.7684275 3.430206
## 28 4.906460 0.7692417 3.445831
## 29 4.937899 0.7698780 3.468374
## 30 4.975059 0.7673481 3.494391
## 31 5.009274 0.7663530 3.515671
## 32 5.037252 0.7663301 3.536316
## 33 5.064711 0.7659840 3.547531
## 34 5.097279 0.7652401 3.568150
## 35 5.129075 0.7637818 3.589298
## 36 5.163558 0.7616490 3.613341
## 37 5.199721 0.7600383 3.641035
## 38 5.236509 0.7577097 3.664487
## 39 5.275571 0.7551501 3.689218
## 40 5.299488 0.7533839 3.704732
## 41 5.333833 0.7506246 3.720636
## 42 5.361248 0.7490764 3.737189
## 43 5.392890 0.7467751 3.758837
## 44 5.421543 0.7443202 3.782809
## 45 5.453338 0.7418674 3.801114
## 46 5.470509 0.7407170 3.808809
## 47 5.498008 0.7382269 3.822770
## 48 5.525593 0.7357696 3.844304
## 49 5.545909 0.7351324 3.857904
## 50 5.575568 0.7325247 3.879289
## 51 5.598911 0.7311560 3.894973
## 52 5.617319 0.7288680 3.905818
## 53 5.636814 0.7279102 3.924568
## 54 5.655302 0.7263812 3.937522
## 55 5.669892 0.7258178 3.946392
## 56 5.681869 0.7263559 3.955676
## 57 5.693715 0.7256033 3.962371
## 58 5.715992 0.7236138 3.978380
## 59 5.735040 0.7225544 3.991542
## 60 5.754101 0.7220259 4.003576
## 61 5.774510 0.7205421 4.015272
## 62 5.795145 0.7187276 4.029275
## 63 5.804344 0.7186301 4.035603
## 64 5.815839 0.7181030 4.042268
## 65 5.830220 0.7174876 4.049738
## 66 5.843720 0.7169533 4.062396
## 67 5.853462 0.7164493 4.068685
## 68 5.866166 0.7159775 4.075606
## 69 5.878718 0.7160276 4.085416
## 70 5.887376 0.7152239 4.093900
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 2.
#plot
plot(fit)
We can see that we get lower values for Root Mean Square Error which is better when ‘k’ values are low and after that it continues to increase.
#important features
varImp(fit)
## loess r-squared variable importance
##
## Overall
## lstat 100.00
## rm 94.28
## indus 88.94
## tax 69.84
## ptratio 69.20
## rad 42.23
## zn 38.95
## crim 35.25
## nox 31.91
## b 24.68
## age 22.83
## dis 19.78
## chas 0.00
Lstat that’s the most important variable whereas chas that seems to be the least important variable in this dataset.
#prediction on train data
p1 <- predict(fit, train)
RMSE(p1, train$medv)
## [1] 2.111708
#prediction
p <- predict(fit, test)
RMSE(p, test$medv)
## [1] 6.151268
#plot
plot(p ~ test$medv)
We can see how is the performance of the model. Obviously this is not a perfect prediction, there is a scope for improvement
#R-squared metric
fit <- train(medv ~ .,
data = train,
tuneGrid = expand.grid(k=1:70),
method='knn',
metric="Rsquared",
trControl = trcontrol,
preProc = c('center','scale'))
#model perfomance
fit
## k-Nearest Neighbors
##
## 355 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 319, 320, 319, 320, 319, 319, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 3.994455 0.7951351 2.637610
## 2 3.754720 0.8205465 2.527137
## 3 3.898348 0.8172672 2.579194
## 4 4.091573 0.8021918 2.707841
## 5 4.148494 0.8009280 2.755362
## 6 4.181742 0.7970423 2.797126
## 7 4.228585 0.7915498 2.827837
## 8 4.248178 0.7900817 2.859755
## 9 4.260175 0.7907409 2.868872
## 10 4.259691 0.7932396 2.873678
## 11 4.285392 0.7921388 2.891003
## 12 4.322493 0.7897200 2.928212
## 13 4.373486 0.7855017 2.979284
## 14 4.396708 0.7842051 3.013882
## 15 4.443218 0.7808016 3.058756
## 16 4.465601 0.7810004 3.080610
## 17 4.494113 0.7808069 3.101784
## 18 4.559028 0.7760992 3.155200
## 19 4.588414 0.7746603 3.181453
## 20 4.615645 0.7731785 3.207387
## 21 4.652467 0.7716002 3.233556
## 22 4.689769 0.7706785 3.268834
## 23 4.739586 0.7689167 3.307920
## 24 4.769788 0.7679972 3.335529
## 25 4.805783 0.7668204 3.358423
## 26 4.836154 0.7666792 3.381292
## 27 4.877529 0.7644500 3.418885
## 28 4.898705 0.7662903 3.443252
## 29 4.933702 0.7646452 3.469975
## 30 4.957030 0.7634719 3.483001
## 31 4.997993 0.7616180 3.505370
## 32 5.022626 0.7624679 3.519330
## 33 5.053992 0.7610815 3.539655
## 34 5.089068 0.7598263 3.559844
## 35 5.119412 0.7591416 3.585384
## 36 5.149067 0.7592888 3.605492
## 37 5.177847 0.7585224 3.625891
## 38 5.216942 0.7563849 3.647255
## 39 5.254944 0.7537319 3.670383
## 40 5.299148 0.7500426 3.699656
## 41 5.329022 0.7477060 3.718442
## 42 5.357098 0.7458972 3.740345
## 43 5.387161 0.7433551 3.757284
## 44 5.417467 0.7402409 3.779443
## 45 5.446810 0.7382343 3.798189
## 46 5.473329 0.7362620 3.816182
## 47 5.496075 0.7349448 3.831527
## 48 5.516363 0.7337574 3.839882
## 49 5.538929 0.7323922 3.857105
## 50 5.560418 0.7312524 3.876276
## 51 5.584433 0.7287451 3.889934
## 52 5.602280 0.7271014 3.902295
## 53 5.625391 0.7248296 3.920985
## 54 5.645916 0.7229610 3.934932
## 55 5.659777 0.7226568 3.943793
## 56 5.673651 0.7220088 3.952638
## 57 5.687988 0.7215625 3.959990
## 58 5.708186 0.7197333 3.973820
## 59 5.727482 0.7187173 3.982935
## 60 5.739980 0.7184282 3.995282
## 61 5.755116 0.7178123 4.004297
## 62 5.776096 0.7155370 4.017374
## 63 5.791640 0.7144609 4.028265
## 64 5.803123 0.7140652 4.036646
## 65 5.816088 0.7136431 4.045364
## 66 5.832064 0.7127467 4.055126
## 67 5.841452 0.7117901 4.059296
## 68 5.854516 0.7116302 4.071632
## 69 5.862202 0.7122801 4.079466
## 70 5.872551 0.7118770 4.087277
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was k = 2.
#plot
plot(fit)
#prediction
p <- predict(fit, newdata = test)
RMSE(p, test$medv)
## [1] 6.151268
#plot
plot(p ~ test$medv)