Machine learning:KNN with Pima Indians Diabetes Data

library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)

## [1] 768   9

levels(PimaIndiansDiabetes$diabetes)

## [1] "neg" "pos"

head(PimaIndiansDiabetes)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

library(class) #k-nearest neighbors
library(kknn) #weighted k-nearest neighbors
library(e1071) #SVM
library(caret) #select tuning parameters

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'caret'

## The following object is masked from 'package:kknn':
## 
##     contr.dummy

library(reshape2) #assist in creating boxplots
library(ggplot2) #create boxplots
library(kernlab) #assist with SVM feature selection

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

pima.melt = melt(PimaIndiansDiabetes, id.var="diabetes")
ggplot(data=pima.melt, aes(x=diabetes, y=value)) + geom_boxplot() + facet_wrap(~variable, ncol=2)

pima.scale = as.data.frame(scale(PimaIndiansDiabetes[,-9]))
str(pima.scale)

## 'data.frame':    768 obs. of  8 variables:
##  $ pregnant: num  0.64 -0.844 1.233 -0.844 -1.141 ...
##  $ glucose : num  0.848 -1.123 1.942 -0.998 0.504 ...
##  $ pressure: num  0.15 -0.16 -0.264 -0.16 -1.504 ...
##  $ triceps : num  0.907 0.531 -1.287 0.154 0.907 ...
##  $ insulin : num  -0.692 -0.692 -0.692 0.123 0.765 ...
##  $ mass    : num  0.204 -0.684 -1.103 -0.494 1.409 ...
##  $ pedigree: num  0.468 -0.365 0.604 -0.92 5.481 ...
##  $ age     : num  1.4251 -0.1905 -0.1055 -1.0409 -0.0205 ...

pima.scale$diabetes = PimaIndiansDiabetes$diabetes

pima.scale.melt = melt(pima.scale, id.var="diabetes")
ggplot(data=pima.scale.melt, aes(x=diabetes, y=value)) +geom_boxplot()+facet_wrap(~variable, ncol=2)

cor(pima.scale[-9])

##             pregnant    glucose   pressure     triceps     insulin
## pregnant  1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461
## glucose   0.12945867 1.00000000 0.15258959  0.05732789  0.33135711
## pressure  0.14128198 0.15258959 1.00000000  0.20737054  0.08893338
## triceps  -0.08167177 0.05732789 0.20737054  1.00000000  0.43678257
## insulin  -0.07353461 0.33135711 0.08893338  0.43678257  1.00000000
## mass      0.01768309 0.22107107 0.28180529  0.39257320  0.19785906
## pedigree -0.03352267 0.13733730 0.04126495  0.18392757  0.18507093
## age       0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295
##                mass    pedigree         age
## pregnant 0.01768309 -0.03352267  0.54434123
## glucose  0.22107107  0.13733730  0.26351432
## pressure 0.28180529  0.04126495  0.23952795
## triceps  0.39257320  0.18392757 -0.11397026
## insulin  0.19785906  0.18507093 -0.04216295
## mass     1.00000000  0.14064695  0.03624187
## pedigree 0.14064695  1.00000000  0.03356131
## age      0.03624187  0.03356131  1.00000000

table(pima.scale$diabetes)

## 
## neg pos 
## 500 268

set.seed(123)
ind = sample(2, nrow(pima.scale), replace=TRUE, prob=c(0.7,0.3))
train = pima.scale[ind==1,]
test = pima.scale[ind==2,]
str(train)

## 'data.frame':    539 obs. of  9 variables:
##  $ pregnant: num  0.64 1.233 0.343 -0.251 -0.548 ...
##  $ glucose : num  0.848 1.942 -0.153 -1.342 2.38 ...
##  $ pressure: num  0.1495 -0.2638 0.2529 -0.9871 0.0462 ...
##  $ triceps : num  0.907 -1.287 -1.287 0.719 1.534 ...
##  $ insulin : num  -0.6924 -0.6924 -0.6924 0.0712 4.0193 ...
##  $ mass    : num  0.204 -1.103 -0.811 -0.126 -0.189 ...
##  $ pedigree: num  0.468 0.604 -0.818 -0.676 -0.947 ...
##  $ age     : num  1.425 -0.106 -0.276 -0.616 1.68 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 2 2 2 2 1 2 2 ...

str(test)

## 'data.frame':    229 obs. of  9 variables:
##  $ pregnant: num  -0.844 -0.844 -1.141 1.827 0.046 ...
##  $ glucose : num  -1.123 -0.998 0.504 -0.184 -0.341 ...
##  $ pressure: num  -0.16 -0.16 -1.5 -3.57 1.18 ...
##  $ triceps : num  0.531 0.154 0.907 -1.287 -1.287 ...
##  $ insulin : num  -0.692 0.123 0.765 -0.692 -0.692 ...
##  $ mass    : num  -0.684 -0.494 1.409 0.42 0.711 ...
##  $ pedigree: num  -0.365 -0.92 5.481 -1.02 -0.848 ...
##  $ age     : num  -0.1905 -1.0409 -0.0205 -0.3606 -0.2756 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 2 1 2 2 ...

#KNN modelling
grid1 = expand.grid(.k=seq(2,20, by=1))
control = trainControl(method="cv")
set.seed(123)
knn.train = train(diabetes~., data=train, method="knn", trControl=control, tuneGrid=grid1)
knn.train

## k-Nearest Neighbors 
## 
## 539 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 486, 485, 484, 485, 486, 485, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    2  0.7179614  0.3639456
##    3  0.7197071  0.3477321
##    4  0.7163439  0.3483783
##    5  0.7405965  0.3897759
##    6  0.7440595  0.4011237
##    7  0.7441630  0.3923259
##    8  0.7384328  0.3738235
##    9  0.7553427  0.4202850
##   10  0.7535595  0.4090959
##   11  0.7572283  0.4150189
##   12  0.7514669  0.4039781
##   13  0.7588082  0.4158824
##   14  0.7477619  0.3826554
##   15  0.7570237  0.4113276
##   16  0.7533549  0.4030145
##   17  0.7439896  0.3799075
##   18  0.7494765  0.3889454
##   19  0.7477282  0.3829248
##   20  0.7477968  0.3856481
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was k = 13.

knn.test = knn(train[,-9], test[,-9], train[,9], k=13)
table(knn.test, test$diabetes)

##         
## knn.test neg pos
##      neg 124  44
##      pos  21  40

(124+40)/229

## [1] 0.7161572

#calculate Kappa
prob.agree = (124+40)/229 #accuracy
prob.chance = ((124+44)/229) * ((124+21)/229)
prob.chance

## [1] 0.464522

kappa = (prob.agree - prob.chance) / (1 - prob.chance)
kappa

## [1] 0.4699263

set.seed(123)
kknn.train = train.kknn(diabetes~., data=train, kmax=25, distance=2,
kernel=c("rectangular", "triangular", "epanechnikov"))
plot(kknn.train)

kknn.train

## 
## Call:
## train.kknn(formula = diabetes ~ ., data = train, kmax = 25, distance = 2,     kernel = c("rectangular", "triangular", "epanechnikov"))
## 
## Type of response variable: nominal
## Minimal misclassification: 0.2393321
## Best kernel: rectangular
## Best k: 11

Machine learning:KNN with Pima Indians Diabetes Data

Kushan De Silva

August 4, 2017