library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)
## [1] 768 9
levels(PimaIndiansDiabetes$diabetes)
## [1] "neg" "pos"
head(PimaIndiansDiabetes)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
library(class) #k-nearest neighbors
library(kknn) #weighted k-nearest neighbors
library(e1071) #SVM
library(caret) #select tuning parameters
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'caret'
## The following object is masked from 'package:kknn':
##
## contr.dummy
library(reshape2) #assist in creating boxplots
library(ggplot2) #create boxplots
library(kernlab) #assist with SVM feature selection
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
pima.melt = melt(PimaIndiansDiabetes, id.var="diabetes")
ggplot(data=pima.melt, aes(x=diabetes, y=value)) + geom_boxplot() + facet_wrap(~variable, ncol=2)

pima.scale = as.data.frame(scale(PimaIndiansDiabetes[,-9]))
str(pima.scale)
## 'data.frame': 768 obs. of 8 variables:
## $ pregnant: num 0.64 -0.844 1.233 -0.844 -1.141 ...
## $ glucose : num 0.848 -1.123 1.942 -0.998 0.504 ...
## $ pressure: num 0.15 -0.16 -0.264 -0.16 -1.504 ...
## $ triceps : num 0.907 0.531 -1.287 0.154 0.907 ...
## $ insulin : num -0.692 -0.692 -0.692 0.123 0.765 ...
## $ mass : num 0.204 -0.684 -1.103 -0.494 1.409 ...
## $ pedigree: num 0.468 -0.365 0.604 -0.92 5.481 ...
## $ age : num 1.4251 -0.1905 -0.1055 -1.0409 -0.0205 ...
pima.scale$diabetes = PimaIndiansDiabetes$diabetes
pima.scale.melt = melt(pima.scale, id.var="diabetes")
ggplot(data=pima.scale.melt, aes(x=diabetes, y=value)) +geom_boxplot()+facet_wrap(~variable, ncol=2)

cor(pima.scale[-9])
## pregnant glucose pressure triceps insulin
## pregnant 1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461
## glucose 0.12945867 1.00000000 0.15258959 0.05732789 0.33135711
## pressure 0.14128198 0.15258959 1.00000000 0.20737054 0.08893338
## triceps -0.08167177 0.05732789 0.20737054 1.00000000 0.43678257
## insulin -0.07353461 0.33135711 0.08893338 0.43678257 1.00000000
## mass 0.01768309 0.22107107 0.28180529 0.39257320 0.19785906
## pedigree -0.03352267 0.13733730 0.04126495 0.18392757 0.18507093
## age 0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295
## mass pedigree age
## pregnant 0.01768309 -0.03352267 0.54434123
## glucose 0.22107107 0.13733730 0.26351432
## pressure 0.28180529 0.04126495 0.23952795
## triceps 0.39257320 0.18392757 -0.11397026
## insulin 0.19785906 0.18507093 -0.04216295
## mass 1.00000000 0.14064695 0.03624187
## pedigree 0.14064695 1.00000000 0.03356131
## age 0.03624187 0.03356131 1.00000000
table(pima.scale$diabetes)
##
## neg pos
## 500 268
set.seed(123)
ind = sample(2, nrow(pima.scale), replace=TRUE, prob=c(0.7,0.3))
train = pima.scale[ind==1,]
test = pima.scale[ind==2,]
str(train)
## 'data.frame': 539 obs. of 9 variables:
## $ pregnant: num 0.64 1.233 0.343 -0.251 -0.548 ...
## $ glucose : num 0.848 1.942 -0.153 -1.342 2.38 ...
## $ pressure: num 0.1495 -0.2638 0.2529 -0.9871 0.0462 ...
## $ triceps : num 0.907 -1.287 -1.287 0.719 1.534 ...
## $ insulin : num -0.6924 -0.6924 -0.6924 0.0712 4.0193 ...
## $ mass : num 0.204 -1.103 -0.811 -0.126 -0.189 ...
## $ pedigree: num 0.468 0.604 -0.818 -0.676 -0.947 ...
## $ age : num 1.425 -0.106 -0.276 -0.616 1.68 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 2 2 2 2 1 2 2 ...
str(test)
## 'data.frame': 229 obs. of 9 variables:
## $ pregnant: num -0.844 -0.844 -1.141 1.827 0.046 ...
## $ glucose : num -1.123 -0.998 0.504 -0.184 -0.341 ...
## $ pressure: num -0.16 -0.16 -1.5 -3.57 1.18 ...
## $ triceps : num 0.531 0.154 0.907 -1.287 -1.287 ...
## $ insulin : num -0.692 0.123 0.765 -0.692 -0.692 ...
## $ mass : num -0.684 -0.494 1.409 0.42 0.711 ...
## $ pedigree: num -0.365 -0.92 5.481 -1.02 -0.848 ...
## $ age : num -0.1905 -1.0409 -0.0205 -0.3606 -0.2756 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 2 1 2 2 ...
#KNN modelling
grid1 = expand.grid(.k=seq(2,20, by=1))
control = trainControl(method="cv")
set.seed(123)
knn.train = train(diabetes~., data=train, method="knn", trControl=control, tuneGrid=grid1)
knn.train
## k-Nearest Neighbors
##
## 539 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 486, 485, 484, 485, 486, 485, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 2 0.7179614 0.3639456
## 3 0.7197071 0.3477321
## 4 0.7163439 0.3483783
## 5 0.7405965 0.3897759
## 6 0.7440595 0.4011237
## 7 0.7441630 0.3923259
## 8 0.7384328 0.3738235
## 9 0.7553427 0.4202850
## 10 0.7535595 0.4090959
## 11 0.7572283 0.4150189
## 12 0.7514669 0.4039781
## 13 0.7588082 0.4158824
## 14 0.7477619 0.3826554
## 15 0.7570237 0.4113276
## 16 0.7533549 0.4030145
## 17 0.7439896 0.3799075
## 18 0.7494765 0.3889454
## 19 0.7477282 0.3829248
## 20 0.7477968 0.3856481
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
knn.test = knn(train[,-9], test[,-9], train[,9], k=13)
table(knn.test, test$diabetes)
##
## knn.test neg pos
## neg 124 44
## pos 21 40
(124+40)/229
## [1] 0.7161572
#calculate Kappa
prob.agree = (124+40)/229 #accuracy
prob.chance = ((124+44)/229) * ((124+21)/229)
prob.chance
## [1] 0.464522
kappa = (prob.agree - prob.chance) / (1 - prob.chance)
kappa
## [1] 0.4699263
set.seed(123)
kknn.train = train.kknn(diabetes~., data=train, kmax=25, distance=2,
kernel=c("rectangular", "triangular", "epanechnikov"))
plot(kknn.train)

kknn.train
##
## Call:
## train.kknn(formula = diabetes ~ ., data = train, kmax = 25, distance = 2, kernel = c("rectangular", "triangular", "epanechnikov"))
##
## Type of response variable: nominal
## Minimal misclassification: 0.2393321
## Best kernel: rectangular
## Best k: 11