library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)
## [1] 768 9
levels(PimaIndiansDiabetes$diabetes)
## [1] "neg" "pos"
head(PimaIndiansDiabetes)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
library(e1071) #SVM
library(caret) #select tuning parameters
## Loading required package: lattice
## Loading required package: ggplot2
library(reshape2) #assist in creating boxplots
library(ggplot2) #create boxplots
library(kernlab) #assist with SVM feature selection
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
pima.scale = as.data.frame(scale(PimaIndiansDiabetes[,-9]))
str(pima.scale)
## 'data.frame': 768 obs. of 8 variables:
## $ pregnant: num 0.64 -0.844 1.233 -0.844 -1.141 ...
## $ glucose : num 0.848 -1.123 1.942 -0.998 0.504 ...
## $ pressure: num 0.15 -0.16 -0.264 -0.16 -1.504 ...
## $ triceps : num 0.907 0.531 -1.287 0.154 0.907 ...
## $ insulin : num -0.692 -0.692 -0.692 0.123 0.765 ...
## $ mass : num 0.204 -0.684 -1.103 -0.494 1.409 ...
## $ pedigree: num 0.468 -0.365 0.604 -0.92 5.481 ...
## $ age : num 1.4251 -0.1905 -0.1055 -1.0409 -0.0205 ...
pima.scale$diabetes = PimaIndiansDiabetes$diabetes
pima.scale.melt = melt(pima.scale, id.var="diabetes")
ggplot(data=pima.scale.melt, aes(x=diabetes, y=value)) +geom_boxplot()+facet_wrap(~variable, ncol=2)

cor(pima.scale[-9])
## pregnant glucose pressure triceps insulin
## pregnant 1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461
## glucose 0.12945867 1.00000000 0.15258959 0.05732789 0.33135711
## pressure 0.14128198 0.15258959 1.00000000 0.20737054 0.08893338
## triceps -0.08167177 0.05732789 0.20737054 1.00000000 0.43678257
## insulin -0.07353461 0.33135711 0.08893338 0.43678257 1.00000000
## mass 0.01768309 0.22107107 0.28180529 0.39257320 0.19785906
## pedigree -0.03352267 0.13733730 0.04126495 0.18392757 0.18507093
## age 0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295
## mass pedigree age
## pregnant 0.01768309 -0.03352267 0.54434123
## glucose 0.22107107 0.13733730 0.26351432
## pressure 0.28180529 0.04126495 0.23952795
## triceps 0.39257320 0.18392757 -0.11397026
## insulin 0.19785906 0.18507093 -0.04216295
## mass 1.00000000 0.14064695 0.03624187
## pedigree 0.14064695 1.00000000 0.03356131
## age 0.03624187 0.03356131 1.00000000
table(pima.scale$diabetes)
##
## neg pos
## 500 268
set.seed(123)
ind = sample(2, nrow(pima.scale), replace=TRUE, prob=c(0.6,0.4))
train = pima.scale[ind==1,]
test = pima.scale[ind==2,]
str(train)
## 'data.frame': 468 obs. of 9 variables:
## $ pregnant: num 0.64 1.233 0.343 -0.251 -0.548 ...
## $ glucose : num 0.848 1.942 -0.153 -1.342 2.38 ...
## $ pressure: num 0.1495 -0.2638 0.2529 -0.9871 0.0462 ...
## $ triceps : num 0.907 -1.287 -1.287 0.719 1.534 ...
## $ insulin : num -0.6924 -0.6924 -0.6924 0.0712 4.0193 ...
## $ mass : num 0.204 -1.103 -0.811 -0.126 -0.189 ...
## $ pedigree: num 0.468 0.604 -0.818 -0.676 -0.947 ...
## $ age : num 1.425 -0.106 -0.276 -0.616 1.68 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 2 2 2 2 2 2 2 ...
str(test)
## 'data.frame': 300 obs. of 9 variables:
## $ pregnant: num -0.844 -0.844 -1.141 1.827 0.046 ...
## $ glucose : num -1.123 -0.998 0.504 -0.184 -0.341 ...
## $ pressure: num -0.16 -0.16 -1.5 -3.57 1.18 ...
## $ triceps : num 0.531 0.154 0.907 -1.287 -1.287 ...
## $ insulin : num -0.692 0.123 0.765 -0.692 -0.692 ...
## $ mass : num -0.684 -0.494 1.409 0.42 0.711 ...
## $ pedigree: num -0.365 -0.92 5.481 -1.02 -0.848 ...
## $ age : num -0.1905 -1.0409 -0.0205 -0.3606 -0.2756 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 1 1 2 1 1 1 2 2 1 1 ...
#SVM modelling
linear.tune = tune.svm(diabetes~., data=train, kernel="linear", cost=c(0.001, 0.01, 0.1, 1,5,10))
summary(linear.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.2177151
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.3354764 0.04821124
## 2 1e-02 0.2177151 0.04908685
## 3 1e-01 0.2198890 0.04521036
## 4 1e+00 0.2241443 0.04535524
## 5 5e+00 0.2219704 0.04951808
## 6 1e+01 0.2219704 0.04951808
best.linear = linear.tune$best.model
tune.test = predict(best.linear, newdata=test)
table(tune.test, test$diabetes)
##
## tune.test neg pos
## neg 172 56
## pos 17 55
(172+55)/300
## [1] 0.7566667
set.seed(123)
poly.tune = tune.svm(diabetes~., data=train, kernel="polynomial",
degree=c(3,4,5), coef0=c(0.1,0.5,1,2,3,4))
summary(poly.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## degree coef0
## 4 0.5
##
## - best performance: 0.230666
##
## - Detailed performance results:
## degree coef0 error dispersion
## 1 3 0.1 0.2437095 0.03634583
## 2 4 0.1 0.2542091 0.05801823
## 3 5 0.1 0.2520814 0.05198025
## 4 3 0.5 0.2329787 0.04474148
## 5 4 0.5 0.2306660 0.04183322
## 6 5 0.5 0.2435708 0.05624789
## 7 3 1.0 0.2414894 0.04736281
## 8 4 1.0 0.2457909 0.04783015
## 9 5 1.0 0.2712303 0.06397011
## 10 3 2.0 0.2393154 0.04573676
## 11 4 2.0 0.2713691 0.05518721
## 12 5 2.0 0.3076318 0.05585784
## 13 3 3.0 0.2435245 0.05007886
## 14 4 3.0 0.2777983 0.05159247
## 15 5 3.0 0.3013414 0.04816053
## 16 3 4.0 0.2478261 0.04485808
## 17 4 4.0 0.3055967 0.05946087
## 18 5 4.0 0.3142923 0.04866626
best.poly = poly.tune$best.model
poly.test = predict(best.poly, newdata=test)
table(poly.test, test$diabetes)
##
## poly.test neg pos
## neg 163 60
## pos 26 51
(163+51)/300
## [1] 0.7133333
set.seed(123)
rbf.tune = tune.svm(diabetes~., data=train, kernel="radial",
gamma=c(0.1,0.5,1,2,3,4))
summary(rbf.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma
## 0.1
##
## - best performance: 0.2222479
##
## - Detailed performance results:
## gamma error dispersion
## 1 0.1 0.2222479 0.04841258
## 2 0.5 0.2628122 0.04365929
## 3 1.0 0.3183164 0.06578581
## 4 2.0 0.3289547 0.05973120
## 5 3.0 0.3353377 0.05945045
## 6 4.0 0.3375116 0.05121227
best.rbf = rbf.tune$best.model
rbf.test = predict(best.rbf, newdata=test)
table(rbf.test, test$diabetes)
##
## rbf.test neg pos
## neg 164 47
## pos 25 64
(164+64)/300
## [1] 0.76
set.seed(123)
sigmoid.tune = tune.svm(diabetes~., data=train, kernel="sigmoid",
gamma=c(0.1,0.5,1,2,3,4), coef0=c(0.1,0.5,1,2,3,4))
summary(sigmoid.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma coef0
## 0.1 2
##
## - best performance: 0.2820537
##
## - Detailed performance results:
## gamma coef0 error dispersion
## 1 0.1 0.1 0.2843663 0.06979211
## 2 0.5 0.1 0.3353377 0.06958547
## 3 1.0 0.1 0.3309436 0.07070152
## 4 2.0 0.1 0.3481961 0.07264783
## 5 3.0 0.1 0.3460685 0.05427109
## 6 4.0 0.1 0.3501850 0.09174225
## 7 0.1 0.5 0.3246994 0.05506732
## 8 0.5 0.5 0.3482886 0.06027442
## 9 1.0 0.5 0.3482424 0.04240683
## 10 2.0 0.5 0.3396855 0.04005442
## 11 3.0 0.5 0.3481036 0.04991623
## 12 4.0 0.5 0.3545791 0.06896997
## 13 0.1 1.0 0.3311286 0.06631114
## 14 0.5 1.0 0.3631822 0.04765719
## 15 1.0 1.0 0.3482886 0.05582785
## 16 2.0 1.0 0.3440333 0.05257536
## 17 3.0 1.0 0.3673913 0.06558360
## 18 4.0 1.0 0.3588344 0.06077079
## 19 0.1 2.0 0.2820537 0.03986144
## 20 0.5 2.0 0.3635060 0.05632654
## 21 1.0 2.0 0.3435245 0.07228906
## 22 2.0 2.0 0.3353839 0.04425879
## 23 3.0 2.0 0.3716004 0.08374793
## 24 4.0 2.0 0.3290934 0.06687542
## 25 0.1 3.0 0.3353839 0.05436896
## 26 0.5 3.0 0.3590657 0.02755441
## 27 1.0 3.0 0.3481961 0.07392996
## 28 2.0 3.0 0.3826549 0.04728425
## 29 3.0 3.0 0.3527752 0.04713723
## 30 4.0 3.0 0.3673913 0.07081131
## 31 0.1 4.0 0.3353839 0.05436896
## 32 0.5 4.0 0.3333025 0.03484510
## 33 1.0 4.0 0.3735893 0.07522130
## 34 2.0 4.0 0.3825624 0.08169074
## 35 3.0 4.0 0.3054117 0.07465731
## 36 4.0 4.0 0.3502775 0.07330135
best.sigmoid = sigmoid.tune$best.model
sigmoid.test = predict(best.sigmoid, newdata=test)
table(sigmoid.test, test$diabetes)
##
## sigmoid.test neg pos
## neg 163 57
## pos 26 54
(163+54)/300
## [1] 0.7233333
confusionMatrix(sigmoid.test, test$diabetes, positive="pos")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 163 57
## pos 26 54
##
## Accuracy : 0.7233
## 95% CI : (0.669, 0.7732)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : 0.0004011
##
## Kappa : 0.3703
## Mcnemar's Test P-Value : 0.0009915
##
## Sensitivity : 0.4865
## Specificity : 0.8624
## Pos Pred Value : 0.6750
## Neg Pred Value : 0.7409
## Prevalence : 0.3700
## Detection Rate : 0.1800
## Detection Prevalence : 0.2667
## Balanced Accuracy : 0.6745
##
## 'Positive' Class : pos
##
confusionMatrix(rbf.test, test$diabetes, positive="pos")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 164 47
## pos 25 64
##
## Accuracy : 0.76
## 95% CI : (0.7076, 0.8072)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : 1.033e-06
##
## Kappa : 0.4632
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 0.5766
## Specificity : 0.8677
## Pos Pred Value : 0.7191
## Neg Pred Value : 0.7773
## Prevalence : 0.3700
## Detection Rate : 0.2133
## Detection Prevalence : 0.2967
## Balanced Accuracy : 0.7222
##
## 'Positive' Class : pos
##
confusionMatrix(tune.test, test$diabetes, positive="pos")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 172 56
## pos 17 55
##
## Accuracy : 0.7567
## 95% CI : (0.704, 0.8041)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : 1.933e-06
##
## Kappa : 0.4372
## Mcnemar's Test P-Value : 8.685e-06
##
## Sensitivity : 0.4955
## Specificity : 0.9101
## Pos Pred Value : 0.7639
## Neg Pred Value : 0.7544
## Prevalence : 0.3700
## Detection Rate : 0.1833
## Detection Prevalence : 0.2400
## Balanced Accuracy : 0.7028
##
## 'Positive' Class : pos
##
#feature selection for SVM
set.seed(123)
rfeCNTL = rfeControl(functions=lrFuncs, method="cv", number=10)
svm.features = rfe(train[,1:8], train[,9],sizes = c(8, 7, 6, 5, 4),
rfeControl = rfeCNTL, method = "svmLinear")
svm.features
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 4 0.7673 0.4451 0.04730 0.1097
## 5 0.7695 0.4605 0.06477 0.1465
## 6 0.7652 0.4455 0.05680 0.1366
## 7 0.7758 0.4686 0.05393 0.1269 *
## 8 0.7737 0.4646 0.05502 0.1283
##
## The top 5 variables (out of 7):
## glucose, mass, pedigree, pressure, age
svm.5 <- svm(diabetes~glucose+mass+pedigree+pressure+age, data=train, kernel="linear")
svm.5.predict <- predict(svm.5, newdata=test[c(2,6,7,3,8)])
table(svm.5.predict, test$diabetes)
##
## svm.5.predict neg pos
## neg 166 47
## pos 23 64
(166+64)/300
## [1] 0.7666667