Machine learning SVM modelling with Pima Indians Diabetes Data

library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)

## [1] 768   9

levels(PimaIndiansDiabetes$diabetes)

## [1] "neg" "pos"

head(PimaIndiansDiabetes)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

library(e1071) #SVM
library(caret) #select tuning parameters

## Loading required package: lattice

## Loading required package: ggplot2

library(reshape2) #assist in creating boxplots
library(ggplot2) #create boxplots
library(kernlab) #assist with SVM feature selection

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

pima.scale = as.data.frame(scale(PimaIndiansDiabetes[,-9]))
str(pima.scale)

## 'data.frame':    768 obs. of  8 variables:
##  $ pregnant: num  0.64 -0.844 1.233 -0.844 -1.141 ...
##  $ glucose : num  0.848 -1.123 1.942 -0.998 0.504 ...
##  $ pressure: num  0.15 -0.16 -0.264 -0.16 -1.504 ...
##  $ triceps : num  0.907 0.531 -1.287 0.154 0.907 ...
##  $ insulin : num  -0.692 -0.692 -0.692 0.123 0.765 ...
##  $ mass    : num  0.204 -0.684 -1.103 -0.494 1.409 ...
##  $ pedigree: num  0.468 -0.365 0.604 -0.92 5.481 ...
##  $ age     : num  1.4251 -0.1905 -0.1055 -1.0409 -0.0205 ...

pima.scale$diabetes = PimaIndiansDiabetes$diabetes

pima.scale.melt = melt(pima.scale, id.var="diabetes")
ggplot(data=pima.scale.melt, aes(x=diabetes, y=value)) +geom_boxplot()+facet_wrap(~variable, ncol=2)

cor(pima.scale[-9])

##             pregnant    glucose   pressure     triceps     insulin
## pregnant  1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461
## glucose   0.12945867 1.00000000 0.15258959  0.05732789  0.33135711
## pressure  0.14128198 0.15258959 1.00000000  0.20737054  0.08893338
## triceps  -0.08167177 0.05732789 0.20737054  1.00000000  0.43678257
## insulin  -0.07353461 0.33135711 0.08893338  0.43678257  1.00000000
## mass      0.01768309 0.22107107 0.28180529  0.39257320  0.19785906
## pedigree -0.03352267 0.13733730 0.04126495  0.18392757  0.18507093
## age       0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295
##                mass    pedigree         age
## pregnant 0.01768309 -0.03352267  0.54434123
## glucose  0.22107107  0.13733730  0.26351432
## pressure 0.28180529  0.04126495  0.23952795
## triceps  0.39257320  0.18392757 -0.11397026
## insulin  0.19785906  0.18507093 -0.04216295
## mass     1.00000000  0.14064695  0.03624187
## pedigree 0.14064695  1.00000000  0.03356131
## age      0.03624187  0.03356131  1.00000000

table(pima.scale$diabetes)

## 
## neg pos 
## 500 268

set.seed(123)
ind = sample(2, nrow(pima.scale), replace=TRUE, prob=c(0.6,0.4))
train = pima.scale[ind==1,]
test = pima.scale[ind==2,]
str(train)

## 'data.frame':    468 obs. of  9 variables:
##  $ pregnant: num  0.64 1.233 0.343 -0.251 -0.548 ...
##  $ glucose : num  0.848 1.942 -0.153 -1.342 2.38 ...
##  $ pressure: num  0.1495 -0.2638 0.2529 -0.9871 0.0462 ...
##  $ triceps : num  0.907 -1.287 -1.287 0.719 1.534 ...
##  $ insulin : num  -0.6924 -0.6924 -0.6924 0.0712 4.0193 ...
##  $ mass    : num  0.204 -1.103 -0.811 -0.126 -0.189 ...
##  $ pedigree: num  0.468 0.604 -0.818 -0.676 -0.947 ...
##  $ age     : num  1.425 -0.106 -0.276 -0.616 1.68 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 2 2 2 2 2 2 2 ...

str(test)

## 'data.frame':    300 obs. of  9 variables:
##  $ pregnant: num  -0.844 -0.844 -1.141 1.827 0.046 ...
##  $ glucose : num  -1.123 -0.998 0.504 -0.184 -0.341 ...
##  $ pressure: num  -0.16 -0.16 -1.5 -3.57 1.18 ...
##  $ triceps : num  0.531 0.154 0.907 -1.287 -1.287 ...
##  $ insulin : num  -0.692 0.123 0.765 -0.692 -0.692 ...
##  $ mass    : num  -0.684 -0.494 1.409 0.42 0.711 ...
##  $ pedigree: num  -0.365 -0.92 5.481 -1.02 -0.848 ...
##  $ age     : num  -0.1905 -1.0409 -0.0205 -0.3606 -0.2756 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 1 1 2 1 1 1 2 2 1 1 ...

#SVM modelling
linear.tune = tune.svm(diabetes~., data=train, kernel="linear", cost=c(0.001, 0.01, 0.1, 1,5,10))
summary(linear.tune)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.01
## 
## - best performance: 0.2177151 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.3354764 0.04821124
## 2 1e-02 0.2177151 0.04908685
## 3 1e-01 0.2198890 0.04521036
## 4 1e+00 0.2241443 0.04535524
## 5 5e+00 0.2219704 0.04951808
## 6 1e+01 0.2219704 0.04951808

best.linear = linear.tune$best.model
tune.test = predict(best.linear, newdata=test)
table(tune.test, test$diabetes)

##          
## tune.test neg pos
##       neg 172  56
##       pos  17  55

(172+55)/300

## [1] 0.7566667

set.seed(123)
poly.tune = tune.svm(diabetes~., data=train, kernel="polynomial",
degree=c(3,4,5), coef0=c(0.1,0.5,1,2,3,4))
summary(poly.tune)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  degree coef0
##       4   0.5
## 
## - best performance: 0.230666 
## 
## - Detailed performance results:
##    degree coef0     error dispersion
## 1       3   0.1 0.2437095 0.03634583
## 2       4   0.1 0.2542091 0.05801823
## 3       5   0.1 0.2520814 0.05198025
## 4       3   0.5 0.2329787 0.04474148
## 5       4   0.5 0.2306660 0.04183322
## 6       5   0.5 0.2435708 0.05624789
## 7       3   1.0 0.2414894 0.04736281
## 8       4   1.0 0.2457909 0.04783015
## 9       5   1.0 0.2712303 0.06397011
## 10      3   2.0 0.2393154 0.04573676
## 11      4   2.0 0.2713691 0.05518721
## 12      5   2.0 0.3076318 0.05585784
## 13      3   3.0 0.2435245 0.05007886
## 14      4   3.0 0.2777983 0.05159247
## 15      5   3.0 0.3013414 0.04816053
## 16      3   4.0 0.2478261 0.04485808
## 17      4   4.0 0.3055967 0.05946087
## 18      5   4.0 0.3142923 0.04866626

best.poly = poly.tune$best.model
poly.test = predict(best.poly, newdata=test)
table(poly.test, test$diabetes)

##          
## poly.test neg pos
##       neg 163  60
##       pos  26  51

(163+51)/300

## [1] 0.7133333

set.seed(123)
rbf.tune = tune.svm(diabetes~., data=train, kernel="radial",
gamma=c(0.1,0.5,1,2,3,4))
summary(rbf.tune)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma
##    0.1
## 
## - best performance: 0.2222479 
## 
## - Detailed performance results:
##   gamma     error dispersion
## 1   0.1 0.2222479 0.04841258
## 2   0.5 0.2628122 0.04365929
## 3   1.0 0.3183164 0.06578581
## 4   2.0 0.3289547 0.05973120
## 5   3.0 0.3353377 0.05945045
## 6   4.0 0.3375116 0.05121227

best.rbf = rbf.tune$best.model
rbf.test = predict(best.rbf, newdata=test)
table(rbf.test, test$diabetes)

##         
## rbf.test neg pos
##      neg 164  47
##      pos  25  64

(164+64)/300

## [1] 0.76

set.seed(123)
sigmoid.tune = tune.svm(diabetes~., data=train, kernel="sigmoid",
gamma=c(0.1,0.5,1,2,3,4), coef0=c(0.1,0.5,1,2,3,4))
summary(sigmoid.tune)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma coef0
##    0.1     2
## 
## - best performance: 0.2820537 
## 
## - Detailed performance results:
##    gamma coef0     error dispersion
## 1    0.1   0.1 0.2843663 0.06979211
## 2    0.5   0.1 0.3353377 0.06958547
## 3    1.0   0.1 0.3309436 0.07070152
## 4    2.0   0.1 0.3481961 0.07264783
## 5    3.0   0.1 0.3460685 0.05427109
## 6    4.0   0.1 0.3501850 0.09174225
## 7    0.1   0.5 0.3246994 0.05506732
## 8    0.5   0.5 0.3482886 0.06027442
## 9    1.0   0.5 0.3482424 0.04240683
## 10   2.0   0.5 0.3396855 0.04005442
## 11   3.0   0.5 0.3481036 0.04991623
## 12   4.0   0.5 0.3545791 0.06896997
## 13   0.1   1.0 0.3311286 0.06631114
## 14   0.5   1.0 0.3631822 0.04765719
## 15   1.0   1.0 0.3482886 0.05582785
## 16   2.0   1.0 0.3440333 0.05257536
## 17   3.0   1.0 0.3673913 0.06558360
## 18   4.0   1.0 0.3588344 0.06077079
## 19   0.1   2.0 0.2820537 0.03986144
## 20   0.5   2.0 0.3635060 0.05632654
## 21   1.0   2.0 0.3435245 0.07228906
## 22   2.0   2.0 0.3353839 0.04425879
## 23   3.0   2.0 0.3716004 0.08374793
## 24   4.0   2.0 0.3290934 0.06687542
## 25   0.1   3.0 0.3353839 0.05436896
## 26   0.5   3.0 0.3590657 0.02755441
## 27   1.0   3.0 0.3481961 0.07392996
## 28   2.0   3.0 0.3826549 0.04728425
## 29   3.0   3.0 0.3527752 0.04713723
## 30   4.0   3.0 0.3673913 0.07081131
## 31   0.1   4.0 0.3353839 0.05436896
## 32   0.5   4.0 0.3333025 0.03484510
## 33   1.0   4.0 0.3735893 0.07522130
## 34   2.0   4.0 0.3825624 0.08169074
## 35   3.0   4.0 0.3054117 0.07465731
## 36   4.0   4.0 0.3502775 0.07330135

best.sigmoid = sigmoid.tune$best.model
sigmoid.test = predict(best.sigmoid, newdata=test)
table(sigmoid.test, test$diabetes)

##             
## sigmoid.test neg pos
##          neg 163  57
##          pos  26  54

(163+54)/300

## [1] 0.7233333

confusionMatrix(sigmoid.test, test$diabetes, positive="pos")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 163  57
##        pos  26  54
##                                          
##                Accuracy : 0.7233         
##                  95% CI : (0.669, 0.7732)
##     No Information Rate : 0.63           
##     P-Value [Acc > NIR] : 0.0004011      
##                                          
##                   Kappa : 0.3703         
##  Mcnemar's Test P-Value : 0.0009915      
##                                          
##             Sensitivity : 0.4865         
##             Specificity : 0.8624         
##          Pos Pred Value : 0.6750         
##          Neg Pred Value : 0.7409         
##              Prevalence : 0.3700         
##          Detection Rate : 0.1800         
##    Detection Prevalence : 0.2667         
##       Balanced Accuracy : 0.6745         
##                                          
##        'Positive' Class : pos            
##

confusionMatrix(rbf.test, test$diabetes, positive="pos")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 164  47
##        pos  25  64
##                                           
##                Accuracy : 0.76            
##                  95% CI : (0.7076, 0.8072)
##     No Information Rate : 0.63            
##     P-Value [Acc > NIR] : 1.033e-06       
##                                           
##                   Kappa : 0.4632          
##  Mcnemar's Test P-Value : 0.01333         
##                                           
##             Sensitivity : 0.5766          
##             Specificity : 0.8677          
##          Pos Pred Value : 0.7191          
##          Neg Pred Value : 0.7773          
##              Prevalence : 0.3700          
##          Detection Rate : 0.2133          
##    Detection Prevalence : 0.2967          
##       Balanced Accuracy : 0.7222          
##                                           
##        'Positive' Class : pos             
##

confusionMatrix(tune.test, test$diabetes, positive="pos")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 172  56
##        pos  17  55
##                                          
##                Accuracy : 0.7567         
##                  95% CI : (0.704, 0.8041)
##     No Information Rate : 0.63           
##     P-Value [Acc > NIR] : 1.933e-06      
##                                          
##                   Kappa : 0.4372         
##  Mcnemar's Test P-Value : 8.685e-06      
##                                          
##             Sensitivity : 0.4955         
##             Specificity : 0.9101         
##          Pos Pred Value : 0.7639         
##          Neg Pred Value : 0.7544         
##              Prevalence : 0.3700         
##          Detection Rate : 0.1833         
##    Detection Prevalence : 0.2400         
##       Balanced Accuracy : 0.7028         
##                                          
##        'Positive' Class : pos            
##

#feature selection for SVM
set.seed(123)
rfeCNTL = rfeControl(functions=lrFuncs, method="cv", number=10)
svm.features = rfe(train[,1:8], train[,9],sizes = c(8, 7, 6, 5, 4),
rfeControl = rfeCNTL, method = "svmLinear")

svm.features

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          4   0.7673 0.4451    0.04730  0.1097         
##          5   0.7695 0.4605    0.06477  0.1465         
##          6   0.7652 0.4455    0.05680  0.1366         
##          7   0.7758 0.4686    0.05393  0.1269        *
##          8   0.7737 0.4646    0.05502  0.1283         
## 
## The top 5 variables (out of 7):
##    glucose, mass, pedigree, pressure, age

svm.5 <- svm(diabetes~glucose+mass+pedigree+pressure+age, data=train, kernel="linear")
svm.5.predict <- predict(svm.5, newdata=test[c(2,6,7,3,8)])
table(svm.5.predict, test$diabetes)

##              
## svm.5.predict neg pos
##           neg 166  47
##           pos  23  64

(166+64)/300

## [1] 0.7666667

Machine learning SVM modelling with Pima Indians Diabetes Data

Kushan De Silva

August 4, 2017