title: “Machine Learning: Predicting Diabetes using Pima Indians Data Set” author: “Mano R” date: “September 25, 2018” output: html_document —

library(neuralnet)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning in as.POSIXlt.POSIXct(Sys.time()): unable to identify current timezone 'C':
## please set environment variable 'TZ'
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.2.1 --
## v tibble  2.1.1       v purrr   0.3.2  
## v tidyr   0.8.3       v dplyr   0.8.0.1
## v readr   1.3.1       v stringr 1.4.0  
## v tibble  2.1.1       v forcats 0.4.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::compute() masks neuralnet::compute()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x purrr::lift()    masks caret::lift()
library(mlbench)
library(e1071)
library(ggplot2)
data("PimaIndiansDiabetes")
df <-PimaIndiansDiabetes
str(PimaIndiansDiabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
ggplot(df,aes(diabetes,fill=factor(diabetes)))+geom_bar()

df$binary <- ifelse(df$diabetes == 'neg' ,0,1)
str(df)
## 'data.frame':    768 obs. of  10 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
##  $ binary  : num  1 0 1 0 1 0 1 0 1 1 ...
rows <- createDataPartition(df$binary,times=1,p=0.7,list=F)
train <- df[rows,]
test <-df[-rows,]
dim(train)
## [1] 538  10
dim(test)
## [1] 230  10
str(train)
## 'data.frame':    538 obs. of  10 variables:
##  $ pregnant: num  6 1 0 5 8 10 10 5 7 7 ...
##  $ glucose : num  148 89 137 116 125 168 139 166 100 107 ...
##  $ pressure: num  72 66 40 74 96 74 80 72 0 74 ...
##  $ triceps : num  35 23 35 0 0 0 0 19 0 0 ...
##  $ insulin : num  0 94 168 0 0 0 0 175 0 0 ...
##  $ mass    : num  33.6 28.1 43.1 25.6 0 38 27.1 25.8 30 29.6 ...
##  $ pedigree: num  0.627 0.167 2.288 0.201 0.232 ...
##  $ age     : num  50 21 33 30 54 34 57 51 32 31 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 2 1 2 2 2 ...
##  $ binary  : num  1 0 1 0 1 1 0 1 1 1 ...
train <-train[,-9]
test <-test[,-9]
model<-train(as.factor(binary)~.,data=train,method='ranger',trControl = trainControl(method = 'repeatedcv',number=2,repeats=2))
summary(model)
##                           Length Class         Mode     
## predictions               538    factor        numeric  
## num.trees                   1    -none-        numeric  
## num.independent.variables   1    -none-        numeric  
## mtry                        1    -none-        numeric  
## min.node.size               1    -none-        numeric  
## prediction.error            1    -none-        numeric  
## forest                     10    ranger.forest list     
## confusion.matrix            4    table         numeric  
## splitrule                   1    -none-        character
## treetype                    1    -none-        character
## call                        9    -none-        call     
## importance.mode             1    -none-        character
## num.samples                 1    -none-        numeric  
## replace                     1    -none-        logical  
## xNames                      8    -none-        character
## problemType                 1    -none-        character
## tuneValue                   3    data.frame    list     
## obsLevels                   2    -none-        character
## param                       0    -none-        list
model
## Random Forest 
## 
## 538 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 2 times) 
## Summary of sample sizes: 269, 269, 269, 269 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##   2     gini        0.7685874  0.4843646
##   2     extratrees  0.7639405  0.4679269
##   5     gini        0.7704461  0.4925502
##   5     extratrees  0.7713755  0.4925280
##   8     gini        0.7685874  0.4885122
##   8     extratrees  0.7723048  0.4930952
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 8, splitrule =
##  extratrees and min.node.size = 1.
pred_train<-predict(model,train)
pred_test<-predict(model,test)
pred_train
##   [1] 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 0 0 1
##  [36] 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0
##  [71] 1 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0
## [106] 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1
## [141] 0 1 1 0 0 0 0 1 1 0 1 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 1
## [176] 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 1
## [211] 0 1 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0
## [246] 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1
## [281] 1 1 1 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0
## [316] 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 0
## [351] 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0
## [386] 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0
## [421] 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 1
## [456] 0 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0
## [491] 0 1 0 0 1 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0
## [526] 1 1 0 1 0 0 1 0 0 0 0 1 0
## Levels: 0 1
confusionMatrix(pred_train,as.factor(train$binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 342   0
##          1   0 196
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9932, 1)
##     No Information Rate : 0.6357     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.6357     
##          Detection Rate : 0.6357     
##    Detection Prevalence : 0.6357     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
## 
confusionMatrix(pred_test,as.factor(test$binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 126  26
##          1  32  46
##                                           
##                Accuracy : 0.7478          
##                  95% CI : (0.6865, 0.8026)
##     No Information Rate : 0.687           
##     P-Value [Acc > NIR] : 0.02578         
##                                           
##                   Kappa : 0.4267          
##                                           
##  Mcnemar's Test P-Value : 0.51148         
##                                           
##             Sensitivity : 0.7975          
##             Specificity : 0.6389          
##          Pos Pred Value : 0.8289          
##          Neg Pred Value : 0.5897          
##              Prevalence : 0.6870          
##          Detection Rate : 0.5478          
##    Detection Prevalence : 0.6609          
##       Balanced Accuracy : 0.7182          
##                                           
##        'Positive' Class : 0               
##