title: “Machine Learning: Predicting Diabetes using Pima Indians Data Set” author: “Mano R” date: “September 25, 2018” output: html_document —
library(neuralnet)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning in as.POSIXlt.POSIXct(Sys.time()): unable to identify current timezone 'C':
## please set environment variable 'TZ'
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.0.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::compute() masks neuralnet::compute()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(mlbench)
library(e1071)
library(ggplot2)
data("PimaIndiansDiabetes")
df <-PimaIndiansDiabetes
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
ggplot(df,aes(diabetes,fill=factor(diabetes)))+geom_bar()
df$binary <- ifelse(df$diabetes == 'neg' ,0,1)
str(df)
## 'data.frame': 768 obs. of 10 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
## $ binary : num 1 0 1 0 1 0 1 0 1 1 ...
rows <- createDataPartition(df$binary,times=1,p=0.7,list=F)
train <- df[rows,]
test <-df[-rows,]
dim(train)
## [1] 538 10
dim(test)
## [1] 230 10
str(train)
## 'data.frame': 538 obs. of 10 variables:
## $ pregnant: num 6 1 0 5 8 10 10 5 7 7 ...
## $ glucose : num 148 89 137 116 125 168 139 166 100 107 ...
## $ pressure: num 72 66 40 74 96 74 80 72 0 74 ...
## $ triceps : num 35 23 35 0 0 0 0 19 0 0 ...
## $ insulin : num 0 94 168 0 0 0 0 175 0 0 ...
## $ mass : num 33.6 28.1 43.1 25.6 0 38 27.1 25.8 30 29.6 ...
## $ pedigree: num 0.627 0.167 2.288 0.201 0.232 ...
## $ age : num 50 21 33 30 54 34 57 51 32 31 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 2 1 2 2 2 ...
## $ binary : num 1 0 1 0 1 1 0 1 1 1 ...
train <-train[,-9]
test <-test[,-9]
model<-train(as.factor(binary)~.,data=train,method='ranger',trControl = trainControl(method = 'repeatedcv',number=2,repeats=2))
summary(model)
## Length Class Mode
## predictions 538 factor numeric
## num.trees 1 -none- numeric
## num.independent.variables 1 -none- numeric
## mtry 1 -none- numeric
## min.node.size 1 -none- numeric
## prediction.error 1 -none- numeric
## forest 10 ranger.forest list
## confusion.matrix 4 table numeric
## splitrule 1 -none- character
## treetype 1 -none- character
## call 9 -none- call
## importance.mode 1 -none- character
## num.samples 1 -none- numeric
## replace 1 -none- logical
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 3 data.frame list
## obsLevels 2 -none- character
## param 0 -none- list
model
## Random Forest
##
## 538 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 2 times)
## Summary of sample sizes: 269, 269, 269, 269
## Resampling results across tuning parameters:
##
## mtry splitrule Accuracy Kappa
## 2 gini 0.7685874 0.4843646
## 2 extratrees 0.7639405 0.4679269
## 5 gini 0.7704461 0.4925502
## 5 extratrees 0.7713755 0.4925280
## 8 gini 0.7685874 0.4885122
## 8 extratrees 0.7723048 0.4930952
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 8, splitrule =
## extratrees and min.node.size = 1.
pred_train<-predict(model,train)
pred_test<-predict(model,test)
pred_train
## [1] 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 0 0 1
## [36] 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0
## [71] 1 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0
## [106] 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1
## [141] 0 1 1 0 0 0 0 1 1 0 1 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 1
## [176] 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 1
## [211] 0 1 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0
## [246] 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1
## [281] 1 1 1 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0
## [316] 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 0
## [351] 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0
## [386] 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0
## [421] 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 1
## [456] 0 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0
## [491] 0 1 0 0 1 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0
## [526] 1 1 0 1 0 0 1 0 0 0 0 1 0
## Levels: 0 1
confusionMatrix(pred_train,as.factor(train$binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 342 0
## 1 0 196
##
## Accuracy : 1
## 95% CI : (0.9932, 1)
## No Information Rate : 0.6357
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6357
## Detection Rate : 0.6357
## Detection Prevalence : 0.6357
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
confusionMatrix(pred_test,as.factor(test$binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 126 26
## 1 32 46
##
## Accuracy : 0.7478
## 95% CI : (0.6865, 0.8026)
## No Information Rate : 0.687
## P-Value [Acc > NIR] : 0.02578
##
## Kappa : 0.4267
##
## Mcnemar's Test P-Value : 0.51148
##
## Sensitivity : 0.7975
## Specificity : 0.6389
## Pos Pred Value : 0.8289
## Neg Pred Value : 0.5897
## Prevalence : 0.6870
## Detection Rate : 0.5478
## Detection Prevalence : 0.6609
## Balanced Accuracy : 0.7182
##
## 'Positive' Class : 0
##