## Load libraries
library(stats)
library(dplyr)
library(randomForest)
library(caret)
library(mlbench)
## Inspect data
diab <- read.csv("diabetes.csv")
View(diab)
head(diab)
str(diab)
## pregnant glucose pressure triceps insulin mass
##1 6 148 72 35 0 33.6
##2 1 85 66 29 0 26.6
##3 8 183 64 0 0 23.3
##4 1 89 66 23 94 28.1
##5 0 137 40 35 168 43.1
##6 5 116 74 0 0 25.6
## pedigree age diabetes
##1 0.627 50 pos
##2 0.351 31 neg
##3 0.672 32 pos
##4 0.167 21 neg
##5 2.288 33 pos
##6 0.201 30 nwg
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288
## $ age : num 50 31 32 21 33 30 26 29 53 54
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
Split dataset and create random forest model
index = sample(2,nrow(diab),replace = TRUE,prob=c(0.7,0.3))
# Training data
Train = diab[index==1,]
# Testing data
Test = diab[index==2,]
# Random Forest Model
rfm = randomForest(diabetes~., data = Train)
print(rfm)
## Call:
## randomForest(formula = diabetes ~ ., data = Train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
## OOB estimate of error rate: 22.39%
## Confusion matrix:
## neg pos class.error
## neg 301 51 0.1448864
## pos 71 122 0.3678756
p1 <- predict(rfm,Train)
confusionMatrix(p1, Train$diabetes)
## Reference
## Prediction neg pos
## neg 352 0
## pos 0 193
## Accuracy : 1
## 95% CI : (0.9933, 1)
## No Information Rate : 0.6459
## P-Value [Acc > NIR] : < 2.2e-16
## Kappa : 1
## Mcnemar's Test P-Value : NA
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6459
## Detection Rate : 0.6459
## Detection Prevalence : 0.6459
## Balanced Accuracy : 1.0000
## 'Positive' Class : neg
Tuning
control <- trainControl(method ="repeatedcv", number = 10, repeats = 6)
grid <- expand.grid(mtry =c(1,2,3,4,5,6))
model.random.forest <- train(diabetes~., data=Train, method="rf", tuneGrid = grid, trConrtol=control)
model.random.forest
## 555 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 555, 555, 555, 555, 555, 555, ...
## Resampling results across tuning parameters:
## mtry Accuracy Kappa
## 1 0.7438669 0.3761886
## 2 0.7487793 0.4046578
## 3 0.7478976 0.4058921
## 4 0.7431232 0.3974846
## 5 0.7415836 0.3948964
## 6 0.7415917 0.3949823
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
p2 <- predict(rfm,Test)
confusionMatrix(p2,Test$diabetes)
## Confusion Matrix and Statistics
## Reference
## Prediction neg pos
## neg 122 25
## pos 18 46
## Accuracy : 0.7962
## 95% CI : (0.7355, 0.8484)
## No Information Rate : 0.6635
## P-Value [Acc > NIR] : 1.536e-05
## Kappa : 0.5322
## Mcnemar's Test P-Value : 0.3602
## P-Value [Acc > NIR] : 0.008665
## Kappa : 0.4174
## Mcnemar's Test P-Value : 1.000000
## Sensitivity : 0.8714
## Specificity : 0.6479
## Pos Pred Value : 0.8299
## Neg Pred Value : 0.7188
## Prevalence : 0.6635
## Detection Rate : 0.5782
## Detection Prevalence : 0.6967
## Balanced Accuracy : 0.7597
## 'Positive' Class : neg
Conclusions 1) Random forest algorithm yielded zero error rate with training data 2) Tuned dataset with 10 folds and 6 repetitions 3) Used six mtry values 4) mtry = 2 had highest value for the model 5) Accuract 79.6% for testing dataset
```