Random Forest Project with Pima Indian Data J-Redden

library(caret)
library(mlbench)
library(randomForest)

data("PimaIndiansDiabetes")
set.seed(8675309)
head(PimaIndiansDiabetes, n=5)
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos

Create 80% - 20% for training and validation datasets

validationIndex <- createDataPartition(PimaIndiansDiabetes$diabetes, p=0.80, list=FALSE)
validation <- PimaIndiansDiabetes[-validationIndex, ]
training <- PimaIndiansDiabetes[validationIndex, ]

Train a model and summarize model

trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
fit.rf <- train(diabetes~., data=training, method="rf", metric="Accuracy", trControl=trainControl, ntree=2000)
print(fit.rf)
## Random Forest 
## 
## 615 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 554, 554, 554, 553, 553, 553, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7609642  0.4610870
##   5     0.7674511  0.4845804
##   8     0.7658822  0.4797141
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
print(fit.rf$finalModel)
## 
## Call:
##  randomForest(x = x, y = y, ntree = 2000, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 2000
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 23.9%
## Confusion matrix:
##     neg pos class.error
## neg 331  69   0.1725000
## pos  78 137   0.3627907

Now test the final model against validation set

finalPredictions <- predict(fit.rf$finalModel, validation[ , 1:8])
confusionMatrix(finalPredictions, validation$diabetes)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  81  21
##        pos  19  32
##                                           
##                Accuracy : 0.7386          
##                  95% CI : (0.6615, 0.8062)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 0.01536         
##                                           
##                   Kappa : 0.4175          
##                                           
##  Mcnemar's Test P-Value : 0.87437         
##                                           
##             Sensitivity : 0.8100          
##             Specificity : 0.6038          
##          Pos Pred Value : 0.7941          
##          Neg Pred Value : 0.6275          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5294          
##    Detection Prevalence : 0.6667          
##       Balanced Accuracy : 0.7069          
##                                           
##        'Positive' Class : neg             
## 
plot(fit.rf)

Standalone model

Now that I have discovered optimal settings I am going to create a standalone random forest.

#train using ALL the training data
set.seed(8675309)
standAloneModel <- randomForest(diabetes~., training, mtry=2, ntree=2000)
print(standAloneModel)
## 
## Call:
##  randomForest(formula = diabetes ~ ., data = training, mtry = 2,      ntree = 2000) 
##                Type of random forest: classification
##                      Number of trees: 2000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 23.41%
## Confusion matrix:
##     neg pos class.error
## neg 341  59   0.1475000
## pos  85 130   0.3953488
#get predictions from held back validation data and check preditions
#in a confusion matrix
predictions <- predict(standAloneModel, validation[ , 1:8])
confusionMatrix(predictions, validation$diabetes)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  84  22
##        pos  16  31
##                                           
##                Accuracy : 0.7516          
##                  95% CI : (0.6754, 0.8179)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 0.005891        
##                                           
##                   Kappa : 0.4365          
##                                           
##  Mcnemar's Test P-Value : 0.417304        
##                                           
##             Sensitivity : 0.8400          
##             Specificity : 0.5849          
##          Pos Pred Value : 0.7925          
##          Neg Pred Value : 0.6596          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5490          
##    Detection Prevalence : 0.6928          
##       Balanced Accuracy : 0.7125          
##                                           
##        'Positive' Class : neg             
##