library(caret)
library(mlbench)
library(randomForest)
data("PimaIndiansDiabetes")
set.seed(8675309)
head(PimaIndiansDiabetes, n=5)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
validationIndex <- createDataPartition(PimaIndiansDiabetes$diabetes, p=0.80, list=FALSE)
validation <- PimaIndiansDiabetes[-validationIndex, ]
training <- PimaIndiansDiabetes[validationIndex, ]
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
fit.rf <- train(diabetes~., data=training, method="rf", metric="Accuracy", trControl=trainControl, ntree=2000)
print(fit.rf)
## Random Forest
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 554, 554, 554, 553, 553, 553, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7609642 0.4610870
## 5 0.7674511 0.4845804
## 8 0.7658822 0.4797141
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
print(fit.rf$finalModel)
##
## Call:
## randomForest(x = x, y = y, ntree = 2000, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 2000
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 23.9%
## Confusion matrix:
## neg pos class.error
## neg 331 69 0.1725000
## pos 78 137 0.3627907
finalPredictions <- predict(fit.rf$finalModel, validation[ , 1:8])
confusionMatrix(finalPredictions, validation$diabetes)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 81 21
## pos 19 32
##
## Accuracy : 0.7386
## 95% CI : (0.6615, 0.8062)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.01536
##
## Kappa : 0.4175
##
## Mcnemar's Test P-Value : 0.87437
##
## Sensitivity : 0.8100
## Specificity : 0.6038
## Pos Pred Value : 0.7941
## Neg Pred Value : 0.6275
## Prevalence : 0.6536
## Detection Rate : 0.5294
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.7069
##
## 'Positive' Class : neg
##
plot(fit.rf)
Now that I have discovered optimal settings I am going to create a standalone random forest.
#train using ALL the training data
set.seed(8675309)
standAloneModel <- randomForest(diabetes~., training, mtry=2, ntree=2000)
print(standAloneModel)
##
## Call:
## randomForest(formula = diabetes ~ ., data = training, mtry = 2, ntree = 2000)
## Type of random forest: classification
## Number of trees: 2000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 23.41%
## Confusion matrix:
## neg pos class.error
## neg 341 59 0.1475000
## pos 85 130 0.3953488
#get predictions from held back validation data and check preditions
#in a confusion matrix
predictions <- predict(standAloneModel, validation[ , 1:8])
confusionMatrix(predictions, validation$diabetes)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 84 22
## pos 16 31
##
## Accuracy : 0.7516
## 95% CI : (0.6754, 0.8179)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.005891
##
## Kappa : 0.4365
##
## Mcnemar's Test P-Value : 0.417304
##
## Sensitivity : 0.8400
## Specificity : 0.5849
## Pos Pred Value : 0.7925
## Neg Pred Value : 0.6596
## Prevalence : 0.6536
## Detection Rate : 0.5490
## Detection Prevalence : 0.6928
## Balanced Accuracy : 0.7125
##
## 'Positive' Class : neg
##