#rm(list=ls())
library(randomForest)
library(mlbench)
library(RCurl)
library(caret)
library(rpart)
library(pROC)
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
str(df)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
head(df)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
#store rows for partition
partition <- caret::createDataPartition(y = df$diabetes, times = 1, p = 0.7, list = FALSE)
# create training data set
train_set <- df[partition,]
# create testing data set, subtracting the rows partition to get remaining 30% of the data
test_set <- df[-partition,]
str(train_set)
## 'data.frame': 538 obs. of 9 variables:
## $ pregnant: num 1 0 5 3 10 2 10 10 1 5 ...
## $ glucose : num 89 137 116 78 115 197 168 139 189 166 ...
## $ pressure: num 66 40 74 50 0 70 74 80 60 72 ...
## $ triceps : num 23 35 0 32 0 45 0 0 23 19 ...
## $ insulin : num 94 168 0 88 0 543 0 0 846 175 ...
## $ mass : num 28.1 43.1 25.6 31 35.3 30.5 38 27.1 30.1 25.8 ...
## $ pedigree: num 0.167 2.288 0.201 0.248 0.134 ...
## $ age : num 21 33 30 26 29 53 34 57 59 51 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 1 2 1 2 1 2 2 1 2 2 ...
model_forest <- caret::train(diabetes ~., data = train_set, method = "ranger", metric = "ROC",trControl = trainControl(method = "cv", number = 10,classProbs = T, summaryFunction = twoClassSummary),preProcess = c("center","scale","pca"))
model_forest
## Random Forest
##
## 538 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## Pre-processing: centered (8), scaled (8), principal component signal
## extraction (8)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 485, 484, 484, 484, 485, 484, ...
## Resampling results across tuning parameters:
##
## mtry splitrule ROC Sens Spec
## 2 gini 0.8014871 0.8285714 0.5479532
## 2 extratrees 0.8091228 0.8685714 0.5368421
## 5 gini 0.7962824 0.8085714 0.5690058
## 5 extratrees 0.8020134 0.8428571 0.5745614
## 8 gini 0.7408521 0.8000000 0.5087719
## 8 extratrees 0.7481203 0.8095238 0.5263158
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
## and min.node.size = 1.
model_forest$results[6,4]
## [1] 0.7481203
# prediction on Test data set
pred_rf <- predict(model_forest, test_set)
# Confusion Matrix
cm_rf <- confusionMatrix(pred_rf, test_set$diabetes, positive="pos")
# Prediction Probabilities
pred_prob_rf <- predict(model_forest, test_set, type="prob")
# ROC value
roc_rf <- roc(test_set$diabetes, pred_prob_rf$pos)
# Confusion Matrix for Random Forest Model
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 137 41
## pos 13 39
##
## Accuracy : 0.7652
## 95% CI : (0.705, 0.8184)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.0001394
##
## Kappa : 0.4365
##
## Mcnemar's Test P-Value : 0.0002386
##
## Sensitivity : 0.4875
## Specificity : 0.9133
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.7697
## Prevalence : 0.3478
## Detection Rate : 0.1696
## Detection Prevalence : 0.2261
## Balanced Accuracy : 0.7004
##
## 'Positive' Class : pos
##
# ROC Value for Random Forest
roc_rf
##
## Call:
## roc.default(response = test_set$diabetes, predictor = pred_prob_rf$pos)
##
## Data: pred_prob_rf$pos in 150 controls (test_set$diabetes neg) < 80 cases (test_set$diabetes pos).
## Area under the curve: 0.8373
# AUC - Area under the curve
caTools::colAUC(pred_prob_rf$pos, test_set$diabetes, plotROC = T)

## [,1]
## neg vs. pos 0.8373333
##Test result
result_rf <- c(cm_rf$byClass['Sensitivity'], cm_rf$byClass['Specificity'], cm_rf$byClass['Precision'], cm_rf$byClass['Recall'], cm_rf$byClass['F1'], roc_rf$auc)
result_rf
## Sensitivity Specificity Precision Recall F1
## 0.4875000 0.9133333 0.7500000 0.4875000 0.5909091 0.8373333