#rm(list=ls())
library(randomForest)
library(mlbench)
library(RCurl)
library(caret)
library(rpart)
library(pROC)

data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
str(df)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
head(df)
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg
#store rows for partition
partition <- caret::createDataPartition(y = df$diabetes, times = 1, p = 0.7, list = FALSE)

# create training data set
train_set <- df[partition,]

# create testing data set, subtracting the rows partition to get remaining 30% of the data
test_set <- df[-partition,]

str(train_set)
## 'data.frame':    538 obs. of  9 variables:
##  $ pregnant: num  1 0 5 3 10 2 10 10 1 5 ...
##  $ glucose : num  89 137 116 78 115 197 168 139 189 166 ...
##  $ pressure: num  66 40 74 50 0 70 74 80 60 72 ...
##  $ triceps : num  23 35 0 32 0 45 0 0 23 19 ...
##  $ insulin : num  94 168 0 88 0 543 0 0 846 175 ...
##  $ mass    : num  28.1 43.1 25.6 31 35.3 30.5 38 27.1 30.1 25.8 ...
##  $ pedigree: num  0.167 2.288 0.201 0.248 0.134 ...
##  $ age     : num  21 33 30 26 29 53 34 57 59 51 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 1 2 1 2 1 2 2 1 2 2 ...
model_forest <- caret::train(diabetes ~., data = train_set, method = "ranger", metric = "ROC",trControl = trainControl(method = "cv", number = 10,classProbs = T, summaryFunction = twoClassSummary),preProcess = c("center","scale","pca"))
model_forest
## Random Forest 
## 
## 538 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## Pre-processing: centered (8), scaled (8), principal component signal
##  extraction (8) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 485, 484, 484, 484, 485, 484, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   ROC        Sens       Spec     
##   2     gini        0.8014871  0.8285714  0.5479532
##   2     extratrees  0.8091228  0.8685714  0.5368421
##   5     gini        0.7962824  0.8085714  0.5690058
##   5     extratrees  0.8020134  0.8428571  0.5745614
##   8     gini        0.7408521  0.8000000  0.5087719
##   8     extratrees  0.7481203  0.8095238  0.5263158
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 1.
model_forest$results[6,4]
## [1] 0.7481203
# prediction on Test data set
pred_rf <- predict(model_forest, test_set)
# Confusion Matrix 
cm_rf <- confusionMatrix(pred_rf, test_set$diabetes, positive="pos")

# Prediction Probabilities
pred_prob_rf <- predict(model_forest, test_set, type="prob")

# ROC value
roc_rf <- roc(test_set$diabetes, pred_prob_rf$pos)

# Confusion Matrix for Random Forest Model
cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 137  41
##        pos  13  39
##                                          
##                Accuracy : 0.7652         
##                  95% CI : (0.705, 0.8184)
##     No Information Rate : 0.6522         
##     P-Value [Acc > NIR] : 0.0001394      
##                                          
##                   Kappa : 0.4365         
##                                          
##  Mcnemar's Test P-Value : 0.0002386      
##                                          
##             Sensitivity : 0.4875         
##             Specificity : 0.9133         
##          Pos Pred Value : 0.7500         
##          Neg Pred Value : 0.7697         
##              Prevalence : 0.3478         
##          Detection Rate : 0.1696         
##    Detection Prevalence : 0.2261         
##       Balanced Accuracy : 0.7004         
##                                          
##        'Positive' Class : pos            
## 
# ROC Value for Random Forest
roc_rf
## 
## Call:
## roc.default(response = test_set$diabetes, predictor = pred_prob_rf$pos)
## 
## Data: pred_prob_rf$pos in 150 controls (test_set$diabetes neg) < 80 cases (test_set$diabetes pos).
## Area under the curve: 0.8373
# AUC - Area under the curve
caTools::colAUC(pred_prob_rf$pos, test_set$diabetes, plotROC = T)

##                  [,1]
## neg vs. pos 0.8373333
##Test result

result_rf <- c(cm_rf$byClass['Sensitivity'], cm_rf$byClass['Specificity'], cm_rf$byClass['Precision'], cm_rf$byClass['Recall'], cm_rf$byClass['F1'], roc_rf$auc)

result_rf
## Sensitivity Specificity   Precision      Recall          F1             
##   0.4875000   0.9133333   0.7500000   0.4875000   0.5909091   0.8373333