# Radhe Radhe
library(randomForest);library(dplyr);library(caTools)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Importing the dataset and converting the dataset attribute class into o and 1

dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)

df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))

# Splitting the dataset into the Training set and Test set
#install.packages('caTools')

set.seed(789) #for fixing the referance
split = sample.split(dataset$class, SplitRatio = 0.76)
tran_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

#Fitting the random forest
rf_pima <- randomForest(class ~., data = tran_set, mtry = 8, ntree=171, importance = TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# Testing the Model
#install.packages("caret")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
rf_probs <- predict(rf_pima, newdata = test_set)
rf_pred <- ifelse(rf_probs > 0.5, 1, 0)
confusionMatrix(as.factor(rf_pred), as.factor(test_set$class))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 99 23
##          1 21 41
##                                           
##                Accuracy : 0.7609          
##                  95% CI : (0.6926, 0.8206)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.0009686       
##                                           
##                   Kappa : 0.469           
##                                           
##  Mcnemar's Test P-Value : 0.8801685       
##                                           
##             Sensitivity : 0.8250          
##             Specificity : 0.6406          
##          Pos Pred Value : 0.8115          
##          Neg Pred Value : 0.6613          
##              Prevalence : 0.6522          
##          Detection Rate : 0.5380          
##    Detection Prevalence : 0.6630          
##       Balanced Accuracy : 0.7328          
##                                           
##        'Positive' Class : 0               
## 
ACC_RandomForest <- confusionMatrix(as.factor(rf_pred), as.factor(test_set$class))$overall['Accuracy']

# Random forest graphs
par(mfrow = c(1, 2))
varImpPlot(rf_pima, type = 2, main = "Variable Importance",col = 'black')
plot(rf_pima, main = "Error vs no. of trees grown")