## Load libraries
library(stats)
library(dplyr)
library(randomForest)
library(caret)
library(mlbench)
## Inspect data
diab <- read.csv("diabetes.csv")
View(diab)
head(diab)
str(diab)
## pregnant glucose pressure triceps insulin mass
##1 6 148 72 35 0 33.6
##2 1 85 66 29 0 26.6
##3 8 183 64 0 0 23.3
##4 1 89 66 23 94 28.1
##5 0 137 40 35 168 43.1
##6 5 116 74 0 0 25.6
## pedigree age diabetes
##1 0.627 50 pos
##2 0.351 31 neg
##3 0.672 32 pos
##4 0.167 21 neg
##5 2.288 33 pos
##6 0.201 30 nwg
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288
## $ age : num 50 31 32 21 33 30 26 29 53 54
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
Split dataset and create random forest model
index = sample(2,nrow(diab),replace = TRUE,prob=c(0.7,0.3))
# Training data
Train = diab[index==1,]
# Testing data
Test = diab[index==2,]
# Random Forest Model
rfm = randomForest(diabetes~., data = Train)
print(rfm)
## Call:
## randomForest(formula = diabetes ~ ., data = Train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
## OOB estimate of error rate: 22.39%
## Confusion matrix:
## neg pos class.error
## neg 301 51 0.1448864
## pos 71 122 0.3678756
p1 <- predict(rfm,Train)
confusionMatrix(p1, Train$diabetes)
## Reference
## Prediction neg pos
## neg 352 0
## pos 0 193
## Accuracy : 1
## 95% CI : (0.9933, 1)
## No Information Rate : 0.6459
## P-Value [Acc > NIR] : < 2.2e-16
## Kappa : 1
## Mcnemar's Test P-Value : NA
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6459
## Detection Rate : 0.6459
## Detection Prevalence : 0.6459
## Balanced Accuracy : 1.0000
## 'Positive' Class : neg
Tuning
control <- trainControl(method ="repeatedcv", number = 10, repeats = 6)
grid <- expand.grid(mtry =c(1,2,3,4,5,6))
model.random.forest <- train(diabetes~., data=Train, method="rf", tuneGrid = grid, trConrtol=control)
model.random.forest
## 555 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 555, 555, 555, 555, 555, 555, ...
## Resampling results across tuning parameters:
## mtry Accuracy Kappa
## 1 0.7438669 0.3761886
## 2 0.7487793 0.4046578
## 3 0.7478976 0.4058921
## 4 0.7431232 0.3974846
## 5 0.7415836 0.3948964
## 6 0.7415917 0.3949823
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
p2 <- predict(rfm,Test)
confusionMatrix(p2,Test$diabetes)
## Confusion Matrix and Statistics
## Reference
## Prediction neg pos
## neg 122 25
## pos 18 46
## Accuracy : 0.7962
## 95% CI : (0.7355, 0.8484)
## No Information Rate : 0.6635
## P-Value [Acc > NIR] : 1.536e-05
## Kappa : 0.5322
## Mcnemar's Test P-Value : 0.3602
## P-Value [Acc > NIR] : 0.008665
## Kappa : 0.4174
## Mcnemar's Test P-Value : 1.000000
## Sensitivity : 0.8714
## Specificity : 0.6479
## Pos Pred Value : 0.8299
## Neg Pred Value : 0.7188
## Prevalence : 0.6635
## Detection Rate : 0.5782
## Detection Prevalence : 0.6967
## Balanced Accuracy : 0.7597
## 'Positive' Class : neg
Conclusions 1) Random forest algorithm yielded zero error rate with training data 2) Tuned dataset with 10 folds and 6 repetitions 3) Used six mtry values 4) mtry = 2 had highest value for the model 5) Accuarcy 79.6% for testing dataset
Load Libraries and Inspect data
# Load Libraries
library(stats)
library(caret)
library(randomForest)
library(mlbench)
# Inspect data
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##1 5.1 3.5 1.4 0.2 setosa
##2 4.9 3.0 1.4 0.2 setosa
##3 4.7 3.2 1.3 0.2 setosa
##4 4.6 3.1 1.5 0.2 setosa
##5 5.0 3.6 1.4 0.2 setosa
##6 5.4 3.9 1.7 0.4 setosa
# Variables
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Split data into Training and Testing
index <- sample(2,nrow(iris),replace = TRUE, prob=c(0.7,0.3))
#Training data
Training <- iris[index==1, ]
#Testing data
Testing <- iris[index==2, ]
Create Random Forest Model
rfm = randomForest(Species~., data = Training)
print(rfm)
## randomForest(formula = Species ~ ., data = Training)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
## OOB estimate of error rate: 2.97%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 35 0 0 0.00000000
## versicolor 0 34 2 0.05555556
## virginica 0 1 29 0.03333333
Rank Variables
importance(rfm)
varImpPlot(rfm)
## MeanDecreaseGini
## Sepal.Length 6.064209
## Sepal.Width 1.479663
## Petal.Length 31.390687
## Petal.Width 27.506897
Prediction of Training data
# Create Confusion Matrix
pre1 <- predict(rfm,Training)
confusionMatrix(pre1, Training$Species)
## Confusion Matrix and Statistics
## Reference
## Prediction setosa versicolor virginica
## setosa 35 0 0
## versicolor 0 36 0
## virginica 0 0 30
## Overall Statistics
## Accuracy : 1
## 95% CI : (0.9641, 1)
## No Information Rate : 0.3564
## P-Value [Acc > NIR] : < 2.2e-16
## Kappa : 1
## Mcnemar's Test P-Value : NA
## Statistics by Class:
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.000
## Specificity 1.0000 1.0000 1.000
## Pos Pred Value 1.0000 1.0000 1.000
## Neg Pred Value 1.0000 1.0000 1.000
## Prevalence 0.3465 0.3564 0.297
## Detection Rate 0.3465 0.3564 0.297
## Detection Prevalence 0.3465 0.3564 0.297
## Balanced Accuracy 1.0000 1.0000 1.000
Tuning Parameters
predicted_table <- predict(rfm, Testing[,-5])
table(observed= Testing[,5], predicted = predicted_table)
## predicted
## observed setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 12 2
## virginica 0 1 19
Prediction of Testing data
pre2 <- predict(rfm,Testing)
confusionMatrix(pre2,Testing$Species)
## Confusion Matrix and Statistics
## Reference
## Prediction setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 12 1
## virginica 0 2 19
## Overall Statistics
## Accuracy : 0.9388
## 95% CI : (0.8313, 0.9872)
## No Information Rate : 0.4082
## P-Value [Acc > NIR] : 5.007e-15
## Kappa : 0.9066
## Mcnemar's Test P-Value : NA
## Statistics by Class:
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8571 0.9500
## Specificity 1.0000 0.9714 0.9310
## Pos Pred Value 1.0000 0.9231 0.9048
## Neg Pred Value 1.0000 0.9444 0.9643
## Prevalence 0.3061 0.2857 0.4082
## Detection Rate 0.3061 0.2449 0.3878
## Detection Prevalence 0.3061 0.2653 0.4286
## Balanced Accuracy 1.0000 0.9143 0.9405
``` Conclusion 1) Low error rate of dataset before tuning 2) Petal Length and Petal Width are most important variables 3) High Sensitivity and Specificity on setosa class