Iris Dataset
# Paul Brown Random Forest Project
# Load Libraries
library(stats)
library(caret)
library(randomForest)
library(mlbench)
library(dplyr)
library(tidyverse)
#Load and Inspect data
data("iris")
head(iris)
View(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Variables
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 ## 1 1 1 1 1 1 1 1 ...
# Split Data into Training and Testing
index <- sample(2,nrow(iris),replace = TRUE, prob=c(0.7,0.3))
##Training data
Training <- iris[index==1, ]
##Testing data
Testing <- iris[index==2, ]
# Random Forest Model
rf = randomForest(Species~., data = Training)
print(rf)
## randomForest(formula = Species ~ ., data = Training)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
## OOB estimate of error rate: 3.96%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 37 0 0 0.00000000
## versicolor 0 31 2 0.06060606
## virginica 0 2 29 0.06451613
#Importance
importance(rf)
varImpPlot(rf)
## MeanDecreaseGini
## Sepal.Length 6.183899
## Sepal.Width 1.426547
## Petal.Length 30.243872
## Petal.Width 28.581184
Predict Model Accuracy
# Confusion Matrix
pred_1 <- predict(rf,Testing)
confusionMatrix(pred_1,Testing$Species)
#Confusion Matrix and Statistics (Testing Data)
## Reference
## Prediction setosa versicolor virginica
## setosa 13 0 0
## versicolor 0 15 0
## virginica 0 2 19
## Overall Statistics
## Accuracy : 0.9592
## 95% CI : (0.8602, 0.995)
## No Information Rate : 0.3878
## P-Value [Acc > NIR] : < 2.2e-16
## Kappa : 0.9379
## Mcnemar's Test P-Value : NA
## Statistics by Class:
## Class:setosa Class:versicolor Class:virginica
## Sensitivity 1.0000 0.8824 1.0000
## Specificity 1.0000 1.0000 0.9333
## Pos Pred Value 1.0000 1.0000 0.9048
## Neg Pred Value 1.0000 0.9412 1.0000
## Prevalence 0.2653 0.3469 0.3878
## Detection Rate 0.2653 0.3061 0.3878
## Detection Prev 0.2653 0.3061 0.4286
## Balanced Accuracy1.0000 0.9412 0.9667
# Error Rate
plot(rf)
## Random Search
control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random")
set.seed(seed)
mtry <- sqrt(ncol(x))
rf_random <- train(Species~., data=Training, method="rf", metric=metric, tuneLength=15, trControl=control)
print(rf_random)
plot(rf_random)
##Random Forest
##101 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 90, 91, 91, 92, 90, 91, ...
## Resampling results across tuning parameters:
## mtry Accuracy Kappa
2 0.9772054 0.9656726
3 0.9805387 0.9708008
4 0.9906061 0.9858582
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
#Tuning Parameters
predicted_table <- predict(rf, Testing[,-5])
table(observed= Testing[,5], predicted = predicted_table)
## predicted
## observed setosa versicolor virginica
## setosa 13 0 0
## versicolor 0 15 2
## virginica 0 0 19