Load Libraries for the Random Forest Project

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Warning: package 'caret' was built under R version 4.2.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
library(ggplot2)
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.2

Read in the dataset and understand the data structure (Lset data)

Lset <- read.csv("D:/otherset.csv")
str(Lset)
## 'data.frame':    10 obs. of  4 variables:
##  $ Left : int  1 0 1 0 0 0 0 1 1 0
##  $ Right: int  45 0 92 18 26 48 41 52 64 80
##  $ Up   : int  24 26 32 41 80 76 92 39 46 50
##  $ Down : int  100 69 46 24 0 32 86 71 65 48
set.seed(1234)
Lset$Left <- as.factor(Lset$Left)

Create 80%/20% for training and validation datasets

validationIndex <- createDataPartition(Lset$Left,p=0.80, list=FALSE)
validation <- Lset[-validationIndex, ]
Ltrain <- Lset[validationIndex, ]
set.seed(123456)
rf <- randomForest(Left ~., Ltrain, mtry=3, ntree=500)
rf
## 
## Call:
##  randomForest(formula = Left ~ ., data = Ltrain, mtry = 3, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 44.44%
## Confusion matrix:
##   0 1 class.error
## 0 3 2         0.4
## 1 2 2         0.5

Evaluate the random forest performance

ctrl <- trainControl(method = "cv", number=3)
grid_rf <- expand.grid(mtry=3)
m_rf <- train(Left~., data=Ltrain, method="rf", metric= "Accuracy", trControl=ctrl, ntree=500)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
m_rf
## Random Forest 
## 
## 9 samples
## 3 predictors
## 2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 6, 6, 6 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7777778  0.6000000
##   3     0.7777778  0.4666667
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Make a prediction

pred <- predict(m_rf, Lset)
table(pred,Lset$Left)
##     
## pred 0 1
##    0 6 0
##    1 0 4
plot(m_rf)

Random Forest Project with Iris Dataset

Load packages

library(rpart)
library(caret)
library(ggplot2)
library(randomForest)
## Load Iris Dataset
Irset <- read.csv("D:/irisr.csv") 
set.seed(1000)
Irset$Species <- as.factor(Irset$Species)
str(Irset)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "Setosa","Versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(Irset)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  Setosa
## 2          4.9         3.0          1.4         0.2  Setosa
## 3          4.7         3.2          1.3         0.2  Setosa
## 4          4.6         3.1          1.5         0.2  Setosa
## 5          5.0         3.6          1.4         0.2  Setosa
## 6          5.4         3.9          1.7         0.4  Setosa

Create a 80%/20% for training and validation of dataset

val_base <- createDataPartition(Irset$Species,p=0.80, list=FALSE)
valid <- Irset[-val_base, ]
ir_train <- Irset[val_base, ]
set.seed(1000)
ir_rf <- randomForest(Species~., ir_train, mtry=4, importance=TRUE, ntree=500)
ir_rf
## 
## Call:
##  randomForest(formula = Species ~ ., data = ir_train, mtry = 4,      importance = TRUE, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 4.17%
## Confusion matrix:
##            Setosa Versicolor Virginica class.error
## Setosa         40          0         0       0.000
## Versicolor      0         38         2       0.050
## Virginica       0          3        37       0.075
## Evaluate the importance
importance(ir_rf)
##                Setosa Versicolor Virginica MeanDecreaseAccuracy
## Sepal.Length  0.00000 -0.8245351  3.678355             3.179503
## Sepal.Width   0.00000 -5.0667707  6.164603             1.975852
## Petal.Length 24.36507 33.9329757 29.518567            35.975324
## Petal.Width  22.47154 30.0467381 31.948066            33.638813
##              MeanDecreaseGini
## Sepal.Length        0.7272981
## Sepal.Width         1.0726318
## Petal.Length       40.0725004
## Petal.Width        37.4144365

Evaluate the random forest performance

ir_ctrl <- trainControl(method = "cv", number=4)
irgrid_rf <- expand.grid(mtry=4)
irm_rf <- train(Species~., data=ir_train, method="rf", metric="Accuracy", trControl=ctrl, ntree=500)
irm_rf
## Random Forest 
## 
## 120 samples
##   4 predictor
##   3 classes: 'Setosa', 'Versicolor', 'Virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 79, 80, 81 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9501824  0.9252884
##   3     0.9416354  0.9124679
##   4     0.9416354  0.9124679
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Make a Prediction

pred_iris <- predict(irm_rf,Irset)
table(pred_iris,Irset$Species)
##             
## pred_iris    Setosa Versicolor Virginica
##   Setosa         50          0         0
##   Versicolor      0         49         2
##   Virginica       0          1        48

Visualize the Prediction

plot(Irset)

plot(irm_rf)