rm(list = ls())
# Load packages
library (caret)

## Loading required package: ggplot2

## Loading required package: lattice

library (mlbench)
library (randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

# Load the dataset
data (iris)
set.seed (5)
# Create 80%/20% for training and test datasets
trainingIndex <- createDataPartition(iris$Species, p=0.80, list=FALSE)
training <- iris[trainingIndex, ]
test <- iris[ -trainingIndex, ]

# Train a model and summarize the model 
set.seed(5)
trainControl <- trainControl(method="repeatedcv", number=2, repeats=3)
rf.model <- train(Species~., data=training, method="rf", metric="Accuracy", trControl=trainControl, ntree=2)
print(rf.model)

## Random Forest 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 3 times) 
## Summary of sample sizes: 60, 60, 60, 60, 60, 60, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9222222  0.8833333
##   3     0.9444444  0.9166667
##   4     0.9250000  0.8875000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.

prediction <- predict(rf.model,test[,-5])
confusionMatrix(prediction,test$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          8         1
##   virginica       0          2         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.7347, 0.9789)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 1.665e-10       
##                                           
##                   Kappa : 0.85            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8000           0.9000
## Specificity                 1.0000            0.9500           0.9000
## Pos Pred Value              1.0000            0.8889           0.8182
## Neg Pred Value              1.0000            0.9048           0.9474
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.2667           0.3000
## Detection Prevalence        0.3333            0.3000           0.3667
## Balanced Accuracy           1.0000            0.8750           0.9000

Accuracy is 90%. To improve it, we can increase the number of k-folds, repeats, or ntrees. For example:

# Train a model and summarize the model 
set.seed(5)
trainControl <- trainControl(method="repeatedcv", number=5, repeats=3)
rf.model <- train(Species~., data=training, method="rf", metric="Accuracy", trControl=trainControl, ntree=50)
print(rf.model)

## Random Forest 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 96, 96, 96, 96, 96, 96, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9638889  0.9458333
##   3     0.9555556  0.9333333
##   4     0.9638889  0.9458333
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

prediction <- predict(rf.model,test[,-5])
confusionMatrix(prediction,test$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          9         0
##   virginica       0          1        10
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9000           1.0000
## Specificity                 1.0000            1.0000           0.9500
## Pos Pred Value              1.0000            1.0000           0.9091
## Neg Pred Value              1.0000            0.9524           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3000           0.3333
## Detection Prevalence        0.3333            0.3000           0.3667
## Balanced Accuracy           1.0000            0.9500           0.9750

Because most of the samples in the Iris data are clearly fell in one of the three classes, we do not really need to increase the number of trees a lot. Increasing ntree to 500 or 5000 does not improve the accuracy anymore.

# Train a model and summarize the model 
set.seed(5)
trainControl <- trainControl(method="repeatedcv", number=5, repeats=3)
rf.model <- train(Species~., data=training, method="rf", metric="Accuracy", trControl=trainControl, ntree=500)
print(rf.model)

## Random Forest 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 96, 96, 96, 96, 96, 96, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9611111  0.9416667
##   3     0.9638889  0.9458333
##   4     0.9611111  0.9416667
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.

prediction <- predict(rf.model,test[,-5])
confusionMatrix(prediction,test$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          9         0
##   virginica       0          1        10
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9000           1.0000
## Specificity                 1.0000            1.0000           0.9500
## Pos Pred Value              1.0000            1.0000           0.9091
## Neg Pred Value              1.0000            0.9524           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3000           0.3333
## Detection Prevalence        0.3333            0.3000           0.3667
## Balanced Accuracy           1.0000            0.9500           0.9750

#  Create using all training data without k-fold
set.seed(5)
finalModel <- randomForest(Species~., training, mtry=3, ntree=500)
print(finalModel)

## 
## Call:
##  randomForest(formula = Species ~ ., data = training, mtry = 3,      ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 4.17%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         40          0         0       0.000
## versicolor      0         38         2       0.050
## virginica       0          3        37       0.075

# Make predictions on “new data” using the final model
finalPredictions <- predict(finalModel, test[ , -5])
confusionMatrix(finalPredictions, test$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          9         0
##   virginica       0          1        10
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9000           1.0000
## Specificity                 1.0000            1.0000           0.9500
## Pos Pred Value              1.0000            1.0000           0.9091
## Neg Pred Value              1.0000            0.9524           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3000           0.3333
## Detection Prevalence        0.3333            0.3000           0.3667
## Balanced Accuracy           1.0000            0.9500           0.9750

plot(finalModel)

The plots below show the variables of the Iris dataset using the Species as a color coding. Sepal. Width and Sepal.Length are not separated clearly for Versicolor and Virginica, but the other two variable are better and more useful for classification. The 3D plots show that few samples are difficult to classify. That is why the accuracy did not get to 100%.

library(ggplot2)
ggplot(data = iris, mapping = aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + geom_point()

ggplot(data = iris, mapping = aes(x = Petal.Length, y = Petal.Width, color = Species)) + geom_point()

library(plotly)

## Warning: package 'plotly' was built under R version 4.4.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

plot_ly ( x = iris$Sepal.Length, y = iris$Sepal.Width, z = iris$Petal.Width, type = 'scatter3d', mode = 'markers', color = iris$Species)

plot_ly ( x = iris$Petal.Length, y = iris$Sepal.Width, z = iris$Petal.Width, type = 'scatter3d', mode = 'markers', color = iris$Species)

plot_ly ( x = iris$Sepal.Length, y = iris$Petal.Length, z = iris$Petal.Width, type = 'scatter3d', mode = 'markers', color = iris$Species)

Other dataset

rm(list = ls())
# Load packages
library (caret)
library (mlbench)
library (randomForest)

# Load the dataset
ghemri <- read.csv('Dataset3.csv')

set.seed (1)
trainingIndex <- sample(1:10, 8, replace = FALSE)
training <- ghemri[trainingIndex, ]
test <- ghemri[ -trainingIndex, ]

# Train a model and summarize the model 
set.seed(1)
trainControl <- trainControl(method="repeatedcv", number=4, repeats=3)
rf.model <- train(Down~., data=training, method="rf", metric="RMSE", trControl=trainControl, ntree=5)

## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

print(rf.model)

## Random Forest 
## 
## 8 samples
## 3 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 3 times) 
## Summary of sample sizes: 6, 6, 6, 6, 6, 6, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared  MAE     
##   2     33.00492  1         28.83722
##   3     37.61027  1         33.27931
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.

prediction <- predict(rf.model$finalModel,test[,-4])
table(prediction,test$Down)

##                   
## prediction         32 71
##   59.1166666666667  1  1

print("Prediction:")

## [1] "Prediction:"

prediction

##        6        8 
## 59.11667 59.11667

print("Actual Values:")

## [1] "Actual Values:"

test$Down

## [1] 32 71

# Calculate and print RMSE
rmse <- sqrt(mean((prediction - test$Down)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 20.9347465021937"

Let’s play with the tuning parameters to see if we can improve the accuracy:

# Train a model and summarize the model 
set.seed(1)
trainControl <- trainControl(method="repeatedcv", number=4, repeats=3)
rf.model <- train(Down~., data=training, method="rf", metric="RMSE", trControl=trainControl, ntree=500)

## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

print(rf.model)

## Random Forest 
## 
## 8 samples
## 3 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 3 times) 
## Summary of sample sizes: 6, 6, 6, 6, 6, 6, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared  MAE     
##   2     35.00634  1         31.23244
##   3     36.49981  1         32.69365
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.

prediction <- predict(rf.model$finalModel,test[,-4])
table(prediction,test$Down)

##                   
## prediction         32 71
##   53.0509666666667  1  0
##   60.8486666666667  0  1

print("Prediction:")

## [1] "Prediction:"

prediction

##        6        8 
## 53.05097 60.84867

print("Actual Values:")

## [1] "Actual Values:"

test$Down

## [1] 32 71

# Calculate and print RMSE
rmse <- sqrt(mean((prediction - test$Down)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 16.5256280674224"

The estimation is not perfect but it is better than before and RMSE has been reduced. I did not increase number of k-fold because we only have 8 samples in our training set and using higher k value does not make sense.

RandomForestProject

Soudeh Khoubrouy

2025-03-13

Other dataset