rm(list = ls())
# Load packages
library (caret)
## Loading required package: ggplot2
## Loading required package: lattice
library (mlbench)
library (randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
# Load the dataset
data (iris)
set.seed (5)
# Create 80%/20% for training and test datasets
trainingIndex <- createDataPartition(iris$Species, p=0.80, list=FALSE)
training <- iris[trainingIndex, ]
test <- iris[ -trainingIndex, ]
# Train a model and summarize the model
set.seed(5)
trainControl <- trainControl(method="repeatedcv", number=2, repeats=3)
rf.model <- train(Species~., data=training, method="rf", metric="Accuracy", trControl=trainControl, ntree=2)
print(rf.model)
## Random Forest
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 3 times)
## Summary of sample sizes: 60, 60, 60, 60, 60, 60, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9222222 0.8833333
## 3 0.9444444 0.9166667
## 4 0.9250000 0.8875000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
prediction <- predict(rf.model,test[,-5])
confusionMatrix(prediction,test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 8 1
## virginica 0 2 9
##
## Overall Statistics
##
## Accuracy : 0.9
## 95% CI : (0.7347, 0.9789)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 1.665e-10
##
## Kappa : 0.85
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8000 0.9000
## Specificity 1.0000 0.9500 0.9000
## Pos Pred Value 1.0000 0.8889 0.8182
## Neg Pred Value 1.0000 0.9048 0.9474
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.2667 0.3000
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.8750 0.9000
Accuracy is 90%. To improve it, we can increase the number of k-folds, repeats, or ntrees. For example:
# Train a model and summarize the model
set.seed(5)
trainControl <- trainControl(method="repeatedcv", number=5, repeats=3)
rf.model <- train(Species~., data=training, method="rf", metric="Accuracy", trControl=trainControl, ntree=50)
print(rf.model)
## Random Forest
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 96, 96, 96, 96, 96, 96, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9638889 0.9458333
## 3 0.9555556 0.9333333
## 4 0.9638889 0.9458333
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
prediction <- predict(rf.model,test[,-5])
confusionMatrix(prediction,test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 0
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
Because most of the samples in the Iris data are clearly fell in one of the three classes, we do not really need to increase the number of trees a lot. Increasing ntree to 500 or 5000 does not improve the accuracy anymore.
# Train a model and summarize the model
set.seed(5)
trainControl <- trainControl(method="repeatedcv", number=5, repeats=3)
rf.model <- train(Species~., data=training, method="rf", metric="Accuracy", trControl=trainControl, ntree=500)
print(rf.model)
## Random Forest
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 96, 96, 96, 96, 96, 96, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9611111 0.9416667
## 3 0.9638889 0.9458333
## 4 0.9611111 0.9416667
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
prediction <- predict(rf.model,test[,-5])
confusionMatrix(prediction,test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 0
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
# Create using all training data without k-fold
set.seed(5)
finalModel <- randomForest(Species~., training, mtry=3, ntree=500)
print(finalModel)
##
## Call:
## randomForest(formula = Species ~ ., data = training, mtry = 3, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 4.17%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 40 0 0 0.000
## versicolor 0 38 2 0.050
## virginica 0 3 37 0.075
# Make predictions on “new data” using the final model
finalPredictions <- predict(finalModel, test[ , -5])
confusionMatrix(finalPredictions, test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 0
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
plot(finalModel)
The plots below show the variables of the Iris dataset using the Species as a color coding. Sepal. Width and Sepal.Length are not separated clearly for Versicolor and Virginica, but the other two variable are better and more useful for classification. The 3D plots show that few samples are difficult to classify. That is why the accuracy did not get to 100%.
library(ggplot2)
ggplot(data = iris, mapping = aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + geom_point()
ggplot(data = iris, mapping = aes(x = Petal.Length, y = Petal.Width, color = Species)) + geom_point()
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly ( x = iris$Sepal.Length, y = iris$Sepal.Width, z = iris$Petal.Width, type = 'scatter3d', mode = 'markers', color = iris$Species)
plot_ly ( x = iris$Petal.Length, y = iris$Sepal.Width, z = iris$Petal.Width, type = 'scatter3d', mode = 'markers', color = iris$Species)
plot_ly ( x = iris$Sepal.Length, y = iris$Petal.Length, z = iris$Petal.Width, type = 'scatter3d', mode = 'markers', color = iris$Species)
rm(list = ls())
# Load packages
library (caret)
library (mlbench)
library (randomForest)
# Load the dataset
ghemri <- read.csv('Dataset3.csv')
set.seed (1)
trainingIndex <- sample(1:10, 8, replace = FALSE)
training <- ghemri[trainingIndex, ]
test <- ghemri[ -trainingIndex, ]
# Train a model and summarize the model
set.seed(1)
trainControl <- trainControl(method="repeatedcv", number=4, repeats=3)
rf.model <- train(Down~., data=training, method="rf", metric="RMSE", trControl=trainControl, ntree=5)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
print(rf.model)
## Random Forest
##
## 8 samples
## 3 predictors
##
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 3 times)
## Summary of sample sizes: 6, 6, 6, 6, 6, 6, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 33.00492 1 28.83722
## 3 37.61027 1 33.27931
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
prediction <- predict(rf.model$finalModel,test[,-4])
table(prediction,test$Down)
##
## prediction 32 71
## 59.1166666666667 1 1
print("Prediction:")
## [1] "Prediction:"
prediction
## 6 8
## 59.11667 59.11667
print("Actual Values:")
## [1] "Actual Values:"
test$Down
## [1] 32 71
# Calculate and print RMSE
rmse <- sqrt(mean((prediction - test$Down)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 20.9347465021937"
Let’s play with the tuning parameters to see if we can improve the accuracy:
# Train a model and summarize the model
set.seed(1)
trainControl <- trainControl(method="repeatedcv", number=4, repeats=3)
rf.model <- train(Down~., data=training, method="rf", metric="RMSE", trControl=trainControl, ntree=500)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
print(rf.model)
## Random Forest
##
## 8 samples
## 3 predictors
##
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 3 times)
## Summary of sample sizes: 6, 6, 6, 6, 6, 6, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 35.00634 1 31.23244
## 3 36.49981 1 32.69365
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
prediction <- predict(rf.model$finalModel,test[,-4])
table(prediction,test$Down)
##
## prediction 32 71
## 53.0509666666667 1 0
## 60.8486666666667 0 1
print("Prediction:")
## [1] "Prediction:"
prediction
## 6 8
## 53.05097 60.84867
print("Actual Values:")
## [1] "Actual Values:"
test$Down
## [1] 32 71
# Calculate and print RMSE
rmse <- sqrt(mean((prediction - test$Down)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 16.5256280674224"
The estimation is not perfect but it is better than before and RMSE has been reduced. I did not increase number of k-fold because we only have 8 samples in our training set and using higher k value does not make sense.