Import necessary packages

library(RCurl)
## Warning: package 'RCurl' was built under R version 4.4.1
library(rsample)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.1
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
library(doParallel)
library(dplyr)

Random Forest for the Iris Data Set

Importing dataset

urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
downloaded <- getURL(urlfile, ssl.verifypeer=FALSE)
connection <- textConnection(downloaded)
dataset <- read.csv(connection, header=FALSE)
head(dataset)
##    V1  V2  V3  V4          V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa

Generating training and testing set

set.seed(123)

# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)

# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)

# Convert target variable to a factor (ensuring it's a classification problem)
train_data$V5 <- as.factor(train_data$V5)
test_data$V5 <- as.factor(test_data$V5)

# View the split summary
print(split)
## <Training/Testing/Total>
## <120/30/150>

Training the Random Forest and getting results

# Set seed for reproducibility
set.seed(123)

# Fit a Random Forest classification model
rf_model <- randomForest(
  formula = V5 ~ .,  # Replace with actual target variable name
  data = train_data,              # Training dataset
  ntree = 500,                    # Number of trees
  mtry = sqrt(ncol(train_data) - 1),  # Number of variables randomly sampled at each split
  nodesize = 5,                    # Minimum size of terminal nodes
  replace = TRUE                   # Use bootstrap sampling
)

# Print the model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = V5 ~ ., data = train_data, ntree = 500,      mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.17%
## Confusion matrix:
##                 Iris-setosa Iris-versicolor Iris-virginica class.error
## Iris-setosa              40               0              0  0.00000000
## Iris-versicolor           0              32              3  0.08571429
## Iris-virginica            0               2             43  0.04444444
# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)

# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$V5, Predicted = predictions)
print(conf_matrix)
##                  Predicted
## Actual            Iris-setosa Iris-versicolor Iris-virginica
##   Iris-setosa              10               0              0
##   Iris-versicolor           0              14              1
##   Iris-virginica            0               0              5
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 96.67 %"

Random Forest for the Pima Indians Diabetes dataset

Importing the dataset

data(PimaIndiansDiabetes)
dataset <- PimaIndiansDiabetes

# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)

# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)

Executing Random Forest algorithm

# Set seed for reproducibility
set.seed(123)

# Fit a Random Forest classification model
rf_model <- randomForest(
  formula = diabetes ~ .,  # Replace with actual target variable name
  data = train_data,              # Training dataset
  ntree = 500,                    # Number of trees
  mtry = sqrt(ncol(train_data) - 1),  # Number of variables randomly sampled at each split
  nodesize = 5,                    # Minimum size of terminal nodes
  replace = TRUE                   # Use bootstrap sampling
)

# Print the model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = diabetes ~ ., data = train_data, ntree = 500,      mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 23.29%
## Confusion matrix:
##     neg pos class.error
## neg 324  68   0.1734694
## pos  75 147   0.3378378
# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)

# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$diabetes, Predicted = predictions)
print(conf_matrix)
##       Predicted
## Actual neg pos
##    neg  88  20
##    pos  20  26
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 74.03 %"

Performing hyperparameter tuning

Activating parallel computing

cl <- makePSOCKcluster(detectCores() - 1)  # Use all but 1 core
registerDoParallel(cl)
# Define hyperparameter grid
tune_grid <- expand.grid(
  mtry = c(2, 5, 10, 15, 20),  # Broad range of mtry
  splitrule = "gini",          # For classification
  min.node.size = c(1, 3, 5, 10)  # Different node sizes
)

# Train model with cross-validation
train_control <- trainControl(method = "cv", number = 10)  # 10-fold cross-validation

# Run the model with hyperparameter tuning
rf_model_tuned <- train(
  diabetes ~ ., 
  data = train_data,
  method = "ranger",  # Efficient Random Forest
  trControl = train_control,
  tuneGrid = tune_grid,
  num.trees = 1000  # Number of trees (adjust as needed)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
# Print best parameters
print(rf_model_tuned$bestTune)
##   mtry splitrule min.node.size
## 8    5      gini            10
# Final model with best hyperparameters
rf_final <- randomForest(
  formula = diabetes ~ .,  
  data = train_data,              
  ntree = 500,                    
  mtry = rf_model_tuned$bestTune$mtry,  
  nodesize = rf_model_tuned$bestTune$min.node.size,                   
  replace = TRUE                   
)

# Make predictions on test data
predictions <- predict(rf_final, newdata = test_data)

# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$diabetes, Predicted = predictions)
print(conf_matrix)
##       Predicted
## Actual neg pos
##    neg  88  20
##    pos  18  28
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 75.32 %"

Random forest for Loan Dataset

dataset <- read.csv("WineQT.csv")
dataset <- subset(dataset, select = -Id)
dataset$quality <- factor(dataset$quality)

# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)

# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)

Executing Random Forest algorithm

### Executing Random Forest algorithm


# Set seed for reproducibility
set.seed(123)

# Fit a Random Forest classification model
rf_model <- randomForest(
  formula = quality ~ .,  # Replace with actual target variable name
  data = train_data,              # Training dataset
  ntree = 500,                    # Number of trees
  mtry = sqrt(ncol(train_data) - 1),  # Number of variables randomly sampled at each split
  nodesize = 5,                    # Minimum size of terminal nodes
  replace = TRUE                   # Use bootstrap sampling
)

# Print the model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = quality ~ ., data = train_data, ntree = 500,      mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 33.15%
## Confusion matrix:
##   3 4   5   6  7 8 class.error
## 3 0 0   5   1  0 0   1.0000000
## 4 0 0  14  12  0 0   1.0000000
## 5 0 0 292  91  3 0   0.2435233
## 6 0 0  85 268 19 0   0.2795699
## 7 0 0   3  56 51 0   0.5363636
## 8 0 0   0  11  3 0   1.0000000
# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)

# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$quality, Predicted = predictions)
print(conf_matrix)
##       Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  0  0  0  0
##      4  0  0  7  0  0  0
##      5  0  0 72 25  0  0
##      6  0  0 24 61  5  0
##      7  0  0  2 19 12  0
##      8  0  0  0  1  1  0
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 63.32 %"