RF_Project

Import necessary packages

library(RCurl)

## Warning: package 'RCurl' was built under R version 4.4.1

library(rsample)
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.1

library(mlbench)

## Warning: package 'mlbench' was built under R version 4.4.1

library(caret)

## Warning: package 'caret' was built under R version 4.4.1

library(doParallel)
library(dplyr)

Random Forest for the Iris Data Set

Importing dataset

urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
downloaded <- getURL(urlfile, ssl.verifypeer=FALSE)
connection <- textConnection(downloaded)
dataset <- read.csv(connection, header=FALSE)
head(dataset)

##    V1  V2  V3  V4          V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa

Generating training and testing set

set.seed(123)

# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)

# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)

# Convert target variable to a factor (ensuring it's a classification problem)
train_data$V5 <- as.factor(train_data$V5)
test_data$V5 <- as.factor(test_data$V5)

# View the split summary
print(split)

## <Training/Testing/Total>
## <120/30/150>

Training the Random Forest and getting results

# Set seed for reproducibility
set.seed(123)

# Fit a Random Forest classification model
rf_model <- randomForest(
  formula = V5 ~ .,  # Replace with actual target variable name
  data = train_data,              # Training dataset
  ntree = 500,                    # Number of trees
  mtry = sqrt(ncol(train_data) - 1),  # Number of variables randomly sampled at each split
  nodesize = 5,                    # Minimum size of terminal nodes
  replace = TRUE                   # Use bootstrap sampling
)

# Print the model summary
print(rf_model)

## 
## Call:
##  randomForest(formula = V5 ~ ., data = train_data, ntree = 500,      mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.17%
## Confusion matrix:
##                 Iris-setosa Iris-versicolor Iris-virginica class.error
## Iris-setosa              40               0              0  0.00000000
## Iris-versicolor           0              32              3  0.08571429
## Iris-virginica            0               2             43  0.04444444

# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)

# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$V5, Predicted = predictions)
print(conf_matrix)

##                  Predicted
## Actual            Iris-setosa Iris-versicolor Iris-virginica
##   Iris-setosa              10               0              0
##   Iris-versicolor           0              14              1
##   Iris-virginica            0               0              5

# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Accuracy: 96.67 %"

Random Forest for the Pima Indians Diabetes dataset

Importing the dataset

data(PimaIndiansDiabetes)
dataset <- PimaIndiansDiabetes

# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)

# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)

Executing Random Forest algorithm

# Set seed for reproducibility
set.seed(123)

# Fit a Random Forest classification model
rf_model <- randomForest(
  formula = diabetes ~ .,  # Replace with actual target variable name
  data = train_data,              # Training dataset
  ntree = 500,                    # Number of trees
  mtry = sqrt(ncol(train_data) - 1),  # Number of variables randomly sampled at each split
  nodesize = 5,                    # Minimum size of terminal nodes
  replace = TRUE                   # Use bootstrap sampling
)

# Print the model summary
print(rf_model)

## 
## Call:
##  randomForest(formula = diabetes ~ ., data = train_data, ntree = 500,      mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 23.29%
## Confusion matrix:
##     neg pos class.error
## neg 324  68   0.1734694
## pos  75 147   0.3378378

# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)

# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$diabetes, Predicted = predictions)
print(conf_matrix)

##       Predicted
## Actual neg pos
##    neg  88  20
##    pos  20  26

# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Accuracy: 74.03 %"

Performing hyperparameter tuning

Activating parallel computing

cl <- makePSOCKcluster(detectCores() - 1)  # Use all but 1 core
registerDoParallel(cl)

# Define hyperparameter grid
tune_grid <- expand.grid(
  mtry = c(2, 5, 10, 15, 20),  # Broad range of mtry
  splitrule = "gini",          # For classification
  min.node.size = c(1, 3, 5, 10)  # Different node sizes
)

# Train model with cross-validation
train_control <- trainControl(method = "cv", number = 10)  # 10-fold cross-validation

# Run the model with hyperparameter tuning
rf_model_tuned <- train(
  diabetes ~ ., 
  data = train_data,
  method = "ranger",  # Efficient Random Forest
  trControl = train_control,
  tuneGrid = tune_grid,
  num.trees = 1000  # Number of trees (adjust as needed)
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results

# Print best parameters
print(rf_model_tuned$bestTune)

##   mtry splitrule min.node.size
## 8    5      gini            10

# Final model with best hyperparameters
rf_final <- randomForest(
  formula = diabetes ~ .,  
  data = train_data,              
  ntree = 500,                    
  mtry = rf_model_tuned$bestTune$mtry,  
  nodesize = rf_model_tuned$bestTune$min.node.size,                   
  replace = TRUE                   
)

# Make predictions on test data
predictions <- predict(rf_final, newdata = test_data)

# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$diabetes, Predicted = predictions)
print(conf_matrix)

##       Predicted
## Actual neg pos
##    neg  88  20
##    pos  18  28

# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Accuracy: 75.32 %"

Random forest for Loan Dataset

dataset <- read.csv("WineQT.csv")
dataset <- subset(dataset, select = -Id)
dataset$quality <- factor(dataset$quality)

# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)

# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)

Executing Random Forest algorithm

### Executing Random Forest algorithm


# Set seed for reproducibility
set.seed(123)

# Fit a Random Forest classification model
rf_model <- randomForest(
  formula = quality ~ .,  # Replace with actual target variable name
  data = train_data,              # Training dataset
  ntree = 500,                    # Number of trees
  mtry = sqrt(ncol(train_data) - 1),  # Number of variables randomly sampled at each split
  nodesize = 5,                    # Minimum size of terminal nodes
  replace = TRUE                   # Use bootstrap sampling
)

# Print the model summary
print(rf_model)

## 
## Call:
##  randomForest(formula = quality ~ ., data = train_data, ntree = 500,      mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 33.15%
## Confusion matrix:
##   3 4   5   6  7 8 class.error
## 3 0 0   5   1  0 0   1.0000000
## 4 0 0  14  12  0 0   1.0000000
## 5 0 0 292  91  3 0   0.2435233
## 6 0 0  85 268 19 0   0.2795699
## 7 0 0   3  56 51 0   0.5363636
## 8 0 0   0  11  3 0   1.0000000

# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)

# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$quality, Predicted = predictions)
print(conf_matrix)

##       Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  0  0  0  0
##      4  0  0  7  0  0  0
##      5  0  0 72 25  0  0
##      6  0  0 24 61  5  0
##      7  0  0  2 19 12  0
##      8  0  0  0  1  1  0

# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Accuracy: 63.32 %"

RF_Project

Carlos Olivos

2025-02-27

Import necessary packages

Random Forest for the Iris Data Set

Importing dataset

Generating training and testing set

Training the Random Forest and getting results

Random Forest for the Pima Indians Diabetes dataset

Importing the dataset

Executing Random Forest algorithm

Performing hyperparameter tuning

Activating parallel computing

Random forest for Loan Dataset

Executing Random Forest algorithm