Import necessary packages
library(RCurl)
## Warning: package 'RCurl' was built under R version 4.4.1
library(rsample)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.1
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
library(doParallel)
library(dplyr)
Random Forest for the Iris Data Set
Importing dataset
urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
downloaded <- getURL(urlfile, ssl.verifypeer=FALSE)
connection <- textConnection(downloaded)
dataset <- read.csv(connection, header=FALSE)
head(dataset)
## V1 V2 V3 V4 V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
Generating training and testing set
set.seed(123)
# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)
# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)
# Convert target variable to a factor (ensuring it's a classification problem)
train_data$V5 <- as.factor(train_data$V5)
test_data$V5 <- as.factor(test_data$V5)
# View the split summary
print(split)
## <Training/Testing/Total>
## <120/30/150>
Training the Random Forest and getting results
# Set seed for reproducibility
set.seed(123)
# Fit a Random Forest classification model
rf_model <- randomForest(
formula = V5 ~ ., # Replace with actual target variable name
data = train_data, # Training dataset
ntree = 500, # Number of trees
mtry = sqrt(ncol(train_data) - 1), # Number of variables randomly sampled at each split
nodesize = 5, # Minimum size of terminal nodes
replace = TRUE # Use bootstrap sampling
)
# Print the model summary
print(rf_model)
##
## Call:
## randomForest(formula = V5 ~ ., data = train_data, ntree = 500, mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.17%
## Confusion matrix:
## Iris-setosa Iris-versicolor Iris-virginica class.error
## Iris-setosa 40 0 0 0.00000000
## Iris-versicolor 0 32 3 0.08571429
## Iris-virginica 0 2 43 0.04444444
# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)
# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$V5, Predicted = predictions)
print(conf_matrix)
## Predicted
## Actual Iris-setosa Iris-versicolor Iris-virginica
## Iris-setosa 10 0 0
## Iris-versicolor 0 14 1
## Iris-virginica 0 0 5
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 96.67 %"
Random Forest for the Pima Indians Diabetes dataset
Importing the dataset
data(PimaIndiansDiabetes)
dataset <- PimaIndiansDiabetes
# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)
# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)
Executing Random Forest algorithm
# Set seed for reproducibility
set.seed(123)
# Fit a Random Forest classification model
rf_model <- randomForest(
formula = diabetes ~ ., # Replace with actual target variable name
data = train_data, # Training dataset
ntree = 500, # Number of trees
mtry = sqrt(ncol(train_data) - 1), # Number of variables randomly sampled at each split
nodesize = 5, # Minimum size of terminal nodes
replace = TRUE # Use bootstrap sampling
)
# Print the model summary
print(rf_model)
##
## Call:
## randomForest(formula = diabetes ~ ., data = train_data, ntree = 500, mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 23.29%
## Confusion matrix:
## neg pos class.error
## neg 324 68 0.1734694
## pos 75 147 0.3378378
# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)
# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$diabetes, Predicted = predictions)
print(conf_matrix)
## Predicted
## Actual neg pos
## neg 88 20
## pos 20 26
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 74.03 %"
Performing hyperparameter tuning
Activating parallel computing
cl <- makePSOCKcluster(detectCores() - 1) # Use all but 1 core
registerDoParallel(cl)
# Define hyperparameter grid
tune_grid <- expand.grid(
mtry = c(2, 5, 10, 15, 20), # Broad range of mtry
splitrule = "gini", # For classification
min.node.size = c(1, 3, 5, 10) # Different node sizes
)
# Train model with cross-validation
train_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Run the model with hyperparameter tuning
rf_model_tuned <- train(
diabetes ~ .,
data = train_data,
method = "ranger", # Efficient Random Forest
trControl = train_control,
tuneGrid = tune_grid,
num.trees = 1000 # Number of trees (adjust as needed)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
# Print best parameters
print(rf_model_tuned$bestTune)
## mtry splitrule min.node.size
## 8 5 gini 10
# Final model with best hyperparameters
rf_final <- randomForest(
formula = diabetes ~ .,
data = train_data,
ntree = 500,
mtry = rf_model_tuned$bestTune$mtry,
nodesize = rf_model_tuned$bestTune$min.node.size,
replace = TRUE
)
# Make predictions on test data
predictions <- predict(rf_final, newdata = test_data)
# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$diabetes, Predicted = predictions)
print(conf_matrix)
## Predicted
## Actual neg pos
## neg 88 20
## pos 18 28
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 75.32 %"
Random forest for Loan Dataset
dataset <- read.csv("WineQT.csv")
dataset <- subset(dataset, select = -Id)
dataset$quality <- factor(dataset$quality)
# Split the dataset into training (80%) and testing (20%) sets
split <- initial_split(dataset, prop = 0.8)
# Extract training and testing sets
train_data <- training(split)
test_data <- testing(split)
Executing Random Forest algorithm
### Executing Random Forest algorithm
# Set seed for reproducibility
set.seed(123)
# Fit a Random Forest classification model
rf_model <- randomForest(
formula = quality ~ ., # Replace with actual target variable name
data = train_data, # Training dataset
ntree = 500, # Number of trees
mtry = sqrt(ncol(train_data) - 1), # Number of variables randomly sampled at each split
nodesize = 5, # Minimum size of terminal nodes
replace = TRUE # Use bootstrap sampling
)
# Print the model summary
print(rf_model)
##
## Call:
## randomForest(formula = quality ~ ., data = train_data, ntree = 500, mtry = sqrt(ncol(train_data) - 1), nodesize = 5, replace = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 33.15%
## Confusion matrix:
## 3 4 5 6 7 8 class.error
## 3 0 0 5 1 0 0 1.0000000
## 4 0 0 14 12 0 0 1.0000000
## 5 0 0 292 91 3 0 0.2435233
## 6 0 0 85 268 19 0 0.2795699
## 7 0 0 3 56 51 0 0.5363636
## 8 0 0 0 11 3 0 1.0000000
# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)
# Compute the confusion matrix
conf_matrix <- table(Actual = test_data$quality, Predicted = predictions)
print(conf_matrix)
## Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 0 0 0 0
## 4 0 0 7 0 0 0
## 5 0 0 72 25 0 0
## 6 0 0 24 61 5 0
## 7 0 0 2 19 12 0
## 8 0 0 0 1 1 0
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 63.32 %"