# =========================================================
# Random Forest Classification with Palmer Penguins Dataset
# Target: Species (Adelie, Chinstrap, Gentoo)
# Q: Can we predict the species of a penguin (Adélie, Chinstrap, Gentoo) using simple body measurements such as
# bill length, bill depth, flipper length, and body mass?
# =========================================================
# CLEAR MEMORY
rm(list=ls())
# set working directory
setwd("D:/Personal/Study/MSS/Dissertation/0311 15 Econ 5212 - Dissertation Part-I-M/Random_Forest/palmerpenguin/")
# --- 1) Load required packages ---
# palmerpenguins = dataset package
# randomForest = Random Forest algorithm
library(palmerpenguins)
## Warning: package 'palmerpenguins' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
# --- 2) Load the dataset ---
data("penguins") # load penguins data into memory
head(penguins) # quick preview of the first 6 rows
## # A tibble: 6 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## # ℹ 2 more variables: sex <fct>, year <int>
# --- 3) Handle missing values ---
# Random Forest cannot handle NA, so we remove rows with NA
penguins_clean <- na.omit(penguins)
dim(penguins_clean) # check new number of rows and columns
## [1] 333 8
# --- 4) Select target and features ---
# We choose species (factor) as target
# Features: bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
penguins_small <- penguins_clean[, c("species",
"bill_length_mm",
"bill_depth_mm",
"flipper_length_mm",
"body_mass_g")]
# Check target is factor (important for classification)
is.factor(penguins_small$species)
## [1] TRUE
# if showed FALSE, penguins_small$species <- as.factor(penguins_small$species)
# --- 5) Split into training (80%) and testing (20%) sets ---
set.seed(123) # fix random seed for reproducibility
n <- nrow(penguins_small) # number of rows
test_size <- round(0.2 * n) # 20% for testing
test_idx <- sample(1:n, size = test_size) # random row indices for test set
train <- penguins_small[-test_idx, ] # training data (everything except test_idx)
test <- penguins_small[ test_idx, ] # testing data (only test_idx)
dim(train); dim(test) # check train/test sizes
## [1] 266 5
## [1] 67 5
# --- 6) Train Random Forest model ---
# species ~ predictors : formula tells model what to predict
# ntree = number of trees (500 is common)
# mtry = number of variables to try at each split (default is sqrt(p))
# importance = TRUE allows us to see variable importance later
set.seed(123)
rf_model <- randomForest(
species ~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g,
data = train,
ntree = 500,
mtry = 2,
importance = TRUE
)
print(rf_model) # shows OOB (out-of-bag) error rate
##
## Call:
## randomForest(formula = species ~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g, data = train, ntree = 500, mtry = 2, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 3.01%
## Confusion matrix:
## Adelie Chinstrap Gentoo class.error
## Adelie 108 3 1 0.03571429
## Chinstrap 4 53 0 0.07017544
## Gentoo 0 0 97 0.00000000
# --- 7) Make predictions on test data ---
test_preds <- predict(rf_model, newdata = test) # predict species for test rows
# --- 8) Evaluate model performance ---
cm <- table(Actual = test$species, Predicted = test_preds) # confusion matrix
cm
## Predicted
## Actual Adelie Chinstrap Gentoo
## Adelie 34 0 0
## Chinstrap 0 11 0
## Gentoo 0 1 21
acc <- mean(test_preds == test$species) # accuracy = correct predictions / total
acc
## [1] 0.9850746
# --- 9) Check variable importance ---
importance(rf_model) # numeric measure of feature importance
## Adelie Chinstrap Gentoo MeanDecreaseAccuracy
## bill_length_mm 120.628581 68.66878 11.72367 99.99512
## bill_depth_mm 16.682837 16.14065 35.24585 36.49475
## flipper_length_mm 23.459839 29.53173 38.72649 42.25038
## body_mass_g 5.185614 19.64146 12.62354 18.94373
## MeanDecreaseGini
## bill_length_mm 68.532585
## bill_depth_mm 29.631093
## flipper_length_mm 62.773159
## body_mass_g 9.599709
varImpPlot(rf_model) # simple plot showing important predictors

# --- 10) Predict species for a new penguin (example data) ---
new_penguin <- data.frame(
bill_length_mm = 45,
bill_depth_mm = 17,
flipper_length_mm = 210,
body_mass_g = 4800
)
predict(rf_model, newdata = new_penguin) # model’s prediction
## 1
## Gentoo
## Levels: Adelie Chinstrap Gentoo
# =========================================================
# Export Random Forest Results - Palmer Penguins
# =========================================================
# 1) Confusion Matrix
cm <- table(Actual = test$species, Predicted = test_preds)
write.csv(cm, "confusion_matrix.csv", row.names = TRUE)
# 2) Accuracy
acc <- mean(test_preds == test$species)
write(acc, file = "accuracy.txt")
# 3) Variable Importance
imp <- importance(rf_model)
write.csv(imp, "variable_importance.csv", row.names = TRUE)
# 4) Model Summary
summary_text <- capture.output(rf_model) # capture print output
writeLines(summary_text, "rf_model_summary.txt")
# Optional message
cat("All results exported successfully!\n")
## All results exported successfully!