# =========================================================
# Random Forest Classification with Palmer Penguins Dataset
# Target: Species (Adelie, Chinstrap, Gentoo)
# Q: Can we predict the species of a penguin (Adélie, Chinstrap, Gentoo) using simple body measurements such as
#    bill length, bill depth, flipper length, and body mass?
# =========================================================
# CLEAR MEMORY
rm(list=ls())

# set working directory
setwd("D:/Personal/Study/MSS/Dissertation/0311 15 Econ 5212 - Dissertation Part-I-M/Random_Forest/palmerpenguin/")

# --- 1) Load required packages ---
# palmerpenguins = dataset package
# randomForest   = Random Forest algorithm
library(palmerpenguins)
## Warning: package 'palmerpenguins' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
# --- 2) Load the dataset ---
data("penguins")     # load penguins data into memory
head(penguins)       # quick preview of the first 6 rows
## # A tibble: 6 × 8
##   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie  Torgersen           39.1          18.7               181        3750
## 2 Adelie  Torgersen           39.5          17.4               186        3800
## 3 Adelie  Torgersen           40.3          18                 195        3250
## 4 Adelie  Torgersen           NA            NA                  NA          NA
## 5 Adelie  Torgersen           36.7          19.3               193        3450
## 6 Adelie  Torgersen           39.3          20.6               190        3650
## # ℹ 2 more variables: sex <fct>, year <int>
# --- 3) Handle missing values ---
# Random Forest cannot handle NA, so we remove rows with NA
penguins_clean <- na.omit(penguins)  
dim(penguins_clean)   # check new number of rows and columns
## [1] 333   8
# --- 4) Select target and features ---
# We choose species (factor) as target
# Features: bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
penguins_small <- penguins_clean[, c("species",
                                     "bill_length_mm",
                                     "bill_depth_mm",
                                     "flipper_length_mm",
                                     "body_mass_g")]

# Check target is factor (important for classification)
is.factor(penguins_small$species)
## [1] TRUE
# if showed FALSE, penguins_small$species <- as.factor(penguins_small$species)

# --- 5) Split into training (80%) and testing (20%) sets ---
set.seed(123)   # fix random seed for reproducibility
n <- nrow(penguins_small)                      # number of rows
test_size <- round(0.2 * n)                    # 20% for testing
test_idx <- sample(1:n, size = test_size)      # random row indices for test set

train <- penguins_small[-test_idx, ]   # training data (everything except test_idx)
test  <- penguins_small[ test_idx, ]   # testing data (only test_idx)

dim(train); dim(test)   # check train/test sizes
## [1] 266   5
## [1] 67  5
# --- 6) Train Random Forest model ---
# species ~ predictors : formula tells model what to predict
# ntree = number of trees (500 is common)
# mtry  = number of variables to try at each split (default is sqrt(p))
# importance = TRUE allows us to see variable importance later
set.seed(123)
rf_model <- randomForest(
  species ~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g,
  data = train,
  ntree = 500,
  mtry = 2,
  importance = TRUE
)

print(rf_model)   # shows OOB (out-of-bag) error rate
## 
## Call:
##  randomForest(formula = species ~ bill_length_mm + bill_depth_mm +      flipper_length_mm + body_mass_g, data = train, ntree = 500,      mtry = 2, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 3.01%
## Confusion matrix:
##           Adelie Chinstrap Gentoo class.error
## Adelie       108         3      1  0.03571429
## Chinstrap      4        53      0  0.07017544
## Gentoo         0         0     97  0.00000000
# --- 7) Make predictions on test data ---
test_preds <- predict(rf_model, newdata = test)  # predict species for test rows

# --- 8) Evaluate model performance ---
cm <- table(Actual = test$species, Predicted = test_preds)  # confusion matrix
cm
##            Predicted
## Actual      Adelie Chinstrap Gentoo
##   Adelie        34         0      0
##   Chinstrap      0        11      0
##   Gentoo         0         1     21
acc <- mean(test_preds == test$species)  # accuracy = correct predictions / total
acc
## [1] 0.9850746
# --- 9) Check variable importance ---
importance(rf_model)     # numeric measure of feature importance
##                       Adelie Chinstrap   Gentoo MeanDecreaseAccuracy
## bill_length_mm    120.628581  68.66878 11.72367             99.99512
## bill_depth_mm      16.682837  16.14065 35.24585             36.49475
## flipper_length_mm  23.459839  29.53173 38.72649             42.25038
## body_mass_g         5.185614  19.64146 12.62354             18.94373
##                   MeanDecreaseGini
## bill_length_mm           68.532585
## bill_depth_mm            29.631093
## flipper_length_mm        62.773159
## body_mass_g               9.599709
varImpPlot(rf_model)     # simple plot showing important predictors

# --- 10) Predict species for a new penguin (example data) ---
new_penguin <- data.frame(
  bill_length_mm    = 45,
  bill_depth_mm     = 17,
  flipper_length_mm = 210,
  body_mass_g       = 4800
)

predict(rf_model, newdata = new_penguin)   # model’s prediction
##      1 
## Gentoo 
## Levels: Adelie Chinstrap Gentoo
# =========================================================
# Export Random Forest Results - Palmer Penguins
# =========================================================

# 1) Confusion Matrix
cm <- table(Actual = test$species, Predicted = test_preds)
write.csv(cm, "confusion_matrix.csv", row.names = TRUE)

# 2) Accuracy
acc <- mean(test_preds == test$species)
write(acc, file = "accuracy.txt")

# 3) Variable Importance
imp <- importance(rf_model)
write.csv(imp, "variable_importance.csv", row.names = TRUE)

# 4) Model Summary
summary_text <- capture.output(rf_model)   # capture print output
writeLines(summary_text, "rf_model_summary.txt")

# Optional message
cat("All results exported successfully!\n")
## All results exported successfully!