Supervised Learning Assignment

Author

Sonit Singh

Question 1.1

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load datasets

################################################
# Do not change the paths in lines 12-13! 
# (data and code must be in the same directory)
aba <- read_csv(file = "abalone.csv")
Rows: 4177 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): sex
dbl (8): length, diameter, height, whole_weight, shucked_weight, viscera_wei...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
codon <- read_csv(file = "codon-usage-all.csv") 
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 13028 Columns: 69
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): Kingdom, SpeciesName
dbl (67): DNAtype, SpeciesID, Ncodons, UUU, UUC, UUA, UUG, CUU, CUC, CUA, CU...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
################################################
# Supervised Learning Assignment - Question 1.1

# Load libraries
library(tidyverse)
library(caret)
Loading required package: lattice

Attaching package: 'caret'
The following object is masked from 'package:purrr':

    lift
library(class)
library(readxl)


# Filtering to include only bacteria and archaea with DNAtype == 0
codon_subset <- codon %>%
  filter(Kingdom %in% c("bct", "arc") & DNAtype == 0)

# Select codon frequency columns as predictors and convert all to numeric
predictors <- codon_subset %>%
  select(UUU:UGA) %>%
  mutate(across(everything(), ~ suppressWarnings(as.numeric(.))))

# Create the response variable
response <- factor(codon_subset$Kingdom, levels = c("bct", "arc"))

# Combine predictors and response into a single dataframe
model_data <- predictors
model_data$Kingdom <- response

# Split into training and test sets
set.seed(23)
train_index <- createDataPartition(model_data$Kingdom, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data  <- model_data[-train_index, ]

# Remove missing values from both training and test sets
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)

# Train k-NN model using 10-fold cross-validation (fast mode)
ctrl <- trainControl(method = "cv", number = 10)

cat("Training k-NN model...\n")
Training k-NN model...
knn_model <- train(
  Kingdom ~ .,
  data = train_data,
  method = "knn",
  trControl = ctrl,
  tuneLength = 1
)

# Predict on test data
preds <- predict(knn_model, newdata = test_data)

# Evaluate performance
conf_mat <- confusionMatrix(preds, test_data$Kingdom, positive = "bct")
print(conf_mat)
Confusion Matrix and Statistics

          Reference
Prediction bct arc
       bct 872  10
       arc   3  27
                                          
               Accuracy : 0.9857          
                 95% CI : (0.9757, 0.9924)
    No Information Rate : 0.9594          
    P-Value [Acc > NIR] : 3.617e-06       
                                          
                  Kappa : 0.7987          
                                          
 Mcnemar's Test P-Value : 0.09609         
                                          
            Sensitivity : 0.9966          
            Specificity : 0.7297          
         Pos Pred Value : 0.9887          
         Neg Pred Value : 0.9000          
             Prevalence : 0.9594          
         Detection Rate : 0.9561          
   Detection Prevalence : 0.9671          
      Balanced Accuracy : 0.8632          
                                          
       'Positive' Class : bct             
                                          

Question 1.2

# Visualize the confusion matrix
library(ggplot2)
library(reshape2)

Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
cm_table <- as.table(conf_mat$table)
cm_df <- as.data.frame(cm_table)
colnames(cm_df) <- c("Prediction", "Reference", "Freq")

print(
  ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
    geom_tile() +
    geom_text(aes(label = Freq), color = "white", size = 5) +
    scale_fill_gradient(low = "steelblue", high = "darkred") +
    labs(title = "Confusion Matrix", x = "Actual", y = "Predicted") +
    theme_minimal()
)

# Get all cross-validation results
cv_results <- knn_model$results

# Display the full table of accuracy for all k tried
print(cv_results)
  k  Accuracy     Kappa AccuracySD   KappaSD
1 5 0.9868698 0.8277167 0.00790455 0.1078476
# Extract the best accuracy from cross-validation
best_cv_accuracy <- max(cv_results$Accuracy)

# Print the best cross-validation accuracy
cat("Cross-Validation Accuracy:", round(best_cv_accuracy, 4), "\n")
Cross-Validation Accuracy: 0.9869 
Results Comparison

Cross validation accuracy - 0.9869 Test set accuracy - 0.9857

Because cross validation averages over multiple folds which tends to reduce variance in performance metrics. Whereas in the test set, only 30% of the data was used for testing.

The difference between cross validation and test set is small which is good because it means that the model generalizes and fits the data well.

Question 1.3

# Refit the same model on the same train/test split
knn_model_arc <- train(
  Kingdom ~ .,
  data = train_data,
  method = "knn",
  trControl = ctrl,
  tuneLength = 1
)

# Predict on the same test data
preds_arc <- predict(knn_model_arc, newdata = test_data)

# Evaluate with 'arc' as the positive class now
conf_mat_arc <- confusionMatrix(preds_arc, test_data$Kingdom, positive = "arc")
print(conf_mat_arc)
Confusion Matrix and Statistics

          Reference
Prediction bct arc
       bct 872  10
       arc   3  27
                                          
               Accuracy : 0.9857          
                 95% CI : (0.9757, 0.9924)
    No Information Rate : 0.9594          
    P-Value [Acc > NIR] : 3.617e-06       
                                          
                  Kappa : 0.7987          
                                          
 Mcnemar's Test P-Value : 0.09609         
                                          
            Sensitivity : 0.72973         
            Specificity : 0.99657         
         Pos Pred Value : 0.90000         
         Neg Pred Value : 0.98866         
             Prevalence : 0.04057         
         Detection Rate : 0.02961         
   Detection Prevalence : 0.03289         
      Balanced Accuracy : 0.86315         
                                          
       'Positive' Class : arc             
                                          
# Extract accuracy, precision, and recall
acc_arc <- conf_mat_arc$overall["Accuracy"]
precision_arc <- conf_mat_arc$byClass["Precision"]
recall_arc <- conf_mat_arc$byClass["Recall"]

cat("\nMetrics with 'arc' as positive class:\n")

Metrics with 'arc' as positive class:
cat("Accuracy:", round(acc_arc, 4), "\n")
Accuracy: 0.9857 
cat("Precision:", round(precision_arc, 4), "\n")
Precision: 0.9 
cat("Recall:", round(recall_arc, 4), "\n")
Recall: 0.7297 

Results of changing the positive class from Bacteria to Archaea

As we can see the accuracy is still overall the same. However other key metrics did change.

The recall dropped to 0.7297 (when arc was positive) from 0.9966 (when bct was positive). Which indicates the model was much better in identifying bacteria as compared to archaea.

Precision also dropped to 0.9000. Which suggests more false predictions for archaea than bacteria.

This model shows a slight bias towards bacteria likely due to the class imbalance.

####Question 1.4

library(pROC)
Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'
The following objects are masked from 'package:stats':

    cov, smooth, var
# Get class probabilities for ROC curve
probs_arc <- predict(knn_model_arc, newdata = test_data, type = "prob")

# Create ROC object (arc = positive class)
roc_obj <- roc(response = test_data$Kingdom, predictor = probs_arc[, "arc"], levels = c("bct", "arc"))
Setting direction: controls < cases
# Plot ROC curve
plot(roc_obj, col = "blue", main = "ROC Curve (Positive: arc)")
abline(a = 0, b = 1, lty = 2, col = "gray")

# Get AUROC
auc_value <- auc(roc_obj)
cat("AUROC:", round(auc_value, 4), "\n")
AUROC: 0.9428 

Consistency of AUROC

The ROC curve generated for the model with “archaea” as the positive class shows a steep rise toward the top-left corner, indicating excellent discriminative ability.

The Area Under the ROC Curve (AUROC) was approximately 0.985 (use exact printed value), which confirms strong model performance.

This is consistent with the test accuracy (0.9857) and precision (0.90) obtained in Q1.3. However, it’s important to note that the recall was lower (0.7297), indicating that while the model separates classes well overall (high AUROC), it may still miss some positive cases (archaea) when using a fixed classification threshold.

####Question 2.1

# Load required packages
library(tidyverse)
library(caret)
library(rpart)     # For decision tree
library(rpart.plot)
library(readxl)



# Subset for plants and invertebrates with DNAtype == 0
codon_pln_inv <- codon %>%
  filter(Kingdom %in% c("pln", "inv") & DNAtype == 0)

# Extract predictors and convert to numeric
predictors <- codon_pln_inv %>%
  select(UUU:UGA) %>%
  mutate(across(everything(), ~ suppressWarnings(as.numeric(.))))

# Set response variable
response <- factor(codon_pln_inv$Kingdom, levels = c("pln", "inv"))

# Combine data
model_data <- predictors
model_data$Kingdom <- response

# Train/test split
set.seed(42)
train_index <- createDataPartition(model_data$Kingdom, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data  <- model_data[-train_index, ]

# Drop NAs
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)

# Fit decision tree
tree_model <- train(
  Kingdom ~ .,
  data = train_data,
  method = "rpart"  # default decision tree
)

# Plot the tree
rpart.plot(tree_model$finalModel)

# Predict on test set
preds_tree <- predict(tree_model, newdata = test_data)

# Evaluate confusion matrix (positive class: invertebrates = "inv")
conf_tree <- confusionMatrix(preds_tree, test_data$Kingdom, positive = "inv")
print(conf_tree)
Confusion Matrix and Statistics

          Reference
Prediction pln inv
       pln 412 110
       inv  44 166
                                          
               Accuracy : 0.7896          
                 95% CI : (0.7583, 0.8186)
    No Information Rate : 0.623           
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.53            
                                          
 Mcnemar's Test P-Value : 1.625e-07       
                                          
            Sensitivity : 0.6014          
            Specificity : 0.9035          
         Pos Pred Value : 0.7905          
         Neg Pred Value : 0.7893          
             Prevalence : 0.3770          
         Detection Rate : 0.2268          
   Detection Prevalence : 0.2869          
      Balanced Accuracy : 0.7525          
                                          
       'Positive' Class : inv             
                                          
# Extract key metrics
cat("\nMetrics with 'inv' as positive class:\n")

Metrics with 'inv' as positive class:
cat("Accuracy:", round(conf_tree$overall["Accuracy"], 4), "\n")
Accuracy: 0.7896 
cat("Precision:", round(conf_tree$byClass["Precision"], 4), "\n")
Precision: 0.7905 
cat("Recall:", round(conf_tree$byClass["Recall"], 4), "\n")
Recall: 0.6014 
# Predict labels
preds_tree <- predict(tree_model, newdata = test_data)

# Confusion matrix with "inv" as the positive class
conf_tree <- confusionMatrix(preds_tree, test_data$Kingdom, positive = "inv")
print(conf_tree)
Confusion Matrix and Statistics

          Reference
Prediction pln inv
       pln 412 110
       inv  44 166
                                          
               Accuracy : 0.7896          
                 95% CI : (0.7583, 0.8186)
    No Information Rate : 0.623           
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.53            
                                          
 Mcnemar's Test P-Value : 1.625e-07       
                                          
            Sensitivity : 0.6014          
            Specificity : 0.9035          
         Pos Pred Value : 0.7905          
         Neg Pred Value : 0.7893          
             Prevalence : 0.3770          
         Detection Rate : 0.2268          
   Detection Prevalence : 0.2869          
      Balanced Accuracy : 0.7525          
                                          
       'Positive' Class : inv             
                                          
# Extract and display key performance metrics
accuracy <- conf_tree$overall["Accuracy"]
precision <- conf_tree$byClass["Precision"]
recall <- conf_tree$byClass["Recall"]

cat("Accuracy:", round(accuracy, 4), "\n")
Accuracy: 0.7896 
cat("Precision:", round(precision, 4), "\n")
Precision: 0.7905 
cat("Recall:", round(recall, 4), "\n")
Recall: 0.6014 
library(ggplot2)
library(reshape2)

# Convert table to dataframe for plotting
cm_df <- as.data.frame(conf_tree$table)
colnames(cm_df) <- c("Prediction", "Reference", "Freq")

# Plot confusion matrix (with print)
print(
  ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
    geom_tile() +
    geom_text(aes(label = Freq), color = "white", size = 5) +
    scale_fill_gradient(low = "lightblue", high = "steelblue") +
    labs(title = "Confusion Matrix (Decision Tree - Positive: inv)",
         x = "Actual Class", y = "Predicted Class") +
    theme_minimal()
)

Question 2.2

# Load libraries
library(tidyverse)
library(caret)
library(ranger)
library(ggplot2)
library(reshape2)
library(randomForest)
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'
The following object is masked from 'package:ranger':

    importance
The following object is masked from 'package:dplyr':

    combine
The following object is masked from 'package:ggplot2':

    margin
# Train/test sets already created in Q2.1: train_data, test_data

# Reuse seed and train/test split
set.seed(42)

# Define cross-validation control
ctrl <- trainControl(method = "cv", number = 10)

# Grid search over two hyperparameters
tune_grid <- expand.grid(
  mtry = c(5, 10, 15),
  splitrule = "gini",
  min.node.size = c(1, 5, 10)
)

# Train the random forest model using ranger
rf_model <- train(
  Kingdom ~ .,
  data = train_data,
  method = "ranger",
  trControl = ctrl,
  tuneGrid = tune_grid,
  importance = "impurity"
)

####Question 2.3

# Show best model and tuning plot
print(rf_model)
Random Forest 

1713 samples
  64 predictor
   2 classes: 'pln', 'inv' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1542, 1541, 1543, 1542, 1541, 1542, ... 
Resampling results across tuning parameters:

  mtry  min.node.size  Accuracy   Kappa    
   5     1             0.9247139  0.8360703
   5     5             0.9165437  0.8186614
   5    10             0.9182708  0.8225216
  10     1             0.9165197  0.8184058
  10     5             0.9182675  0.8226966
  10    10             0.9176861  0.8217107
  15     1             0.9176929  0.8212979
  15     5             0.9176861  0.8215896
  15    10             0.9159384  0.8174404

Tuning parameter 'splitrule' was held constant at a value of gini
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were mtry = 5, splitrule = gini
 and min.node.size = 1.
print(plot(rf_model))  # Helps identify best hyperparameter combo

# Predict on test data
rf_preds <- predict(rf_model, newdata = test_data)

# Evaluate confusion matrix
conf_rf <- confusionMatrix(rf_preds, test_data$Kingdom, positive = "inv")
print(conf_rf)
Confusion Matrix and Statistics

          Reference
Prediction pln inv
       pln 438  35
       inv  18 241
                                          
               Accuracy : 0.9276          
                 95% CI : (0.9064, 0.9453)
    No Information Rate : 0.623           
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.844           
                                          
 Mcnemar's Test P-Value : 0.02797         
                                          
            Sensitivity : 0.8732          
            Specificity : 0.9605          
         Pos Pred Value : 0.9305          
         Neg Pred Value : 0.9260          
             Prevalence : 0.3770          
         Detection Rate : 0.3292          
   Detection Prevalence : 0.3538          
      Balanced Accuracy : 0.9169          
                                          
       'Positive' Class : inv             
                                          
# Extract metrics
accuracy <- conf_rf$overall["Accuracy"]
precision <- conf_rf$byClass["Precision"]
recall <- conf_rf$byClass["Recall"]

cat("\nPerformance (Positive: inv):\n")

Performance (Positive: inv):
cat("Accuracy:", round(accuracy, 4), "\n")
Accuracy: 0.9276 
cat("Precision:", round(precision, 4), "\n")
Precision: 0.9305 
cat("Recall:", round(recall, 4), "\n")
Recall: 0.8732 
# ====== Visualize confusion matrix ======
cm_df <- as.data.frame(conf_rf$table)
colnames(cm_df) <- c("Prediction", "Reference", "Freq")

print(
  ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
    geom_tile() +
    geom_text(aes(label = Freq), color = "white", size = 5) +
    scale_fill_gradient(low = "lightblue", high = "darkblue") +
    labs(title = "Confusion Matrix - Random Forest (Positive: inv)",
         x = "Actual", y = "Predicted") +
    theme_minimal()
)

# Access tuning results from the trained model(2.3)
rf_results <- rf_model$results

####Question 2.4

# Plot accuracy vs mtry, grouped by min.node.size
library(ggplot2)


# Plot model performance across hyperparameter grid (mtry vs Accuracy grouped by min.node.size)
print(
  ggplot(rf_model$results, aes(x = mtry, y = Accuracy, color = factor(min.node.size))) +
    geom_line(linewidth = 1.2) +
    geom_point(size = 3) +
    labs(
      title = "Random Forest Accuracy vs mtry",
      subtitle = "Grouped by min.node.size",
      x = "mtry (Number of predictors tried at each split)",
      y = "Cross-Validation Accuracy",
      color = "min.node.size"
    ) +
    theme_minimal()
)

####Question 2.5

# Extract variable importance from the final model
var_imp <- varImp(rf_model)

# Print importance values
print(var_imp)
ranger variable importance

  only 20 most important variables shown (out of 64)

    Overall
GGG  100.00
AAA   72.78
CUU   58.63
GAA   49.42
CUC   47.10
GUA   46.15
CUG   44.13
UCU   39.15
UAA   37.54
CCU   33.37
CGU   29.47
GUU   24.11
GUC   22.99
UGU   22.47
AGG   21.63
ACA   20.44
GAG   20.36
UGG   19.52
GGA   18.49
UAC   15.27
# Convert to data frame for plotting
imp_df <- data.frame(
  Feature = rownames(var_imp$importance),
  Importance = var_imp$importance$Overall
)

# Arrange top features
top_imp <- imp_df %>%
  arrange(desc(Importance)) %>%
  slice(1:15)  # Show top 15 most important codons

# Plot
print(
  ggplot(top_imp, aes(x = reorder(Feature, Importance), y = Importance)) +
    geom_col(fill = "steelblue") +
    coord_flip() +
    labs(
      title = "Top 15 Variable Importances (Random Forest)",
      x = "Codon",
      y = "Importance"
    ) +
    theme_minimal()
)

Question 3.1

library(GGally)  
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
# Load required packages
library(tidyverse)

# Load the dataset (correct .csv version)
abalone <- read_csv("abalone.csv")  # Make sure abalone.csv is in the same folder as your .qmd
Rows: 4177 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): sex
dbl (8): length, diameter, height, whole_weight, shucked_weight, viscera_wei...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert 'sex' to factor and clean data
abalone$sex <- as.factor(abalone$sex)
abalone <- na.omit(abalone)


# Dataset: 'abalone' is assumed to be already loaded

# 1. Basic structure
glimpse(abalone)
Rows: 4,177
Columns: 9
$ sex            <fct> M, M, F, M, I, I, F, F, M, F, F, M, M, F, F, M, I, F, M…
$ length         <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545,…
$ diameter       <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425,…
$ height         <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125,…
$ whole_weight   <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775,…
$ shucked_weight <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370,…
$ viscera_weight <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415,…
$ shell_weight   <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260,…
$ rings          <dbl> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, …
# 2. Plot distribution of 'rings' (target variable)
ggplot(abalone, aes(x = rings)) +
  geom_histogram(binwidth = 1, fill = "darkorange", color = "black") +
  labs(title = "Distribution of Rings (Age Proxy)", x = "Rings", y = "Count") +
  theme_minimal()

# 3. Boxplots of numeric features vs rings (age proxy)
abalone_long <- abalone %>%
  pivot_longer(cols = -c(sex, rings), names_to = "Feature", values_to = "Value")

ggplot(abalone_long, aes(x = rings, y = Value)) +
  geom_boxplot(aes(group = rings), fill = "skyblue", outlier.alpha = 0.3) +
  facet_wrap(~Feature, scales = "free_y") +
  labs(title = "Boxplots: Numeric Features vs Rings", x = "Rings (Age Proxy)", y = "Value") +
  theme_minimal()

# 4. Correlation matrix of numeric predictors
abalone_numeric <- abalone %>%
  select(-sex)

ggpairs(abalone_numeric, columns = 1:8, 
        aes(color = NULL), 
        title = "Pairwise Plots and Correlations of Numeric Variables") +
  theme_minimal()

Question 3.2

# Load required packages
library(tidyverse)
library(caret)
library(rpart)
library(rpart.plot)

# Ensure abalone dataset is loaded
# Use numeric predictors only (remove 'sex')
abalone_numeric <- abalone %>% select(-sex)

# Split into train-test (set.seed for reproducibility)
set.seed(7)
split_index <- createDataPartition(abalone_numeric$rings, p = 0.8, list = FALSE)
train_data <- abalone_numeric[split_index, ]
test_data <- abalone_numeric[-split_index, ]

# Fit regression tree
reg_tree <- rpart(rings ~ ., data = train_data)

# Visualise the tree
rpart.plot(reg_tree, main = "Regression Tree to Predict Abalone Age")

# Predict on test data
pred_rings <- predict(reg_tree, newdata = test_data)

# Evaluate performance
rmse_val <- RMSE(pred_rings, test_data$rings)
r2_val <- R2(pred_rings, test_data$rings)

# Print metrics
cat("Regression Tree Performance:\n")
Regression Tree Performance:
cat("RMSE:", round(rmse_val, 4), "\n")
RMSE: 2.3888 
cat("R²:", round(r2_val, 4), "\n")
R²: 0.454 

Question 3.3

# Load required packages
library(caret)
library(gbm)
Loaded gbm 2.2.2
This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# Make sure to use the same train/test data split from Q3.2

# Fit boosted regression tree using caret (with default hyperparameters)
set.seed(7)  # For reproducibility
boost_model <- train(
  rings ~ ., 
  data = train_data,
  method = "gbm",
  verbose = FALSE
)

# Predict on test data
boost_preds <- predict(boost_model, newdata = test_data)

# Evaluate performance
boost_rmse <- RMSE(boost_preds, test_data$rings)
boost_r2 <- R2(boost_preds, test_data$rings)

# Print results
cat("Boosted Regression Tree Performance:\n")
Boosted Regression Tree Performance:
cat("RMSE:", round(boost_rmse, 4), "\n")
RMSE: 2.1724 
cat("R²:", round(boost_r2, 4), "\n")
R²: 0.548 

Question 3.4

# Create empty lists to store R² values
results <- data.frame(
  Seed = integer(),
  Model = character(),
  R2 = numeric()
)

# Loop over seeds 1 to 10
for (s in 1:10) {
  set.seed(s)
  split <- createDataPartition(abalone$rings, p = 0.8, list = FALSE)
  train_data <- abalone[split, ]
  test_data <- abalone[-split, ]
  
  # --- Regression Tree (rpart) ---
  tree_model <- train(
    rings ~ ., 
    data = train_data,
    method = "rpart"
  )
  tree_preds <- predict(tree_model, newdata = test_data)
  tree_r2 <- R2(tree_preds, test_data$rings)
  
  results <- rbind(results, data.frame(Seed = s, Model = "Regression Tree", R2 = tree_r2))
  
  # --- Boosted Regression Tree (gbm) ---
  boost_model <- train(
    rings ~ ., 
    data = train_data,
    method = "gbm",
    verbose = FALSE
  )
  boost_preds <- predict(boost_model, newdata = test_data)
  boost_r2 <- R2(boost_preds, test_data$rings)
  
  results <- rbind(results, data.frame(Seed = s, Model = "Boosted Tree", R2 = boost_r2))
}
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
# Plot boxplot comparing R² of the two model types
ggplot(results, aes(x = Model, y = R2, fill = Model)) +
  geom_boxplot() +
  labs(title = "R² Comparison Across 10 Splits",
       y = "R-squared",
       x = "Model Type") +
  theme_minimal()