library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(palmerpenguins)
## Warning: package 'palmerpenguins' was built under R version 4.4.1

For this assignment, I will be using the Palmer penguins dataset. In particular, I want to model penguin body mass as a function of bill length and species and use cross validation methods to assess the skill of said model.

Loading Data

# from palmerpenguins package
data("penguins")
head(penguins)
## # A tibble: 6 × 8
##   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie  Torgersen           39.1          18.7               181        3750
## 2 Adelie  Torgersen           39.5          17.4               186        3800
## 3 Adelie  Torgersen           40.3          18                 195        3250
## 4 Adelie  Torgersen           NA            NA                  NA          NA
## 5 Adelie  Torgersen           36.7          19.3               193        3450
## 6 Adelie  Torgersen           39.3          20.6               190        3650
## # ℹ 2 more variables: sex <fct>, year <int>
# subset data to only include variables of interest, then drop entries with NA fields
peng_data <- penguins |> 
  # select columns of interest
  select(species, bill_length_mm, body_mass_g) |>
  # remove NAs
  drop_na()

Cross Validation Methods

Holdout

The first and simplest cross validation approach excludes one third of the original data before training the model, then uses the excluded values to test the fit (i.e., bias vs variance) of the model. First, I will just do this once by manually separating one third of the data. Then, I will do this multiple times with a K-fold cross validation.

# set seed for replication - this time I chose pi because pie
set.seed(314)

# create a vector of indices to categorize 1/3 of points as test and 2/3 as train
n <- nrow(peng_data)
testRows <- sample(1:n, size = n * 1/3, replace = FALSE)

peng_data$test <- FALSE
peng_data$test[testRows] <- TRUE
# training datapoints are now labelled as FALSE, while test datapoints are labelled as TRUE
head(peng_data)
## # A tibble: 6 × 4
##   species bill_length_mm body_mass_g test 
##   <fct>            <dbl>       <int> <lgl>
## 1 Adelie            39.1        3750 TRUE 
## 2 Adelie            39.5        3800 TRUE 
## 3 Adelie            40.3        3250 FALSE
## 4 Adelie            36.7        3450 FALSE
## 5 Adelie            39.3        3650 FALSE
## 6 Adelie            38.9        3625 FALSE
table(peng_data$test)
## 
## FALSE  TRUE 
##   228   114

With a quick plot, we can see the random selection of test and train points from the whole dataset.

p1 <- ggplot(data = peng_data) +
  geom_point(aes(x = bill_length_mm, y = body_mass_g, 
                 color = species, shape = test))
p1

Next, I make a linear model using the training subset, modeling body mass as a function of bill length and species. Then I use the linear model to predict body mass for the testing subset and compare the predicted values to the actual values.

# subset testing and training sets
test_data <- peng_data[peng_data$test,]
train_data <- peng_data[!peng_data$test,]

# linear model
lm1 <- lm(body_mass_g ~ bill_length_mm + species, data = train_data)
summary(lm1)
## 
## Call:
## lm(formula = body_mass_g ~ bill_length_mm + species, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -848.98 -275.62   10.31  229.06 1078.66 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       364.977    357.241   1.022    0.308    
## bill_length_mm     85.444      9.156   9.332  < 2e-16 ***
## speciesChinstrap -823.324    113.232  -7.271 5.90e-12 ***
## speciesGentoo     666.443     97.624   6.827 8.05e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 381 on 224 degrees of freedom
## Multiple R-squared:  0.7767, Adjusted R-squared:  0.7737 
## F-statistic: 259.8 on 3 and 224 DF,  p-value: < 2.2e-16
# predict values for testing subset
test_data$yhat <- predict(lm1, newdata = test_data)

Now we can look at the fit of the model with \(R^2\), root mean square error (RMSE) and mean absolute error (MAE)

rsq <- cor(test_data$yhat, test_data$body_mass_g)^2
rmse <- sqrt(mean((test_data$yhat - test_data$body_mass_g)^2))
mae <- mean(abs(test_data$yhat - test_data$body_mass_g))

rsq
## [1] 0.7925788
rmse
## [1] 366.6112
mae
## [1] 285.7129

This cross-validation method gives us a point estimate for the \(R^2\) as well as an error margin. We can repeat the method multiple times with different test and training subsets to get a range of expected \(R^2\) values by using K-fold cross validation.

K-fold

For K-fold cross validation, the data are separated multiple times into training and testing sets. This way, each datapoint is used as a test point as well as a training point for the model. I will use k = 10 to start, then compare with the Holdout cross validation I did above.

# set seed for replication
set.seed(314)
# set number of folds k
k <- 10
# pull number of observations n
n <- nrow(peng_data)
# reshuffle penguin data for assigning folds
peng_data2 <- peng_data[sample(n),1:3]
# assign datapoints fold groups
peng_data2$fold <- cut(seq(1, n), breaks = k, labels = FALSE)

# create empty matrix for loop output
peng_kfold <- matrix(nrow = k, ncol = 3)
colnames(peng_kfold) <- c("r2", "rmse", "mae")
for(i in 1:k) {
  # filter test and train datasets 
  train_data <- filter(peng_data2, fold != i)
  test_data <- filter(peng_data2, fold == i)
  
  # build linear model on train dataset
  loop_lm <- lm(body_mass_g ~ bill_length_mm + species, data = train_data)
  # predict yhat values for test dataset
  test_data$yhat <- predict(loop_lm, newdata = test_data)
  
  # R^2
  peng_kfold[i,1] <- cor(test_data$yhat, test_data$body_mass_g)^2
  # RMSE
  peng_kfold[i,2] <- sqrt(mean((test_data$yhat - test_data$body_mass_g)^2))
  # MAE
  peng_kfold[i,3] <- mean(abs(test_data$yhat - test_data$body_mass_g))
}

peng_kfold
##              r2     rmse      mae
##  [1,] 0.8106465 356.7285 286.2302
##  [2,] 0.7953530 353.1718 259.2957
##  [3,] 0.7893307 372.7699 306.6330
##  [4,] 0.7164487 429.3371 332.6427
##  [5,] 0.7241469 407.4544 317.7930
##  [6,] 0.7311099 333.4642 264.1582
##  [7,] 0.8027231 326.5941 264.8773
##  [8,] 0.7869622 376.8155 337.0993
##  [9,] 0.7761139 420.2895 326.0232
## [10,] 0.8442983 379.3577 307.6552

From this, we can calculate an interval estimate for our test statistics (\(R^2\), RMSE, and MAE) using mean and standard deviation.

# mean R^2
mean_r2 <- mean(peng_kfold[,1])
# R^2 standard error
se_r2 <- sd(peng_kfold[,1]) / sqrt(k)

# mean Root Mean Square Error
mean_rmse <- mean(peng_kfold[,2])
# RMSE standard error
se_rmse <- sd(peng_kfold[,2]) / sqrt(k)

# mean Mean Absolute Error
mean_mae <- mean(peng_kfold[,3])
# MAE standard error
se_mae <- sd(peng_kfold[,3]) / sqrt(k)

With K-fold cross validation, our linear model has an average \(R^2\) of 0.778 +/- 0.013, an average RMSE of 375.6 +/- 11.03, and an average MAE of 300.2 +/- 9.37.

We could repeat this for as many k as we want, or reshuffle the original dataset and run the k-fold loop again.

LOOCV

If we increase k all the way to \(n\), then arrive at the third method: Leave One Out Cross Validation (LOOCV). This uses the whole dataset except one point to train the model, tests the model with the omitted point, and repeats the process for every point in the data. This can be computationally expensive for large datasets, but for our 342 penguins it is a reasonable loop.

# set seed for replication
set.seed(314)
# pull number of observations n
n <- nrow(peng_data)

# create empty matrix for loop output
peng_LOOCV <- matrix(nrow = n, ncol = 2)
colnames(peng_LOOCV) <- c("y", "yhat")
for(i in 1:n) {
  # filter test and train datasets 
  train_data <- peng_data[-i,1:3]
  test_data <- peng_data[i,1:3]
  
  # build linear model on train dataset
  loop_lm <- lm(body_mass_g ~ bill_length_mm + species, data = train_data)
  
  peng_LOOCV[i,1] <- as.numeric(test_data[,3])
  # predict yhat values for test dataset
  loop_yhat <- predict(loop_lm, newdata = test_data)
  peng_LOOCV[i,2] <- loop_yhat[1]
}

head(peng_LOOCV)
##         y     yhat
## [1,] 3750 3728.739
## [2,] 3800 3765.218
## [3,] 3250 3842.985
## [4,] 3450 3509.919
## [5,] 3650 3747.824
## [6,] 3625 3711.164
  # R^2
  LOOCV_r2 <- cor(peng_LOOCV[,1], peng_LOOCV[,2])^2
  # RMSE
  LOOCV_rmse <- sqrt(mean((peng_LOOCV[,2] - peng_LOOCV[,1])^2))
  # MAE
  LOOCV_mae <- mean(abs(peng_LOOCV[,2] - peng_LOOCV[,1]))
  
LOOCV_r2
## [1] 0.7779887
LOOCV_rmse
## [1] 377.3213
LOOCV_mae
## [1] 301.0585

Given that LOOCV provides a single \(R^2\) value, I am a little unsure of how to compare these two. However, all three stats are incredibly similar, so I feel fairly confident in both methods.