library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(palmerpenguins)
## Warning: package 'palmerpenguins' was built under R version 4.4.1
For this assignment, I will be using the Palmer penguins dataset. In particular, I want to model penguin body mass as a function of bill length and species and use cross validation methods to assess the skill of said model.
# from palmerpenguins package
data("penguins")
head(penguins)
## # A tibble: 6 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## # ℹ 2 more variables: sex <fct>, year <int>
# subset data to only include variables of interest, then drop entries with NA fields
peng_data <- penguins |>
# select columns of interest
select(species, bill_length_mm, body_mass_g) |>
# remove NAs
drop_na()
The first and simplest cross validation approach excludes one third of the original data before training the model, then uses the excluded values to test the fit (i.e., bias vs variance) of the model. First, I will just do this once by manually separating one third of the data. Then, I will do this multiple times with a K-fold cross validation.
# set seed for replication - this time I chose pi because pie
set.seed(314)
# create a vector of indices to categorize 1/3 of points as test and 2/3 as train
n <- nrow(peng_data)
testRows <- sample(1:n, size = n * 1/3, replace = FALSE)
peng_data$test <- FALSE
peng_data$test[testRows] <- TRUE
# training datapoints are now labelled as FALSE, while test datapoints are labelled as TRUE
head(peng_data)
## # A tibble: 6 × 4
## species bill_length_mm body_mass_g test
## <fct> <dbl> <int> <lgl>
## 1 Adelie 39.1 3750 TRUE
## 2 Adelie 39.5 3800 TRUE
## 3 Adelie 40.3 3250 FALSE
## 4 Adelie 36.7 3450 FALSE
## 5 Adelie 39.3 3650 FALSE
## 6 Adelie 38.9 3625 FALSE
table(peng_data$test)
##
## FALSE TRUE
## 228 114
With a quick plot, we can see the random selection of test and train points from the whole dataset.
p1 <- ggplot(data = peng_data) +
geom_point(aes(x = bill_length_mm, y = body_mass_g,
color = species, shape = test))
p1
Next, I make a linear model using the training subset, modeling body mass as a function of bill length and species. Then I use the linear model to predict body mass for the testing subset and compare the predicted values to the actual values.
# subset testing and training sets
test_data <- peng_data[peng_data$test,]
train_data <- peng_data[!peng_data$test,]
# linear model
lm1 <- lm(body_mass_g ~ bill_length_mm + species, data = train_data)
summary(lm1)
##
## Call:
## lm(formula = body_mass_g ~ bill_length_mm + species, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -848.98 -275.62 10.31 229.06 1078.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 364.977 357.241 1.022 0.308
## bill_length_mm 85.444 9.156 9.332 < 2e-16 ***
## speciesChinstrap -823.324 113.232 -7.271 5.90e-12 ***
## speciesGentoo 666.443 97.624 6.827 8.05e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 381 on 224 degrees of freedom
## Multiple R-squared: 0.7767, Adjusted R-squared: 0.7737
## F-statistic: 259.8 on 3 and 224 DF, p-value: < 2.2e-16
# predict values for testing subset
test_data$yhat <- predict(lm1, newdata = test_data)
Now we can look at the fit of the model with \(R^2\), root mean square error (RMSE) and mean absolute error (MAE)
rsq <- cor(test_data$yhat, test_data$body_mass_g)^2
rmse <- sqrt(mean((test_data$yhat - test_data$body_mass_g)^2))
mae <- mean(abs(test_data$yhat - test_data$body_mass_g))
rsq
## [1] 0.7925788
rmse
## [1] 366.6112
mae
## [1] 285.7129
This cross-validation method gives us a point estimate for the \(R^2\) as well as an error margin. We can repeat the method multiple times with different test and training subsets to get a range of expected \(R^2\) values by using K-fold cross validation.
For K-fold cross validation, the data are separated multiple times into training and testing sets. This way, each datapoint is used as a test point as well as a training point for the model. I will use k = 10 to start, then compare with the Holdout cross validation I did above.
# set seed for replication
set.seed(314)
# set number of folds k
k <- 10
# pull number of observations n
n <- nrow(peng_data)
# reshuffle penguin data for assigning folds
peng_data2 <- peng_data[sample(n),1:3]
# assign datapoints fold groups
peng_data2$fold <- cut(seq(1, n), breaks = k, labels = FALSE)
# create empty matrix for loop output
peng_kfold <- matrix(nrow = k, ncol = 3)
colnames(peng_kfold) <- c("r2", "rmse", "mae")
for(i in 1:k) {
# filter test and train datasets
train_data <- filter(peng_data2, fold != i)
test_data <- filter(peng_data2, fold == i)
# build linear model on train dataset
loop_lm <- lm(body_mass_g ~ bill_length_mm + species, data = train_data)
# predict yhat values for test dataset
test_data$yhat <- predict(loop_lm, newdata = test_data)
# R^2
peng_kfold[i,1] <- cor(test_data$yhat, test_data$body_mass_g)^2
# RMSE
peng_kfold[i,2] <- sqrt(mean((test_data$yhat - test_data$body_mass_g)^2))
# MAE
peng_kfold[i,3] <- mean(abs(test_data$yhat - test_data$body_mass_g))
}
peng_kfold
## r2 rmse mae
## [1,] 0.8106465 356.7285 286.2302
## [2,] 0.7953530 353.1718 259.2957
## [3,] 0.7893307 372.7699 306.6330
## [4,] 0.7164487 429.3371 332.6427
## [5,] 0.7241469 407.4544 317.7930
## [6,] 0.7311099 333.4642 264.1582
## [7,] 0.8027231 326.5941 264.8773
## [8,] 0.7869622 376.8155 337.0993
## [9,] 0.7761139 420.2895 326.0232
## [10,] 0.8442983 379.3577 307.6552
From this, we can calculate an interval estimate for our test statistics (\(R^2\), RMSE, and MAE) using mean and standard deviation.
# mean R^2
mean_r2 <- mean(peng_kfold[,1])
# R^2 standard error
se_r2 <- sd(peng_kfold[,1]) / sqrt(k)
# mean Root Mean Square Error
mean_rmse <- mean(peng_kfold[,2])
# RMSE standard error
se_rmse <- sd(peng_kfold[,2]) / sqrt(k)
# mean Mean Absolute Error
mean_mae <- mean(peng_kfold[,3])
# MAE standard error
se_mae <- sd(peng_kfold[,3]) / sqrt(k)
With K-fold cross validation, our linear model has an average \(R^2\) of 0.778 +/- 0.013, an average RMSE of 375.6 +/- 11.03, and an average MAE of 300.2 +/- 9.37.
We could repeat this for as many k as we want, or reshuffle the original dataset and run the k-fold loop again.
If we increase k all the way to \(n\), then arrive at the third method: Leave One Out Cross Validation (LOOCV). This uses the whole dataset except one point to train the model, tests the model with the omitted point, and repeats the process for every point in the data. This can be computationally expensive for large datasets, but for our 342 penguins it is a reasonable loop.
# set seed for replication
set.seed(314)
# pull number of observations n
n <- nrow(peng_data)
# create empty matrix for loop output
peng_LOOCV <- matrix(nrow = n, ncol = 2)
colnames(peng_LOOCV) <- c("y", "yhat")
for(i in 1:n) {
# filter test and train datasets
train_data <- peng_data[-i,1:3]
test_data <- peng_data[i,1:3]
# build linear model on train dataset
loop_lm <- lm(body_mass_g ~ bill_length_mm + species, data = train_data)
peng_LOOCV[i,1] <- as.numeric(test_data[,3])
# predict yhat values for test dataset
loop_yhat <- predict(loop_lm, newdata = test_data)
peng_LOOCV[i,2] <- loop_yhat[1]
}
head(peng_LOOCV)
## y yhat
## [1,] 3750 3728.739
## [2,] 3800 3765.218
## [3,] 3250 3842.985
## [4,] 3450 3509.919
## [5,] 3650 3747.824
## [6,] 3625 3711.164
# R^2
LOOCV_r2 <- cor(peng_LOOCV[,1], peng_LOOCV[,2])^2
# RMSE
LOOCV_rmse <- sqrt(mean((peng_LOOCV[,2] - peng_LOOCV[,1])^2))
# MAE
LOOCV_mae <- mean(abs(peng_LOOCV[,2] - peng_LOOCV[,1]))
LOOCV_r2
## [1] 0.7779887
LOOCV_rmse
## [1] 377.3213
LOOCV_mae
## [1] 301.0585
Given that LOOCV provides a single \(R^2\) value, I am a little unsure of how to compare these two. However, all three stats are incredibly similar, so I feel fairly confident in both methods.