Pre_loading

rm(list = ls())
set.seed(1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
UPDRS_df <- read_csv('./Datasets/Regression/parkinsons_updrs.data', show_col_types = FALSE) %>% 
  drop_na()

1

Divide the input dataset into training and testing using the validation set approach (with a 50%/50% split). Use the training set to train and the test set to assess model performance. a. How many examples will be used for training and how many for testing? b. How does the model perform?

# Split data into training and testing sets with a 50%/50% split
num_samples <- nrow(UPDRS_df)
index <- sample(1:num_samples, size = num_samples * 0.5)

train <- UPDRS_df[index, ]
test <- UPDRS_df[-index, ]

# Number of training and test examples  
train_num <- nrow(train)
test_num <- nrow(test)

# Define models
model1 <- lm(motor_UPDRS ~ . - total_UPDRS, data = train)
model2 <- lm(motor_UPDRS ~ . - total_UPDRS + age:sex, data = train)
model3 <- lm(motor_UPDRS ~ . - total_UPDRS + poly(age, 2) + poly(age, 3), data = train)

# Assess model performance on the test set
mse1 <- mean((test$motor_UPDRS - predict(model1, test))^2)
mse2 <- mean((test$motor_UPDRS - predict(model2, test))^2)
mse3 <- mean((test$motor_UPDRS - predict(model3, test))^2)

cat("Number of training examples:", train_num, "\n")
## Number of training examples: 2937
cat("Number of test examples:", test_num, "\n\n")
## Number of test examples: 2938
cat("Model 1 MSE on Test Set:", mse1, "\n")
## Model 1 MSE on Test Set: 51.14023
cat("Model 2 MSE on Test Set:", mse2, "\n")
## Model 2 MSE on Test Set: 51.11729
cat("Model 3 MSE on Test Set:", mse3, "\n")
## Model 3 MSE on Test Set: 49.90668

2

Use Leave-One-Out Cross-Validation (LOOCV) to test the models. a. How does the model perform?

library(boot)

# Perform LOOCV for each model

# Model 1
formula1 <- motor_UPDRS ~ . - total_UPDRS
cv.error1 <- cv.glm(UPDRS_df, glm(formula1, data=UPDRS_df), K=nrow(UPDRS_df))
mse1 <- cv.error1$delta[1]  # Using the first element of delta for the standard error estimate

# Model 2
formula2 <- motor_UPDRS ~ . - total_UPDRS + age:sex
cv.error2 <- cv.glm(UPDRS_df, glm(formula2, data=UPDRS_df), K=nrow(UPDRS_df))
mse2 <- cv.error2$delta[1]

# Model 3
formula3 <- motor_UPDRS ~ . - total_UPDRS + poly(age, 2) + poly(age, 3)
cv.error3 <- cv.glm(UPDRS_df, glm(formula3, data=UPDRS_df), K=nrow(UPDRS_df))
mse3 <- cv.error3$delta[1]

# Print the MSE for each model based on LOOCV
cat("Model 1 LOOCV MSE:", mse1, "\n")
## Model 1 LOOCV MSE: 52.12593
cat("Model 2 LOOCV MSE:", mse2, "\n")
## Model 2 LOOCV MSE: 52.131
cat("Model 3 LOOCV MSE:", mse3, "\n")
## Model 3 LOOCV MSE: 51.06764

3

Use 5-fold cross-validation to test the models. a. How does the model perform?

# Function to perform 5-fold CV and calculate MSE
perform_cv <- function(formula, data, folds) {
  set.seed(123) # Ensure reproducibility
  cv_results <- cv.glm(data, glm(formula, data=data), K=folds)
  return(cv_results$delta[1])  # Return the MSE
}

# Model formulas
formula1 <- motor_UPDRS ~ . - total_UPDRS
formula2 <- motor_UPDRS ~ . - total_UPDRS + age:sex
formula3 <- motor_UPDRS ~ . - total_UPDRS + poly(age, 2) + poly(age, 3)

# Perform 5-fold CV for each model
mse1 <- perform_cv(formula1, UPDRS_df, 5)
mse2 <- perform_cv(formula2, UPDRS_df, 5)
mse3 <- perform_cv(formula3, UPDRS_df, 5)

# Print the MSE for each model based on 5-fold CV
cat("Model 1 5-fold CV MSE:", mse1, "\n")
## Model 1 5-fold CV MSE: 52.10428
cat("Model 2 5-fold CV MSE:", mse2, "\n")
## Model 2 5-fold CV MSE: 52.10761
cat("Model 3 5-fold CV MSE:", mse3, "\n")
## Model 3 5-fold CV MSE: 51.087

4

Compare and comment on the error obtained with each validation approach for each model. (Note: The purpose of this comparison is not to evaluate which validation method should be chosen based on the lowest error, that is incorrect.)

For model 1, the validation set approach has the smallest MSE, the LOOCV has the largest MSE and the 5-fold CV is in the middle.

For model 2, the validation set approach has the smallest MSE, the LOOCV has the largest MSE and the 5-fold CV is in the middle.

For model 3, the validation set approach has the smallest MSE, the 5-fold CV has the largest MSE and the LOOCV is in the middle.

The codes are also publicly available at https://rpubs.com/AlanHuang/CSC642-R_Assignment3