Assignment3

Pre_loading

rm(list = ls())
set.seed(1)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

UPDRS_df <- read_csv('./Datasets/Regression/parkinsons_updrs.data', show_col_types = FALSE) %>% 
  drop_na()

1

Divide the input dataset into training and testing using the validation set approach (with a 50%/50% split). Use the training set to train and the test set to assess model performance. a. How many examples will be used for training and how many for testing? b. How does the model perform?

# Split data into training and testing sets with a 50%/50% split
num_samples <- nrow(UPDRS_df)
index <- sample(1:num_samples, size = num_samples * 0.5)

train <- UPDRS_df[index, ]
test <- UPDRS_df[-index, ]

# Number of training and test examples  
train_num <- nrow(train)
test_num <- nrow(test)

# Define models
model1 <- lm(motor_UPDRS ~ . - total_UPDRS, data = train)
model2 <- lm(motor_UPDRS ~ . - total_UPDRS + age:sex, data = train)
model3 <- lm(motor_UPDRS ~ . - total_UPDRS + poly(age, 2) + poly(age, 3), data = train)

# Assess model performance on the test set
mse1 <- mean((test$motor_UPDRS - predict(model1, test))^2)
mse2 <- mean((test$motor_UPDRS - predict(model2, test))^2)
mse3 <- mean((test$motor_UPDRS - predict(model3, test))^2)

cat("Number of training examples:", train_num, "\n")

## Number of training examples: 2937

cat("Number of test examples:", test_num, "\n\n")

## Number of test examples: 2938

cat("Model 1 MSE on Test Set:", mse1, "\n")

## Model 1 MSE on Test Set: 51.14023

cat("Model 2 MSE on Test Set:", mse2, "\n")

## Model 2 MSE on Test Set: 51.11729

cat("Model 3 MSE on Test Set:", mse3, "\n")

## Model 3 MSE on Test Set: 49.90668

2

Use Leave-One-Out Cross-Validation (LOOCV) to test the models. a. How does the model perform?

library(boot)

# Perform LOOCV for each model

# Model 1
formula1 <- motor_UPDRS ~ . - total_UPDRS
cv.error1 <- cv.glm(UPDRS_df, glm(formula1, data=UPDRS_df), K=nrow(UPDRS_df))
mse1 <- cv.error1$delta[1]  # Using the first element of delta for the standard error estimate

# Model 2
formula2 <- motor_UPDRS ~ . - total_UPDRS + age:sex
cv.error2 <- cv.glm(UPDRS_df, glm(formula2, data=UPDRS_df), K=nrow(UPDRS_df))
mse2 <- cv.error2$delta[1]

# Model 3
formula3 <- motor_UPDRS ~ . - total_UPDRS + poly(age, 2) + poly(age, 3)
cv.error3 <- cv.glm(UPDRS_df, glm(formula3, data=UPDRS_df), K=nrow(UPDRS_df))
mse3 <- cv.error3$delta[1]

# Print the MSE for each model based on LOOCV
cat("Model 1 LOOCV MSE:", mse1, "\n")

## Model 1 LOOCV MSE: 52.12593

cat("Model 2 LOOCV MSE:", mse2, "\n")

## Model 2 LOOCV MSE: 52.131

cat("Model 3 LOOCV MSE:", mse3, "\n")

## Model 3 LOOCV MSE: 51.06764

3

Use 5-fold cross-validation to test the models. a. How does the model perform?

# Function to perform 5-fold CV and calculate MSE
perform_cv <- function(formula, data, folds) {
  set.seed(123) # Ensure reproducibility
  cv_results <- cv.glm(data, glm(formula, data=data), K=folds)
  return(cv_results$delta[1])  # Return the MSE
}

# Model formulas
formula1 <- motor_UPDRS ~ . - total_UPDRS
formula2 <- motor_UPDRS ~ . - total_UPDRS + age:sex
formula3 <- motor_UPDRS ~ . - total_UPDRS + poly(age, 2) + poly(age, 3)

# Perform 5-fold CV for each model
mse1 <- perform_cv(formula1, UPDRS_df, 5)
mse2 <- perform_cv(formula2, UPDRS_df, 5)
mse3 <- perform_cv(formula3, UPDRS_df, 5)

# Print the MSE for each model based on 5-fold CV
cat("Model 1 5-fold CV MSE:", mse1, "\n")

## Model 1 5-fold CV MSE: 52.10428

cat("Model 2 5-fold CV MSE:", mse2, "\n")

## Model 2 5-fold CV MSE: 52.10761

cat("Model 3 5-fold CV MSE:", mse3, "\n")

## Model 3 5-fold CV MSE: 51.087

4

Compare and comment on the error obtained with each validation approach for each model. (Note: The purpose of this comparison is not to evaluate which validation method should be chosen based on the lowest error, that is incorrect.)

For model 1, the validation set approach has the smallest MSE, the LOOCV has the largest MSE and the 5-fold CV is in the middle.

For model 2, the validation set approach has the smallest MSE, the LOOCV has the largest MSE and the 5-fold CV is in the middle.

For model 3, the validation set approach has the smallest MSE, the 5-fold CV has the largest MSE and the LOOCV is in the middle.

Assignment3

Ao (Alan) Huang

2024-02-24

Pre_loading

1

2

3

4

The codes are also publicly available at https://rpubs.com/AlanHuang/CSC642-R_Assignment3