Pre_loading

rm(list = ls())
set.seed(1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaps)
UPDRS_df <- read_csv('./Datasets/Regression/parkinsons_updrs.data', show_col_types = FALSE) %>% 
  select(-total_UPDRS) %>%
  drop_na()

Regression problem:

• Divide the input dataset into training and testing as done in Assignment 1 (80%/20%).

• Build a model to predict motor_UPDRS (reminder: do not include total_UPDRS as input to the model) and measure its performance on the test set.

  1. Use forward stepwise selection to perform feature selection.
  1. Use 5-fold cross-validation on the training set to select the model that gives the best results. Hint: If using the loop seen in class, remember that the jth element is the CV error for the j-variable model.
  2. Based on the CV error, select the model with the optimal number of variables and calculate the MSE on the test set.
# Split data into training and testing sets with an 80%/20% split
num_samples <- nrow(UPDRS_df)
train_indices <- sample(1:num_samples, size = num_samples * 0.8)

train <- UPDRS_df[train_indices, ]
test <- UPDRS_df[-train_indices, ]

# Forward Stepwise Selection using 5-fold cross-validation
set.seed(1) # Ensure reproducibility
folds <- sample(1:5, nrow(train), replace = TRUE)
cv_errors <- matrix(NA, 5, ncol(train) - 1) # Exclude the response variable

for (j in 1:5) {
  train_fold <- train[folds != j, ]
  test_fold <- train[folds == j, ]
  
  regfit.fwd <- regsubsets(motor_UPDRS ~ ., data = train_fold, nvmax = ncol(train_fold) - 1, method = "forward")
  
  for (i in 1:(ncol(train_fold) - 1)) {
    coefi <- coef(regfit.fwd, id = i)
    pred <- model.matrix(motor_UPDRS ~ ., data = test_fold)[, names(coefi)] %*% coefi
    cv_errors[j, i] <- mean((test_fold$motor_UPDRS - pred)^2)
  }
}

# Average CV errors across folds to find the optimal model size
mean_cv_errors <- apply(cv_errors, 2, mean)
optimal_num_vars <- which.min(mean_cv_errors)

# Build the model with the optimal number of variables
regfit.best <- regsubsets(motor_UPDRS ~ ., data = train, nvmax = optimal_num_vars, method = "forward")
best_coefs <- coef(regfit.best, id = optimal_num_vars)

# Predict on test set
test_matrix <- model.matrix(motor_UPDRS ~ ., data = test)[, names(best_coefs)]
predictions <- test_matrix %*% best_coefs
mse_test <- mean((test$motor_UPDRS - predictions)^2)

# Output the MSE on the test set and optimal number of variables
print(paste("MSE on Test Set:", mse_test))
## [1] "MSE on Test Set: 52.3465751365149"
print(paste("Optimal Number of Variables:", optimal_num_vars))
## [1] "Optimal Number of Variables: 11"

2

  1. Apply Lasso.
  1. Optimize the parameter lamba using CV.
  2. Once lambda.min is obtained calculate the MSE on the test set.
y_train <- train$motor_UPDRS
x_train <- model.matrix(motor_UPDRS ~ . - 1, data = train) # -1 to exclude intercept

y_test <- test$motor_UPDRS
x_test <- model.matrix(motor_UPDRS ~ . - 1, data = test) # -1 to exclude intercept

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
set.seed(1)
cv.out <- cv.glmnet(x_train, y_train, alpha = 1)

plot(cv.out)

bestlam <- cv.out$lambda.min
lasso.pred <- predict(cv.out, s = bestlam, newx = x_test)
mse_test <- mean((lasso.pred - y_test)^2)
print(paste("MSE on Test Set with Lasso:", mse_test))
## [1] "MSE on Test Set with Lasso: 52.2767733919589"
lasso.coef <- predict(cv.out, type = "coefficients", s = bestlam)
lasso.coef[lasso.coef != 0]
##  [1]  2.715562e+01  1.847651e-01  2.018398e-01 -2.415227e+00  8.901892e-03
##  [6] -4.174177e+04  3.963522e+02  3.082857e+00 -4.857902e+01 -3.329007e+01
## [11]  3.919727e+01 -6.998783e-01 -1.505030e+01 -3.243335e-01 -1.105015e+00
## [16] -2.596496e+01  1.476018e+01

3

  1. Apply PLS.
  1. Optimize M using CV.
  2. Calculate the MSE on the test set.
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(1)
pls.fit <- plsr(motor_UPDRS ~ . , data = UPDRS_df, subset = train_indices, scale = TRUE, validation = "CV")

summary(pls.fit)
## Data:    X dimension: 4700 20 
##  Y dimension: 4700 1
## Fit method: kernelpls
## Number of components considered: 20
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           8.122    7.896    7.326    7.259    7.221    7.207    7.203
## adjCV        8.122    7.896    7.326    7.258    7.221    7.206    7.203
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       7.204    7.205    7.206     7.207     7.207     7.207     7.208
## adjCV    7.203    7.204    7.205     7.205     7.205     7.205     7.206
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV        7.211     7.211     7.212     7.211     7.211     7.213     7.213
## adjCV     7.209     7.210     7.210     7.210     7.210     7.211     7.211
## 
## TRAINING: % variance explained
##              1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X             51.406    61.54    67.89    72.21    75.81    80.41    85.47
## motor_UPDRS    5.536    18.86    20.36    21.19    21.55    21.66    21.69
##              8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X              88.70    92.06     94.01     97.40     98.18     98.57     98.85
## motor_UPDRS    21.71    21.72     21.74     21.75     21.75     21.75     21.76
##              15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## X               99.63     99.89     99.95    100.00    100.00    100.00
## motor_UPDRS     21.76     21.76     21.76     21.76     21.77     21.78
M=6
pls.pred <- predict(pls.fit, newdata = UPDRS_df[-train_indices, ], ncomp = M)
mse_test <- mean((pls.pred - UPDRS_df$motor_UPDRS[-train_indices])^2)
print(paste("MSE on Test Set with PLS, M=", M, ": ", mse_test, sep=""))
## [1] "MSE on Test Set with PLS, M=6: 52.2984111680083"
# Refit using all data
pls.fit.all <- plsr(motor_UPDRS ~ ., data = UPDRS_df, scale = TRUE, ncomp = M)
summary(pls.fit.all)
## Data:    X dimension: 5875 20 
##  Y dimension: 5875 1
## Fit method: kernelpls
## Number of components considered: 6
## TRAINING: % variance explained
##              1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## X             51.874    61.78    68.27    72.39    76.03    80.69
## motor_UPDRS    5.527    18.91    20.33    21.19    21.55    21.68

4

  1. Compare and comment on the performance obtained with each model on the test set.
  1. Are you able to get better predictions by applying feature selection and/or dimension reduction as compared to previous assignments?
lm <- lm(motor_UPDRS ~ . , data = train)
mse.lm <- mean((test$motor_UPDRS - predict(lm, test))^2)
cat("Basic linear model MSE on Test Set:", mse.lm, "\n")
## Basic linear model MSE on Test Set: 52.24631

In this specific case, forward stepwise selection has the worst performance in test set, and LASSO has the best performance, while PLS is in the middle.

However, compared to the basic linear model, none of them improves the prediction on the test set (at least in this case).

The codes are also publicly available at https://rpubs.com/AlanHuang/CSC642-R_Assignment4