rm(list = ls())
set.seed(1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaps)
UPDRS_df <- read_csv('./Datasets/Regression/parkinsons_updrs.data', show_col_types = FALSE) %>%
select(-total_UPDRS) %>%
drop_na()
• Divide the input dataset into training and testing as done in Assignment 1 (80%/20%).
• Build a model to predict motor_UPDRS (reminder: do not include total_UPDRS as input to the model) and measure its performance on the test set.
# Split data into training and testing sets with an 80%/20% split
num_samples <- nrow(UPDRS_df)
train_indices <- sample(1:num_samples, size = num_samples * 0.8)
train <- UPDRS_df[train_indices, ]
test <- UPDRS_df[-train_indices, ]
# Forward Stepwise Selection using 5-fold cross-validation
set.seed(1) # Ensure reproducibility
folds <- sample(1:5, nrow(train), replace = TRUE)
cv_errors <- matrix(NA, 5, ncol(train) - 1) # Exclude the response variable
for (j in 1:5) {
train_fold <- train[folds != j, ]
test_fold <- train[folds == j, ]
regfit.fwd <- regsubsets(motor_UPDRS ~ ., data = train_fold, nvmax = ncol(train_fold) - 1, method = "forward")
for (i in 1:(ncol(train_fold) - 1)) {
coefi <- coef(regfit.fwd, id = i)
pred <- model.matrix(motor_UPDRS ~ ., data = test_fold)[, names(coefi)] %*% coefi
cv_errors[j, i] <- mean((test_fold$motor_UPDRS - pred)^2)
}
}
# Average CV errors across folds to find the optimal model size
mean_cv_errors <- apply(cv_errors, 2, mean)
optimal_num_vars <- which.min(mean_cv_errors)
# Build the model with the optimal number of variables
regfit.best <- regsubsets(motor_UPDRS ~ ., data = train, nvmax = optimal_num_vars, method = "forward")
best_coefs <- coef(regfit.best, id = optimal_num_vars)
# Predict on test set
test_matrix <- model.matrix(motor_UPDRS ~ ., data = test)[, names(best_coefs)]
predictions <- test_matrix %*% best_coefs
mse_test <- mean((test$motor_UPDRS - predictions)^2)
# Output the MSE on the test set and optimal number of variables
print(paste("MSE on Test Set:", mse_test))
## [1] "MSE on Test Set: 52.3465751365149"
print(paste("Optimal Number of Variables:", optimal_num_vars))
## [1] "Optimal Number of Variables: 11"
y_train <- train$motor_UPDRS
x_train <- model.matrix(motor_UPDRS ~ . - 1, data = train) # -1 to exclude intercept
y_test <- test$motor_UPDRS
x_test <- model.matrix(motor_UPDRS ~ . - 1, data = test) # -1 to exclude intercept
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
set.seed(1)
cv.out <- cv.glmnet(x_train, y_train, alpha = 1)
plot(cv.out)
bestlam <- cv.out$lambda.min
lasso.pred <- predict(cv.out, s = bestlam, newx = x_test)
mse_test <- mean((lasso.pred - y_test)^2)
print(paste("MSE on Test Set with Lasso:", mse_test))
## [1] "MSE on Test Set with Lasso: 52.2767733919589"
lasso.coef <- predict(cv.out, type = "coefficients", s = bestlam)
lasso.coef[lasso.coef != 0]
## [1] 2.715562e+01 1.847651e-01 2.018398e-01 -2.415227e+00 8.901892e-03
## [6] -4.174177e+04 3.963522e+02 3.082857e+00 -4.857902e+01 -3.329007e+01
## [11] 3.919727e+01 -6.998783e-01 -1.505030e+01 -3.243335e-01 -1.105015e+00
## [16] -2.596496e+01 1.476018e+01
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
pls.fit <- plsr(motor_UPDRS ~ . , data = UPDRS_df, subset = train_indices, scale = TRUE, validation = "CV")
summary(pls.fit)
## Data: X dimension: 4700 20
## Y dimension: 4700 1
## Fit method: kernelpls
## Number of components considered: 20
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.122 7.896 7.326 7.259 7.221 7.207 7.203
## adjCV 8.122 7.896 7.326 7.258 7.221 7.206 7.203
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 7.204 7.205 7.206 7.207 7.207 7.207 7.208
## adjCV 7.203 7.204 7.205 7.205 7.205 7.205 7.206
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 7.211 7.211 7.212 7.211 7.211 7.213 7.213
## adjCV 7.209 7.210 7.210 7.210 7.210 7.211 7.211
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 51.406 61.54 67.89 72.21 75.81 80.41 85.47
## motor_UPDRS 5.536 18.86 20.36 21.19 21.55 21.66 21.69
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 88.70 92.06 94.01 97.40 98.18 98.57 98.85
## motor_UPDRS 21.71 21.72 21.74 21.75 21.75 21.75 21.76
## 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## X 99.63 99.89 99.95 100.00 100.00 100.00
## motor_UPDRS 21.76 21.76 21.76 21.76 21.77 21.78
M=6
pls.pred <- predict(pls.fit, newdata = UPDRS_df[-train_indices, ], ncomp = M)
mse_test <- mean((pls.pred - UPDRS_df$motor_UPDRS[-train_indices])^2)
print(paste("MSE on Test Set with PLS, M=", M, ": ", mse_test, sep=""))
## [1] "MSE on Test Set with PLS, M=6: 52.2984111680083"
# Refit using all data
pls.fit.all <- plsr(motor_UPDRS ~ ., data = UPDRS_df, scale = TRUE, ncomp = M)
summary(pls.fit.all)
## Data: X dimension: 5875 20
## Y dimension: 5875 1
## Fit method: kernelpls
## Number of components considered: 6
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 51.874 61.78 68.27 72.39 76.03 80.69
## motor_UPDRS 5.527 18.91 20.33 21.19 21.55 21.68
lm <- lm(motor_UPDRS ~ . , data = train)
mse.lm <- mean((test$motor_UPDRS - predict(lm, test))^2)
cat("Basic linear model MSE on Test Set:", mse.lm, "\n")
## Basic linear model MSE on Test Set: 52.24631
In this specific case, forward stepwise selection has the worst performance in test set, and LASSO has the best performance, while PLS is in the middle.
However, compared to the basic linear model, none of them improves the prediction on the test set (at least in this case).