# ---- Installing Packages ----
req <- c("data.table","dplyr","ggplot2","glmnet","pls","caret","scales")
to_install <- setdiff(req, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(req, library, character.only = TRUE))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: Matrix
## Loaded glmnet 4.1-10
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
##
## R2
set.seed(42)
library(data.table)
# make sure data.table is attached
parkinsons <- fread("c:/Users/HP/OneDrive/Documents/School Work/Statistical Modeling/parkinsons_updrs.data")
head(parkinsons)
## subject# age sex test_time motor_UPDRS total_UPDRS Jitter(%) Jitter(Abs)
## <int> <int> <int> <num> <num> <num> <num> <num>
## 1: 1 72 0 5.6431 28.199 34.398 0.00662 3.380e-05
## 2: 1 72 0 12.6660 28.447 34.894 0.00300 1.680e-05
## 3: 1 72 0 19.6810 28.695 35.389 0.00481 2.462e-05
## 4: 1 72 0 25.6470 28.905 35.810 0.00528 2.657e-05
## 5: 1 72 0 33.6420 29.187 36.375 0.00335 2.014e-05
## 6: 1 72 0 40.6520 29.435 36.870 0.00353 2.290e-05
## Jitter:RAP Jitter:PPQ5 Jitter:DDP Shimmer Shimmer(dB) Shimmer:APQ3
## <num> <num> <num> <num> <num> <num>
## 1: 0.00401 0.00317 0.01204 0.02565 0.230 0.01438
## 2: 0.00132 0.00150 0.00395 0.02024 0.179 0.00994
## 3: 0.00205 0.00208 0.00616 0.01675 0.181 0.00734
## 4: 0.00191 0.00264 0.00573 0.02309 0.327 0.01106
## 5: 0.00093 0.00130 0.00278 0.01703 0.176 0.00679
## 6: 0.00119 0.00159 0.00357 0.02227 0.214 0.01006
## Shimmer:APQ5 Shimmer:APQ11 Shimmer:DDA NHR HNR RPDE DFA
## <num> <num> <num> <num> <num> <num> <num>
## 1: 0.01309 0.01662 0.04314 0.014290 21.640 0.41888 0.54842
## 2: 0.01072 0.01689 0.02982 0.011112 27.183 0.43493 0.56477
## 3: 0.00844 0.01458 0.02202 0.020220 23.047 0.46222 0.54405
## 4: 0.01265 0.01963 0.03317 0.027837 24.445 0.48730 0.57794
## 5: 0.00929 0.01819 0.02036 0.011625 26.126 0.47188 0.56122
## 6: 0.01337 0.02263 0.03019 0.009438 22.946 0.53949 0.57243
## PPE
## <num>
## 1: 0.16006
## 2: 0.10810
## 3: 0.21014
## 4: 0.33277
## 5: 0.19361
## 6: 0.19500
# quick peek at the first few rows
parkinsons
## subject# age sex test_time motor_UPDRS total_UPDRS Jitter(%)
## <int> <int> <int> <num> <num> <num> <num>
## 1: 1 72 0 5.6431 28.199 34.398 0.00662
## 2: 1 72 0 12.6660 28.447 34.894 0.00300
## 3: 1 72 0 19.6810 28.695 35.389 0.00481
## 4: 1 72 0 25.6470 28.905 35.810 0.00528
## 5: 1 72 0 33.6420 29.187 36.375 0.00335
## ---
## 5871: 42 61 0 142.7900 22.485 33.485 0.00406
## 5872: 42 61 0 149.8400 21.988 32.988 0.00297
## 5873: 42 61 0 156.8200 21.495 32.495 0.00349
## 5874: 42 61 0 163.7300 21.007 32.007 0.00281
## 5875: 42 61 0 170.7300 20.513 31.513 0.00282
## Jitter(Abs) Jitter:RAP Jitter:PPQ5 Jitter:DDP Shimmer Shimmer(dB)
## <num> <num> <num> <num> <num> <num>
## 1: 3.380e-05 0.00401 0.00317 0.01204 0.02565 0.230
## 2: 1.680e-05 0.00132 0.00150 0.00395 0.02024 0.179
## 3: 2.462e-05 0.00205 0.00208 0.00616 0.01675 0.181
## 4: 2.657e-05 0.00191 0.00264 0.00573 0.02309 0.327
## 5: 2.014e-05 0.00093 0.00130 0.00278 0.01703 0.176
## ---
## 5871: 3.113e-05 0.00167 0.00168 0.00500 0.01896 0.160
## 5872: 2.469e-05 0.00119 0.00147 0.00358 0.02315 0.215
## 5873: 2.470e-05 0.00152 0.00187 0.00456 0.02499 0.244
## 5874: 2.034e-05 0.00128 0.00151 0.00383 0.01484 0.131
## 5875: 2.110e-05 0.00135 0.00166 0.00406 0.01907 0.171
## Shimmer:APQ3 Shimmer:APQ5 Shimmer:APQ11 Shimmer:DDA NHR HNR
## <num> <num> <num> <num> <num> <num>
## 1: 0.01438 0.01309 0.01662 0.04314 0.014290 21.640
## 2: 0.00994 0.01072 0.01689 0.02982 0.011112 27.183
## 3: 0.00734 0.00844 0.01458 0.02202 0.020220 23.047
## 4: 0.01106 0.01265 0.01963 0.03317 0.027837 24.445
## 5: 0.00679 0.00929 0.01819 0.02036 0.011625 26.126
## ---
## 5871: 0.00973 0.01133 0.01549 0.02920 0.025137 22.369
## 5872: 0.01052 0.01277 0.01904 0.03157 0.011927 22.886
## 5873: 0.01371 0.01456 0.01877 0.04112 0.017701 25.065
## 5874: 0.00693 0.00870 0.01307 0.02078 0.007984 24.422
## 5875: 0.00946 0.01154 0.01470 0.02839 0.008172 23.259
## RPDE DFA PPE
## <num> <num> <num>
## 1: 0.41888 0.54842 0.16006
## 2: 0.43493 0.56477 0.10810
## 3: 0.46222 0.54405 0.21014
## 4: 0.48730 0.57794 0.33277
## 5: 0.47188 0.56122 0.19361
## ---
## 5871: 0.64215 0.55314 0.21367
## 5872: 0.52598 0.56518 0.12621
## 5873: 0.47792 0.57888 0.14157
## 5874: 0.56865 0.56327 0.14204
## 5875: 0.58608 0.57077 0.15336
# ---- 2) Basic cleaning / naming ----
old_names <- names(parkinsons)
# keep original names in case we ever need to compare
names(parkinsons) <- make.names(names(parkinsons), unique = TRUE)
# make column names R-safe (no %,: spaces)
if ("sex" %in% names(parkinsons)) parkinsons$sex <- factor(parkinsons$sex)
# treat sex as a factor if present
stopifnot("total_UPDRS" %in% names(parkinsons))
# fail early if the response column is missing
parkinsons <- na.omit(parkinsons)
# drop any rows with missing values
str(parkinsons)
## Classes 'data.table' and 'data.frame': 5875 obs. of 22 variables:
## $ subject. : int 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 72 72 72 72 72 72 72 72 72 72 ...
## $ sex : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ test_time : num 5.64 12.67 19.68 25.65 33.64 ...
## $ motor_UPDRS : num 28.2 28.4 28.7 28.9 29.2 ...
## $ total_UPDRS : num 34.4 34.9 35.4 35.8 36.4 ...
## $ Jitter... : num 0.00662 0.003 0.00481 0.00528 0.00335 0.00353 0.00422 0.00476 0.00432 0.00496 ...
## $ Jitter.Abs. : num 3.38e-05 1.68e-05 2.46e-05 2.66e-05 2.01e-05 ...
## $ Jitter.RAP : num 0.00401 0.00132 0.00205 0.00191 0.00093 0.00119 0.00212 0.00226 0.00156 0.00258 ...
## $ Jitter.PPQ5 : num 0.00317 0.0015 0.00208 0.00264 0.0013 0.00159 0.00221 0.00259 0.00207 0.00253 ...
## $ Jitter.DDP : num 0.01204 0.00395 0.00616 0.00573 0.00278 ...
## $ Shimmer : num 0.0256 0.0202 0.0168 0.0231 0.017 ...
## $ Shimmer.dB. : num 0.23 0.179 0.181 0.327 0.176 0.214 0.445 0.212 0.371 0.31 ...
## $ Shimmer.APQ3 : num 0.01438 0.00994 0.00734 0.01106 0.00679 ...
## $ Shimmer.APQ5 : num 0.01309 0.01072 0.00844 0.01265 0.00929 ...
## $ Shimmer.APQ11: num 0.0166 0.0169 0.0146 0.0196 0.0182 ...
## $ Shimmer.DDA : num 0.0431 0.0298 0.022 0.0332 0.0204 ...
## $ NHR : num 0.0143 0.0111 0.0202 0.0278 0.0116 ...
## $ HNR : num 21.6 27.2 23 24.4 26.1 ...
## $ RPDE : num 0.419 0.435 0.462 0.487 0.472 ...
## $ DFA : num 0.548 0.565 0.544 0.578 0.561 ...
## $ PPE : num 0.16 0.108 0.21 0.333 0.194 ...
## - attr(*, ".internal.selfref")=<externalptr>
# structure overview to confirm types
head(parkinsons)
## subject. age sex test_time motor_UPDRS total_UPDRS Jitter...
## <int> <int> <fctr> <num> <num> <num> <num>
## 1: 1 72 0 5.6431 28.199 34.398 0.00662
## 2: 1 72 0 12.6660 28.447 34.894 0.00300
## 3: 1 72 0 19.6810 28.695 35.389 0.00481
## 4: 1 72 0 25.6470 28.905 35.810 0.00528
## 5: 1 72 0 33.6420 29.187 36.375 0.00335
## 6: 1 72 0 40.6520 29.435 36.870 0.00353
## Jitter.Abs. Jitter.RAP Jitter.PPQ5 Jitter.DDP Shimmer Shimmer.dB.
## <num> <num> <num> <num> <num> <num>
## 1: 3.380e-05 0.00401 0.00317 0.01204 0.02565 0.230
## 2: 1.680e-05 0.00132 0.00150 0.00395 0.02024 0.179
## 3: 2.462e-05 0.00205 0.00208 0.00616 0.01675 0.181
## 4: 2.657e-05 0.00191 0.00264 0.00573 0.02309 0.327
## 5: 2.014e-05 0.00093 0.00130 0.00278 0.01703 0.176
## 6: 2.290e-05 0.00119 0.00159 0.00357 0.02227 0.214
## Shimmer.APQ3 Shimmer.APQ5 Shimmer.APQ11 Shimmer.DDA NHR HNR RPDE
## <num> <num> <num> <num> <num> <num> <num>
## 1: 0.01438 0.01309 0.01662 0.04314 0.014290 21.640 0.41888
## 2: 0.00994 0.01072 0.01689 0.02982 0.011112 27.183 0.43493
## 3: 0.00734 0.00844 0.01458 0.02202 0.020220 23.047 0.46222
## 4: 0.01106 0.01265 0.01963 0.03317 0.027837 24.445 0.48730
## 5: 0.00679 0.00929 0.01819 0.02036 0.011625 26.126 0.47188
## 6: 0.01006 0.01337 0.02263 0.03019 0.009438 22.946 0.53949
## DFA PPE
## <num> <num>
## 1: 0.54842 0.16006
## 2: 0.56477 0.10810
## 3: 0.54405 0.21014
## 4: 0.57794 0.33277
## 5: 0.56122 0.19361
## 6: 0.57243 0.19500
# another quick peek
# ---- 3) Train/Test split ----
library(caret)
# we'll use caret to split the data
set.seed(42)
# set seed again for the split
idx <- createDataPartition(parkinsons$total_UPDRS, p = 0.8, list = FALSE)
# stratified split index (80% train)
train <- parkinsons[idx, ]
# training set
test <- parkinsons[-idx, ]
# test set
# ---- 4) Model matrices (for glmnet) ----
# Build a formula that includes all predictors except the response
x_formula <- as.formula(paste("~", paste(setdiff(names(train), "total_UPDRS"), collapse = "+")))
x_train <- model.matrix(x_formula, data = train)[, -1, drop = FALSE]
# design matrix for train (drop intercept col)
x_test <- model.matrix(x_formula, data = test)[, -1, drop = FALSE]
# design matrix for test (drop intercept col)
y_train <- train$total_UPDRS
# numeric response for train
y_test <- test$total_UPDRS
# numeric response for test
# ---- 5) Helper: simple metrics ----
metrics <- function(obs, pred) {
# small function to compute common metrics
rmse <- RMSE(pred, obs)
# root mean squared error
mae <- MAE(pred, obs)
# mean absolute error
r2 <- cor(obs, pred)^2
# squared correlation as a simple R^2
c(RMSE = rmse, MAE = mae, R2 = r2) # return a named numeric vector
}
# =============================================================================
# A) RIDGE (alpha = 0)
# =============================================================================
library(glmnet) # ensure glmnet is attached
set.seed(42) # seed for cross-validation
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 10, # 10-fold CV for ridge
standardize = TRUE)
plot(cv_ridge) # show CV curve (λ vs. CV error) in Plots pane
title("Ridge CV Curve (alpha = 0)") # add a title on top

ridge_fit <- glmnet(x_train, y_train, alpha = 0, # fit ridge at the best λ from CV
lambda = cv_ridge$lambda.min, standardize = TRUE)
ridge_coef <- coef(ridge_fit) # coefficient vector (including intercept)
ridge_coef_df <- data.frame( # tidy up as a data frame
Feature = rownames(ridge_coef),
Coefficient = as.numeric(ridge_coef)
)
print(head(ridge_coef_df, 15)) # show the first few ridge coefficients
## Feature Coefficient
## 1 (Intercept) 4.226154e+00
## 2 subject. 5.651063e-02
## 3 age 8.832910e-02
## 4 sex1 -1.796332e+00
## 5 test_time 2.695492e-03
## 6 motor_UPDRS 1.083190e+00
## 7 Jitter... -3.913412e+01
## 8 Jitter.Abs. 4.805318e+03
## 9 Jitter.RAP 5.117704e+01
## 10 Jitter.PPQ5 -4.440975e+00
## 11 Jitter.DDP 1.433628e+01
## 12 Shimmer -3.887519e+00
## 13 Shimmer.dB. -5.807030e-01
## 14 Shimmer.APQ3 3.647852e+00
## 15 Shimmer.APQ5 9.023994e+00
ridge_pred_tr <- as.numeric(predict(ridge_fit, newx = x_train)) # fitted values on train
ridge_pred_te <- as.numeric(predict(ridge_fit, newx = x_test)) # predictions on test
ridge_tr_m <- metrics(y_train, ridge_pred_tr) # train metrics
ridge_te_m <- metrics(y_test, ridge_pred_te) # test metrics
print(ridge_te_m) # print ridge test metrics
## RMSE MAE R2
## 3.2685552 2.3710977 0.9125407
ridge_path <- glmnet(x_train, y_train, alpha = 0, standardize = TRUE) # refit across many λ for path plotting
plot(ridge_path, xvar = "lambda", label = FALSE) # ridge coefficient paths vs λ
title("Ridge Coefficient Paths") # add title

ridge_resid <- y_test - ridge_pred_te # residuals on the test set
plot(ridge_pred_te, ridge_resid, pch = 20, # residuals vs fitted to check structure
main = "Ridge: Residuals vs Fitted", xlab = "Fitted", ylab = "Residuals", col = "pink")
abline(h = 0, lty = 2) # reference line at zero

qqnorm(ridge_resid, main = "Ridge: Normal Q-Q", col = "purple") # Q-Q plot for residual normality check
qqline(ridge_resid) # reference line

# =============================================================================
# B) LASSO (alpha = 1)
# =============================================================================
set.seed(42) # seed for cross-validation
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1, nfolds = 10, # 10-fold CV for lasso
standardize = TRUE)
plot(cv_lasso) # show CV curve for lasso in Plots pane
title("LASSO CV Curve (alpha = 1)") # add title

lasso_fit <- glmnet(x_train, y_train, alpha = 1, # fit lasso at best λ
lambda = cv_lasso$lambda.min, standardize = TRUE)
lasso_coef <- coef(lasso_fit) # coefficient vector (includes intercept)
lasso_coef_df <- data.frame(Feature = rownames(lasso_coef), # tidy coefficients
Coefficient = as.numeric(lasso_coef))
print(head(lasso_coef_df, 15)) # show a sample of coefficients
## Feature Coefficient
## 1 (Intercept) 1.938297e+00
## 2 subject. 4.589583e-02
## 3 age 7.294393e-02
## 4 sex1 -1.769383e+00
## 5 test_time 1.895935e-03
## 6 motor_UPDRS 1.206547e+00
## 7 Jitter... -3.341993e+02
## 8 Jitter.Abs. 1.695020e+04
## 9 Jitter.RAP 4.515372e+02
## 10 Jitter.PPQ5 5.187883e+01
## 11 Jitter.DDP 0.000000e+00
## 12 Shimmer -3.029778e+01
## 13 Shimmer.dB. 0.000000e+00
## 14 Shimmer.APQ3 0.000000e+00
## 15 Shimmer.APQ5 9.309207e+01
lasso_selected <- subset(lasso_coef_df, Feature != "(Intercept)" & # keep features with non-zero coeffs
abs(Coefficient) > 0)
print(lasso_selected[order(-abs(lasso_selected$Coefficient)), ][1:20, ]) # show top 20 by absolute size
## Feature Coefficient
## 8 Jitter.Abs. 1.695020e+04
## 9 Jitter.RAP 4.515372e+02
## 7 Jitter... -3.341993e+02
## 15 Shimmer.APQ5 9.309207e+01
## 10 Jitter.PPQ5 5.187883e+01
## 16 Shimmer.APQ11 -4.078055e+01
## 12 Shimmer -3.029778e+01
## 17 Shimmer.DDA -4.856863e+00
## 18 NHR -4.758957e+00
## 22 PPE -3.686316e+00
## 21 DFA -3.680895e+00
## 20 RPDE 2.652370e+00
## 4 sex1 -1.769383e+00
## 6 motor_UPDRS 1.206547e+00
## 19 HNR -7.817605e-02
## 3 age 7.294393e-02
## 2 subject. 4.589583e-02
## 5 test_time 1.895935e-03
## NA <NA> NA
## NA.1 <NA> NA
lasso_pred_tr <- as.numeric(predict(lasso_fit, newx = x_train)) # fitted values on train
lasso_pred_te <- as.numeric(predict(lasso_fit, newx = x_test)) # predictions on test
lasso_tr_m <- metrics(y_train, lasso_pred_tr) # train metrics
lasso_te_m <- metrics(y_test, lasso_pred_te) # test metrics
print(lasso_te_m) # print lasso test metrics
## RMSE MAE R2
## 3.1319922 2.3101629 0.9137848
lasso_path <- glmnet(x_train, y_train, alpha = 1, standardize = TRUE) # refit across λ for path plotting
plot(lasso_path, xvar = "lambda", label = FALSE) # lasso coefficient paths vs λ
title("LASSO Coefficient Paths") # add title

lasso_resid <- y_test - lasso_pred_te # residuals on test
plot(lasso_pred_te, lasso_resid, pch = 20, # residuals vs fitted
main = "LASSO: Residuals vs Fitted", xlab = "Fitted", ylab = "Residuals")
abline(h = 0, lty = 2) # reference line

qqnorm(lasso_resid, main = "LASSO: Normal Q-Q") # Q-Q plot
qqline(lasso_resid) # reference line

# C) PLS (pls::plsr)
# =============================================================================
library(pls) # attach pls for plsr()
train_pls <- cbind.data.frame(total_UPDRS = y_train, as.data.frame(x_train)) # build a data.frame for plsr
test_pls <- cbind.data.frame(total_UPDRS = y_test, as.data.frame(x_test)) # same for test
set.seed(42) # seed for CV component selection
pls_cv <- plsr(total_UPDRS ~ ., data = train_pls, scale = TRUE, # fit PLS with scaling and K-fold CV
validation = "CV", segments = 10)
plot(RMSEP(pls_cv), main = "PLS: CV RMSEP by # Components") # visualize CV error across components

rmsep_vals <- drop(RMSEP(pls_cv)$val[1,1,]) # pull CV RMSEP for 0..m comps into a vector
best_comp <- which.min(rmsep_vals) - 1 # index minus 1 = number of components
if (best_comp < 1) best_comp <- 1 # ensure we pick at least 1 component
cat("Best number of PLS components:", best_comp, "\n") # print chosen number of comps
## Best number of PLS components: 14
pls_pred_tr <- as.numeric(predict(pls_cv, ncomp = best_comp, # predict on train set with chosen comps
newdata = train_pls))
pls_pred_te <- as.numeric(predict(pls_cv, ncomp = best_comp, # predict on test set with chosen comps
newdata = test_pls))
pls_tr_m <- metrics(train_pls$total_UPDRS, pls_pred_tr) # train metrics
pls_te_m <- metrics(test_pls$total_UPDRS, pls_pred_te) # test metrics
print(pls_te_m) # print PLS test metrics
## RMSE MAE R2
## 3.1335417 2.3166312 0.9136953
pls_coef <- coef(pls_cv, ncomp = best_comp, intercept = TRUE) # get coefficients including intercept
pls_coef_df <- data.frame(Feature = rownames(pls_coef), # tidy coefficients
Coefficient = as.numeric(pls_coef))
print(head(pls_coef_df, 15)) # show a sample of PLS coefficients
## Feature Coefficient
## 1 (Intercept) 2.2703213
## 2 subject. 0.5738272
## 3 age 0.6578161
## 4 sex1 -0.8420905
## 5 test_time 0.1094232
## 6 motor_UPDRS 9.8276103
## 7 Jitter... -2.1379967
## 8 Jitter.Abs. 0.5960258
## 9 Jitter.RAP 0.8588487
## 10 Jitter.PPQ5 0.1520581
## 11 Jitter.DDP 0.8577049
## 12 Shimmer -0.7315441
## 13 Shimmer.dB. -0.1897086
## 14 Shimmer.APQ3 -0.1845148
## 15 Shimmer.APQ5 1.8911502
pls_resid <- test_pls$total_UPDRS - pls_pred_te # residuals for diagnostic plots
plot(pls_pred_te, pls_resid, pch = 20, # residuals vs fitted
main = paste0("PLS: Residuals vs Fitted (", best_comp, " comps)"),
xlab = "Fitted", ylab = "Residuals")
abline(h = 0, lty = 2) # reference line

qqnorm(pls_resid, main = "PLS: Normal Q-Q") # Q-Q plot for residuals
qqline(pls_resid) # reference line

# =============================================================================
# D) Compare models on the test set
# =============================================================================
library(dplyr) # attach dplyr for binding/formatting
comp <- bind_rows( # bind metric rows for each model
data.frame(Model = "Ridge (lambda.min)", t(ridge_te_m)),
data.frame(Model = "LASSO (lambda.min)", t(lasso_te_m)),
data.frame(Model = paste0("PLS (", best_comp, " comps)"), t(pls_te_m))
) %>% mutate(across(where(is.numeric), ~round(., 4))) # round numbers for neat printing
print(comp) # show the comparison table in console
## Model RMSE MAE R2
## 1 Ridge (lambda.min) 3.2686 2.3711 0.9125
## 2 LASSO (lambda.min) 3.1320 2.3102 0.9138
## 3 PLS (14 comps) 3.1335 2.3166 0.9137
# (Optional) quick bar plot of RMSE by model for your report
library(ggplot2) # use ggplot2 for a quick visual
ggplot(comp, aes(x = Model, y = RMSE)) + # RMSE by model
geom_col() + # simple columns
ggtitle("Test RMSE by Model") + # title for context
xlab("") + ylab("RMSE") + # axis labels
theme_minimal() + # clean theme
theme(axis.text.x = element_text(angle = 20, hjust = 1)) # tilt labels for readability
