Homework 7, Data 624

Load required libraries

6.2

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

(a)

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

Load and prepare the data

# Start R and use these commands to load the data:
# Load necessary libraries
library(AppliedPredictiveModeling)
data(permeability)

(b)

The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

nzv <- nearZeroVar(fingerprints)
str(nzv)

##  int [1:719] 7 8 9 10 13 14 17 18 19 22 ...

head(nzv)

## [1]  7  8  9 10 13 14

# Filter low-frequency predictors using nearZeroVar function
nzv <- nearZeroVar(fingerprints)
# Remove predictors with low variance
fingerprints_filtered <- fingerprints[, -nzv]
str(fingerprints_filtered)

##  num [1:165, 1:388] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:165] "1" "2" "3" "4" ...
##   ..$ : chr [1:388] "X1" "X2" "X3" "X4" ...

# Check how many predictors are left for modeling
predictors_left <- ncol(fingerprints_filtered)
print(paste("Number of predictors left:", predictors_left))

## [1] "Number of predictors left: 388"

# Re-check
predictors_left <- fingerprints[,-nearZeroVar(fingerprints)]

predictors_left |>
  dim()

## [1] 165 388

###(c) Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

#set seed for reproducibility and split data into training and testing sets
set.seed(3)
train_test_split <- createDataPartition(permeability, p = 0.7, list = FALSE)

# Filter low variance predictors
filtered_predictors <- fingerprints[, -nearZeroVar(fingerprints)]

set.seed(3)
train_test_split <- createDataPartition(permeability, p = 0.7, list = FALSE)

# Create the training and test sets
train.x <- filtered_predictors[train_test_split, ]  # Predictors for training
train.y <- permeability[train_test_split]  # Response variable for training
test.x <- filtered_predictors[-train_test_split, ]  # Predictors for testing
test.y <- permeability[-train_test_split]  # Response variable for testing

Preprocess the training and test data

# Train the PLS model with cross-validation
pls_model <- plsr(Response ~ ., 
                   data = train_data, 
                   validation = "CV",  # Cross-validation
                   ncomp = 20)  # Try up to 20 components

# Summary of the model
summary(pls_model)

## Data:    X dimension: 117 388 
##  Y dimension: 117 1
## Fit method: kernelpls
## Number of components considered: 20
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            15.4    12.94    11.82    11.69    11.66    11.66    11.41
## adjCV         15.4    12.93    11.77    11.59    11.50    11.43    11.20
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       11.22    10.97    11.02     11.20     11.54     11.62     11.72
## adjCV    11.01    10.79    10.78     10.92     11.21     11.29     11.34
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV        11.75     11.73     11.91     12.05     12.31     12.65     13.08
## adjCV     11.36     11.35     11.51     11.63     11.88     12.19     12.60
## 
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X           20.35    34.96    40.24    45.07    49.63    56.91    60.87
## Response    34.49    51.33    60.27    66.48    72.88    75.73    78.27
##           8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X           64.99    68.10     70.73     72.48     75.67     77.12     79.05
## Response    80.60    83.25     84.85     86.53     87.45     88.77     89.30
##           15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## X            81.62     83.48     84.49     85.76     86.91     88.34
## Response     89.69     90.16     90.75     91.11     91.46     91.80

Tune the PLS model by fitting it with different numbers of components

# Tune the PLS model by fitting it with different numbers of components
ncomp_range <- 1:20
press_values <- numeric(length(ncomp_range))

for (i in ncomp_range) {
  pls_model <- plsr(Response ~ ., data = train_data, ncomp = i, validation = "CV")
  press_values[i] <- min(pls_model$validation$PRESS)  # Store PRESS for each component count
}

Plot PRESS values to visualize performance

# Plot PRESS values to visualize performance
plot(ncomp_range, press_values, type = "b", pch = 19, 
     xlab = "Number of Components", ylab = "PRESS", 
     main = "PRESS Values for Different Number of Components")

Predict on the test set

ncomp_range <- 1:20  # Specify the range of components to try
press_values <- numeric(length(ncomp_range)) 

# Calculate PRESS values for each number of components
for (i in ncomp_range) {
  pls_model <- plsr(Response ~ ., data = train_data, ncomp = i, validation = "CV")
  press_values[i] <- min(pls_model$validation$PRESS)  
}

# Identify the optimal number of components based on the minimum PRESS
optimal_ncomp <- which.min(press_values)
cat("Optimal number of latent variables:", optimal_ncomp, "\n")

## Optimal number of latent variables: 18

# Fit the final PLS model with the optimal number of components
final_pls_model <- plsr(Response ~ ., data = train_data, ncomp = optimal_ncomp)

# Predict on the test set
predictions <- predict(final_pls_model, newdata = test.x, ncomp = optimal_ncomp)
head(predictions)

## , , 18 comps
## 
##     Response
## 2   4.808946
## 3  15.376591
## 8   1.426698
## 11 10.466658
## 22 18.875668
## 24 17.508632

# Get resampled estimate of R^2 for the PLS model
predicted <- predict(final_pls_model, newdata = test.x, ncomp = optimal_ncomp)
test_R2 <- cor(test.y, predicted)^2
test_R2

## [1] 0.3384419

The R² value of 0.338 for the test set in your Partial Least Squares (PLS) model suggests that the model explains approximately 33.8% of the variance in the test set. This indicates a relatively low explanatory power, meaning that a substantial portion of the variance in the test data remains unexplained by the model.

# Output optimal latent variables and corresponding R^2
cat("Optimal number of latent variables:", optimal_ncomp, "\n")

## Optimal number of latent variables: 18

# Define the range of components to test
ncomp_range <- 1:20
press_values <- numeric(length(ncomp_range))

# Fit the PLS model with different numbers of components and calculate PRESS
for (i in ncomp_range) {
  pls_model <- plsr(Response ~ ., data = train_data, ncomp = i, validation = "CV")
  press_values[i] <- min(pls_model$validation$PRESS)  # Store the minimum PRESS for each component count
}

# Plot the PRESS values for each component count
plot(
  ncomp_range, press_values, 
  type = "b", pch = 19, col = "blue",
  xlab = "Number of Components", 
  ylab = "PRESS (Prediction Residual Error Sum of Squares)",
  main = "PRESS vs. Number of Components in PLS Model"
)

Plot the validation results

# Fit the PLS model using cross-validation and specify the range of components
pls_model <- plsr(Response ~ ., data = train_data, ncomp = max(ncomp_range), validation = "CV")

# Plot the Root Mean Squared Error of Prediction (RMSEP) for each component
plot(
  RMSEP(pls_model), 
  legendpos = "topright",  # Position the legend at the top right
  main = "PLS Model Validation Results",
  xlab = "Number of Components",
  ylab = "RMSEP"
)

Get resampled estimate of \[R2\] for the PLS model

# Predict on the test set using the optimal number of components found
y_pred <- predict(pls_model, newdata = test.x, ncomp = optimal_ncomp)

# Calculate R^2 for the PLS model on the test set
pls_R2 <- cor(test.y, y_pred)^2
cat("PLS Model R^2 for the test set:", pls_R2, "\n")

## PLS Model R^2 for the test set: 0.3384419

# Determine the optimal number of components based on minimum PRESS
optimal_ncomp <- which.min(press_values)
cat("Optimal number of latent variables:", optimal_ncomp, "\n")

## Optimal number of latent variables: 8

# Fit the final PLS model with the optimal number of components
final_pls_model <- plsr(Response ~ ., data = train_data, ncomp = optimal_ncomp)
final_pls_model

## Partial least squares regression, fitted with the kernel algorithm.
## Call:
## plsr(formula = Response ~ ., ncomp = optimal_ncomp, data = train_data)

# Plot the validation results
plot(RMSEP(final_pls_model), legendpos = "topright")  # Plot RMSEP for the final model

This plot provides a clear, intuitive view of how the RMSE changes with each additional component, helping to visually identify the point where additional components don’t significantly improve the model fit.

(d)

Predict the response for the test set. What is the test set estimate of R squared? Calculate R squared for test set predictions

# Predict the response for the test set using the optimal number of components
predictions <- predict(pls_model, newdata = test.x, ncomp = optimal_ncomp)

# Calculate R^2 for the test set predictions
test_R2 <- cor(test.y, predictions)^2
cat("Test set R^2:", test_R2, "\n")

## Test set R^2: 0.4386452

###(e) Try building other models discussed in this chapter. Do any have better predictive performance?

Building other models discussed in the chapter

data(permeability)

set.seed(13)  

preProc <- preProcess(train.x, method = c("center", "scale"))
train.x <- predict(preProc, train.x)
test.x <- predict(preProc, test.x)

# Create a data frame for training
train_data <- data.frame(Response = train.y, train.x)

Ridge Regression Model

# Ridge Regression with lm.ridge from the MASS Package

ridge_model_mas <- lm.ridge(train.y ~ ., data = data.frame(Response = train.y, train.x), 
                             lambda = seq(0, 0.1, length = 100))

# Ridge regression using the glmnet function (the glmnet function uses alpha = 0)

ridge_model <- glmnet(as.matrix(train.x), train.y, alpha = 0, lambda = seq(0, 0.1, length = 100))

Lasso Regression Model

# Fit Lasso regression model (uses alpha = 1)
lasso_model <- glmnet(as.matrix(train.x), train.y, alpha = 1, lambda = seq(0, 0.1, length = 100))

Build Elastic Net Model

#Elastic Net allows for both penalties; you can adjust alpha between 0 (Ridge) and 1 (Lasso):


# Fit Elastic Net model
elastic_net_model <- glmnet(as.matrix(train.x), train.y, alpha = 0.5, lambda = seq(0, 0.1, length = 100))

Make Predictions

# Make predictions with Ridge model
ridge_pred <- predict(ridge_model, s = 0.01, newx = as.matrix(test.x))  
ridge_pred

##               s1
## 2     6.93321160
## 3    11.73993257
## 8   -17.50592061
## 11   -4.78600585
## 22   11.34585609
## 24  -18.42865197
## 25  -30.96591572
## 27   13.09280689
## 35   -4.47286166
## 44  -14.13524015
## 51    3.05114078
## 58    0.61707758
## 60   15.71750069
## 62   11.11903461
## 63  -10.55799787
## 70   10.30975964
## 72  -26.25990607
## 74   29.91911297
## 75   -3.42113177
## 79   30.49347808
## 82  -18.29923252
## 83  -11.62780211
## 86  -32.21163870
## 89  -16.63220004
## 90    0.06978239
## 92  -60.56138945
## 93   23.63770651
## 94   37.84272905
## 96   42.50053653
## 99    1.99425760
## 103  -0.30017992
## 105 -43.39208610
## 111  36.55355553
## 114 -11.14728500
## 121  34.32523284
## 122  27.09006177
## 123  45.13296151
## 124  27.09006177
## 129  12.04667103
## 134   4.99291948
## 136  -4.39485537
## 139   4.04977509
## 143  43.82474414
## 145  -6.99532583
## 150  -9.96612725
## 154 -31.98150498
## 156  36.33071552
## 162   6.71225086

# Make predictions with Lasso model
lasso_pred <- predict(lasso_model, s = 0.01, newx = as.matrix(test.x))
lasso_pred

##              s1
## 2     5.8676744
## 3    12.2291536
## 8    -9.4641551
## 11    5.9712244
## 22   17.9290950
## 24    1.0567106
## 25  -14.4924704
## 27   14.5096883
## 35    1.0542362
## 44  -29.6425815
## 51    2.9826098
## 58    0.2568369
## 60   13.3158612
## 62   11.2639142
## 63   -3.3965634
## 70    8.4441296
## 72   -9.2223422
## 74   20.3485243
## 75   -0.3746506
## 79   17.0270139
## 82  -21.9568716
## 83  -26.2413979
## 86   -2.8480548
## 89  -40.1532453
## 90  -11.3768404
## 92  -25.0344502
## 93    6.2135184
## 94   24.4792596
## 96   37.5425270
## 99    1.5474932
## 103   3.7893442
## 105 -56.0701806
## 111  32.6104865
## 114  -6.3115719
## 121  27.8036709
## 122  42.5529569
## 123  44.7542220
## 124  42.5529569
## 129   9.4022448
## 134   7.4219299
## 136  -1.3054683
## 139 -11.1613540
## 143  44.1049979
## 145  -5.6593378
## 150 -10.9585817
## 154 -13.9905752
## 156  36.4989613
## 162   7.9345160

# Make predictions with Elastic Net model
elastic_net_pred <- predict(elastic_net_model, s = 0.01, newx = as.matrix(test.x))
elastic_net_pred

##              s1
## 2     6.0464765
## 3    12.0887788
## 8   -15.2834523
## 11    3.6707785
## 22   16.7909918
## 24   -7.8421711
## 25  -22.2701240
## 27   13.1767507
## 35   -1.2457960
## 44  -30.7070647
## 51    2.8575518
## 58    0.5924951
## 60   14.4671649
## 62   10.4942954
## 63   -7.4463559
## 70    9.2114686
## 72  -22.8939417
## 74   24.3904088
## 75   -1.6583148
## 79   22.2000452
## 82  -23.2017637
## 83  -27.7944608
## 86   -0.1123813
## 89  -35.3465159
## 90  -12.9924724
## 92  -26.9399294
## 93    7.8090933
## 94   25.5236856
## 96   39.0750630
## 99    1.8080935
## 103   1.6526664
## 105 -62.8884332
## 111  34.1344619
## 114 -10.1502429
## 121  32.0639478
## 122  37.5701371
## 123  45.4112451
## 124  37.5701371
## 129  10.8048136
## 134   5.9198015
## 136  -1.8126745
## 139  -6.9216927
## 143  43.6356575
## 145  -5.9522979
## 150 -14.4066215
## 154 -20.1314191
## 156  36.1967687
## 162   7.3844036

Evaluate the Models

I will evaluate he model predictions using RMSE or R-squared:

# Function to calculate RMSE
rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}

# Calculate RMSE for each model
ridge_rmse <- rmse(test.y, ridge_pred)
lasso_rmse <- rmse(test.y, lasso_pred)
elastic_net_rmse <- rmse(test.y, elastic_net_pred)

# Print RMSE values
cat("Ridge RMSE:", ridge_rmse, "\n")

## Ridge RMSE: 25.77621

cat("Lasso RMSE:", lasso_rmse, "\n")

## Lasso RMSE: 21.23168

cat("Elastic Net RMSE:", elastic_net_rmse, "\n")

## Elastic Net RMSE: 22.88874

Based on the RMSE metrics, the Lasso model is the best performer based on the RMSE metric,indicating that it provides the most accurate predictions for the dataset used in this analysis.The Elastic Net model is the second best.It performs better than Ridge but does not achieve the level of accuracy seen with Lasso. It all boils down to the necessary trade-off between bias and variance. Lasso’s ability to simplify the model may lead to better generalization, particularly in high-dimensional datasets. However, if interpretability is critical, exploring the coefficients of each model could provide insights into the most influential predictors. Additional evaluation metrics, such as R-squared, mean absolute error (MAE)etc are however needed to complement the RMSE analysis.

Extract the optimal lambda and compute R squared:

Ridge Regression

library(glmnet)
# Fit the Ridge model 
ridge_model <- glmnet(train.x, train.y, alpha = 0)  # alpha = 0 for Ridge

# Cross-validation to find optimal lambda
set.seed(13)
cv_ridge <- cv.glmnet(train.x, train.y, alpha = 0)
optimal_lambda_ridge <- cv_ridge$lambda.min  

# Calculate R^2 for Ridge
ridge_predictions <- predict(ridge_model, s = optimal_lambda_ridge, newx = test.x)
ridge_r_squared <- 1 - sum((test.y - ridge_predictions)^2) / sum((test.y - mean(test.y))^2)

# Print results
cat("Ridge Optimal Lambda:", optimal_lambda_ridge, "\n")

## Ridge Optimal Lambda: 99.96172

cat("Ridge R^2:", ridge_r_squared, "\n")

## Ridge R^2: 0.4189032

The ridge model has an explanatory power of approximately 41%. This means that the model explains just 41 percent of the variance in the test set

####Lasso Regression

# Fit the Lasso model
lasso_model <- glmnet(train.x, train.y, alpha = 1)  # alpha = 1 for Lasso

# Cross-validation to find optimal lambda
set.seed(13)
cv_lasso <- cv.glmnet(train.x, train.y, alpha = 1)
optimal_lambda_lasso <- cv_lasso$lambda.min  # Optimal lambda

# Calculate R^2 for Lasso
lasso_predictions <- predict(lasso_model, s = optimal_lambda_lasso, newx = test.x)
lasso_r_squared <- 1 - sum((test.y - lasso_predictions)^2) / sum((test.y - mean(test.y))^2)

# Print results
cat("Lasso Optimal Lambda:", optimal_lambda_lasso, "\n")

## Lasso Optimal Lambda: 1.122894

cat("Lasso R^2:", lasso_r_squared, "\n")

## Lasso R^2: 0.461151

The Lasso model has an explanatory power of approximately 46%. This means that the model explains only 46 percent of the variance found in the test set

Elastic Net model

# Fit the Elastic Net model (choose alpha between 0 and 1)
alpha_value <- 0.5  

elastic_net_model <- glmnet(train.x, train.y, alpha = alpha_value)

# Cross-validation to find optimal lambda
set.seed(13)
cv_enet <- cv.glmnet(train.x, train.y, alpha = alpha_value)
optimal_lambda_enet <- cv_enet$lambda.min 

# Calculate R^2 for Elastic Net
enet_predictions <- predict(elastic_net_model, s = optimal_lambda_enet, newx = test.x)
enet_r_squared <- 1 - sum((test.y - enet_predictions)^2) / sum((test.y - mean(test.y))^2)

# Print results
cat("Elastic Net Optimal Lambda:", optimal_lambda_enet, "\n")

## Elastic Net Optimal Lambda: 2.143713

cat("Elastic Net R^2:", enet_r_squared, "\n")

## Elastic Net R^2: 0.4588529

The Elastic Net model has an explanatory power of approximately 46%. This means that the model explains only 46 percent of the variance found in the test set

###(f) Would you recommend any of your models to replace the permeability laboratory experiment?

ANSWER: The R squared values for the selected models are all below 50%, indicating low explanatory power. A significant portion of the variance in the test set is left unexplained by all the three models. As a result, they are all poor options for replacing the original lab experiment. However, Elastic Net emerges as the more promising choice if needs be, due to its better balance of predictive performance and its stability, which suggests that Elastic Net effectively captures the underlying relationships in the data. Overall, with further refinement and tuning, Elastic Net’s combined penalties could provide better predictive power than say Partial Least Squares (PLS), especially for datasets like the permeability dataset, that have a small sample size and lots of predictors. Ignoring PLS models for such data sets aligns with Max Kuhn and Kjell Johnson guidelines in Applied PredictiveModelingIn that book, the two authors states that structural imbalances in datasets make mulitiple linear regression models or linear discriminant analysis unsuitable as predictive models.

6.3.

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the re-lationship between biological measurements of the raw materials(predictors),measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material beforeprocessing. On the other hand,manufacturing process predictors can be changed in the manufacturing pro- cess. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch:

###(a) Start R and use these commands to load the data:

# Check for any unusual values
boxplot(ChemicalManufacturingProcess$Yield, main = "Boxplot of Yield")

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors)for the 176 manufacturing runs. yield contains the percent yield for each run.

(b)

A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

Impute Missing Values

library(AppliedPredictiveModeling)

# Check for total missing values before imputation
total_missing_before <- sum(is.na(ChemicalManufacturingProcess))
total_missing_before

## [1] 106

# Impute Missing Values using an imputation loop

for(i in seq_along(ChemicalManufacturingProcess)) {
  if (is.numeric(ChemicalManufacturingProcess[[i]])) {
    ChemicalManufacturingProcess[[i]][is.na(ChemicalManufacturingProcess[[i]])] <- mean(ChemicalManufacturingProcess[[i]], na.rm = TRUE)
  }
}

# Check for total missing values after imputation
total_missing_after <- sum(is.na(ChemicalManufacturingProcess))
 total_missing_after

## [1] 0

# Imputation using knn
imputed.knn <- preProcess(ChemicalManufacturingProcess,
           method = "knnImpute",
           k = sqrt(nrow(ChemicalManufacturingProcess))
           )

imputed_data <- predict(imputed.knn, ChemicalManufacturingProcess)

###(c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric? ### Understanding the data

library(e1071)
# Compute the skewness across columns.
skewValues <- apply(ChemicalManufacturingProcess, 2, skewness)
head(skewValues)

##                Yield BiologicalMaterial01 BiologicalMaterial02 
##           0.31095956           0.27331650           0.24412685 
## BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05 
##           0.02851075           1.73231530           0.30400532

Visualize the skeweness through an histogram

# Loop through each numeric column and plot a histogram
numeric_columns <- sapply(ChemicalManufacturingProcess, is.numeric)
par(mfrow = c(1, 1))  # Adjusts layout to show multiple plots (3x3 grid)

for (col in names(ChemicalManufacturingProcess)[numeric_columns]) {
  hist(ChemicalManufacturingProcess[[col]], 
       main = paste("Histogram of", col), 
       xlab = col, 
       col = "skyblue", 
       border = "black")
}

# Reset layout to default
par(mfrow = c(1, 1))

The histogram plots show that many of the variables are skewed,so I decided to apply some transformation to the data using the BoxCoxTrans function from caret.

near_zero <- nearZeroVar(transformed_data)

imputed.data <- transformed_data[, -near_zero]

Now, we can split plit the data into a training and a test set

set.seed(3)

# Apply Box-Cox transformation to numeric columns
transformed_data <- ChemicalManufacturingProcess %>%
  mutate(across(where(is.numeric), ~ predict(BoxCoxTrans(.), .)))

# Split the transformed data into training and test sets
trainIndex <- createDataPartition(transformed_data$Yield, p = 0.75, list = FALSE)
training_set <- transformed_data[trainIndex, ]
test_set <- transformed_data[-trainIndex, ]

# Verify the split
cat("Training set size:", nrow(training_set), "\n")

## Training set size: 132

cat("Test set size:", nrow(test_set), "\n")

## Test set size: 44

I opted for a ridge regression model

library(tidymodels)
library(parsnip)
# Define a ridge regression model specification
ridge_spec <- linear_reg(penalty = 1, mixture = 0) %>%  # Ridge regression
  set_engine("glmnet")

# Create a workflow for ridge regression
ridge_workflow <- workflow() %>%
  add_model(ridge_spec) %>%
  add_formula(Yield ~ .)

# Fit the ridge regression model
ridge_fit <- ridge_workflow %>%
  fit(data = training_set)

# Make predictions and evaluate RMSE
ridge_predictions <- predict(ridge_fit, new_data = test_set) %>%
  bind_cols(test_set)

# Fit the ridge regression model
ridge_fit <- ridge_workflow %>%
  fit(data = training_set)

# Plot the cross-validated mean squared error (MSE) against log(lambda)
plot(cv_ridge)

y_train <- training_set$Yield  
x_train <- as.matrix(training_set[, -which(names(training_set) == "Yield")])  # Exclude the target column

# Fit the ridge regression model using glmnet
ridge_model <- glmnet(x_train, y_train, alpha = 0)

# Plot the coefficient path
plot(ridge_model, xvar = "lambda", label = TRUE)
title("Ridge Regression Coefficient Path")

cv_results <- data.frame(lambda = cv_ridge$lambda, 
                         cvm = cv_ridge$cvm, 
                         cvsd = cv_ridge$cvsd)

# Plot the cross-validation profiles
ggplot(cv_results, aes(x = log(lambda), y = cvm)) +
  geom_line() +
  geom_ribbon(aes(ymin = cvm - cvsd, ymax = cvm + cvsd), alpha = 0.2) +
  labs(title = "Cross-Validation Profiles for Ridge Regression",
       x = "Log(Lambda)",
       y = "Cross-Validated Mean Squared Error (MSE)") +
  theme_minimal()

What is the optimal value of the performance metric?

# Load necessary libraries
library(tidymodels)
library(yardstick)  # For metric calculations

# Set seed for reproducibility
set.seed(3)

# Create a recipe for preprocessing
recipe <- recipe(Yield ~ ., data = training_set)

# Create 10-fold cross-validation folds
cv_folds <- vfold_cv(training_set, v = 10)

# Define the ridge regression model specification with tuning
ridge_spec <- linear_reg(penalty = tune(), mixture = 0) %>%
  set_engine("glmnet")

# Create a workflow
ridge_workflow <- workflow() %>%
  add_model(ridge_spec) %>%
  add_recipe(recipe)

# Define a grid of penalty values to tune
penalty_grid <- grid_regular(penalty(), levels = 30)

# Tune the model and save results to ridge_tuning_results
ridge_tuning_results <- tune_grid(
  ridge_workflow,
  resamples = cv_folds,
  grid = penalty_grid
)

# Collect metrics and find the best results
best_results <- ridge_tuning_results %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  slice_min(mean, n = 1)

# Extract optimal penalty and RMSE
optimal_penalty <- best_results$penalty
optimal_rmse <- best_results$mean

# Print optimal penalty and RMSE
cat("Optimal Penalty:", optimal_penalty, "\n")

## Optimal Penalty: 3.290345e-05

cat("Optimal RMSE from CV:", optimal_rmse, "\n")

## Optimal RMSE from CV: 0.0001237152

library(tidymodels)
library(yardstick)  

# Set seed for reproducibility
set.seed(3)

# Create a recipe for preprocessing
recipe <- recipe(Yield ~ ., data = training_set)

# Create 10-fold cross-validation folds
cv_folds <- vfold_cv(training_set, v = 10) 

# Define the ridge regression model specification with tuning
ridge_spec <- linear_reg(penalty = tune(), mixture = 0) %>%  # Use `tune()` for penalty
  set_engine("glmnet") 

# Create a workflow
ridge_workflow <- workflow() %>% 
  add_model(ridge_spec) %>% 
  add_recipe(recipe)

# Define a grid of penalty values to tune
penalty_grid <- grid_regular(penalty(), levels = 30)  # 30 values of penalty

# Ensure Yield is numeric (make sure to check this in your data)
training_set$Yield <- as.numeric(as.character(training_set$Yield))


# Collect metrics and find the best results
best_results <- ridge_tuning_results %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  slice_min(mean, n = 1)

# Extract optimal penalty and RMSE
optimal_penalty <- best_results$penalty
optimal_rmse <- best_results$mean

# Print optimal penalty and RMSE
cat("Optimal Penalty:", optimal_penalty, "\n")

## Optimal Penalty: 3.290345e-05

cat("Optimal RMSE from CV:", optimal_rmse, "\n")

## Optimal RMSE from CV: 0.0001237152

###(d)

Predict the response for the test set. What is the value of theperformance metric and how does this compare with the resampledperformance metric on the training set?

First, let’s set up the cross-validation and store the training RMSE.

library(tidymodels)

set.seed(3) 

# Set up cross-validation
cv_folds <- vfold_cv(training_set, v = 10)  # 10-fold cross-validation

# Define a ridge regression model specification
ridge_spec <- linear_reg(penalty = 1, mixture = 0) %>%  # Ridge regression
  set_engine("glmnet")

# Create a workflow for ridge regression
ridge_workflow <- workflow() %>%
  add_model(ridge_spec) %>%
  add_formula(Yield ~ .)

# Fit the model using cross-validation
cv_results <- fit_resamples(ridge_workflow, resamples = cv_folds)

# Collect metrics, including RMSE
cv_metrics <- collect_metrics(cv_results)
training_rmse_value <- cv_metrics %>%
  filter(.metric == "rmse") %>%
  summarise(mean_rmse = mean(mean)) %>%
  pull(mean_rmse)

cat("Cross-validated training RMSE:", training_rmse_value, "\n")

## Cross-validated training RMSE: 0.0001714473

Now that I have the training_rmse_value, I can proceed with predicting the test set response and calculating the RMSE for the test set as before:

# Fit the ridge regression model on the full training data
ridge_fit <- ridge_workflow %>% fit(data = training_set)

# Predict the response for the test set
ridge_predictions <- predict(ridge_fit, new_data = test_set) %>% 
  bind_cols(test_set)

# Calculate RMSE manually
test_rmse <- sqrt(mean((ridge_predictions$Yield - ridge_predictions$.pred) ^ 2))

# Display the RMSE for the test set
cat("RMSE on test set (Ridge, manual calculation):", test_rmse, "\n")

## RMSE on test set (Ridge, manual calculation): 0.0002002418

# Compare the training RMSE with the test RMSE
if (test_rmse < training_rmse_value) {
  cat("The test set RMSE is lower than the training RMSE, indicating better generalization.\n")
} else if (test_rmse > training_rmse_value) {
  cat("The test set RMSE is higher than the training RMSE, indicating potential overfitting.\n")
} else {
  cat("The test set RMSE is equal to the training RMSE, indicating similar performance.\n")
}

## The test set RMSE is higher than the training RMSE, indicating potential overfitting.

###(e)

Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

Do either the biological or process predictors dominate the list?

top_vars <- coef_df %>%
  arrange(desc(abs(Coefficient))) %>%  # Sort by absolute value of coefficients
  slice_head(n = 10)  # Select the top 10

# Plotting the top 10 variables
ggplot(top_vars, aes(x = reorder(Predictors, abs(Coefficient)), y = abs(Coefficient))) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip coordinates for better readability
  labs(title = "Top 10 Variables from Ridge Regression",
       x = "Predictors",
       y = "Importance Score (Absolute Coefficient)") +
  theme_minimal()

No,manufacturing processes are the predictors that dominate the list

top_vars <- coef_df %>%
  arrange(desc(abs(Coefficient)^2)) %>%  # Sort by squared absolute value of coefficients
  slice_head(n = 10)  # Select the top 10

# Plotting the squared VIPs of the top 10 variables with lines
ggplot(top_vars, aes(x = reorder(Predictors, abs(Coefficient)^2), y = abs(Coefficient)^2)) +
  geom_segment(aes(xend = Predictors, y = 0, yend = abs(Coefficient)^2), color = "steelblue", size = 1) +
  geom_point(color = "steelblue", size = 3) +  # Add points at the end of each line
  coord_flip() +  # Flip coordinates for better readability
  labs(title = "Top 10 Variables from Ridge Regression (Squared VIP)",
       x = "Predictors",
       y = "Importance Score (Squared Absolute Coefficient)") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

(f)

Explore the relationships between each of the top predictors and the re- sponse. How could this information be helpful in improving yield in future runs of the manufacturing process?

library(dplyr)
library(corrplot)
library(reshape2)  

selected_predictors <- top_vars$Predictors[1:10] 


cor_data <- transformed_data %>%
  dplyr::select(all_of(selected_predictors))

# Calculate the correlation matrix
cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")


# Melt the correlation matrix for ggplot
cor_melted <- melt(cor_matrix)
ggplot(data = cor_melted, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), 
                       name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   hjust = 1, color = "black"),
        axis.text.y = element_text(color = "black")) +
  labs(title = "Between-Predictor Correlations of Solubility Predictors",
       x = "Predictors",
       y = "Predictors")

The Proces predictors have a higher correlation with the response variable Yield compared to the Material predictors. Given time and resources constrains,it is recommended to focus more on the Process than to focus on the biological materials.