DATA 624 Homework 8

First, ensure the necessary packages are installed and loaded in R.

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Define a function to install packages if not already installed
install_if_missing <- function(package) {
  if (!require(package, character.only = TRUE)) {
    install.packages(package)
    library(package, character.only = TRUE)
  }
}

# Install 'AppliedPredictiveModeling' and 'caret' if they are missing
install_if_missing("AppliedPredictiveModeling")

## Loading required package: AppliedPredictiveModeling

install_if_missing("caret")

## Loading required package: caret

## Loading required package: lattice

# Load the libraries and data
suppressPackageStartupMessages({
  library(AppliedPredictiveModeling)
  library(caret)
  library(ggplot2)
})

Exercise 7.2

Process the data

library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Fit and predict with KNN Model

# Load required libraries
library(caret)
library(doParallel)

## Warning: package 'doParallel' was built under R version 4.4.2

## Loading required package: foreach

## Loading required package: iterators

## Loading required package: parallel

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use one less core than available
cl <- makeCluster(cores)             # Create a parallel cluster
registerDoParallel(cl)               # Register the parallel backend

# Train the kNN model in parallel
knnModel <- train(
  x = trainingData$x,              # Feature data
  y = trainingData$y,              # Target variable
  method = "knn",                  # k-Nearest Neighbors method
  preProc = c("center", "scale"),  # Center and scale the data
  tuneLength = 10,                 # Number of hyperparameter combinations
  trControl = trainControl(method = "cv", number = 5)  # Cross-validation
)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Print the final kNN model
print(knnModel)

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 160, 160, 160, 160, 160 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.310218  0.5640866  2.695817
##    7  3.183598  0.6130231  2.575609
##    9  3.133750  0.6548230  2.519030
##   11  3.135078  0.6653443  2.528709
##   13  3.106943  0.6907083  2.503302
##   15  3.081233  0.7116627  2.465985
##   17  3.100552  0.7236138  2.499389
##   19  3.169291  0.7099245  2.562107
##   21  3.192033  0.7134604  2.586728
##   23  3.216710  0.7143357  2.601841
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.

knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 3.1750657 0.6785946 2.5443169

Fit and predict with MARS Model

MARS selected the informative predictors (those named X1–X5)

# Load necessary libraries
library(caret)
library(earth)  # For MARS model implementation

## Loading required package: Formula

## Loading required package: plotmo

## Loading required package: plotrix

library(doParallel)  # For parallel processing

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use one less core than available
cl <- makeCluster(cores)  # Create a parallel cluster
registerDoParallel(cl)  # Register the parallel backend

# Train a MARS model in parallel
marsModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",                # MARS method in caret
  preProc = c("center", "scale"),  # Preprocessing: centering and scaling
  tuneLength = 10,                 # Tune hyperparameters
  trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE)  # Cross-validation
)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Print the model details
print(marsModel)

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 160, 160, 160, 160, 160 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##    2      4.245901  0.2787321  3.483428
##    3      3.583229  0.4805401  2.881311
##    4      2.657913  0.7084960  2.125487
##    6      2.322011  0.7813465  1.833316
##    7      1.855381  0.8615696  1.450558
##    9      1.781814  0.8724492  1.364755
##   10      1.759272  0.8752761  1.376871
##   12      1.655616  0.8902637  1.281361
##   13      1.681933  0.8890678  1.315516
##   15      1.681012  0.8898265  1.320421
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 1.

# Predict on the test set
marsPred <- predict(marsModel, newdata = testData$x)

# Evaluate performance on the test set
marsPerformance <- postResample(pred = marsPred, obs = testData$y)
print(marsPerformance)

##      RMSE  Rsquared       MAE 
## 1.8136467 0.8677298 1.3911836

# Variable importance for MARS
marsImportance <- varImp(marsModel)
print(marsImportance)

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   82.08
## X2   62.79
## X5   38.07
## X3   25.80
## X6    0.00

# Plot variable importance
plot(marsImportance, top = 15)

Fit and predict with SVM Model

# Load necessary libraries
library(caret)
library(doParallel)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use one less core than available
cl <- makeCluster(cores)  # Create a parallel cluster
registerDoParallel(cl)  # Register the parallel backend

# Train an SVM model in parallel
svmModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "svmRadial",           # SVM with radial basis function kernel
  preProc = c("center", "scale"), # Preprocessing: centering and scaling
  tuneLength = 10,                # Tune hyperparameters (cost and sigma)
  trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE)  # Cross-validation
)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Print the model details
print(svmModel)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 160, 160, 160, 160, 160 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.580537  0.7936982  2.061208
##     0.50  2.291391  0.8098871  1.814238
##     1.00  2.103498  0.8286516  1.641077
##     2.00  1.971031  0.8425164  1.530163
##     4.00  1.917953  0.8489094  1.488262
##     8.00  1.919174  0.8490298  1.484001
##    16.00  1.921739  0.8486423  1.484417
##    32.00  1.921739  0.8486423  1.484417
##    64.00  1.921739  0.8486423  1.484417
##   128.00  1.921739  0.8486423  1.484417
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06851516
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06851516 and C = 4.

# Predict on the test set
svmPred <- predict(svmModel, newdata = testData$x)

# Evaluate performance on the test set
svmPerformance <- postResample(pred = svmPred, obs = testData$y)
print(svmPerformance)

##      RMSE  Rsquared       MAE 
## 2.0742936 0.8265221 1.5696844

Fit and predict with ANN Model

# Load necessary libraries
library(caret)
library(doParallel)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use one less core than available
cl <- makeCluster(cores)  # Create a parallel cluster
registerDoParallel(cl)  # Register the parallel backend

# Define a refined tuning grid
tuneGrid <- expand.grid(size = c(1, 5, 10), decay = c(0.01, 0.1))

# Train the model again with the refined grid
annModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "nnet",
  preProc = c("center", "scale"),
  tuneGrid = tuneGrid,
  trace = FALSE,
  trControl = trainControl(method = "cv", number = 5)
)


# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Print the model details
print(annModel)

## Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 160, 160, 160, 160, 160 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE      Rsquared   MAE     
##    1    0.01   14.29446  0.7282426  13.41621
##    1    0.10   14.29464  0.7512374  13.41641
##    5    0.01   14.29445  0.7105693  13.41620
##    5    0.10   14.29454  0.7518310  13.41631
##   10    0.01   14.29444  0.7125623  13.41619
##   10    0.10   14.29452  0.7476741  13.41628
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 10 and decay = 0.01.

# Predict on the test set
annPred <- predict(annModel, newdata = testData$x)

# Evaluate performance on the test set
annPerformance <- postResample(pred = annPred, obs = testData$y)
print(annPerformance)

##       RMSE   Rsquared        MAE 
## 14.2769345  0.6154719 13.3869171

Evaluation Metrics

After running the models, compare their performances:

Best performing model

Best Model: MARS appears to give the best performance overall, with the lowest RMSE and MAE and the highest R-squared value. Second Best: SVM performs decently but is slightly worse than MARS on all metrics. Worst Model: ANN performs poorly, likely due to overfitting, insufficient hyperparameter tuning, or the structure of the data being unsuitable for a neural network.

# Combine all results into a data frame
results <- data.frame(
  Model = c("MARS", "SVM", "ANN"),
  RMSE = c(marsPerformance["RMSE"], svmPerformance["RMSE"], annPerformance["RMSE"]),
  Rsquared = c(marsPerformance["Rsquared"], svmPerformance["Rsquared"], annPerformance["Rsquared"]),
  MAE = c(marsPerformance["MAE"], svmPerformance["MAE"], annPerformance["MAE"])
)

# Print the combined results
print(results)

##   Model      RMSE  Rsquared       MAE
## 1  MARS  1.813647 0.8677298  1.391184
## 2   SVM  2.074294 0.8265221  1.569684
## 3   ANN 14.276934 0.6154719 13.386917

Exercise 7.5

Here’s the complete workflow for comparing MARS, KNN, SVM, and ANN models on the chemical manufacturing process data:

Prepare the data

(1) Load the Chemical Manufacturing Process Data

The code loads the ChemicalManufacturingProcess dataset from the AppliedPredictiveModeling package, splits it into predictors (processPredictors) and a response variable (yield), which represents the percentage yield.

The dataset contains 176 observations and 57 variables.
The predictors are stored in processPredictors, excluding the final column (the yield), and the response variable is assigned to yield.

# Load necessary library
if (!require("AppliedPredictiveModeling")) {
  install.packages("AppliedPredictiveModeling")
  library(AppliedPredictiveModeling)
}

# Load the data
data("ChemicalManufacturingProcess")
processPredictors <- ChemicalManufacturingProcess[, -ncol(ChemicalManufacturingProcess)] # Exclude the last column which is `yield`
yield <- ChemicalManufacturingProcess$Yield  # Assign response variable explicitly
# Review the data
#str(processPredictors)  
#str(yield)              # Response variable: percent yield

(2) Handle Missing Values with Imputation

Use caret’s preProcess to fill missing values.

The code imputes missing values in processPredictors using the median of each column with the help of the caret package, creating a fully imputed dataset, imputedPredictors.

# Load necessary libraries
library(caret)
library(doParallel)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use one less core than available
cl <- makeCluster(cores)  # Create a parallel cluster
registerDoParallel(cl)  # Register the parallel backend

# Create a preprocessing object to impute missing values with the median
preProcessObj <- preProcess(
  processPredictors,
  method = "medianImpute"
)

# Apply the preprocessing object to impute missing values in the dataset
# Parallel processing is used if the dataset is large or complex
imputedPredictors <- predict(preProcessObj, processPredictors)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

(3) Split Data, Pre-process, and Train a Model

Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

The code splits the dataset into training (80%) and test (20%) sets, trains a Random Forest model with 10-fold cross-validation to predict yield, and evaluates its performance.

# Identify predictors with near-zero variance
nzv <- nearZeroVar(imputedPredictors)

# Filter out those predictors
filteredPredictors <- imputedPredictors[, -nzv]

# Separate the response variable 'Yield' (first column) from the predictors
response <- filteredPredictors[, 1]  # Extract the Yield column
filteredPredictors <- filteredPredictors[, -1]  # Remove Yield from the predictors

# Check the number of predictors remaining
num_predictors <- ncol(filteredPredictors)
print(paste("Number of predictors left after removing near-zero variance:", num_predictors))

## [1] "Number of predictors left after removing near-zero variance: 55"

# Split the data into training and test sets
set.seed(123)
trainIndex <- createDataPartition(response, p = 0.8, list = FALSE)
trainData <- filteredPredictors[trainIndex, ]  # Training predictors
testData <- filteredPredictors[-trainIndex, ]  # Test predictors
trainYield <- response[trainIndex]  # Training response
testYield <- response[-trainIndex]  # Test response

# Print the number of rows in the training and test sets
print(paste("Training set size:", nrow(trainData)))

## [1] "Training set size: 144"

print(paste("Test set size:", nrow(testData)))

## [1] "Test set size: 32"

Train and Evaluate Models

Train MARS Model

The MARS (Multivariate Adaptive Regression Splines) model successfully identified key predictors for the response variable, achieving optimal performance with an RMSE of 1.4073, R-squared of 0.4525, and MAE of 1.1528 on the independent test set. These results were obtained using the final tuning parameters of nprune = 10 and degree = 1. During cross-validation, the model achieved an R-squared value of 0.5941, highlighting strong performance on resampled training data.

The variable importance analysis revealed the following as the most significant predictors:

ManufacturingProcess32 (100% importance)
ManufacturingProcess17 (56.57% importance)
ManufacturingProcess09 (38.43% importance)
ManufacturingProcess02 (25.70% importance)
BiologicalMaterial02 and BiologicalMaterial03 (both with 18.60% importance)

This model demonstrates its ability to balance interpretability and predictive accuracy, effectively identifying the most critical variables influencing the response. However, the slight difference between the test set R-squared and cross-validation R-squared underscores the importance of validating the model’s generalization ability.

# Load necessary libraries
library(earth)
library(caret)
library(doParallel)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use all but one core
cl <- makeCluster(cores)             # Create a parallel cluster
registerDoParallel(cl)               # Register the parallel backend

# Define the tuning grid for degree and nprune
marsGrid <- expand.grid(
  degree = 1:2,  # Allow up to second-degree interactions
  nprune = 2:38  # Number of terms to prune
)

# Fit the MARS model using train() with external resampling
set.seed(100)
marsModel <- train(
  x = trainData,          # Predictor variables
  y = trainYield,         # Response variable
  method = "earth",       # MARS method in caret
  tuneGrid = marsGrid,    # Grid of hyperparameters to explore
  trControl = trainControl(
    method = "cv",        # Cross-validation
    number = 10           # Number of folds
  )
)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Print the tuned model
print(marsModel)

## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  55 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 130, 130, 130, 130, 130, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      1.417036  0.4610563  1.1268899
##   1        3      1.270562  0.5426046  1.0152669
##   1        4      1.257285  0.5629884  1.0024011
##   1        5      1.259061  0.5647063  1.0030357
##   1        6      1.282213  0.5374464  1.0278309
##   1        7      1.276699  0.5302778  1.0188633
##   1        8      1.240021  0.5559535  0.9863856
##   1        9      1.203378  0.5882041  0.9721436
##   1       10      1.185759  0.5941380  0.9506741
##   1       11      2.875523  0.5147237  1.4148261
##   1       12      2.889647  0.5044998  1.4125068
##   1       13      2.911860  0.4906354  1.4252568
##   1       14      2.929364  0.4783386  1.4313547
##   1       15      2.935318  0.4798018  1.4315310
##   1       16      2.939199  0.4768222  1.4368428
##   1       17      2.939199  0.4768222  1.4368428
##   1       18      2.939199  0.4768222  1.4368428
##   1       19      2.939199  0.4768222  1.4368428
##   1       20      2.939199  0.4768222  1.4368428
##   1       21      2.939199  0.4768222  1.4368428
##   1       22      2.939199  0.4768222  1.4368428
##   1       23      2.939199  0.4768222  1.4368428
##   1       24      2.939199  0.4768222  1.4368428
##   1       25      2.939199  0.4768222  1.4368428
##   1       26      2.939199  0.4768222  1.4368428
##   1       27      2.939199  0.4768222  1.4368428
##   1       28      2.939199  0.4768222  1.4368428
##   1       29      2.939199  0.4768222  1.4368428
##   1       30      2.939199  0.4768222  1.4368428
##   1       31      2.939199  0.4768222  1.4368428
##   1       32      2.939199  0.4768222  1.4368428
##   1       33      2.939199  0.4768222  1.4368428
##   1       34      2.939199  0.4768222  1.4368428
##   1       35      2.939199  0.4768222  1.4368428
##   1       36      2.939199  0.4768222  1.4368428
##   1       37      2.939199  0.4768222  1.4368428
##   1       38      2.939199  0.4768222  1.4368428
##   2        2      1.372502  0.4735490  1.0990454
##   2        3      1.356502  0.4752559  1.0559439
##   2        4      1.276802  0.5549247  1.0099776
##   2        5      1.226674  0.5890922  0.9743090
##   2        6      1.236372  0.5810537  0.9938101
##   2        7      1.229469  0.5857615  0.9768368
##   2        8      1.223556  0.5912044  0.9861892
##   2        9      1.313564  0.5749585  1.0448934
##   2       10      1.340676  0.5565462  1.0739402
##   2       11      1.467545  0.4878662  1.1288558
##   2       12      1.519976  0.4695515  1.1510186
##   2       13      2.588658  0.4725595  1.4774373
##   2       14      2.648549  0.4769268  1.5012190
##   2       15      2.689316  0.4755803  1.5369717
##   2       16      2.884679  0.4803363  1.5883932
##   2       17      2.644325  0.5018611  1.5085782
##   2       18      2.621212  0.5020664  1.5044503
##   2       19      2.647251  0.4995565  1.4965914
##   2       20      2.507056  0.5047496  1.4801174
##   2       21      2.677305  0.5007535  1.5239616
##   2       22      2.652433  0.4989326  1.5103233
##   2       23      2.636900  0.5021532  1.4952947
##   2       24      2.753441  0.5018200  1.5335271
##   2       25      2.820024  0.5017712  1.5509031
##   2       26      2.934670  0.4886758  1.6113139
##   2       27      2.934670  0.4886758  1.6113139
##   2       28      2.934670  0.4886758  1.6113139
##   2       29      2.934670  0.4886758  1.6113139
##   2       30      2.934670  0.4886758  1.6113139
##   2       31      2.934670  0.4886758  1.6113139
##   2       32      2.934670  0.4886758  1.6113139
##   2       33      2.934670  0.4886758  1.6113139
##   2       34      2.934670  0.4886758  1.6113139
##   2       35      2.934670  0.4886758  1.6113139
##   2       36      2.934670  0.4886758  1.6113139
##   2       37      2.934670  0.4886758  1.6113139
##   2       38      2.934670  0.4886758  1.6113139
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.

# Analyze variable importance
marsImportance <- varImp(marsModel)
print(marsImportance)

## earth variable importance
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess17   56.57
## ManufacturingProcess09   38.43
## ManufacturingProcess02   25.70
## BiologicalMaterial02     18.60
## BiologicalMaterial03     18.60
## ManufacturingProcess39   13.33
## ManufacturingProcess22    0.00

# Plot variable importance
plot(marsImportance, top = 20)

# Predict on the test set
marsPredictions <- predict(marsModel, newdata = testData)

# Evaluate performance on the test set
marsPerformance <- postResample(pred = marsPredictions, obs = testYield)
print(marsPerformance)

##      RMSE  Rsquared       MAE 
## 1.4072635 0.4524617 1.1527858

Train k-Nearest Neighbors (KNN) Model

For the k-Nearest Neighbors (k-NN) model with \(k = 3\), the performance metrics indicate the following:

The R-squared value from cross-validation is 0.5342, which reflects the model’s ability to explain variance in the training data during resampling.
The R-squared value on the independent test set is 0.3350, demonstrating how well the model generalizes to unseen data.

While the cross-validation R-squared is higher, the drop in test set R-squared suggests some level of overfitting or differences in data characteristics between the training and test sets. This highlights the importance of validating model performance on an independent test set to ensure reliability.

# Load necessary libraries
library(caret)
library(doParallel)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use all but one core
cl <- makeCluster(cores)             # Create a parallel cluster
registerDoParallel(cl)               # Register the parallel backend

# Define the tuning grid for the number of neighbors
knnGrid <- expand.grid(k = 1:20)

# Train the KNN model with caret
set.seed(123)
knnModel <- train(
  x = trainData,                     # Predictor variables
  y = trainYield,                    # Response variable
  method = "knn",                    # KNN method
  preProc = c("center", "scale"),    # Centering and scaling
  tuneGrid = knnGrid,                # Grid of hyperparameters to explore
  trControl = trainControl(
    method = "cv",                   # Cross-validation
    number = 5,                      # Number of folds
    allowParallel = TRUE             # Enable parallel processing during resampling
  )
)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Print the KNN model results
print(knnModel)

## k-Nearest Neighbors 
## 
## 144 samples
##  55 predictor
## 
## Pre-processing: centered (55), scaled (55) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 116, 116, 115, 116, 113 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE      
##    1  1.462027  0.4168436  1.1290106
##    2  1.363514  0.4737576  1.0813046
##    3  1.273494  0.5342453  1.0043078
##    4  1.282900  0.5269851  0.9980377
##    5  1.328725  0.4913713  1.0411439
##    6  1.340986  0.4851244  1.0678474
##    7  1.373854  0.4565705  1.0915708
##    8  1.368188  0.4616640  1.0896447
##    9  1.336792  0.4894488  1.0587541
##   10  1.323616  0.5029194  1.0571762
##   11  1.335415  0.4975159  1.0736199
##   12  1.346281  0.4891835  1.0774769
##   13  1.347502  0.4940762  1.0823816
##   14  1.351050  0.4976379  1.0870508
##   15  1.369857  0.4875060  1.1052929
##   16  1.376861  0.4824909  1.0993577
##   17  1.392951  0.4669743  1.1137737
##   18  1.395873  0.4685117  1.1101456
##   19  1.400597  0.4679007  1.1205462
##   20  1.415488  0.4611594  1.1328655
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 3.

# Predict on the test set
knnPredictions <- predict(knnModel, newdata = testData)

# Evaluate performance
knnPerformance <- postResample(pred = knnPredictions, obs = testYield)
print(knnPerformance)

##      RMSE  Rsquared       MAE 
## 1.5013193 0.3349698 1.1772917

Train Support Vector Machine (SVM) Model

The Support Vector Machine (SVM) model was successfully optimized using cross-validation, with RMSE as the evaluation metric. The optimal model was selected based on the smallest RMSE value during cross-validation, where it achieved a cross-validation R-squared of 0.6750. The final model parameters were sigma = 0.01525 (Gaussian kernel width) and C = 16 (cost parameter).

The performance metrics for the independent test set were:

RMSE: 1.2300
R-squared: 0.5530
MAE: 1.0424

The final SVM model, of type epsilon-Support Vector Regression (eps-svr), used 129 support vectors, which constituted a significant portion of the training set. The model achieved an objective function value of -83.1197 and a very low training error of 0.0093, indicating a well-fitted model.

The close alignment between the cross-validation R-squared and the test set R-squared underscores the model’s strong generalization performance, effectively capturing the underlying relationships in the data while minimizing overfitting.

# Load necessary libraries
library(caret)
library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(doParallel)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use all but one core
cl <- makeCluster(cores)
registerDoParallel(cl)

# Define a tuning grid for hyperparameter search
tuneGrid <- expand.grid(
  sigma = seq(0.001, 0.02, length = 5),  # Kernel width parameter
  C = c(0.25, 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2050)  # Cost parameter
)

# Train the SVM model using caret
set.seed(123)
svmModel <- train(
  x = trainData,                # Predictor variables
  y = trainYield,               # Response variable
  method = "svmRadial",         # Radial Basis Function kernel
  preProc = c("center", "scale"), # Automatically center and scale
  tuneGrid = tuneGrid,          # Grid of hyperparameters (sigma and C)
  trControl = trainControl(
    method = "cv",              # Cross-validation
    number = 10,                # 10-fold cross-validation
    allowParallel = TRUE        # Enable parallel processing during resampling
  )
)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Print the tuned SVM model
print(svmModel)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  55 predictor
## 
## Pre-processing: centered (55), scaled (55) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ... 
## Resampling results across tuning parameters:
## 
##   sigma    C        RMSE      Rsquared   MAE      
##   0.00100     0.25  1.584724  0.4908850  1.2933921
##   0.00100     0.50  1.472165  0.5227248  1.2035928
##   0.00100     1.00  1.347364  0.5609412  1.1059130
##   0.00100     2.00  1.251732  0.5836637  1.0514179
##   0.00100     4.00  1.207624  0.5962614  1.0144070
##   0.00100     8.00  1.183270  0.6048753  0.9717196
##   0.00100    16.00  1.174018  0.6152566  0.9655921
##   0.00100    32.00  1.213477  0.6058961  1.0149432
##   0.00100    64.00  1.275442  0.5900019  1.0789686
##   0.00100   128.00  1.384748  0.5650622  1.1632247
##   0.00100   256.00  1.494103  0.5574347  1.1884871
##   0.00100   512.00  1.536853  0.5524136  1.1745447
##   0.00100  1024.00  1.429057  0.5634973  1.1021262
##   0.00100  2050.00  1.425374  0.5642553  1.1039291
##   0.00575     0.25  1.374990  0.5565957  1.1242310
##   0.00575     0.50  1.277043  0.5786877  1.0586592
##   0.00575     1.00  1.204491  0.6102910  0.9939477
##   0.00575     2.00  1.150214  0.6344192  0.9410259
##   0.00575     4.00  1.122581  0.6496301  0.9249104
##   0.00575     8.00  1.113802  0.6504783  0.9181936
##   0.00575    16.00  1.131446  0.6428268  0.9289700
##   0.00575    32.00  1.123222  0.6512749  0.9100888
##   0.00575    64.00  1.133427  0.6457598  0.9142969
##   0.00575   128.00  1.133427  0.6457598  0.9142969
##   0.00575   256.00  1.133427  0.6457598  0.9142969
##   0.00575   512.00  1.133427  0.6457598  0.9142969
##   0.00575  1024.00  1.133427  0.6457598  0.9142969
##   0.00575  2050.00  1.133427  0.6457598  0.9142969
##   0.01050     0.25  1.354291  0.5687365  1.1046404
##   0.01050     0.50  1.259387  0.5949316  1.0354559
##   0.01050     1.00  1.179686  0.6325881  0.9593929
##   0.01050     2.00  1.115066  0.6618773  0.9065678
##   0.01050     4.00  1.089756  0.6656807  0.8800131
##   0.01050     8.00  1.103614  0.6577296  0.8819586
##   0.01050    16.00  1.095728  0.6655347  0.8714994
##   0.01050    32.00  1.095600  0.6657098  0.8719145
##   0.01050    64.00  1.095600  0.6657098  0.8719145
##   0.01050   128.00  1.095600  0.6657098  0.8719145
##   0.01050   256.00  1.095600  0.6657098  0.8719145
##   0.01050   512.00  1.095600  0.6657098  0.8719145
##   0.01050  1024.00  1.095600  0.6657098  0.8719145
##   0.01050  2050.00  1.095600  0.6657098  0.8719145
##   0.01525     0.25  1.362528  0.5669301  1.1108764
##   0.01525     0.50  1.264952  0.6000550  1.0309710
##   0.01525     1.00  1.166932  0.6442221  0.9444841
##   0.01525     2.00  1.112524  0.6655948  0.8992675
##   0.01525     4.00  1.101407  0.6628631  0.8780349
##   0.01525     8.00  1.087566  0.6740149  0.8633659
##   0.01525    16.00  1.086919  0.6750417  0.8633120
##   0.01525    32.00  1.086919  0.6750417  0.8633120
##   0.01525    64.00  1.086919  0.6750417  0.8633120
##   0.01525   128.00  1.086919  0.6750417  0.8633120
##   0.01525   256.00  1.086919  0.6750417  0.8633120
##   0.01525   512.00  1.086919  0.6750417  0.8633120
##   0.01525  1024.00  1.086919  0.6750417  0.8633120
##   0.01525  2050.00  1.086919  0.6750417  0.8633120
##   0.02000     0.25  1.383589  0.5615123  1.1271756
##   0.02000     0.50  1.278280  0.5984133  1.0350035
##   0.02000     1.00  1.176618  0.6426173  0.9438365
##   0.02000     2.00  1.131785  0.6585769  0.9076618
##   0.02000     4.00  1.100867  0.6735171  0.8801702
##   0.02000     8.00  1.097831  0.6771105  0.8781162
##   0.02000    16.00  1.097831  0.6771105  0.8781162
##   0.02000    32.00  1.097831  0.6771105  0.8781162
##   0.02000    64.00  1.097831  0.6771105  0.8781162
##   0.02000   128.00  1.097831  0.6771105  0.8781162
##   0.02000   256.00  1.097831  0.6771105  0.8781162
##   0.02000   512.00  1.097831  0.6771105  0.8781162
##   0.02000  1024.00  1.097831  0.6771105  0.8781162
##   0.02000  2050.00  1.097831  0.6771105  0.8781162
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01525 and C = 16.

# Predict on the test set
svmPredictions <- predict(svmModel, newdata = testData)

# Evaluate the performance on the test set
svmPerformance <- postResample(pred = svmPredictions, obs = testYield)
print(svmPerformance)

##     RMSE Rsquared      MAE 
## 1.230027 0.553041 1.042387

# Print the final model details
svmModel$finalModel

## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 16 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.01525 
## 
## Number of Support Vectors : 129 
## 
## Objective Function Value : -83.1197 
## Training error : 0.009344

Train Artificial Neural Network (ANN) Model

ANN with Cross-Validation and Model Tuning (caret)

The Artificial Neural Network (ANN) model was trained using cross-validation to identify the optimal combination of hyperparameters for predictive performance. Based on the cross-validation results:

Optimal Hyperparameters:
- Size (number of units in the hidden layer): 3
- Decay (regularization parameter): 0.10
Cross-Validation Performance:
- RMSE: 1.436522
- R-squared: 0.4712512
- MAE: 1.124480
Test Set Performance:
- RMSE: 1.418033
- R-squared: 0.430970
- MAE: 1.200945

The model demonstrated effective generalization, with close alignment between cross-validation and test set metrics. While it exhibited reasonable predictive performance, the R-squared value indicates there may still be room for improvement in explaining variance in the response variable. This highlights the ANN model’s capability to capture complex relationships in the data while maintaining interpretability through parameter tuning.

# Load necessary libraries
library(caret)
library(doParallel)
library(kableExtra)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use all but one core
cl <- makeCluster(cores)             # Create a parallel cluster
registerDoParallel(cl)               # Register the parallel backend

# Define a tuning grid for hyperparameter search
nnetGrid <- expand.grid(
  .size = c(1:10),              # Number of units in the hidden layer
  .decay = c(0, 0.01, 0.1)      # Weight decay (regularization)
)

# Train the ANN model with caret
set.seed(123)
annModel <- train(
  x = trainData,                # Predictor variables
  y = trainYield,               # Response variable
  method = "nnet",              # ANN model
  preProc = c("center", "scale"), # Automatically center and scale
  tuneGrid = nnetGrid,          # Grid of hyperparameters to explore
  linout = TRUE,                # For regression
  trace = FALSE,                # Suppress printed output
  MaxNWts = 10 * (ncol(trainData) + 1) + 10 + 1, # Weight limit
  maxit = 500,                  # Maximum number of iterations
  trControl = trainControl(
    method = "cv",              # Cross-validation
    number = 5,                 # Number of folds
    allowParallel = TRUE        # Enable parallel processing during resampling
  )
)

# Predict on the test set
annModelPredictions <- predict(annModel, newdata = testData)

# Evaluate performance
performance <- postResample(pred = annModelPredictions, obs = testYield)

performance |>
  kable(caption = "Summary of ANN Performance") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Summary of ANN Performance
	x
RMSE	1.418033
Rsquared	0.430970
MAE	1.200945

# Print the final model details
annModel$results |>
  kable(caption = "Summary of ANN Model") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Summary of ANN Model
size	decay	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD
1	0.00	1.689734	0.2542072	1.349821	0.1794880	0.1791168	0.1562689
1	0.01	1.561907	0.4187517	1.234869	0.2837755	0.1321660	0.2040772
1	0.10	1.436522	0.4712951	1.124480	0.2718533	0.1698442	0.2328798
2	0.00	1.645134	0.2585107	1.289944	0.2915022	0.2058334	0.2303881
2	0.01	1.867172	0.3413779	1.478404	0.2507002	0.1218222	0.1724573
2	0.10	2.509684	0.2571194	1.994840	0.6941452	0.1144497	0.4945125
3	0.00	2.050785	0.2086117	1.542577	0.8567400	0.1353936	0.5047662
3	0.01	2.555253	0.1740069	1.960693	0.5980771	0.0762876	0.3544072
3	0.10	3.508416	0.1174988	2.389504	1.0200151	0.0716593	0.4171840
4	0.00	2.341756	0.3372945	1.610766	1.2583672	0.1861665	0.5921126
4	0.01	2.821935	0.1277371	2.000191	0.5010514	0.0837186	0.3396468
4	0.10	2.535123	0.2921858	1.766428	0.9007859	0.1299448	0.3690240
5	0.00	3.417779	0.1635364	2.693057	0.7258504	0.1000456	0.6737322
5	0.01	2.790992	0.1837780	2.045475	0.4722601	0.1291922	0.1992326
5	0.10	2.943415	0.1461997	2.207419	0.6393891	0.1113244	0.3677718
6	0.00	3.723598	0.1351910	2.828667	1.2000072	0.1581778	0.8735173
6	0.01	2.587431	0.1541800	1.994730	0.3411992	0.1133942	0.2366837
6	0.10	2.819276	0.1728679	1.950022	0.7970351	0.0786146	0.2363705
7	0.00	3.199451	0.2165521	2.597800	0.8165988	0.1454107	0.7121076
7	0.01	2.677368	0.1903446	2.019389	0.5868093	0.1312342	0.4784302
7	0.10	3.278807	0.1466532	2.159866	1.2302435	0.1720565	0.4866422
8	0.00	4.900561	0.1320959	3.797158	1.0821378	0.1179410	0.9482748
8	0.01	2.454183	0.2284838	1.885210	0.4702424	0.1911357	0.3272997
8	0.10	2.233319	0.3440147	1.713529	0.5282493	0.1256910	0.2499077
9	0.00	5.196023	0.1549238	3.992858	1.4621434	0.1367316	1.2467610
9	0.01	2.721205	0.3134279	2.074909	0.5559672	0.0791680	0.2950494
9	0.10	2.852651	0.1707596	2.096973	0.6316644	0.0873419	0.2651230
10	0.00	7.230668	0.0536463	4.719107	2.9623238	0.0407995	1.7589904
10	0.01	3.403570	0.1862717	2.489796	1.3782233	0.2703826	0.7280932
10	0.10	2.637258	0.1974167	1.998243	0.8516244	0.1312045	0.5239237

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

(a) Which nonlinear regression model gives the optimal resampling and test set performance?

Evaluate Models on the Test Set

Optimal Model:
- I will compare RMSE and \(R^2\) for MARS, KNN, SVM, and ANN.
- Select the model with the lowest RMSE and highest \(R^2\) on the test set.
Optimal Model SVM:

The SVM model provides the optimal resampling and test set performance, achieving the lowest RMSE (1.230027), the highest R-squared (0.5530410), and the smallest MAE (1.042387) among the nonlinear regression models.

# Predict on the test set
marsPred <- predict(marsModel, newdata = testData)  # Automatically scaled
knnPred <- predict(knnModel, newdata = testData)    # Automatically scaled
svmPred <- predict(svmModel, newdata = testData)    # Automatically scaled
annPred <- predict(annModel, newdata = testData)    # Automatically scaled

# Calculate performance metrics
marsPerformance <- postResample(pred = marsPred, obs = testYield)
knnPerformance <- postResample(pred = knnPred, obs = testYield)
svmPerformance <- postResample(pred = svmPred, obs = testYield)
annPerformance <- postResample(pred = annPred, obs = testYield)

# Combine results into a single data frame
results <- data.frame(
  Model = c("MARS", "KNN", "SVM", "ANN"),
  RMSE = c(marsPerformance["RMSE"], knnPerformance["RMSE"], svmPerformance["RMSE"], annPerformance["RMSE"]),
  Rsquared = c(marsPerformance["Rsquared"], knnPerformance["Rsquared"], svmPerformance["Rsquared"], annPerformance["Rsquared"]),
  MAE = c(marsPerformance["MAE"], knnPerformance["MAE"], svmPerformance["MAE"], annPerformance["MAE"])
)

# Sort the results by RMSE in ascending order
sorted_results <- results[order(results$RMSE), ]

# Print the sorted results
sorted_results|>
  kable(caption = "Summary of Model Comparison") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Summary of Model Comparison
	Model	RMSE	Rsquared	MAE
3	SVM	1.230027	0.5530410	1.042387
1	MARS	1.407264	0.4524617	1.152786
4	ANN	1.418033	0.4309700	1.200945
2	KNN	1.501319	0.3349698	1.177292

(b) Analyze Variable Importance for SVM ( the optimal nonlinear regression model)

permutation-based Feature Selection

Predictor Importance: - Highlight which predictors dominate in the best nonlinear model. - Determine whether biological or process predictors dominate.

Based on the analysis of the bar chart:

ManufacturingProcess predictors: ManufacturingProcess32, ManufacturingProcess37, ManufacturingProcess09, ManufacturingProcess13, ManufacturingProcess36, ManufacturingProcess17, and ManufacturingProcess28.
- Count: 7
BiologicalMaterial predictors: BiologicalMaterial03, BiologicalMaterial05, and BiologicalMaterial06.
- Count: 3

From this analysis, manufacturing process predictors dominate the list of most important features in the model, as they make up the majority of the top ten important predictors. This suggests that process-related features have a stronger influence on the model’s predictions compared to biological features.

# Load necessary libraries
library(pdp)

## Warning: package 'pdp' was built under R version 4.4.2

library(vip)

## Warning: package 'vip' was built under R version 4.4.2

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(doParallel)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use all but one core
cl <- makeCluster(cores)             # Create a parallel cluster
registerDoParallel(cl)               # Register the parallel backend

# Define a custom function for permutation-based feature importance
set.seed(123)
svmImportance <- vip(
  svmModel,
  method = "permute",                # Permutation-based importance
  train = trainData,                 # Correct training predictors
  target = trainYield,               # Correct target variable
  metric = "RMSE",                   # Performance metric (e.g., RMSE)
  nsim = 50,                         # Number of permutations
  pred_wrapper = predict             # Prediction function for the model
)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Plot variable importance
plot(svmImportance)

Refit model based on features selected by the permutation-based Feature Selection

# Subset the training data to only include the selected 11 features
# Ensure train and test data subsets are consistent
selected_features <- c("ManufacturingProcess32", "ManufacturingProcess37", "ManufacturingProcess09",
                       "ManufacturingProcess13", "ManufacturingProcess36", "ManufacturingProcess28",
                       "BiologicalMaterial03", "BiologicalMaterial05", "BiologicalMaterial06")

trainData_selected <- trainData[, selected_features]
testData_selected <- testData[, selected_features]

tuneGrid <- expand.grid(
  sigma = c(0.01, 0.05, 0.1, 0.15, 0.2),  # Adjust based on reduced feature space
  C = c(0.25, 0.5, 1, 2, 4, 8, 16, 32, 64)
)


set.seed(123)
final_svm_model <- train(
  x = trainData_selected,
  y = trainYield,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneGrid = tuneGrid,
  trControl = trainControl(
    method = "cv",
    number = 5,
    allowParallel = TRUE
  )
)


svm_predictions <- predict(final_svm_model, newdata = testData_selected)
svm_performance <- postResample(pred = svm_predictions, obs = testYield)

svm_performance|>
  kable(caption = "SVM Model Performance") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

SVM Model Performance
	x
RMSE	1.2476406
Rsquared	0.5540403
MAE	1.0195677

Recursive Feature Selection (RFE)

The Recursive Feature Selection (RFE) process identified the top 5 variables influencing the response variable, namely:

ManufacturingProcess32
BiologicalMaterial06
ManufacturingProcess13
BiologicalMaterial02
ManufacturingProcess17

Using the identified features, the optimal model achieved the following cross-validation results:

RMSE: 1.265
R-squared: 0.5388
MAE: 1.0085

This demonstrates that focusing on these key variables enhances the model’s efficiency without compromising its predictive performance. Among all models evaluated, SVM provided the best overall performance on the independent test set with an RMSE of 1.2300, R-squared of 0.5530, and MAE of 1.0424, followed by MARS and KNN models. These results highlight the importance of variable selection and proper model evaluation to achieve optimal predictive accuracy.

Again, from this analysis, manufacturing process predictors dominate the list of most important features in the model.

# Load necessary libraries
library(caret)
library(doParallel)

# Set up parallel processing
cores <- parallel::detectCores() - 1  # Use all but one core
cl <- makeCluster(cores)             # Create a parallel cluster
registerDoParallel(cl)               # Register the parallel backend

# Define control for RFE
control <- rfeControl(
  functions = caretFuncs,             # Use caret's built-in SVM support
  method = "cv",                      # Cross-validation method
  number = 5,                         # Number of folds
  allowParallel = TRUE                # Enable parallel processing
)

# Perform RFE with SVM
set.seed(123)
svmRFE <- rfe(
  x = trainData,                 # Predictor variables
  y = trainYield,                 # Target variable
  sizes = c(1:10),                    # Subset sizes to explore
  rfeControl = control,               # RFE control object
  method = "svmRadial"                # SVM with radial basis function kernel
)

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Return to sequential processing

# Print RFE results
print(svmRFE)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared    MAE RMSESD RsquaredSD  MAESD Selected
##          1 1.419   0.4176 1.1443 0.1673    0.08145 0.1462         
##          2 1.386   0.4657 1.0643 0.2540    0.16121 0.2067         
##          3 1.420   0.4335 1.1090 0.2579    0.14224 0.1882         
##          4 1.357   0.4719 1.0477 0.3037    0.15805 0.2413         
##          5 1.321   0.4951 1.0512 0.3145    0.16525 0.2423         
##          6 1.280   0.5272 1.0347 0.2737    0.13124 0.2178         
##          7 1.264   0.5393 1.0065 0.3389    0.18226 0.2551         
##          8 1.269   0.5444 1.0049 0.3236    0.18270 0.2553         
##          9 1.240   0.5510 0.9728 0.3801    0.22981 0.2962         
##         10 1.235   0.5599 0.9404 0.3638    0.20580 0.2605         
##         55 1.222   0.5778 0.9425 0.2112    0.10171 0.1252        *
## 
## The top 5 variables (out of 55):
##    ManufacturingProcess32, BiologicalMaterial06, ManufacturingProcess13, BiologicalMaterial02, ManufacturingProcess17

# Plot RFE variable importance
plot(svmRFE, type = c("g", "o"))

Refit model based on features selected by the Recursive Feature Selection

# Subset the training data to only include the selected 5 features
selected_features <- c("ManufacturingProcess32", "BiologicalMaterial06", 
                       "ManufacturingProcess13", "BiologicalMaterial02", 
                       "ManufacturingProcess17")
trainData_selected <- trainData[, selected_features, drop = FALSE]
testData_selected <- testData[, selected_features, drop = FALSE]

# Align Column Names in Test Data
colnames(testData_selected) <- colnames(trainData_selected)


# Train the final SVM model with the selected features
set.seed(123)
final_svm_model <- train(
  x = trainData_selected,
  y = trainYield,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneGrid = expand.grid(sigma = c(0.001, 0.01, 0.1), C = c(1, 2, 4, 8, 16)),
  trControl = trainControl(
    method = "cv",
    number = 5,
    allowParallel = TRUE
  )
)

svm_predictions <- predict(final_svm_model, newdata = testData_selected)
svm_performance <- postResample(pred = svm_predictions, obs = testYield)

svm_performance |>
  kable(caption = "SVM Model Performance") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

SVM Model Performance
	x
RMSE	1.2462983
Rsquared	0.5446202
MAE	1.0344052

Compare the originalmodel,permutation-based feature selection model, and Recursive Feature Elimination (RFE) model

Updated Comparison of SVM Models

The Support Vector Machine (SVM) models were evaluated using three different approaches to balance predictive accuracy and model simplicity: the original model with all 55 features, a permutation-based feature selection model with 11 features, and a Recursive Feature Elimination (RFE) model with 5 features.

Comparison of SVM Models

Original SVM Model (55 features):
RMSE: 1.2300, R²: 0.5530, MAE: 1.0424, Sigma: 0.01525, C: 16
Permutation-Based Model (11 features):
RMSE: 1.2476, R²: 0.5540, MAE: 1.0196, Sigma: 0.0824, C: 4
RFE Model (5 features):
RMSE: 1.2463, R²: 0.5444, MAE: 1.0344, Sigma: 0.3666, C: 2

Conclusion

The original SVM model showed the best balance between predictive accuracy and computational complexity, with the lowest RMSE and highest R² among all models. However, the RFE model provides a simpler alternative with competitive performance, making it suitable for cases where fewer predictors are desired.

Compare top-ten predictors between optimal linear and non-lineal model

Fit the Linear Model (PLS)

# Load required library
library(caret)

# Define 10-fold cross-validation
control <- trainControl(
  method = "cv",         # Cross-validation
  number = 10,           # 10 folds
  savePredictions = "final", # Save final predictions
  verboseIter = TRUE     # Print progress during training
)

# Train the PLS model
set.seed(123)  # Ensure reproducibility
plsFit <- train(
  x = trainData,         # Predictor variables
  y = trainYield,        # Response variable
  method = "pls",        # Partial Least Squares regression
  tuneLength = 20,       # Explore up to 20 latent components
  trControl = control,   # Cross-validation control
  preProcess = c("center", "scale") # Standardize data
)

## + Fold01: ncomp=20 
## - Fold01: ncomp=20 
## + Fold02: ncomp=20 
## - Fold02: ncomp=20 
## + Fold03: ncomp=20 
## - Fold03: ncomp=20 
## + Fold04: ncomp=20 
## - Fold04: ncomp=20 
## + Fold05: ncomp=20 
## - Fold05: ncomp=20 
## + Fold06: ncomp=20 
## - Fold06: ncomp=20 
## + Fold07: ncomp=20 
## - Fold07: ncomp=20 
## + Fold08: ncomp=20 
## - Fold08: ncomp=20 
## + Fold09: ncomp=20 
## - Fold09: ncomp=20 
## + Fold10: ncomp=20 
## - Fold10: ncomp=20 
## Aggregating results
## Selecting tuning parameters
## Fitting ncomp = 3 on full training set

# Get the optimal number of latent variables
optimal_ncomp <- plsFit$bestTune$ncomp
resampled_R2 <- max(plsFit$results$Rsquared)

# Print key results
cat("Optimal number of latent variables:", optimal_ncomp, "\n")

## Optimal number of latent variables: 3

cat("Resampled estimate of R²:", round(resampled_R2, 4), "\n")

## Resampled estimate of R²: 0.6032

# Plot cross-validated performance
plot(plsFit)

# Predict on the test set
plsPred <- predict(plsFit, newdata = testData)

# Evaluate performance on the test set
testPerformance <- postResample(pred = plsPred, obs = testYield)
cat("Test Set Performance:\n")

## Test Set Performance:

print(testPerformance)

##      RMSE  Rsquared       MAE 
## 1.3891031 0.4610002 1.1561989

# Display variable importance
cat("\nVariable Importance:\n")

## 
## Variable Importance:

varImp(plsFit)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

## pls variable importance
## 
##   only 20 most important variables shown (out of 55)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess17   87.20
## ManufacturingProcess36   87.19
## ManufacturingProcess13   86.58
## ManufacturingProcess09   86.35
## ManufacturingProcess06   69.71
## ManufacturingProcess33   62.42
## BiologicalMaterial06     61.65
## BiologicalMaterial03     60.97
## BiologicalMaterial08     59.90
## BiologicalMaterial02     59.90
## ManufacturingProcess11   56.10
## BiologicalMaterial12     55.03
## BiologicalMaterial11     54.57
## BiologicalMaterial01     52.38
## BiologicalMaterial04     50.61
## ManufacturingProcess28   49.74
## ManufacturingProcess12   46.30
## ManufacturingProcess37   45.97
## BiologicalMaterial10     42.18

Compare Predictors

# Extract variable importance for SVM and PLS
svmImportance <- varImp(svmModel)$importance
plsImportance <- varImp(plsFit)$importance

# Get the top 10 predictors from each model
top_svm <- rownames(svmImportance)[order(-svmImportance[,1])[1:10]]
top_pls <- rownames(plsImportance)[order(-plsImportance[,1])[1:10]]

# Compare the predictors
common_predictors <- intersect(top_svm, top_pls)
unique_svm <- setdiff(top_svm, top_pls)
unique_pls <- setdiff(top_pls, top_svm)

# Display the results
cat("Common Predictors:\n", common_predictors, "\n")

## Common Predictors:
##  ManufacturingProcess32 BiologicalMaterial06 ManufacturingProcess36 BiologicalMaterial03 ManufacturingProcess13 ManufacturingProcess17 ManufacturingProcess09

cat("Unique to SVM:\n", unique_svm, "\n")

## Unique to SVM:
##  ManufacturingProcess31 BiologicalMaterial02 BiologicalMaterial12

cat("Unique to PLS:\n", unique_pls, "\n")

## Unique to PLS:
##  ManufacturingProcess06 ManufacturingProcess33 BiologicalMaterial08

(c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model.

To explore the relationships between predictors unique to the optimal nonlinear regression model (SVM) and the response variable (yield), we create scatterplots for these predictors. These plots will help evaluate whether these predictors have intuitive or interpretable relationships with the yield.

The plots reveal intuitive relationships between the predictors and yield:

ManufacturingProcess31: Shows a negative correlation, suggesting higher levels may reduce yield, indicating potential inefficiencies in the process.
BiologicalMaterial02: Positively correlated with yield, likely reflecting its beneficial role in the process.
BiologicalMaterial12: Also positively correlated, suggesting it contributes significantly to yield improvement.

These patterns highlight key areas for optimizing biological inputs and manufacturing processes to enhance yield.

# Identify unique predictors for SVM
unique_svm_predictors <- setdiff(top_svm, common_predictors)

# Create scatterplots for unique SVM predictors
par(mfrow = c(2, 3))  # Adjust layout for multiple plots
for (predictor in unique_svm_predictors) {
  plot(
    trainData[[predictor]], trainYield,
    main = predictor,
    xlab = predictor,
    ylab = "Yield",
    pch = 19, col = "blue"
  )
  abline(lm(trainYield ~ trainData[[predictor]]), col = "red")  # Add regression line
}