library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# Define a function to install packages if not already installed
install_if_missing <- function(package) {
if (!require(package, character.only = TRUE)) {
install.packages(package)
library(package, character.only = TRUE)
}
}
# Install 'AppliedPredictiveModeling' and 'caret' if they are missing
install_if_missing("AppliedPredictiveModeling")
## Loading required package: AppliedPredictiveModeling
install_if_missing("caret")
## Loading required package: caret
## Loading required package: lattice
# Load the libraries and data
suppressPackageStartupMessages({
library(AppliedPredictiveModeling)
library(caret)
library(ggplot2)
})
library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
# Load required libraries
library(caret)
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.4.2
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use one less core than available
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Train the kNN model in parallel
knnModel <- train(
x = trainingData$x, # Feature data
y = trainingData$y, # Target variable
method = "knn", # k-Nearest Neighbors method
preProc = c("center", "scale"), # Center and scale the data
tuneLength = 10, # Number of hyperparameter combinations
trControl = trainControl(method = "cv", number = 5) # Cross-validation
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Print the final kNN model
print(knnModel)
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 160, 160, 160, 160, 160
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.310218 0.5640866 2.695817
## 7 3.183598 0.6130231 2.575609
## 9 3.133750 0.6548230 2.519030
## 11 3.135078 0.6653443 2.528709
## 13 3.106943 0.6907083 2.503302
## 15 3.081233 0.7116627 2.465985
## 17 3.100552 0.7236138 2.499389
## 19 3.169291 0.7099245 2.562107
## 21 3.192033 0.7134604 2.586728
## 23 3.216710 0.7143357 2.601841
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.1750657 0.6785946 2.5443169
MARS selected the informative predictors (those named X1–X5)
# Load necessary libraries
library(caret)
library(earth) # For MARS model implementation
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(doParallel) # For parallel processing
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use one less core than available
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Train a MARS model in parallel
marsModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "earth", # MARS method in caret
preProc = c("center", "scale"), # Preprocessing: centering and scaling
tuneLength = 10, # Tune hyperparameters
trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE) # Cross-validation
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Print the model details
print(marsModel)
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 160, 160, 160, 160, 160
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.245901 0.2787321 3.483428
## 3 3.583229 0.4805401 2.881311
## 4 2.657913 0.7084960 2.125487
## 6 2.322011 0.7813465 1.833316
## 7 1.855381 0.8615696 1.450558
## 9 1.781814 0.8724492 1.364755
## 10 1.759272 0.8752761 1.376871
## 12 1.655616 0.8902637 1.281361
## 13 1.681933 0.8890678 1.315516
## 15 1.681012 0.8898265 1.320421
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 1.
# Predict on the test set
marsPred <- predict(marsModel, newdata = testData$x)
# Evaluate performance on the test set
marsPerformance <- postResample(pred = marsPred, obs = testData$y)
print(marsPerformance)
## RMSE Rsquared MAE
## 1.8136467 0.8677298 1.3911836
# Variable importance for MARS
marsImportance <- varImp(marsModel)
print(marsImportance)
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.08
## X2 62.79
## X5 38.07
## X3 25.80
## X6 0.00
# Plot variable importance
plot(marsImportance, top = 15)
# Load necessary libraries
library(caret)
library(doParallel)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use one less core than available
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Train an SVM model in parallel
svmModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "svmRadial", # SVM with radial basis function kernel
preProc = c("center", "scale"), # Preprocessing: centering and scaling
tuneLength = 10, # Tune hyperparameters (cost and sigma)
trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE) # Cross-validation
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Print the model details
print(svmModel)
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 160, 160, 160, 160, 160
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.580537 0.7936982 2.061208
## 0.50 2.291391 0.8098871 1.814238
## 1.00 2.103498 0.8286516 1.641077
## 2.00 1.971031 0.8425164 1.530163
## 4.00 1.917953 0.8489094 1.488262
## 8.00 1.919174 0.8490298 1.484001
## 16.00 1.921739 0.8486423 1.484417
## 32.00 1.921739 0.8486423 1.484417
## 64.00 1.921739 0.8486423 1.484417
## 128.00 1.921739 0.8486423 1.484417
##
## Tuning parameter 'sigma' was held constant at a value of 0.06851516
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06851516 and C = 4.
# Predict on the test set
svmPred <- predict(svmModel, newdata = testData$x)
# Evaluate performance on the test set
svmPerformance <- postResample(pred = svmPred, obs = testData$y)
print(svmPerformance)
## RMSE Rsquared MAE
## 2.0742936 0.8265221 1.5696844
# Load necessary libraries
library(caret)
library(doParallel)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use one less core than available
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Define a refined tuning grid
tuneGrid <- expand.grid(size = c(1, 5, 10), decay = c(0.01, 0.1))
# Train the model again with the refined grid
annModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "nnet",
preProc = c("center", "scale"),
tuneGrid = tuneGrid,
trace = FALSE,
trControl = trainControl(method = "cv", number = 5)
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Print the model details
print(annModel)
## Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 160, 160, 160, 160, 160
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.01 14.29446 0.7282426 13.41621
## 1 0.10 14.29464 0.7512374 13.41641
## 5 0.01 14.29445 0.7105693 13.41620
## 5 0.10 14.29454 0.7518310 13.41631
## 10 0.01 14.29444 0.7125623 13.41619
## 10 0.10 14.29452 0.7476741 13.41628
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 10 and decay = 0.01.
# Predict on the test set
annPred <- predict(annModel, newdata = testData$x)
# Evaluate performance on the test set
annPerformance <- postResample(pred = annPred, obs = testData$y)
print(annPerformance)
## RMSE Rsquared MAE
## 14.2769345 0.6154719 13.3869171
After running the models, compare their performances:
Best performing model
Best Model: MARS appears to give the best performance overall, with the lowest RMSE and MAE and the highest R-squared value. Second Best: SVM performs decently but is slightly worse than MARS on all metrics. Worst Model: ANN performs poorly, likely due to overfitting, insufficient hyperparameter tuning, or the structure of the data being unsuitable for a neural network.
# Combine all results into a data frame
results <- data.frame(
Model = c("MARS", "SVM", "ANN"),
RMSE = c(marsPerformance["RMSE"], svmPerformance["RMSE"], annPerformance["RMSE"]),
Rsquared = c(marsPerformance["Rsquared"], svmPerformance["Rsquared"], annPerformance["Rsquared"]),
MAE = c(marsPerformance["MAE"], svmPerformance["MAE"], annPerformance["MAE"])
)
# Print the combined results
print(results)
## Model RMSE Rsquared MAE
## 1 MARS 1.813647 0.8677298 1.391184
## 2 SVM 2.074294 0.8265221 1.569684
## 3 ANN 14.276934 0.6154719 13.386917
Here’s the complete workflow for comparing MARS, KNN, SVM, and ANN models on the chemical manufacturing process data:
The code loads the ChemicalManufacturingProcess
dataset from the AppliedPredictiveModeling
package, splits
it into predictors (processPredictors
) and a response
variable (yield
), which represents the percentage
yield.
processPredictors
,
excluding the final column (the yield), and the response variable is
assigned to yield
.# Load necessary library
if (!require("AppliedPredictiveModeling")) {
install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
}
# Load the data
data("ChemicalManufacturingProcess")
processPredictors <- ChemicalManufacturingProcess[, -ncol(ChemicalManufacturingProcess)] # Exclude the last column which is `yield`
yield <- ChemicalManufacturingProcess$Yield # Assign response variable explicitly
# Review the data
#str(processPredictors)
#str(yield) # Response variable: percent yield
caret
’s preProcess
to fill missing
values.The code imputes missing values in processPredictors
using the median of each column with the help of the caret
package, creating a fully imputed dataset,
imputedPredictors
.
# Load necessary libraries
library(caret)
library(doParallel)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use one less core than available
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Create a preprocessing object to impute missing values with the median
preProcessObj <- preProcess(
processPredictors,
method = "medianImpute"
)
# Apply the preprocessing object to impute missing values in the dataset
# Parallel processing is used if the dataset is large or complex
imputedPredictors <- predict(preProcessObj, processPredictors)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?
yield
, and evaluates its performance.# Identify predictors with near-zero variance
nzv <- nearZeroVar(imputedPredictors)
# Filter out those predictors
filteredPredictors <- imputedPredictors[, -nzv]
# Separate the response variable 'Yield' (first column) from the predictors
response <- filteredPredictors[, 1] # Extract the Yield column
filteredPredictors <- filteredPredictors[, -1] # Remove Yield from the predictors
# Check the number of predictors remaining
num_predictors <- ncol(filteredPredictors)
print(paste("Number of predictors left after removing near-zero variance:", num_predictors))
## [1] "Number of predictors left after removing near-zero variance: 55"
# Split the data into training and test sets
set.seed(123)
trainIndex <- createDataPartition(response, p = 0.8, list = FALSE)
trainData <- filteredPredictors[trainIndex, ] # Training predictors
testData <- filteredPredictors[-trainIndex, ] # Test predictors
trainYield <- response[trainIndex] # Training response
testYield <- response[-trainIndex] # Test response
# Print the number of rows in the training and test sets
print(paste("Training set size:", nrow(trainData)))
## [1] "Training set size: 144"
print(paste("Test set size:", nrow(testData)))
## [1] "Test set size: 32"
The MARS (Multivariate Adaptive Regression Splines) model successfully identified key predictors for the response variable, achieving optimal performance with an RMSE of 1.4073, R-squared of 0.4525, and MAE of 1.1528 on the independent test set. These results were obtained using the final tuning parameters of nprune = 10 and degree = 1. During cross-validation, the model achieved an R-squared value of 0.5941, highlighting strong performance on resampled training data.
The variable importance analysis revealed the following as the most significant predictors:
This model demonstrates its ability to balance interpretability and predictive accuracy, effectively identifying the most critical variables influencing the response. However, the slight difference between the test set R-squared and cross-validation R-squared underscores the importance of validating the model’s generalization ability.
# Load necessary libraries
library(earth)
library(caret)
library(doParallel)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use all but one core
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Define the tuning grid for degree and nprune
marsGrid <- expand.grid(
degree = 1:2, # Allow up to second-degree interactions
nprune = 2:38 # Number of terms to prune
)
# Fit the MARS model using train() with external resampling
set.seed(100)
marsModel <- train(
x = trainData, # Predictor variables
y = trainYield, # Response variable
method = "earth", # MARS method in caret
tuneGrid = marsGrid, # Grid of hyperparameters to explore
trControl = trainControl(
method = "cv", # Cross-validation
number = 10 # Number of folds
)
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Print the tuned model
print(marsModel)
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 55 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 130, 130, 130, 130, 130, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.417036 0.4610563 1.1268899
## 1 3 1.270562 0.5426046 1.0152669
## 1 4 1.257285 0.5629884 1.0024011
## 1 5 1.259061 0.5647063 1.0030357
## 1 6 1.282213 0.5374464 1.0278309
## 1 7 1.276699 0.5302778 1.0188633
## 1 8 1.240021 0.5559535 0.9863856
## 1 9 1.203378 0.5882041 0.9721436
## 1 10 1.185759 0.5941380 0.9506741
## 1 11 2.875523 0.5147237 1.4148261
## 1 12 2.889647 0.5044998 1.4125068
## 1 13 2.911860 0.4906354 1.4252568
## 1 14 2.929364 0.4783386 1.4313547
## 1 15 2.935318 0.4798018 1.4315310
## 1 16 2.939199 0.4768222 1.4368428
## 1 17 2.939199 0.4768222 1.4368428
## 1 18 2.939199 0.4768222 1.4368428
## 1 19 2.939199 0.4768222 1.4368428
## 1 20 2.939199 0.4768222 1.4368428
## 1 21 2.939199 0.4768222 1.4368428
## 1 22 2.939199 0.4768222 1.4368428
## 1 23 2.939199 0.4768222 1.4368428
## 1 24 2.939199 0.4768222 1.4368428
## 1 25 2.939199 0.4768222 1.4368428
## 1 26 2.939199 0.4768222 1.4368428
## 1 27 2.939199 0.4768222 1.4368428
## 1 28 2.939199 0.4768222 1.4368428
## 1 29 2.939199 0.4768222 1.4368428
## 1 30 2.939199 0.4768222 1.4368428
## 1 31 2.939199 0.4768222 1.4368428
## 1 32 2.939199 0.4768222 1.4368428
## 1 33 2.939199 0.4768222 1.4368428
## 1 34 2.939199 0.4768222 1.4368428
## 1 35 2.939199 0.4768222 1.4368428
## 1 36 2.939199 0.4768222 1.4368428
## 1 37 2.939199 0.4768222 1.4368428
## 1 38 2.939199 0.4768222 1.4368428
## 2 2 1.372502 0.4735490 1.0990454
## 2 3 1.356502 0.4752559 1.0559439
## 2 4 1.276802 0.5549247 1.0099776
## 2 5 1.226674 0.5890922 0.9743090
## 2 6 1.236372 0.5810537 0.9938101
## 2 7 1.229469 0.5857615 0.9768368
## 2 8 1.223556 0.5912044 0.9861892
## 2 9 1.313564 0.5749585 1.0448934
## 2 10 1.340676 0.5565462 1.0739402
## 2 11 1.467545 0.4878662 1.1288558
## 2 12 1.519976 0.4695515 1.1510186
## 2 13 2.588658 0.4725595 1.4774373
## 2 14 2.648549 0.4769268 1.5012190
## 2 15 2.689316 0.4755803 1.5369717
## 2 16 2.884679 0.4803363 1.5883932
## 2 17 2.644325 0.5018611 1.5085782
## 2 18 2.621212 0.5020664 1.5044503
## 2 19 2.647251 0.4995565 1.4965914
## 2 20 2.507056 0.5047496 1.4801174
## 2 21 2.677305 0.5007535 1.5239616
## 2 22 2.652433 0.4989326 1.5103233
## 2 23 2.636900 0.5021532 1.4952947
## 2 24 2.753441 0.5018200 1.5335271
## 2 25 2.820024 0.5017712 1.5509031
## 2 26 2.934670 0.4886758 1.6113139
## 2 27 2.934670 0.4886758 1.6113139
## 2 28 2.934670 0.4886758 1.6113139
## 2 29 2.934670 0.4886758 1.6113139
## 2 30 2.934670 0.4886758 1.6113139
## 2 31 2.934670 0.4886758 1.6113139
## 2 32 2.934670 0.4886758 1.6113139
## 2 33 2.934670 0.4886758 1.6113139
## 2 34 2.934670 0.4886758 1.6113139
## 2 35 2.934670 0.4886758 1.6113139
## 2 36 2.934670 0.4886758 1.6113139
## 2 37 2.934670 0.4886758 1.6113139
## 2 38 2.934670 0.4886758 1.6113139
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.
# Analyze variable importance
marsImportance <- varImp(marsModel)
print(marsImportance)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess17 56.57
## ManufacturingProcess09 38.43
## ManufacturingProcess02 25.70
## BiologicalMaterial02 18.60
## BiologicalMaterial03 18.60
## ManufacturingProcess39 13.33
## ManufacturingProcess22 0.00
# Plot variable importance
plot(marsImportance, top = 20)
# Predict on the test set
marsPredictions <- predict(marsModel, newdata = testData)
# Evaluate performance on the test set
marsPerformance <- postResample(pred = marsPredictions, obs = testYield)
print(marsPerformance)
## RMSE Rsquared MAE
## 1.4072635 0.4524617 1.1527858
For the k-Nearest Neighbors (k-NN) model with \(k = 3\), the performance metrics indicate the following:
While the cross-validation R-squared is higher, the drop in test set R-squared suggests some level of overfitting or differences in data characteristics between the training and test sets. This highlights the importance of validating model performance on an independent test set to ensure reliability.
# Load necessary libraries
library(caret)
library(doParallel)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use all but one core
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Define the tuning grid for the number of neighbors
knnGrid <- expand.grid(k = 1:20)
# Train the KNN model with caret
set.seed(123)
knnModel <- train(
x = trainData, # Predictor variables
y = trainYield, # Response variable
method = "knn", # KNN method
preProc = c("center", "scale"), # Centering and scaling
tuneGrid = knnGrid, # Grid of hyperparameters to explore
trControl = trainControl(
method = "cv", # Cross-validation
number = 5, # Number of folds
allowParallel = TRUE # Enable parallel processing during resampling
)
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Print the KNN model results
print(knnModel)
## k-Nearest Neighbors
##
## 144 samples
## 55 predictor
##
## Pre-processing: centered (55), scaled (55)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 116, 116, 115, 116, 113
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.462027 0.4168436 1.1290106
## 2 1.363514 0.4737576 1.0813046
## 3 1.273494 0.5342453 1.0043078
## 4 1.282900 0.5269851 0.9980377
## 5 1.328725 0.4913713 1.0411439
## 6 1.340986 0.4851244 1.0678474
## 7 1.373854 0.4565705 1.0915708
## 8 1.368188 0.4616640 1.0896447
## 9 1.336792 0.4894488 1.0587541
## 10 1.323616 0.5029194 1.0571762
## 11 1.335415 0.4975159 1.0736199
## 12 1.346281 0.4891835 1.0774769
## 13 1.347502 0.4940762 1.0823816
## 14 1.351050 0.4976379 1.0870508
## 15 1.369857 0.4875060 1.1052929
## 16 1.376861 0.4824909 1.0993577
## 17 1.392951 0.4669743 1.1137737
## 18 1.395873 0.4685117 1.1101456
## 19 1.400597 0.4679007 1.1205462
## 20 1.415488 0.4611594 1.1328655
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 3.
# Predict on the test set
knnPredictions <- predict(knnModel, newdata = testData)
# Evaluate performance
knnPerformance <- postResample(pred = knnPredictions, obs = testYield)
print(knnPerformance)
## RMSE Rsquared MAE
## 1.5013193 0.3349698 1.1772917
The Support Vector Machine (SVM) model was successfully optimized using cross-validation, with RMSE as the evaluation metric. The optimal model was selected based on the smallest RMSE value during cross-validation, where it achieved a cross-validation R-squared of 0.6750. The final model parameters were sigma = 0.01525 (Gaussian kernel width) and C = 16 (cost parameter).
The performance metrics for the independent test set were:
The final SVM model, of type epsilon-Support Vector Regression (eps-svr), used 129 support vectors, which constituted a significant portion of the training set. The model achieved an objective function value of -83.1197 and a very low training error of 0.0093, indicating a well-fitted model.
The close alignment between the cross-validation R-squared and the test set R-squared underscores the model’s strong generalization performance, effectively capturing the underlying relationships in the data while minimizing overfitting.
# Load necessary libraries
library(caret)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(doParallel)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use all but one core
cl <- makeCluster(cores)
registerDoParallel(cl)
# Define a tuning grid for hyperparameter search
tuneGrid <- expand.grid(
sigma = seq(0.001, 0.02, length = 5), # Kernel width parameter
C = c(0.25, 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2050) # Cost parameter
)
# Train the SVM model using caret
set.seed(123)
svmModel <- train(
x = trainData, # Predictor variables
y = trainYield, # Response variable
method = "svmRadial", # Radial Basis Function kernel
preProc = c("center", "scale"), # Automatically center and scale
tuneGrid = tuneGrid, # Grid of hyperparameters (sigma and C)
trControl = trainControl(
method = "cv", # Cross-validation
number = 10, # 10-fold cross-validation
allowParallel = TRUE # Enable parallel processing during resampling
)
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Print the tuned SVM model
print(svmModel)
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 55 predictor
##
## Pre-processing: centered (55), scaled (55)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## sigma C RMSE Rsquared MAE
## 0.00100 0.25 1.584724 0.4908850 1.2933921
## 0.00100 0.50 1.472165 0.5227248 1.2035928
## 0.00100 1.00 1.347364 0.5609412 1.1059130
## 0.00100 2.00 1.251732 0.5836637 1.0514179
## 0.00100 4.00 1.207624 0.5962614 1.0144070
## 0.00100 8.00 1.183270 0.6048753 0.9717196
## 0.00100 16.00 1.174018 0.6152566 0.9655921
## 0.00100 32.00 1.213477 0.6058961 1.0149432
## 0.00100 64.00 1.275442 0.5900019 1.0789686
## 0.00100 128.00 1.384748 0.5650622 1.1632247
## 0.00100 256.00 1.494103 0.5574347 1.1884871
## 0.00100 512.00 1.536853 0.5524136 1.1745447
## 0.00100 1024.00 1.429057 0.5634973 1.1021262
## 0.00100 2050.00 1.425374 0.5642553 1.1039291
## 0.00575 0.25 1.374990 0.5565957 1.1242310
## 0.00575 0.50 1.277043 0.5786877 1.0586592
## 0.00575 1.00 1.204491 0.6102910 0.9939477
## 0.00575 2.00 1.150214 0.6344192 0.9410259
## 0.00575 4.00 1.122581 0.6496301 0.9249104
## 0.00575 8.00 1.113802 0.6504783 0.9181936
## 0.00575 16.00 1.131446 0.6428268 0.9289700
## 0.00575 32.00 1.123222 0.6512749 0.9100888
## 0.00575 64.00 1.133427 0.6457598 0.9142969
## 0.00575 128.00 1.133427 0.6457598 0.9142969
## 0.00575 256.00 1.133427 0.6457598 0.9142969
## 0.00575 512.00 1.133427 0.6457598 0.9142969
## 0.00575 1024.00 1.133427 0.6457598 0.9142969
## 0.00575 2050.00 1.133427 0.6457598 0.9142969
## 0.01050 0.25 1.354291 0.5687365 1.1046404
## 0.01050 0.50 1.259387 0.5949316 1.0354559
## 0.01050 1.00 1.179686 0.6325881 0.9593929
## 0.01050 2.00 1.115066 0.6618773 0.9065678
## 0.01050 4.00 1.089756 0.6656807 0.8800131
## 0.01050 8.00 1.103614 0.6577296 0.8819586
## 0.01050 16.00 1.095728 0.6655347 0.8714994
## 0.01050 32.00 1.095600 0.6657098 0.8719145
## 0.01050 64.00 1.095600 0.6657098 0.8719145
## 0.01050 128.00 1.095600 0.6657098 0.8719145
## 0.01050 256.00 1.095600 0.6657098 0.8719145
## 0.01050 512.00 1.095600 0.6657098 0.8719145
## 0.01050 1024.00 1.095600 0.6657098 0.8719145
## 0.01050 2050.00 1.095600 0.6657098 0.8719145
## 0.01525 0.25 1.362528 0.5669301 1.1108764
## 0.01525 0.50 1.264952 0.6000550 1.0309710
## 0.01525 1.00 1.166932 0.6442221 0.9444841
## 0.01525 2.00 1.112524 0.6655948 0.8992675
## 0.01525 4.00 1.101407 0.6628631 0.8780349
## 0.01525 8.00 1.087566 0.6740149 0.8633659
## 0.01525 16.00 1.086919 0.6750417 0.8633120
## 0.01525 32.00 1.086919 0.6750417 0.8633120
## 0.01525 64.00 1.086919 0.6750417 0.8633120
## 0.01525 128.00 1.086919 0.6750417 0.8633120
## 0.01525 256.00 1.086919 0.6750417 0.8633120
## 0.01525 512.00 1.086919 0.6750417 0.8633120
## 0.01525 1024.00 1.086919 0.6750417 0.8633120
## 0.01525 2050.00 1.086919 0.6750417 0.8633120
## 0.02000 0.25 1.383589 0.5615123 1.1271756
## 0.02000 0.50 1.278280 0.5984133 1.0350035
## 0.02000 1.00 1.176618 0.6426173 0.9438365
## 0.02000 2.00 1.131785 0.6585769 0.9076618
## 0.02000 4.00 1.100867 0.6735171 0.8801702
## 0.02000 8.00 1.097831 0.6771105 0.8781162
## 0.02000 16.00 1.097831 0.6771105 0.8781162
## 0.02000 32.00 1.097831 0.6771105 0.8781162
## 0.02000 64.00 1.097831 0.6771105 0.8781162
## 0.02000 128.00 1.097831 0.6771105 0.8781162
## 0.02000 256.00 1.097831 0.6771105 0.8781162
## 0.02000 512.00 1.097831 0.6771105 0.8781162
## 0.02000 1024.00 1.097831 0.6771105 0.8781162
## 0.02000 2050.00 1.097831 0.6771105 0.8781162
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01525 and C = 16.
# Predict on the test set
svmPredictions <- predict(svmModel, newdata = testData)
# Evaluate the performance on the test set
svmPerformance <- postResample(pred = svmPredictions, obs = testYield)
print(svmPerformance)
## RMSE Rsquared MAE
## 1.230027 0.553041 1.042387
# Print the final model details
svmModel$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 16
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.01525
##
## Number of Support Vectors : 129
##
## Objective Function Value : -83.1197
## Training error : 0.009344
ANN with Cross-Validation and Model Tuning (caret)
The Artificial Neural Network (ANN) model was trained using cross-validation to identify the optimal combination of hyperparameters for predictive performance. Based on the cross-validation results:
The model demonstrated effective generalization, with close alignment between cross-validation and test set metrics. While it exhibited reasonable predictive performance, the R-squared value indicates there may still be room for improvement in explaining variance in the response variable. This highlights the ANN model’s capability to capture complex relationships in the data while maintaining interpretability through parameter tuning.
# Load necessary libraries
library(caret)
library(doParallel)
library(kableExtra)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use all but one core
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Define a tuning grid for hyperparameter search
nnetGrid <- expand.grid(
.size = c(1:10), # Number of units in the hidden layer
.decay = c(0, 0.01, 0.1) # Weight decay (regularization)
)
# Train the ANN model with caret
set.seed(123)
annModel <- train(
x = trainData, # Predictor variables
y = trainYield, # Response variable
method = "nnet", # ANN model
preProc = c("center", "scale"), # Automatically center and scale
tuneGrid = nnetGrid, # Grid of hyperparameters to explore
linout = TRUE, # For regression
trace = FALSE, # Suppress printed output
MaxNWts = 10 * (ncol(trainData) + 1) + 10 + 1, # Weight limit
maxit = 500, # Maximum number of iterations
trControl = trainControl(
method = "cv", # Cross-validation
number = 5, # Number of folds
allowParallel = TRUE # Enable parallel processing during resampling
)
)
# Predict on the test set
annModelPredictions <- predict(annModel, newdata = testData)
# Evaluate performance
performance <- postResample(pred = annModelPredictions, obs = testYield)
performance |>
kable(caption = "Summary of ANN Performance") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
x | |
---|---|
RMSE | 1.418033 |
Rsquared | 0.430970 |
MAE | 1.200945 |
# Print the final model details
annModel$results |>
kable(caption = "Summary of ANN Model") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
size | decay | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
---|---|---|---|---|---|---|---|
1 | 0.00 | 1.689734 | 0.2542072 | 1.349821 | 0.1794880 | 0.1791168 | 0.1562689 |
1 | 0.01 | 1.561907 | 0.4187517 | 1.234869 | 0.2837755 | 0.1321660 | 0.2040772 |
1 | 0.10 | 1.436522 | 0.4712951 | 1.124480 | 0.2718533 | 0.1698442 | 0.2328798 |
2 | 0.00 | 1.645134 | 0.2585107 | 1.289944 | 0.2915022 | 0.2058334 | 0.2303881 |
2 | 0.01 | 1.867172 | 0.3413779 | 1.478404 | 0.2507002 | 0.1218222 | 0.1724573 |
2 | 0.10 | 2.509684 | 0.2571194 | 1.994840 | 0.6941452 | 0.1144497 | 0.4945125 |
3 | 0.00 | 2.050785 | 0.2086117 | 1.542577 | 0.8567400 | 0.1353936 | 0.5047662 |
3 | 0.01 | 2.555253 | 0.1740069 | 1.960693 | 0.5980771 | 0.0762876 | 0.3544072 |
3 | 0.10 | 3.508416 | 0.1174988 | 2.389504 | 1.0200151 | 0.0716593 | 0.4171840 |
4 | 0.00 | 2.341756 | 0.3372945 | 1.610766 | 1.2583672 | 0.1861665 | 0.5921126 |
4 | 0.01 | 2.821935 | 0.1277371 | 2.000191 | 0.5010514 | 0.0837186 | 0.3396468 |
4 | 0.10 | 2.535123 | 0.2921858 | 1.766428 | 0.9007859 | 0.1299448 | 0.3690240 |
5 | 0.00 | 3.417779 | 0.1635364 | 2.693057 | 0.7258504 | 0.1000456 | 0.6737322 |
5 | 0.01 | 2.790992 | 0.1837780 | 2.045475 | 0.4722601 | 0.1291922 | 0.1992326 |
5 | 0.10 | 2.943415 | 0.1461997 | 2.207419 | 0.6393891 | 0.1113244 | 0.3677718 |
6 | 0.00 | 3.723598 | 0.1351910 | 2.828667 | 1.2000072 | 0.1581778 | 0.8735173 |
6 | 0.01 | 2.587431 | 0.1541800 | 1.994730 | 0.3411992 | 0.1133942 | 0.2366837 |
6 | 0.10 | 2.819276 | 0.1728679 | 1.950022 | 0.7970351 | 0.0786146 | 0.2363705 |
7 | 0.00 | 3.199451 | 0.2165521 | 2.597800 | 0.8165988 | 0.1454107 | 0.7121076 |
7 | 0.01 | 2.677368 | 0.1903446 | 2.019389 | 0.5868093 | 0.1312342 | 0.4784302 |
7 | 0.10 | 3.278807 | 0.1466532 | 2.159866 | 1.2302435 | 0.1720565 | 0.4866422 |
8 | 0.00 | 4.900561 | 0.1320959 | 3.797158 | 1.0821378 | 0.1179410 | 0.9482748 |
8 | 0.01 | 2.454183 | 0.2284838 | 1.885210 | 0.4702424 | 0.1911357 | 0.3272997 |
8 | 0.10 | 2.233319 | 0.3440147 | 1.713529 | 0.5282493 | 0.1256910 | 0.2499077 |
9 | 0.00 | 5.196023 | 0.1549238 | 3.992858 | 1.4621434 | 0.1367316 | 1.2467610 |
9 | 0.01 | 2.721205 | 0.3134279 | 2.074909 | 0.5559672 | 0.0791680 | 0.2950494 |
9 | 0.10 | 2.852651 | 0.1707596 | 2.096973 | 0.6316644 | 0.0873419 | 0.2651230 |
10 | 0.00 | 7.230668 | 0.0536463 | 4.719107 | 2.9623238 | 0.0407995 | 1.7589904 |
10 | 0.01 | 3.403570 | 0.1862717 | 2.489796 | 1.3782233 | 0.2703826 | 0.7280932 |
10 | 0.10 | 2.637258 | 0.1974167 | 1.998243 | 0.8516244 | 0.1312045 | 0.5239237 |
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
Optimal Model:
Optimal Model SVM:
The SVM model provides the optimal resampling and test set performance, achieving the lowest RMSE (1.230027), the highest R-squared (0.5530410), and the smallest MAE (1.042387) among the nonlinear regression models.
# Predict on the test set
marsPred <- predict(marsModel, newdata = testData) # Automatically scaled
knnPred <- predict(knnModel, newdata = testData) # Automatically scaled
svmPred <- predict(svmModel, newdata = testData) # Automatically scaled
annPred <- predict(annModel, newdata = testData) # Automatically scaled
# Calculate performance metrics
marsPerformance <- postResample(pred = marsPred, obs = testYield)
knnPerformance <- postResample(pred = knnPred, obs = testYield)
svmPerformance <- postResample(pred = svmPred, obs = testYield)
annPerformance <- postResample(pred = annPred, obs = testYield)
# Combine results into a single data frame
results <- data.frame(
Model = c("MARS", "KNN", "SVM", "ANN"),
RMSE = c(marsPerformance["RMSE"], knnPerformance["RMSE"], svmPerformance["RMSE"], annPerformance["RMSE"]),
Rsquared = c(marsPerformance["Rsquared"], knnPerformance["Rsquared"], svmPerformance["Rsquared"], annPerformance["Rsquared"]),
MAE = c(marsPerformance["MAE"], knnPerformance["MAE"], svmPerformance["MAE"], annPerformance["MAE"])
)
# Sort the results by RMSE in ascending order
sorted_results <- results[order(results$RMSE), ]
# Print the sorted results
sorted_results|>
kable(caption = "Summary of Model Comparison") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Model | RMSE | Rsquared | MAE | |
---|---|---|---|---|
3 | SVM | 1.230027 | 0.5530410 | 1.042387 |
1 | MARS | 1.407264 | 0.4524617 | 1.152786 |
4 | ANN | 1.418033 | 0.4309700 | 1.200945 |
2 | KNN | 1.501319 | 0.3349698 | 1.177292 |
Predictor Importance: - Highlight which predictors dominate in the best nonlinear model. - Determine whether biological or process predictors dominate.
Based on the analysis of the bar chart:
ManufacturingProcess32
,
ManufacturingProcess37
,
ManufacturingProcess09
,
ManufacturingProcess13
,
ManufacturingProcess36
,
ManufacturingProcess17
, and
ManufacturingProcess28
.
BiologicalMaterial03
, BiologicalMaterial05
,
and BiologicalMaterial06
.
From this analysis, manufacturing process predictors dominate the list of most important features in the model, as they make up the majority of the top ten important predictors. This suggests that process-related features have a stronger influence on the model’s predictions compared to biological features.
# Load necessary libraries
library(pdp)
## Warning: package 'pdp' was built under R version 4.4.2
library(vip)
## Warning: package 'vip' was built under R version 4.4.2
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(doParallel)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use all but one core
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Define a custom function for permutation-based feature importance
set.seed(123)
svmImportance <- vip(
svmModel,
method = "permute", # Permutation-based importance
train = trainData, # Correct training predictors
target = trainYield, # Correct target variable
metric = "RMSE", # Performance metric (e.g., RMSE)
nsim = 50, # Number of permutations
pred_wrapper = predict # Prediction function for the model
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Plot variable importance
plot(svmImportance)
# Subset the training data to only include the selected 11 features
# Ensure train and test data subsets are consistent
selected_features <- c("ManufacturingProcess32", "ManufacturingProcess37", "ManufacturingProcess09",
"ManufacturingProcess13", "ManufacturingProcess36", "ManufacturingProcess28",
"BiologicalMaterial03", "BiologicalMaterial05", "BiologicalMaterial06")
trainData_selected <- trainData[, selected_features]
testData_selected <- testData[, selected_features]
tuneGrid <- expand.grid(
sigma = c(0.01, 0.05, 0.1, 0.15, 0.2), # Adjust based on reduced feature space
C = c(0.25, 0.5, 1, 2, 4, 8, 16, 32, 64)
)
set.seed(123)
final_svm_model <- train(
x = trainData_selected,
y = trainYield,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = tuneGrid,
trControl = trainControl(
method = "cv",
number = 5,
allowParallel = TRUE
)
)
svm_predictions <- predict(final_svm_model, newdata = testData_selected)
svm_performance <- postResample(pred = svm_predictions, obs = testYield)
svm_performance|>
kable(caption = "SVM Model Performance") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
x | |
---|---|
RMSE | 1.2476406 |
Rsquared | 0.5540403 |
MAE | 1.0195677 |
The Recursive Feature Selection (RFE) process identified the top 5 variables influencing the response variable, namely:
Using the identified features, the optimal model achieved the following cross-validation results:
This demonstrates that focusing on these key variables enhances the model’s efficiency without compromising its predictive performance. Among all models evaluated, SVM provided the best overall performance on the independent test set with an RMSE of 1.2300, R-squared of 0.5530, and MAE of 1.0424, followed by MARS and KNN models. These results highlight the importance of variable selection and proper model evaluation to achieve optimal predictive accuracy.
Again, from this analysis, manufacturing process predictors dominate the list of most important features in the model.
# Load necessary libraries
library(caret)
library(doParallel)
# Set up parallel processing
cores <- parallel::detectCores() - 1 # Use all but one core
cl <- makeCluster(cores) # Create a parallel cluster
registerDoParallel(cl) # Register the parallel backend
# Define control for RFE
control <- rfeControl(
functions = caretFuncs, # Use caret's built-in SVM support
method = "cv", # Cross-validation method
number = 5, # Number of folds
allowParallel = TRUE # Enable parallel processing
)
# Perform RFE with SVM
set.seed(123)
svmRFE <- rfe(
x = trainData, # Predictor variables
y = trainYield, # Target variable
sizes = c(1:10), # Subset sizes to explore
rfeControl = control, # RFE control object
method = "svmRadial" # SVM with radial basis function kernel
)
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Return to sequential processing
# Print RFE results
print(svmRFE)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 1.419 0.4176 1.1443 0.1673 0.08145 0.1462
## 2 1.386 0.4657 1.0643 0.2540 0.16121 0.2067
## 3 1.420 0.4335 1.1090 0.2579 0.14224 0.1882
## 4 1.357 0.4719 1.0477 0.3037 0.15805 0.2413
## 5 1.321 0.4951 1.0512 0.3145 0.16525 0.2423
## 6 1.280 0.5272 1.0347 0.2737 0.13124 0.2178
## 7 1.264 0.5393 1.0065 0.3389 0.18226 0.2551
## 8 1.269 0.5444 1.0049 0.3236 0.18270 0.2553
## 9 1.240 0.5510 0.9728 0.3801 0.22981 0.2962
## 10 1.235 0.5599 0.9404 0.3638 0.20580 0.2605
## 55 1.222 0.5778 0.9425 0.2112 0.10171 0.1252 *
##
## The top 5 variables (out of 55):
## ManufacturingProcess32, BiologicalMaterial06, ManufacturingProcess13, BiologicalMaterial02, ManufacturingProcess17
# Plot RFE variable importance
plot(svmRFE, type = c("g", "o"))
# Subset the training data to only include the selected 5 features
selected_features <- c("ManufacturingProcess32", "BiologicalMaterial06",
"ManufacturingProcess13", "BiologicalMaterial02",
"ManufacturingProcess17")
trainData_selected <- trainData[, selected_features, drop = FALSE]
testData_selected <- testData[, selected_features, drop = FALSE]
# Align Column Names in Test Data
colnames(testData_selected) <- colnames(trainData_selected)
# Train the final SVM model with the selected features
set.seed(123)
final_svm_model <- train(
x = trainData_selected,
y = trainYield,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = expand.grid(sigma = c(0.001, 0.01, 0.1), C = c(1, 2, 4, 8, 16)),
trControl = trainControl(
method = "cv",
number = 5,
allowParallel = TRUE
)
)
svm_predictions <- predict(final_svm_model, newdata = testData_selected)
svm_performance <- postResample(pred = svm_predictions, obs = testYield)
svm_performance |>
kable(caption = "SVM Model Performance") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
x | |
---|---|
RMSE | 1.2462983 |
Rsquared | 0.5446202 |
MAE | 1.0344052 |
The Support Vector Machine (SVM) models were evaluated using three different approaches to balance predictive accuracy and model simplicity: the original model with all 55 features, a permutation-based feature selection model with 11 features, and a Recursive Feature Elimination (RFE) model with 5 features.
Original SVM Model (55 features):
RMSE: 1.2300, R²: 0.5530,
MAE: 1.0424, Sigma: 0.01525,
C: 16
Permutation-Based Model (11 features):
RMSE: 1.2476, R²: 0.5540,
MAE: 1.0196, Sigma: 0.0824,
C: 4
RFE Model (5 features):
RMSE: 1.2463, R²: 0.5444,
MAE: 1.0344, Sigma: 0.3666,
C: 2
The original SVM model showed the best balance between predictive accuracy and computational complexity, with the lowest RMSE and highest R² among all models. However, the RFE model provides a simpler alternative with competitive performance, making it suitable for cases where fewer predictors are desired.
# Load required library
library(caret)
# Define 10-fold cross-validation
control <- trainControl(
method = "cv", # Cross-validation
number = 10, # 10 folds
savePredictions = "final", # Save final predictions
verboseIter = TRUE # Print progress during training
)
# Train the PLS model
set.seed(123) # Ensure reproducibility
plsFit <- train(
x = trainData, # Predictor variables
y = trainYield, # Response variable
method = "pls", # Partial Least Squares regression
tuneLength = 20, # Explore up to 20 latent components
trControl = control, # Cross-validation control
preProcess = c("center", "scale") # Standardize data
)
## + Fold01: ncomp=20
## - Fold01: ncomp=20
## + Fold02: ncomp=20
## - Fold02: ncomp=20
## + Fold03: ncomp=20
## - Fold03: ncomp=20
## + Fold04: ncomp=20
## - Fold04: ncomp=20
## + Fold05: ncomp=20
## - Fold05: ncomp=20
## + Fold06: ncomp=20
## - Fold06: ncomp=20
## + Fold07: ncomp=20
## - Fold07: ncomp=20
## + Fold08: ncomp=20
## - Fold08: ncomp=20
## + Fold09: ncomp=20
## - Fold09: ncomp=20
## + Fold10: ncomp=20
## - Fold10: ncomp=20
## Aggregating results
## Selecting tuning parameters
## Fitting ncomp = 3 on full training set
# Get the optimal number of latent variables
optimal_ncomp <- plsFit$bestTune$ncomp
resampled_R2 <- max(plsFit$results$Rsquared)
# Print key results
cat("Optimal number of latent variables:", optimal_ncomp, "\n")
## Optimal number of latent variables: 3
cat("Resampled estimate of R²:", round(resampled_R2, 4), "\n")
## Resampled estimate of R²: 0.6032
# Plot cross-validated performance
plot(plsFit)
# Predict on the test set
plsPred <- predict(plsFit, newdata = testData)
# Evaluate performance on the test set
testPerformance <- postResample(pred = plsPred, obs = testYield)
cat("Test Set Performance:\n")
## Test Set Performance:
print(testPerformance)
## RMSE Rsquared MAE
## 1.3891031 0.4610002 1.1561989
# Display variable importance
cat("\nVariable Importance:\n")
##
## Variable Importance:
varImp(plsFit)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 55)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess17 87.20
## ManufacturingProcess36 87.19
## ManufacturingProcess13 86.58
## ManufacturingProcess09 86.35
## ManufacturingProcess06 69.71
## ManufacturingProcess33 62.42
## BiologicalMaterial06 61.65
## BiologicalMaterial03 60.97
## BiologicalMaterial08 59.90
## BiologicalMaterial02 59.90
## ManufacturingProcess11 56.10
## BiologicalMaterial12 55.03
## BiologicalMaterial11 54.57
## BiologicalMaterial01 52.38
## BiologicalMaterial04 50.61
## ManufacturingProcess28 49.74
## ManufacturingProcess12 46.30
## ManufacturingProcess37 45.97
## BiologicalMaterial10 42.18
# Extract variable importance for SVM and PLS
svmImportance <- varImp(svmModel)$importance
plsImportance <- varImp(plsFit)$importance
# Get the top 10 predictors from each model
top_svm <- rownames(svmImportance)[order(-svmImportance[,1])[1:10]]
top_pls <- rownames(plsImportance)[order(-plsImportance[,1])[1:10]]
# Compare the predictors
common_predictors <- intersect(top_svm, top_pls)
unique_svm <- setdiff(top_svm, top_pls)
unique_pls <- setdiff(top_pls, top_svm)
# Display the results
cat("Common Predictors:\n", common_predictors, "\n")
## Common Predictors:
## ManufacturingProcess32 BiologicalMaterial06 ManufacturingProcess36 BiologicalMaterial03 ManufacturingProcess13 ManufacturingProcess17 ManufacturingProcess09
cat("Unique to SVM:\n", unique_svm, "\n")
## Unique to SVM:
## ManufacturingProcess31 BiologicalMaterial02 BiologicalMaterial12
cat("Unique to PLS:\n", unique_pls, "\n")
## Unique to PLS:
## ManufacturingProcess06 ManufacturingProcess33 BiologicalMaterial08
To explore the relationships between predictors unique to the optimal nonlinear regression model (SVM) and the response variable (yield), we create scatterplots for these predictors. These plots will help evaluate whether these predictors have intuitive or interpretable relationships with the yield.
The plots reveal intuitive relationships between the predictors and yield:
These patterns highlight key areas for optimizing biological inputs and manufacturing processes to enhance yield.
# Identify unique predictors for SVM
unique_svm_predictors <- setdiff(top_svm, common_predictors)
# Create scatterplots for unique SVM predictors
par(mfrow = c(2, 3)) # Adjust layout for multiple plots
for (predictor in unique_svm_predictors) {
plot(
trainData[[predictor]], trainYield,
main = predictor,
xlab = predictor,
ylab = "Yield",
pch = 19, col = "blue"
)
abline(lm(trainYield ~ trainData[[predictor]]), col = "red") # Add regression line
}