Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.

Excercise:7.2

Friedman (1991) introduced several benchmark data sets create by sim- ulation. One of these simulations used the following nonlinear equation to create data: 22 y = 10sin(πx1x2) + 20(x3 − 0.5) + 10x4 + 5x5 + N(0,σ ) where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simula- tion). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(123)
trainingData <- mlbench.friedman1(200, sd = 1) ## We convert the 'x' data from a matrix to a data frame.## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y)

## other methods.
## This creates a list with a vector 'y' and a maxtrix
## of predictors 'x'. Also simulates a large test set to 
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

## Example of Tuning a Model is 
knnmodel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnmodel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.413281  0.5080422  2.728246
##    7  3.273414  0.5527050  2.615315
##    9  3.235373  0.5697592  2.590831
##   11  3.162297  0.6032897  2.536368
##   13  3.154606  0.6140107  2.532940
##   15  3.181223  0.6164334  2.556654
##   17  3.179161  0.6300266  2.557801
##   19  3.205484  0.6317257  2.586990
##   21  3.230044  0.6358811  2.607149
##   23  3.250240  0.6423378  2.625998
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knnpred <- predict(knnmodel, newdata = testData$x) 
postResample(pred = knnpred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 3.2089763 0.6460475 2.5710330
# Single-Layer Neural Network with 5 Hidden Neurons
# Train a single-layer neural network model
single_layer_nn <- nnet(
  x = trainingData$x,               # Input features from training dataset
  y = trainingData$y,               # Target output values from training dataset
  size = 5,                         # Number of neurons in the hidden layer
  decay = 0.01,                     # Regularization parameter to reduce overfitting
  linout = TRUE,                    # Use linear output for regression
  trace = FALSE,                    # Suppress training output details
  maxit = 500,                      # Maximum number of iterations for convergence
  MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1 # Max weights based on inputs and hidden layer size
)

# Predict using the single-layer model on test dataset
single_layer_predictions <- predict(single_layer_nn, newdata = testData$x)

# Evaluate model performance: accuracy, RMSE, and R-squared
single_layer_performance <- postResample(pred = single_layer_predictions, obs = testData$y)

# Averaged Neural Network Model with Multiple Repeats

# Train an ensemble neural network model with averaged predictions
averaged_nn_model <- avNNet(
  x = trainingData$x,               # Input features from training dataset
  y = trainingData$y,               # Target output values from training dataset
  size = 5,                         # Number of neurons in each hidden layer
  decay = 0.01,                     # Regularization parameter to reduce overfitting
  repeats = 5,                      # Number of model repeats for ensemble averaging
  linout = TRUE,                    # Use linear output for regression
  trace = FALSE,                    # Suppress training output details
  maxit = 500,                      # Maximum number of iterations for convergence
  MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1 # Max weights based on inputs and hidden layer size
)
## Warning: executing %dopar% sequentially: no parallel backend registered
# Predict using the averaged model on test dataset
averaged_nn_predictions <- predict(averaged_nn_model, newdata = testData$x)

# Evaluate performance of the averaged neural network model
averaged_nn_performance <- postResample(pred = averaged_nn_predictions, obs = testData$y)

# Output the performance metrics for both models
list(Single_Layer_NN_Performance = single_layer_performance,
     Averaged_NN_Performance = averaged_nn_performance)
## $Single_Layer_NN_Performance
##      RMSE  Rsquared       MAE 
## 2.6273853 0.7331068 1.9189496 
## 
## $Averaged_NN_Performance
##      RMSE  Rsquared       MAE 
## 1.9057052 0.8526887 1.3993520
# Regular MARS (Multivariate Adaptive Regression Splines) Model

# Train a basic MARS model on the training data
basic_mars_model <- earth(
  x = trainingData$x,               # Input features from training dataset
  y = trainingData$y                # Target output values from training dataset
)

# Display model summary for interpretation
print(basic_mars_model)
## Selected 11 of 19 terms, and 7 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X4, X2, X1, X5, X3, X8, X9, X6-unused, X7-unused, X10-unused
## Number of terms at each degree of interaction: 1 10 (additive model)
## GCV 2.474848    RSS 396.483    GRSq 0.890458    RSq 0.91137
summary(basic_mars_model)
## Call: earth(x=trainingData$x, y=trainingData$y)
## 
##                coefficients
## (Intercept)       18.726619
## h(0.594343-X1)    -9.932787
## h(0.5849-X2)     -12.604904
## h(0.423464-X3)    11.522748
## h(X3-0.423464)     5.840845
## h(X3-0.816478)    11.190677
## h(0.915273-X4)    -9.792100
## h(X4-0.915273)    30.182420
## h(X5-0.161809)     4.867564
## h(X8-0.812012)     5.856740
## h(X9-0.784228)    -4.059111
## 
## Selected 11 of 19 terms, and 7 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X4, X2, X1, X5, X3, X8, X9, X6-unused, X7-unused, X10-unused
## Number of terms at each degree of interaction: 1 10 (additive model)
## GCV 2.474848    RSS 396.483    GRSq 0.890458    RSq 0.91137
# Make predictions with the basic MARS model on the test dataset
basic_mars_predictions <- predict(basic_mars_model, newdata = testData$x)

# Evaluate performance of the basic MARS model: accuracy, RMSE, and R-squared
basic_mars_performance <- postResample(pred = basic_mars_predictions, obs = testData$y)

# Setting up a grid for hyperparameter tuning
mars_tuning_grid <- expand.grid(.degree = 1:2, .nprune = 2:16)  # Degrees of interaction and number of terms

# Set seed for reproducibility in cross-validation
set.seed(1234)

# Tuning the MARS model with cross-validation

# Train a MARS model with cross-validation to find optimal hyperparameters
tuned_mars_model <- train(
  x = trainingData$x,               # Input features from training dataset
  y = trainingData$y,               # Target output values from training dataset
  method = "earth",                 # Specifies MARS model
  tuneGrid = mars_tuning_grid,      # Grid of tuning parameters
  trControl = trainControl(method = "cv") # Cross-validation control
)

# Display the tuned model details
print(tuned_mars_model)
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      3.770992  0.3727524  3.1357545
##   1        3      2.949996  0.6160612  2.2725468
##   1        4      2.482922  0.7307027  2.0031607
##   1        5      2.220126  0.7941821  1.8083169
##   1        6      2.089436  0.8195562  1.7001675
##   1        7      1.649547  0.8867299  1.3209519
##   1        8      1.637781  0.8870078  1.3132005
##   1        9      1.622447  0.8888477  1.2978562
##   1       10      1.623192  0.8881811  1.2989605
##   1       11      1.627285  0.8875157  1.3019940
##   1       12      1.620883  0.8878084  1.2959395
##   1       13      1.620883  0.8878084  1.2959395
##   1       14      1.620883  0.8878084  1.2959395
##   1       15      1.620883  0.8878084  1.2959395
##   1       16      1.620883  0.8878084  1.2959395
##   2        2      3.770992  0.3727524  3.1357545
##   2        3      2.949996  0.6160612  2.2725468
##   2        4      2.482922  0.7307027  2.0031607
##   2        5      2.240208  0.7896959  1.8018649
##   2        6      2.089436  0.8195562  1.7001675
##   2        7      1.649547  0.8867299  1.3209519
##   2        8      1.547759  0.9000767  1.2488714
##   2        9      1.455612  0.9095562  1.1686029
##   2       10      1.366332  0.9228232  1.0896493
##   2       11      1.278749  0.9307460  1.0340444
##   2       12      1.234331  0.9340234  0.9872796
##   2       13      1.252947  0.9332905  1.0157019
##   2       14      1.241224  0.9346076  0.9998393
##   2       15      1.248447  0.9329054  0.9986344
##   2       16      1.252950  0.9327962  1.0016825
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 2.
# Make predictions with the tuned MARS model on the test dataset
tuned_mars_predictions <- predict(tuned_mars_model, newdata = testData$x)

# Evaluate performance of the tuned MARS model
tuned_mars_performance <- postResample(pred = tuned_mars_predictions, obs = testData$y)

# Output performance metrics for both the basic and tuned MARS models
list(Basic_MARS_Performance = basic_mars_performance,
     Tuned_MARS_Performance = tuned_mars_performance)
## $Basic_MARS_Performance
##      RMSE  Rsquared       MAE 
## 1.8593934 0.8627649 1.4270936 
## 
## $Tuned_MARS_Performance
##      RMSE  Rsquared       MAE 
## 1.2051604 0.9406026 0.9555366
# Training SVM with Radial Basis Function (RBF) Kernel
# Train an SVM model with an RBF kernel on the training data
svm_rbf_model <- ksvm(
  x = as.matrix(trainingData$x),    # Input features matrix from training dataset
  y = trainingData$y,               # Target output values from training dataset
  kernel = "rbfdot",                # Use radial basis function kernel
  kpar = "automatic",               # Automatically select kernel parameters
  C = 1,                            # Regularization parameter
  epsilon = 0.1                     # Epsilon parameter for regression tolerance
)

# Make predictions with the RBF SVM model on the test dataset
svm_rbf_predictions <- predict(svm_rbf_model, newdata = testData$x)
# Evaluate performance of the RBF SVM model: accuracy, RMSE, and R-squared
svm_rbf_performance <- postResample(pred = svm_rbf_predictions, obs = testData$y)
# Tuning SVM with Radial Kernel, Centering, and Scaling
# Set up an SVM model with cross-validation, centering, and scaling for tuning
tuned_svm_model <- train(
  x = trainingData$x,               # Input features from training dataset
  y = trainingData$y,               # Target output values from training dataset
  method = "svmRadial",             # SVM with radial basis kernel
  preProc = c("center", "scale"),   # Preprocess data by centering and scaling
  tuneLength = 14,                  # Number of hyperparameter combinations to try
  trControl = trainControl(method = "cv") # Cross-validation control
)

# Display the details of the final tuned model
print(tuned_svm_model$finalModel)
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 8 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0533123769579932 
## 
## Number of Support Vectors : 151 
## 
## Objective Function Value : -91.8869 
## Training error : 0.015587
# Make predictions with the tuned SVM model on the test dataset
tuned_svm_predictions <- predict(tuned_svm_model, newdata = testData$x)

# Evaluate performance of the tuned SVM model
tuned_svm_performance <- postResample(pred = tuned_svm_predictions, obs = testData$y)

# Output performance metrics for both the basic and tuned SVM models
list(RBF_SVM_Performance = svm_rbf_performance,
     Tuned_SVM_Performance = tuned_svm_performance)
## $RBF_SVM_Performance
##      RMSE  Rsquared       MAE 
## 2.3383507 0.7836887 1.7842886 
## 
## $Tuned_SVM_Performance
##      RMSE  Rsquared       MAE 
## 2.1476907 0.8123075 1.5942410

When comparing the base models of four distinct non-linear methods—Multivariate Adaptive Regression Splines (MARS), Support Vector Machine (SVM), Neural Network (NN), and K-Nearest Neighbor (KNN)—we find that MARS demonstrates the best baseline performance, achieving an RMSE of 1.8 and an R-squared of 0.86. This suggests that MARS is more effective at capturing the underlying data structure than the other models at this initial level.

In the analysis of the tuned models, MARS continues to lead with an RMSE of 1.28 and an R-squared of 0.93, reinforcing its capability to effectively capture non-linear relationships even after hyperparameter tuning. The rankings of the other three models shift slightly: in the base models, SVM performs better than NN, followed by KNN. However, once tuning is applied, the Neural Network marginally outperforms SVM, indicating that NN gains more from the tuning adjustments in this instance.

Regarding feature selection, MARS inherently identifies and prioritizes the most informative predictors (x1-x5), while excluding less relevant features (x7-x10). This built-in capability to select the most predictive variables enhances model accuracy and increases interpretability. The top five predictors—x1 to x5—show high importance scores, whereas x6 is somewhat less significant, and the other predictors contribute only minimally to the model’s performance.

Excercise 7.5:

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

Using data imputation from previous

set.seed(123)
data(ChemicalManufacturingProcess)

# Apply BoxCox, center, and scale the imputation model to get better imputation values
impute_model <- preProcess(ChemicalManufacturingProcess, 
                           method = c("knnImpute", "BoxCox", "center", "scale"))
imputed_chemicals <- predict(impute_model, ChemicalManufacturingProcess)

chem_train_ind <- createDataPartition(imputed_chemicals$Yield, p = 0.8, list = FALSE)
chem_train <- imputed_chemicals[chem_train_ind, ]
chem_test <- imputed_chemicals[-chem_train_ind, ]

Part a:

Which nonlinear regression model gives the optimal resampling and test set performance?

KNN Model

A knn model with k=11 is chosen as optimal and produces RMSE=0.6873036 and R2=0.5981203

set.seed(123)

knn_chem_model <- train(Yield ~ .,
                  data = chem_train,
                  method = "knn",
                  preProcess = c("center", "scale"),
                  tuneLength = 10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
knn_chem_model
## k-Nearest Neighbors 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.7588924  0.4580137  0.5990882
##    7  0.7615983  0.4534715  0.6029981
##    9  0.7650739  0.4518594  0.6077026
##   11  0.7728976  0.4401068  0.6162233
##   13  0.7776275  0.4338687  0.6193723
##   15  0.7836841  0.4270089  0.6289283
##   17  0.7863849  0.4265546  0.6308605
##   19  0.7890962  0.4255567  0.6320916
##   21  0.7931773  0.4238605  0.6376813
##   23  0.7969760  0.4228887  0.6389923
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
knn_chem_pred <- predict(knn_chem_model, newdata = chem_test)
postResample(pred = knn_chem_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.7617649 0.4383261 0.6421287
plot(knn_chem_model)

Neural Network The generated neural network has an RMSE=0.6872073 and R2=0.5427729

set.seed(123)
gen_data <- mlbench.friedman1(5000, sd = 1)
gen_df <- data.frame(
  y = gen_data$y,
  x = gen_data$x
)
names(gen_df) <- c("y", paste0("x", 1:10))
head(gen_df)
##           y        x1         x2        x3         x4        x5        x6
## 1 13.473644 0.2875775 0.24862988 0.3105917 0.63683160 0.9911234 0.5040483
## 2  8.287567 0.7883051 0.98946318 0.3245201 0.09213944 0.3022307 0.8851688
## 3 21.378106 0.4089769 0.71712190 0.8702542 0.79230402 0.4337590 0.9408723
## 4 21.051258 0.8830174 0.65172829 0.3286738 0.94234440 0.1605209 0.8034121
## 5 16.675697 0.9404673 0.24055621 0.1257012 0.30269462 0.8230267 0.6601403
## 6  6.724787 0.0455565 0.08679275 0.3562214 0.56300805 0.2080906 0.3284019
##          x7        x8        x9        x10
## 1 0.0878923 0.2144076 0.2014940 0.60919692
## 2 0.7796115 0.5172579 0.9781903 0.02035448
## 3 0.2811673 0.3692681 0.4127125 0.68641761
## 4 0.5496990 0.8934657 0.4753125 0.39645894
## 5 0.1946069 0.7101645 0.0177104 0.61479040
## 6 0.3394398 0.1253853 0.3680780 0.26499289
train_ind <- createDataPartition(gen_df$y, p = 0.8, list = FALSE)
train_data <- gen_df[train_ind, ]
test_data <- gen_df[-train_ind, ]

control <- trainControl(
  method = "cv",
  preProcOptions = list(thresh = 0.75, cutoff = 0.75),
  allowParallel = TRUE
)

grid <- expand.grid(decay = c(0, 0.01, 0.1),
                    size = c(1:10),
                    bag = FALSE)
set.seed(123)
nn_chem_model <- train(
  Yield ~ .,
  data = chem_train,
  method = "avNNet",
  tuneGrid = grid,
  trControl = control,
  preProc = c("center", "scale", "corr"),
  linout = TRUE,
  trace = FALSE,
  maxit = 500
)
nn_chem_model
## Model Averaged Neural Network 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (37), scaled (37), remove (20) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE       Rsquared   MAE      
##   0.00    1    0.7100569  0.5563524  0.5440596
##   0.00    2    2.0450271  0.5486189  1.3589678
##   0.00    3    0.7419776  0.5708658  0.5739003
##   0.00    4    0.8273727  0.4831412  0.6586579
##   0.00    5    0.8559324  0.4837055  0.7131706
##   0.00    6    0.7872071  0.5146799  0.6527727
##   0.00    7    0.7386397  0.5446433  0.6051741
##   0.00    8    0.6901780  0.5995850  0.5718239
##   0.00    9    0.7283031  0.5874339  0.5809482
##   0.00   10    0.7381303  0.5385086  0.5935150
##   0.01    1    0.7450630  0.5233907  0.5843594
##   0.01    2    0.7067621  0.5684759  0.5659910
##   0.01    3    0.7004152  0.6154963  0.5842734
##   0.01    4    0.7254184  0.5950158  0.5896892
##   0.01    5    0.7319819  0.5525507  0.6115600
##   0.01    6    0.7334654  0.5775806  0.5933141
##   0.01    7    0.7324740  0.5526832  0.5807001
##   0.01    8    0.6929219  0.5865870  0.5488956
##   0.01    9    0.6647295  0.6264209  0.5229653
##   0.01   10    0.7059449  0.5783273  0.5510736
##   0.10    1    0.7772416  0.4897152  0.6020630
##   0.10    2    0.6650758  0.6285155  0.5360095
##   0.10    3    0.7178038  0.5929767  0.5806844
##   0.10    4    0.7001799  0.6079494  0.5494031
##   0.10    5    0.6865642  0.6050420  0.5488292
##   0.10    6    0.6551998  0.6319296  0.5254472
##   0.10    7    0.7088824  0.5877615  0.5640455
##   0.10    8    0.6802457  0.6101034  0.5470427
##   0.10    9    0.6697169  0.6290221  0.5366376
##   0.10   10    0.6656332  0.6282083  0.5294630
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 6, decay = 0.1 and bag = FALSE.
nn_chem_pred <- predict(nn_chem_model, newdata = chem_test)
postResample(pred = nn_chem_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.7628976 0.4293435 0.6393363
plot(nn_chem_model)

MARS

set.seed(123)

mars_grid <- expand.grid(degree = 1:5, nprune = 2:15)

mars_chem_model <- train(
  Yield ~ .,
  data = chem_train,
  method = "earth",
  trControl = control,
  tuneGrid = mars_grid
)
mars_chem_model
## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   MAE      
##   1        2      0.7621118  0.4906890  0.6064278
##   1        3      0.6323257  0.6171623  0.5128873
##   1        4      0.6583632  0.5969889  0.5320321
##   1        5      0.6519735  0.6165006  0.5216628
##   1        6      0.6877929  0.5728255  0.5407436
##   1        7      0.7080069  0.5607620  0.5583900
##   1        8      0.6818369  0.5924789  0.5535884
##   1        9      0.6735862  0.5963033  0.5541177
##   1       10      0.6807735  0.5910463  0.5477761
##   1       11      0.7024670  0.5768218  0.5627467
##   1       12      0.6969277  0.5843421  0.5600702
##   1       13      0.7079254  0.5731459  0.5711444
##   1       14      0.6937551  0.5786094  0.5620798
##   1       15      0.6908159  0.5807306  0.5603381
##   2        2      0.7621413  0.4908307  0.6070541
##   2        3      0.7354225  0.5165008  0.5824443
##   2        4      0.7276720  0.5235000  0.5756566
##   2        5      0.7501465  0.5071710  0.5945529
##   2        6      0.6930557  0.5698183  0.5454546
##   2        7      0.7271372  0.5310383  0.5670769
##   2        8      0.6832487  0.5738050  0.5360324
##   2        9      0.6855948  0.5638344  0.5339212
##   2       10      0.7744518  0.5075680  0.5758361
##   2       11      0.8312710  0.4901822  0.5982835
##   2       12      0.8445185  0.4847349  0.6036923
##   2       13      0.8566268  0.4853131  0.6161969
##   2       14      0.8603744  0.4935273  0.6132537
##   2       15      0.8912680  0.4814599  0.6291927
##   3        2      0.7621118  0.4906890  0.6064278
##   3        3      0.7424477  0.4984068  0.5852355
##   3        4      0.6861449  0.5716586  0.5430798
##   3        5      0.6846093  0.5850094  0.5390157
##   3        6      0.7068949  0.5463858  0.5799532
##   3        7      0.6645660  0.6031164  0.5398710
##   3        8      0.7135935  0.5286837  0.5639673
##   3        9      0.7067804  0.5411635  0.5643315
##   3       10      0.7124391  0.5440297  0.5651158
##   3       11      0.6987911  0.5796929  0.5539759
##   3       12      0.6942083  0.5820513  0.5512771
##   3       13      0.6882416  0.5879539  0.5420081
##   3       14      0.6908933  0.5969142  0.5467769
##   3       15      0.6843873  0.5937411  0.5410936
##   4        2      0.7621118  0.4906890  0.6064278
##   4        3      0.7050915  0.5218064  0.5681924
##   4        4      0.6572655  0.5960170  0.5280621
##   4        5      0.6852291  0.5818228  0.5398516
##   4        6      0.6926565  0.5770203  0.5520966
##   4        7      0.6887199  0.5856274  0.5437541
##   4        8      0.7175431  0.5509878  0.5562073
##   4        9      0.6864761  0.5917609  0.5328572
##   4       10      0.6852887  0.5938170  0.5315121
##   4       11      0.7146774  0.5763203  0.5442798
##   4       12      0.7365103  0.5716742  0.5588410
##   4       13      0.7171026  0.5823801  0.5434432
##   4       14      0.7518837  0.5691038  0.5639828
##   4       15      0.7277940  0.5910639  0.5618234
##   5        2      0.7621118  0.4906890  0.6064278
##   5        3      0.7198190  0.5185049  0.5858252
##   5        4      0.6755461  0.5892435  0.5439119
##   5        5      0.6882159  0.5822779  0.5461870
##   5        6      0.6949300  0.5776154  0.5541851
##   5        7      0.6920664  0.5910170  0.5450179
##   5        8      0.7248814  0.5472777  0.5667749
##   5        9      0.7000950  0.5773643  0.5446700
##   5       10      0.6850288  0.6024703  0.5288247
##   5       11      0.7141039  0.5820812  0.5406995
##   5       12      0.7388682  0.5735372  0.5598934
##   5       13      0.7173654  0.5839099  0.5463137
##   5       14      0.7493001  0.5723814  0.5634080
##   5       15      0.7256314  0.5938435  0.5634040
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 3 and degree = 1.
mars_chem_pred <- predict(mars_chem_model, newdata = chem_test)
postResample(pred = mars_chem_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.6743883 0.5467767 0.5429865
varImp(mars_chem_model)
## earth variable importance
## 
##                        Overall
## ManufacturingProcess32     100
## ManufacturingProcess09       0
plot(mars_chem_model)

Our MARS model has an RMSE=0.6162499 and R2=0.6335505 Interestingly there appear to only be two variables of import, and of those only ManufacturingProcess32 seems to have any weight.

SVM

set.seed(123)
svm_chem_model <- train(
  Yield ~ .,
  data = chem_train,
  method = "svmRadial",
  preProc = c("center", "scale"),
  trControl = control,
  tuneLength = 15
)
svm_chem_model
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE      
##      0.25  0.7203398  0.5731679  0.5837392
##      0.50  0.6617693  0.6244659  0.5297320
##      1.00  0.6188577  0.6786222  0.4866437
##      2.00  0.5995705  0.6974463  0.4753289
##      4.00  0.5973357  0.6908469  0.4788951
##      8.00  0.5860151  0.7009013  0.4763383
##     16.00  0.5866685  0.7000360  0.4772034
##     32.00  0.5866685  0.7000360  0.4772034
##     64.00  0.5866685  0.7000360  0.4772034
##    128.00  0.5866685  0.7000360  0.4772034
##    256.00  0.5866685  0.7000360  0.4772034
##    512.00  0.5866685  0.7000360  0.4772034
##   1024.00  0.5866685  0.7000360  0.4772034
##   2048.00  0.5866685  0.7000360  0.4772034
##   4096.00  0.5866685  0.7000360  0.4772034
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0140845
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0140845 and C = 8.
svm_chem_pred <- predict(svm_chem_model, newdata = chem_test)
postResample(pred = svm_chem_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.6462616 0.5850084 0.5414635
plot(svm_chem_model)

Part b:

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

varImp(svm_chem_model)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## BiologicalMaterial06     95.22
## BiologicalMaterial03     87.00
## ManufacturingProcess13   83.26
## ManufacturingProcess17   82.65
## ManufacturingProcess09   82.50
## ManufacturingProcess36   81.25
## ManufacturingProcess31   80.14
## BiologicalMaterial02     77.84
## BiologicalMaterial12     75.18
## ManufacturingProcess06   73.94
## BiologicalMaterial11     61.74
## ManufacturingProcess33   59.39
## ManufacturingProcess11   58.23
## BiologicalMaterial04     57.91
## ManufacturingProcess29   54.86
## BiologicalMaterial09     50.07
## BiologicalMaterial08     48.78
## BiologicalMaterial01     46.96
## ManufacturingProcess30   44.87

The best performing model was the SVM model, and below are the most important predictors Of the top ten predictors seven are manufacturing process variables. The top ten predictors for our SVM model are identical to the top ten for the enet model which was our optimal linear model. Not only is the order identical, but so are the weightings.

Part c:

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

# Calculate variable importance
imp <- varImp(svm_chem_model)

# Get top predictors
top_predictors <- rownames(imp$importance)[order(imp$importance$Overall, decreasing = TRUE)][1:5]

# Generate partial dependence data for all top predictors
partial_data <- lapply(top_predictors, function(pred) {
  partial(svm_chem_model, pred.var = pred, train = chem_train, grid.resolution = 50)
})

# Create ggplot for each partial dependence with adjusted text size
plots <- lapply(seq_along(partial_data), function(i) {
  ggplot(partial_data[[i]], aes_string(x = top_predictors[i], y = "yhat")) +
    geom_line() +
    labs(title = paste("Partial Dependence on", top_predictors[i]),
         x = top_predictors[i],
         y = "Partial Effect on Response") +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 10),  # Title size
      axis.title.x = element_text(size = 8),  # X-axis label size
      axis.title.y = element_text(size = 8),  # Y-axis label size
      axis.text = element_text(size = 7)       # Axis text size
    )
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Arrange and display all plots in a grid
do.call(grid.arrange, plots)

Given that our best model exhibited the same predictor importance as the optimal linear model (which raises some concerns), we will focus on the top five predictors. The plots reveal that the first three predictors are positively correlated with yield, while the last two are negatively correlated. Most relationships appear to be linear, although a slight sinusoidal pattern is noticeable in the first three predictors, particularly in ManufacturingProcess09. The last two predictors demonstrate a decline at the end of the relationship plot, suggesting that their importance decreases sharply beyond a certain threshold.