Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.
Friedman (1991) introduced several benchmark data sets create by sim- ulation. One of these simulations used the following nonlinear equation to create data: 22 y = 10sin(πx1x2) + 20(x3 − 0.5) + 10x4 + 5x5 + N(0,σ ) where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simula- tion). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(123)
trainingData <- mlbench.friedman1(200, sd = 1) ## We convert the 'x' data from a matrix to a data frame.## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y)
## other methods.
## This creates a list with a vector 'y' and a maxtrix
## of predictors 'x'. Also simulates a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
## Example of Tuning a Model is
knnmodel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnmodel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.413281 0.5080422 2.728246
## 7 3.273414 0.5527050 2.615315
## 9 3.235373 0.5697592 2.590831
## 11 3.162297 0.6032897 2.536368
## 13 3.154606 0.6140107 2.532940
## 15 3.181223 0.6164334 2.556654
## 17 3.179161 0.6300266 2.557801
## 19 3.205484 0.6317257 2.586990
## 21 3.230044 0.6358811 2.607149
## 23 3.250240 0.6423378 2.625998
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knnpred <- predict(knnmodel, newdata = testData$x)
postResample(pred = knnpred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2089763 0.6460475 2.5710330
# Single-Layer Neural Network with 5 Hidden Neurons
# Train a single-layer neural network model
single_layer_nn <- nnet(
x = trainingData$x, # Input features from training dataset
y = trainingData$y, # Target output values from training dataset
size = 5, # Number of neurons in the hidden layer
decay = 0.01, # Regularization parameter to reduce overfitting
linout = TRUE, # Use linear output for regression
trace = FALSE, # Suppress training output details
maxit = 500, # Maximum number of iterations for convergence
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1 # Max weights based on inputs and hidden layer size
)
# Predict using the single-layer model on test dataset
single_layer_predictions <- predict(single_layer_nn, newdata = testData$x)
# Evaluate model performance: accuracy, RMSE, and R-squared
single_layer_performance <- postResample(pred = single_layer_predictions, obs = testData$y)
# Averaged Neural Network Model with Multiple Repeats
# Train an ensemble neural network model with averaged predictions
averaged_nn_model <- avNNet(
x = trainingData$x, # Input features from training dataset
y = trainingData$y, # Target output values from training dataset
size = 5, # Number of neurons in each hidden layer
decay = 0.01, # Regularization parameter to reduce overfitting
repeats = 5, # Number of model repeats for ensemble averaging
linout = TRUE, # Use linear output for regression
trace = FALSE, # Suppress training output details
maxit = 500, # Maximum number of iterations for convergence
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1 # Max weights based on inputs and hidden layer size
)
## Warning: executing %dopar% sequentially: no parallel backend registered
# Predict using the averaged model on test dataset
averaged_nn_predictions <- predict(averaged_nn_model, newdata = testData$x)
# Evaluate performance of the averaged neural network model
averaged_nn_performance <- postResample(pred = averaged_nn_predictions, obs = testData$y)
# Output the performance metrics for both models
list(Single_Layer_NN_Performance = single_layer_performance,
Averaged_NN_Performance = averaged_nn_performance)
## $Single_Layer_NN_Performance
## RMSE Rsquared MAE
## 2.6273853 0.7331068 1.9189496
##
## $Averaged_NN_Performance
## RMSE Rsquared MAE
## 1.9057052 0.8526887 1.3993520
# Regular MARS (Multivariate Adaptive Regression Splines) Model
# Train a basic MARS model on the training data
basic_mars_model <- earth(
x = trainingData$x, # Input features from training dataset
y = trainingData$y # Target output values from training dataset
)
# Display model summary for interpretation
print(basic_mars_model)
## Selected 11 of 19 terms, and 7 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X4, X2, X1, X5, X3, X8, X9, X6-unused, X7-unused, X10-unused
## Number of terms at each degree of interaction: 1 10 (additive model)
## GCV 2.474848 RSS 396.483 GRSq 0.890458 RSq 0.91137
summary(basic_mars_model)
## Call: earth(x=trainingData$x, y=trainingData$y)
##
## coefficients
## (Intercept) 18.726619
## h(0.594343-X1) -9.932787
## h(0.5849-X2) -12.604904
## h(0.423464-X3) 11.522748
## h(X3-0.423464) 5.840845
## h(X3-0.816478) 11.190677
## h(0.915273-X4) -9.792100
## h(X4-0.915273) 30.182420
## h(X5-0.161809) 4.867564
## h(X8-0.812012) 5.856740
## h(X9-0.784228) -4.059111
##
## Selected 11 of 19 terms, and 7 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X4, X2, X1, X5, X3, X8, X9, X6-unused, X7-unused, X10-unused
## Number of terms at each degree of interaction: 1 10 (additive model)
## GCV 2.474848 RSS 396.483 GRSq 0.890458 RSq 0.91137
# Make predictions with the basic MARS model on the test dataset
basic_mars_predictions <- predict(basic_mars_model, newdata = testData$x)
# Evaluate performance of the basic MARS model: accuracy, RMSE, and R-squared
basic_mars_performance <- postResample(pred = basic_mars_predictions, obs = testData$y)
# Setting up a grid for hyperparameter tuning
mars_tuning_grid <- expand.grid(.degree = 1:2, .nprune = 2:16) # Degrees of interaction and number of terms
# Set seed for reproducibility in cross-validation
set.seed(1234)
# Tuning the MARS model with cross-validation
# Train a MARS model with cross-validation to find optimal hyperparameters
tuned_mars_model <- train(
x = trainingData$x, # Input features from training dataset
y = trainingData$y, # Target output values from training dataset
method = "earth", # Specifies MARS model
tuneGrid = mars_tuning_grid, # Grid of tuning parameters
trControl = trainControl(method = "cv") # Cross-validation control
)
# Display the tuned model details
print(tuned_mars_model)
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 3.770992 0.3727524 3.1357545
## 1 3 2.949996 0.6160612 2.2725468
## 1 4 2.482922 0.7307027 2.0031607
## 1 5 2.220126 0.7941821 1.8083169
## 1 6 2.089436 0.8195562 1.7001675
## 1 7 1.649547 0.8867299 1.3209519
## 1 8 1.637781 0.8870078 1.3132005
## 1 9 1.622447 0.8888477 1.2978562
## 1 10 1.623192 0.8881811 1.2989605
## 1 11 1.627285 0.8875157 1.3019940
## 1 12 1.620883 0.8878084 1.2959395
## 1 13 1.620883 0.8878084 1.2959395
## 1 14 1.620883 0.8878084 1.2959395
## 1 15 1.620883 0.8878084 1.2959395
## 1 16 1.620883 0.8878084 1.2959395
## 2 2 3.770992 0.3727524 3.1357545
## 2 3 2.949996 0.6160612 2.2725468
## 2 4 2.482922 0.7307027 2.0031607
## 2 5 2.240208 0.7896959 1.8018649
## 2 6 2.089436 0.8195562 1.7001675
## 2 7 1.649547 0.8867299 1.3209519
## 2 8 1.547759 0.9000767 1.2488714
## 2 9 1.455612 0.9095562 1.1686029
## 2 10 1.366332 0.9228232 1.0896493
## 2 11 1.278749 0.9307460 1.0340444
## 2 12 1.234331 0.9340234 0.9872796
## 2 13 1.252947 0.9332905 1.0157019
## 2 14 1.241224 0.9346076 0.9998393
## 2 15 1.248447 0.9329054 0.9986344
## 2 16 1.252950 0.9327962 1.0016825
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 2.
# Make predictions with the tuned MARS model on the test dataset
tuned_mars_predictions <- predict(tuned_mars_model, newdata = testData$x)
# Evaluate performance of the tuned MARS model
tuned_mars_performance <- postResample(pred = tuned_mars_predictions, obs = testData$y)
# Output performance metrics for both the basic and tuned MARS models
list(Basic_MARS_Performance = basic_mars_performance,
Tuned_MARS_Performance = tuned_mars_performance)
## $Basic_MARS_Performance
## RMSE Rsquared MAE
## 1.8593934 0.8627649 1.4270936
##
## $Tuned_MARS_Performance
## RMSE Rsquared MAE
## 1.2051604 0.9406026 0.9555366
# Training SVM with Radial Basis Function (RBF) Kernel
# Train an SVM model with an RBF kernel on the training data
svm_rbf_model <- ksvm(
x = as.matrix(trainingData$x), # Input features matrix from training dataset
y = trainingData$y, # Target output values from training dataset
kernel = "rbfdot", # Use radial basis function kernel
kpar = "automatic", # Automatically select kernel parameters
C = 1, # Regularization parameter
epsilon = 0.1 # Epsilon parameter for regression tolerance
)
# Make predictions with the RBF SVM model on the test dataset
svm_rbf_predictions <- predict(svm_rbf_model, newdata = testData$x)
# Evaluate performance of the RBF SVM model: accuracy, RMSE, and R-squared
svm_rbf_performance <- postResample(pred = svm_rbf_predictions, obs = testData$y)
# Tuning SVM with Radial Kernel, Centering, and Scaling
# Set up an SVM model with cross-validation, centering, and scaling for tuning
tuned_svm_model <- train(
x = trainingData$x, # Input features from training dataset
y = trainingData$y, # Target output values from training dataset
method = "svmRadial", # SVM with radial basis kernel
preProc = c("center", "scale"), # Preprocess data by centering and scaling
tuneLength = 14, # Number of hyperparameter combinations to try
trControl = trainControl(method = "cv") # Cross-validation control
)
# Display the details of the final tuned model
print(tuned_svm_model$finalModel)
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 8
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0533123769579932
##
## Number of Support Vectors : 151
##
## Objective Function Value : -91.8869
## Training error : 0.015587
# Make predictions with the tuned SVM model on the test dataset
tuned_svm_predictions <- predict(tuned_svm_model, newdata = testData$x)
# Evaluate performance of the tuned SVM model
tuned_svm_performance <- postResample(pred = tuned_svm_predictions, obs = testData$y)
# Output performance metrics for both the basic and tuned SVM models
list(RBF_SVM_Performance = svm_rbf_performance,
Tuned_SVM_Performance = tuned_svm_performance)
## $RBF_SVM_Performance
## RMSE Rsquared MAE
## 2.3383507 0.7836887 1.7842886
##
## $Tuned_SVM_Performance
## RMSE Rsquared MAE
## 2.1476907 0.8123075 1.5942410
When comparing the base models of four distinct non-linear methods—Multivariate Adaptive Regression Splines (MARS), Support Vector Machine (SVM), Neural Network (NN), and K-Nearest Neighbor (KNN)—we find that MARS demonstrates the best baseline performance, achieving an RMSE of 1.8 and an R-squared of 0.86. This suggests that MARS is more effective at capturing the underlying data structure than the other models at this initial level.
In the analysis of the tuned models, MARS continues to lead with an RMSE of 1.28 and an R-squared of 0.93, reinforcing its capability to effectively capture non-linear relationships even after hyperparameter tuning. The rankings of the other three models shift slightly: in the base models, SVM performs better than NN, followed by KNN. However, once tuning is applied, the Neural Network marginally outperforms SVM, indicating that NN gains more from the tuning adjustments in this instance.
Regarding feature selection, MARS inherently identifies and prioritizes the most informative predictors (x1-x5), while excluding less relevant features (x7-x10). This built-in capability to select the most predictive variables enhances model accuracy and increases interpretability. The top five predictors—x1 to x5—show high importance scores, whereas x6 is somewhat less significant, and the other predictors contribute only minimally to the model’s performance.
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
Using data imputation from previous
set.seed(123)
data(ChemicalManufacturingProcess)
# Apply BoxCox, center, and scale the imputation model to get better imputation values
impute_model <- preProcess(ChemicalManufacturingProcess,
method = c("knnImpute", "BoxCox", "center", "scale"))
imputed_chemicals <- predict(impute_model, ChemicalManufacturingProcess)
chem_train_ind <- createDataPartition(imputed_chemicals$Yield, p = 0.8, list = FALSE)
chem_train <- imputed_chemicals[chem_train_ind, ]
chem_test <- imputed_chemicals[-chem_train_ind, ]
Which nonlinear regression model gives the optimal resampling and test set performance?
KNN Model
A knn model with k=11 is chosen as optimal and produces RMSE=0.6873036 and R2=0.5981203
set.seed(123)
knn_chem_model <- train(Yield ~ .,
data = chem_train,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
knn_chem_model
## k-Nearest Neighbors
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.7588924 0.4580137 0.5990882
## 7 0.7615983 0.4534715 0.6029981
## 9 0.7650739 0.4518594 0.6077026
## 11 0.7728976 0.4401068 0.6162233
## 13 0.7776275 0.4338687 0.6193723
## 15 0.7836841 0.4270089 0.6289283
## 17 0.7863849 0.4265546 0.6308605
## 19 0.7890962 0.4255567 0.6320916
## 21 0.7931773 0.4238605 0.6376813
## 23 0.7969760 0.4228887 0.6389923
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
knn_chem_pred <- predict(knn_chem_model, newdata = chem_test)
postResample(pred = knn_chem_pred, obs = chem_test$Yield)
## RMSE Rsquared MAE
## 0.7617649 0.4383261 0.6421287
plot(knn_chem_model)
Neural Network The generated neural network has an
RMSE=0.6872073 and R2=0.5427729
set.seed(123)
gen_data <- mlbench.friedman1(5000, sd = 1)
gen_df <- data.frame(
y = gen_data$y,
x = gen_data$x
)
names(gen_df) <- c("y", paste0("x", 1:10))
head(gen_df)
## y x1 x2 x3 x4 x5 x6
## 1 13.473644 0.2875775 0.24862988 0.3105917 0.63683160 0.9911234 0.5040483
## 2 8.287567 0.7883051 0.98946318 0.3245201 0.09213944 0.3022307 0.8851688
## 3 21.378106 0.4089769 0.71712190 0.8702542 0.79230402 0.4337590 0.9408723
## 4 21.051258 0.8830174 0.65172829 0.3286738 0.94234440 0.1605209 0.8034121
## 5 16.675697 0.9404673 0.24055621 0.1257012 0.30269462 0.8230267 0.6601403
## 6 6.724787 0.0455565 0.08679275 0.3562214 0.56300805 0.2080906 0.3284019
## x7 x8 x9 x10
## 1 0.0878923 0.2144076 0.2014940 0.60919692
## 2 0.7796115 0.5172579 0.9781903 0.02035448
## 3 0.2811673 0.3692681 0.4127125 0.68641761
## 4 0.5496990 0.8934657 0.4753125 0.39645894
## 5 0.1946069 0.7101645 0.0177104 0.61479040
## 6 0.3394398 0.1253853 0.3680780 0.26499289
train_ind <- createDataPartition(gen_df$y, p = 0.8, list = FALSE)
train_data <- gen_df[train_ind, ]
test_data <- gen_df[-train_ind, ]
control <- trainControl(
method = "cv",
preProcOptions = list(thresh = 0.75, cutoff = 0.75),
allowParallel = TRUE
)
grid <- expand.grid(decay = c(0, 0.01, 0.1),
size = c(1:10),
bag = FALSE)
set.seed(123)
nn_chem_model <- train(
Yield ~ .,
data = chem_train,
method = "avNNet",
tuneGrid = grid,
trControl = control,
preProc = c("center", "scale", "corr"),
linout = TRUE,
trace = FALSE,
maxit = 500
)
nn_chem_model
## Model Averaged Neural Network
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (37), scaled (37), remove (20)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 0.7100569 0.5563524 0.5440596
## 0.00 2 2.0450271 0.5486189 1.3589678
## 0.00 3 0.7419776 0.5708658 0.5739003
## 0.00 4 0.8273727 0.4831412 0.6586579
## 0.00 5 0.8559324 0.4837055 0.7131706
## 0.00 6 0.7872071 0.5146799 0.6527727
## 0.00 7 0.7386397 0.5446433 0.6051741
## 0.00 8 0.6901780 0.5995850 0.5718239
## 0.00 9 0.7283031 0.5874339 0.5809482
## 0.00 10 0.7381303 0.5385086 0.5935150
## 0.01 1 0.7450630 0.5233907 0.5843594
## 0.01 2 0.7067621 0.5684759 0.5659910
## 0.01 3 0.7004152 0.6154963 0.5842734
## 0.01 4 0.7254184 0.5950158 0.5896892
## 0.01 5 0.7319819 0.5525507 0.6115600
## 0.01 6 0.7334654 0.5775806 0.5933141
## 0.01 7 0.7324740 0.5526832 0.5807001
## 0.01 8 0.6929219 0.5865870 0.5488956
## 0.01 9 0.6647295 0.6264209 0.5229653
## 0.01 10 0.7059449 0.5783273 0.5510736
## 0.10 1 0.7772416 0.4897152 0.6020630
## 0.10 2 0.6650758 0.6285155 0.5360095
## 0.10 3 0.7178038 0.5929767 0.5806844
## 0.10 4 0.7001799 0.6079494 0.5494031
## 0.10 5 0.6865642 0.6050420 0.5488292
## 0.10 6 0.6551998 0.6319296 0.5254472
## 0.10 7 0.7088824 0.5877615 0.5640455
## 0.10 8 0.6802457 0.6101034 0.5470427
## 0.10 9 0.6697169 0.6290221 0.5366376
## 0.10 10 0.6656332 0.6282083 0.5294630
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 6, decay = 0.1 and bag = FALSE.
nn_chem_pred <- predict(nn_chem_model, newdata = chem_test)
postResample(pred = nn_chem_pred, obs = chem_test$Yield)
## RMSE Rsquared MAE
## 0.7628976 0.4293435 0.6393363
plot(nn_chem_model)
MARS
set.seed(123)
mars_grid <- expand.grid(degree = 1:5, nprune = 2:15)
mars_chem_model <- train(
Yield ~ .,
data = chem_train,
method = "earth",
trControl = control,
tuneGrid = mars_grid
)
mars_chem_model
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 0.7621118 0.4906890 0.6064278
## 1 3 0.6323257 0.6171623 0.5128873
## 1 4 0.6583632 0.5969889 0.5320321
## 1 5 0.6519735 0.6165006 0.5216628
## 1 6 0.6877929 0.5728255 0.5407436
## 1 7 0.7080069 0.5607620 0.5583900
## 1 8 0.6818369 0.5924789 0.5535884
## 1 9 0.6735862 0.5963033 0.5541177
## 1 10 0.6807735 0.5910463 0.5477761
## 1 11 0.7024670 0.5768218 0.5627467
## 1 12 0.6969277 0.5843421 0.5600702
## 1 13 0.7079254 0.5731459 0.5711444
## 1 14 0.6937551 0.5786094 0.5620798
## 1 15 0.6908159 0.5807306 0.5603381
## 2 2 0.7621413 0.4908307 0.6070541
## 2 3 0.7354225 0.5165008 0.5824443
## 2 4 0.7276720 0.5235000 0.5756566
## 2 5 0.7501465 0.5071710 0.5945529
## 2 6 0.6930557 0.5698183 0.5454546
## 2 7 0.7271372 0.5310383 0.5670769
## 2 8 0.6832487 0.5738050 0.5360324
## 2 9 0.6855948 0.5638344 0.5339212
## 2 10 0.7744518 0.5075680 0.5758361
## 2 11 0.8312710 0.4901822 0.5982835
## 2 12 0.8445185 0.4847349 0.6036923
## 2 13 0.8566268 0.4853131 0.6161969
## 2 14 0.8603744 0.4935273 0.6132537
## 2 15 0.8912680 0.4814599 0.6291927
## 3 2 0.7621118 0.4906890 0.6064278
## 3 3 0.7424477 0.4984068 0.5852355
## 3 4 0.6861449 0.5716586 0.5430798
## 3 5 0.6846093 0.5850094 0.5390157
## 3 6 0.7068949 0.5463858 0.5799532
## 3 7 0.6645660 0.6031164 0.5398710
## 3 8 0.7135935 0.5286837 0.5639673
## 3 9 0.7067804 0.5411635 0.5643315
## 3 10 0.7124391 0.5440297 0.5651158
## 3 11 0.6987911 0.5796929 0.5539759
## 3 12 0.6942083 0.5820513 0.5512771
## 3 13 0.6882416 0.5879539 0.5420081
## 3 14 0.6908933 0.5969142 0.5467769
## 3 15 0.6843873 0.5937411 0.5410936
## 4 2 0.7621118 0.4906890 0.6064278
## 4 3 0.7050915 0.5218064 0.5681924
## 4 4 0.6572655 0.5960170 0.5280621
## 4 5 0.6852291 0.5818228 0.5398516
## 4 6 0.6926565 0.5770203 0.5520966
## 4 7 0.6887199 0.5856274 0.5437541
## 4 8 0.7175431 0.5509878 0.5562073
## 4 9 0.6864761 0.5917609 0.5328572
## 4 10 0.6852887 0.5938170 0.5315121
## 4 11 0.7146774 0.5763203 0.5442798
## 4 12 0.7365103 0.5716742 0.5588410
## 4 13 0.7171026 0.5823801 0.5434432
## 4 14 0.7518837 0.5691038 0.5639828
## 4 15 0.7277940 0.5910639 0.5618234
## 5 2 0.7621118 0.4906890 0.6064278
## 5 3 0.7198190 0.5185049 0.5858252
## 5 4 0.6755461 0.5892435 0.5439119
## 5 5 0.6882159 0.5822779 0.5461870
## 5 6 0.6949300 0.5776154 0.5541851
## 5 7 0.6920664 0.5910170 0.5450179
## 5 8 0.7248814 0.5472777 0.5667749
## 5 9 0.7000950 0.5773643 0.5446700
## 5 10 0.6850288 0.6024703 0.5288247
## 5 11 0.7141039 0.5820812 0.5406995
## 5 12 0.7388682 0.5735372 0.5598934
## 5 13 0.7173654 0.5839099 0.5463137
## 5 14 0.7493001 0.5723814 0.5634080
## 5 15 0.7256314 0.5938435 0.5634040
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 3 and degree = 1.
mars_chem_pred <- predict(mars_chem_model, newdata = chem_test)
postResample(pred = mars_chem_pred, obs = chem_test$Yield)
## RMSE Rsquared MAE
## 0.6743883 0.5467767 0.5429865
varImp(mars_chem_model)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100
## ManufacturingProcess09 0
plot(mars_chem_model)
Our MARS model has an RMSE=0.6162499 and R2=0.6335505 Interestingly
there appear to only be two variables of import, and of those only
ManufacturingProcess32 seems to have any weight.
SVM
set.seed(123)
svm_chem_model <- train(
Yield ~ .,
data = chem_train,
method = "svmRadial",
preProc = c("center", "scale"),
trControl = control,
tuneLength = 15
)
svm_chem_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.7203398 0.5731679 0.5837392
## 0.50 0.6617693 0.6244659 0.5297320
## 1.00 0.6188577 0.6786222 0.4866437
## 2.00 0.5995705 0.6974463 0.4753289
## 4.00 0.5973357 0.6908469 0.4788951
## 8.00 0.5860151 0.7009013 0.4763383
## 16.00 0.5866685 0.7000360 0.4772034
## 32.00 0.5866685 0.7000360 0.4772034
## 64.00 0.5866685 0.7000360 0.4772034
## 128.00 0.5866685 0.7000360 0.4772034
## 256.00 0.5866685 0.7000360 0.4772034
## 512.00 0.5866685 0.7000360 0.4772034
## 1024.00 0.5866685 0.7000360 0.4772034
## 2048.00 0.5866685 0.7000360 0.4772034
## 4096.00 0.5866685 0.7000360 0.4772034
##
## Tuning parameter 'sigma' was held constant at a value of 0.0140845
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0140845 and C = 8.
svm_chem_pred <- predict(svm_chem_model, newdata = chem_test)
postResample(pred = svm_chem_pred, obs = chem_test$Yield)
## RMSE Rsquared MAE
## 0.6462616 0.5850084 0.5414635
plot(svm_chem_model)
Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
varImp(svm_chem_model)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 95.22
## BiologicalMaterial03 87.00
## ManufacturingProcess13 83.26
## ManufacturingProcess17 82.65
## ManufacturingProcess09 82.50
## ManufacturingProcess36 81.25
## ManufacturingProcess31 80.14
## BiologicalMaterial02 77.84
## BiologicalMaterial12 75.18
## ManufacturingProcess06 73.94
## BiologicalMaterial11 61.74
## ManufacturingProcess33 59.39
## ManufacturingProcess11 58.23
## BiologicalMaterial04 57.91
## ManufacturingProcess29 54.86
## BiologicalMaterial09 50.07
## BiologicalMaterial08 48.78
## BiologicalMaterial01 46.96
## ManufacturingProcess30 44.87
The best performing model was the SVM model, and below are the most important predictors Of the top ten predictors seven are manufacturing process variables. The top ten predictors for our SVM model are identical to the top ten for the enet model which was our optimal linear model. Not only is the order identical, but so are the weightings.
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
# Calculate variable importance
imp <- varImp(svm_chem_model)
# Get top predictors
top_predictors <- rownames(imp$importance)[order(imp$importance$Overall, decreasing = TRUE)][1:5]
# Generate partial dependence data for all top predictors
partial_data <- lapply(top_predictors, function(pred) {
partial(svm_chem_model, pred.var = pred, train = chem_train, grid.resolution = 50)
})
# Create ggplot for each partial dependence with adjusted text size
plots <- lapply(seq_along(partial_data), function(i) {
ggplot(partial_data[[i]], aes_string(x = top_predictors[i], y = "yhat")) +
geom_line() +
labs(title = paste("Partial Dependence on", top_predictors[i]),
x = top_predictors[i],
y = "Partial Effect on Response") +
theme_minimal() +
theme(
plot.title = element_text(size = 10), # Title size
axis.title.x = element_text(size = 8), # X-axis label size
axis.title.y = element_text(size = 8), # Y-axis label size
axis.text = element_text(size = 7) # Axis text size
)
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Arrange and display all plots in a grid
do.call(grid.arrange, plots)
Given that our best model exhibited the same predictor importance as the optimal linear model (which raises some concerns), we will focus on the top five predictors. The plots reveal that the first three predictors are positively correlated with yield, while the last two are negatively correlated. Most relationships appear to be linear, although a slight sinusoidal pattern is noticeable in the first three predictors, particularly in ManufacturingProcess09. The last two predictors demonstrate a decline at the end of the relationship plot, suggesting that their importance decreases sharply beyond a certain threshold.