Required Libraries

library(tidyverse)
library(kableExtra)
library(janitor)
library(mlbench)
library(caret) ## KNN
library(latex2exp)
library(earth) ## MARS
library(AppliedPredictiveModeling)
library(ggcorrplot)

7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\[ y = 10sin(\pi x_1x_2) + 20(x_3 - 0.5)^2 + 10x_4 + 5x_5 + N(0, \sigma^2) \]

where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1that simulates these data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.

trainingData$x <- data.frame(trainingData$x)

## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

KNN

k-Nearest Neighbors


set.seed(42)

knnModel <- train(x = trainingData$x, y = trainingData$y, 
                  method = "knn", 
                  preProc = c("center", "scale"), 
                  tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.593384  0.4866962  2.925730
##    7  3.450129  0.5309533  2.812117
##    9  3.374966  0.5603358  2.738616
##   11  3.344853  0.5811160  2.699411
##   13  3.288017  0.6137613  2.651344
##   15  3.291558  0.6243500  2.656925
##   17  3.293868  0.6348015  2.660223
##   19  3.287441  0.6500167  2.651429
##   21  3.290406  0.6578647  2.661403
##   23  3.295990  0.6670376  2.674025
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.
knn_results <-
  knnModel$results |>
  filter(k == 19)

kbl(knn_results,
    caption = "KNN Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
KNN Training Model
k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
19 3.287441 0.6500167 2.651429 0.2962589 0.0646111 0.2795915
top_pred <- 
  varImp(knnModel)$importance |>
  arrange(desc(Overall))

kbl(head(top_pred, 5),
    caption = "Top 5 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("KNN Model")
Top 5 Predictors
Overall
X4 100.00000
X1 95.50471
X2 89.61865
X5 45.21698
X3 29.93295
Note:
KNN Model
knnPred <- predict(knnModel, newdata = testData$x)

## The function 'postResample' can be used to get the test set
## perforamnce values

knn_metrics <- postResample(pred = knnPred, obs = testData$y)
knn_metrics <- as_tibble(as.list(knn_metrics))

kbl(knn_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("KNN Model")
Prediction Metrics
RMSE Rsquared MAE
3.228683 0.6871735 2.593973
Note:
KNN Model

MARS

Multivariate Adaptive Regression Splines


set.seed(42)

marsGrid <- expand.grid(.degree = 1:2, .nprune=2:16)

marsTuned <- train(trainingData$x, trainingData$y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

marsTuned
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      4.476579  0.2277382  3.7356067
##   1        3      3.553760  0.4883923  2.8477328
##   1        4      2.693919  0.7206389  2.1654581
##   1        5      2.466322  0.7609079  1.9586776
##   1        6      2.360291  0.7853958  1.8438467
##   1        7      1.855162  0.8634450  1.4472161
##   1        8      1.709378  0.8862307  1.3044347
##   1        9      1.622279  0.8998138  1.2631306
##   1       10      1.616540  0.9008842  1.2649581
##   1       11      1.646626  0.8961273  1.2803571
##   1       12      1.674181  0.8923909  1.2855550
##   1       13      1.664137  0.8948943  1.2645698
##   1       14      1.651457  0.8956602  1.2630422
##   1       15      1.646502  0.8960189  1.2580649
##   1       16      1.646502  0.8960189  1.2580649
##   2        2      4.476579  0.2277382  3.7356067
##   2        3      3.553760  0.4883923  2.8477328
##   2        4      2.693919  0.7206389  2.1654581
##   2        5      2.391966  0.7756076  1.9090990
##   2        6      2.333309  0.7881029  1.8043481
##   2        7      1.885029  0.8607341  1.4635248
##   2        8      1.743065  0.8797282  1.3691428
##   2        9      1.640348  0.8899848  1.3033033
##   2       10      1.437179  0.9154582  1.1344466
##   2       11      1.387157  0.9206236  1.0618170
##   2       12      1.260926  0.9363945  0.9719606
##   2       13      1.249483  0.9372768  0.9627787
##   2       14      1.237039  0.9404698  0.9591283
##   2       15      1.236968  0.9419557  0.9678294
##   2       16      1.245441  0.9402940  0.9647491
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 15 and degree = 2.
mars_results <-
  marsTuned$results |>
  filter(degree == 2, nprune == 15)

kbl(mars_results,
    caption = "MARS Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
MARS Training Model
degree nprune RMSE Rsquared MAE RMSESD RsquaredSD MAESD
2 15 1.236968 0.9419557 0.9678294 0.1780041 0.0228732 0.17343
mars_top_pred <- 
  varImp(marsTuned)$importance |>
  arrange(desc(Overall))

kbl(head(mars_top_pred, 5),
    caption = "Top 5 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("MARS Model")
Top 5 Predictors
Overall
X1 100.00000
X4 75.23592
X2 48.72974
X5 15.51884
X3 0.00000
Note:
MARS Model
marsPred <- predict(marsTuned, newdata = testData$x)

## The function 'postResample' can be used to get the test set
## perforamnce values

mars_metrics <- postResample(pred = marsPred, obs = testData$y)
mars_metrics <- as_tibble(as.list(mars_metrics))

kbl(mars_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("MARS Model")
Prediction Metrics
RMSE Rsquared MAE
1.158995 0.9460418 0.925023
Note:
MARS Model


SVM

Support Vector Machines

set.seed(42)

svmRTuned <- train(trainingData$x, trainingData$y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))
svmRTuned
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.494595  0.7977557  2.005513
##      0.50  2.216555  0.8156746  1.766512
##      1.00  2.020950  0.8402258  1.608446
##      2.00  1.945841  0.8514850  1.536883
##      4.00  1.894326  0.8609361  1.504003
##      8.00  1.886268  0.8620782  1.494837
##     16.00  1.879360  0.8627289  1.491156
##     32.00  1.879360  0.8627289  1.491156
##     64.00  1.879360  0.8627289  1.491156
##    128.00  1.879360  0.8627289  1.491156
##    256.00  1.879360  0.8627289  1.491156
##    512.00  1.879360  0.8627289  1.491156
##   1024.00  1.879360  0.8627289  1.491156
##   2048.00  1.879360  0.8627289  1.491156
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06533796
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06533796 and C = 16.
svm_results <-
  svmRTuned$results |>
  filter(round(sigma, 8) == 0.06533796, C == 16)

kbl(svm_results,
    caption = "SVM Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
SVM Training Model
sigma C RMSE Rsquared MAE RMSESD RsquaredSD MAESD
0.065338 16 1.87936 0.8627289 1.491156 0.3480277 0.0531924 0.2883053
top_pred <- 
  varImp(svmRTuned)$importance |>
  arrange(desc(Overall))

kbl(head(top_pred, 5),
    caption = "Top 5 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("SVM Model")
Top 5 Predictors
Overall
X4 100.00000
X1 95.50471
X2 89.61865
X5 45.21698
X3 29.93295
Note:
SVM Model
svmPred <- predict(svmRTuned, newdata = testData$x)

## The function 'postResample' can be used to get the test set
## perforamnce values

svm_metrics <- postResample(pred = svmPred, obs = testData$y)
svm_metrics <- as_tibble(as.list(svm_metrics))

kbl(svm_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("SVM Model")
Prediction Metrics
RMSE Rsquared MAE
2.079382 0.824766 1.57968
Note:
SVM Model

Results

Our MARS model appears to give the best performance from both the training model and prediction results. When we compare each of their RMSE and \(R^2\) scores, MARS vastly outperforms the other two models. We also see how MARS selected which variables were the most informative predictors. In our chart, it has X1, X4, X2, X5 in order of importance and we see how X3 is not as important in the model.

append_tables <- function(train_table, pred_table, model_name){
  
  train_table <-
    train_table |>
    select(RMSE, Rsquared) |>
    rename(Train_RMSE = RMSE,
           Train_Rsq = Rsquared) |>
    mutate(Model = model_name) |>
    relocate(Model)

  pred_table <-
    pred_table |>
    select(RMSE, Rsquared) |>
    rename(Pred_RMSE = RMSE,
           Pred_Rsq = Rsquared) |>
    mutate(Model = model_name) |>
    relocate(Model)
  
  
    return(train_table |>
              left_join(pred_table, join_by(Model)))
}

knn_table <- append_tables(knn_results, knn_metrics, "KNN")
mars_table <- append_tables(mars_results, mars_metrics, "MARS")
svm_table <- append_tables(svm_results, svm_metrics, "SVM")
model_table <- 
  bind_rows(knn_table, mars_table, svm_table)

kbl(model_table,
    caption = "Model Comparisons",
    digits=4) |>
  kable_classic(full_width = F, html_font = "Cambria")
Model Comparisons
Model Train_RMSE Train_Rsq Pred_RMSE Pred_Rsq
KNN 3.2874 0.6500 3.2287 0.6872
MARS 1.2370 0.9420 1.1590 0.9460
SVM 1.8794 0.8627 2.0794 0.8248
mars_top_pred |>
  ggplot(aes(y=reorder(row.names(mars_top_pred), Overall), x=Overall, label=round(Overall,2))) +
  geom_bar(stat='identity', fill="#F28E2B") +
  geom_text(position = position_stack(vjust = 0.5),
            size=3,
            fontface = "bold",
            colour="black") +
  theme_minimal() +
  labs(y="", title = "MARS Predictors by Model Importance") +
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"))

7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

  1. Which nonlinear regression model gives the optimal resampling and test set performance?

Data Preparation

data(ChemicalManufacturingProcess)

chem <- ChemicalManufacturingProcess
kbl(chem[1:5, 1:10],
    caption = "Chemical Manufacturing Process") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(chem), " x ", ncol(chem))))
Chemical Manufacturing Process
Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
38.00 6.25 49.58 56.97 12.74 19.51 43.73 100 16.66 11.44
42.44 8.01 60.97 67.48 14.65 19.36 53.14 100 19.04 12.55
42.03 8.01 60.97 67.48 14.65 19.36 53.14 100 19.04 12.55
41.42 8.01 60.97 67.48 14.65 19.36 53.14 100 19.04 12.55
42.49 7.47 63.33 72.25 14.02 17.91 54.66 100 18.22 12.80
Dimensions:
176 x 58

We can see which variables have missing values below. Afterwards, we can use a KNN imputation method to replace these NA values and confirm that we have zero missing values.

missing <- 
  chem %>% 
  summarise(across(everything(), ~ sum(is.na(.x)))) %>%
  pivot_longer(., 
            cols = everything()) |>
  filter(value != 0) |>
  rename(variable = name, 
         total = value) |>
  mutate(pct = round(total/nrow(chem), 4)) |>
  arrange(desc(pct), variable) |>
  mutate(variable = fct_reorder(variable, pct))

kbl(missing,
  caption = "Missing Values") |>
  kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(latex_options = "HOLD_position")
Missing Values
variable total pct
ManufacturingProcess03 15 0.0852
ManufacturingProcess11 10 0.0568
ManufacturingProcess10 9 0.0511
ManufacturingProcess25 5 0.0284
ManufacturingProcess26 5 0.0284
ManufacturingProcess27 5 0.0284
ManufacturingProcess28 5 0.0284
ManufacturingProcess29 5 0.0284
ManufacturingProcess30 5 0.0284
ManufacturingProcess31 5 0.0284
ManufacturingProcess33 5 0.0284
ManufacturingProcess34 5 0.0284
ManufacturingProcess35 5 0.0284
ManufacturingProcess36 5 0.0284
ManufacturingProcess02 3 0.0170
ManufacturingProcess06 2 0.0114
ManufacturingProcess01 1 0.0057
ManufacturingProcess04 1 0.0057
ManufacturingProcess05 1 0.0057
ManufacturingProcess07 1 0.0057
ManufacturingProcess08 1 0.0057
ManufacturingProcess12 1 0.0057
ManufacturingProcess14 1 0.0057
ManufacturingProcess22 1 0.0057
ManufacturingProcess23 1 0.0057
ManufacturingProcess24 1 0.0057
ManufacturingProcess40 1 0.0057
ManufacturingProcess41 1 0.0057
missing |>
  ggplot(aes(x=variable, y=pct)) +
  geom_bar(stat='identity', fill="#4E79A7") +
  coord_flip() +
  theme_bw() +
  labs(x = "") +
  geom_text(aes(label = paste0(round(pct*100, 2), "%")), colour = "white", hjust = 1, size=3, fontface = "bold") +
  scale_y_continuous(labels = scales::percent)

#t1 <- tableGrob(missing)  # transform into a tableGrob

#grid.arrange(missing_plot, t1, nrow=1)
set.seed(42)

knn_impute <- preProcess(chem, "knnImpute", k=5)
impute_chem <- predict(knn_impute, chem)

impute_missing <-
  impute_chem %>% 
  summarise(across(everything(), ~ sum(is.na(.x)))) %>%
  pivot_longer(., 
            cols = everything()) |>
  filter(value != 0) %>%
  nrow(.)

print(paste0("Missing Values: ", impute_missing >0))
## [1] "Missing Values: FALSE"

We will filter out predictors with low frequency using the nearZeroVar function. We then will be able to build our non-linear models.

nzv_predictors <- nearZeroVar(impute_chem |> select(-Yield),
                              names = TRUE, saveMetrics = FALSE)

filtered_chem <-
  as_tibble(impute_chem) |>
  select(-all_of(nzv_predictors)) |>
  clean_names()

kbl(filtered_chem[1:5, 1:10],
    caption = "Filtered Chemical Manufacturing Process") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(filtered_chem), " x ", ncol(filtered_chem))))
Filtered Chemical Manufacturing Process
yield biological_material01 biological_material02 biological_material03 biological_material04 biological_material05 biological_material06 biological_material08 biological_material09 biological_material10
-1.1792673 -0.2261036 -1.514098 -2.683036 0.2201765 0.4941942 -1.382888 -1.233131 -3.3962895 1.1005296
1.2263678 2.2391498 1.308996 -0.056235 1.2964386 0.4128555 1.129077 2.282619 -0.7227225 1.1005296
1.0042258 2.2391498 1.308996 -0.056235 1.2964386 0.4128555 1.129077 2.282619 -0.7227225 1.1005296
0.6737219 2.2391498 1.308996 -0.056235 1.2964386 0.4128555 1.129077 2.282619 -0.7227225 1.1005296
1.2534583 1.4827653 1.893939 1.135948 0.9414412 -0.3734185 1.534835 1.071310 -0.1205678 0.4162193
Dimensions:
176 x 57
set.seed(42)

train_index <- createDataPartition(filtered_chem$yield, p = 0.80, list = FALSE)

train_df <- filtered_chem[train_index, ]
train_x <- train_df |>
    select(-yield) %>%
  as.data.frame()
train_y <- train_df$yield
train_y <- as.numeric(train_y) 

test_df <- filtered_chem[-train_index, ]
test_x <- test_df |>
    select(-yield) %>%
  as.data.frame()
test_y <- test_df$yield
test_y <- as.numeric(test_y)

KNN

k-Nearest Neighbors


set.seed(42)

knnModel <- train(x = train_x, y = train_y, 
                  method = "knn", 
                  preProc = c("center", "scale"), 
                  tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.8215308  0.3207346  0.6521219
##    7  0.8055626  0.3350894  0.6473780
##    9  0.7993434  0.3468390  0.6491551
##   11  0.7948818  0.3594389  0.6471427
##   13  0.8003748  0.3539355  0.6533868
##   15  0.7990797  0.3600081  0.6546080
##   17  0.8040805  0.3571988  0.6608149
##   19  0.8065744  0.3551302  0.6620227
##   21  0.8090859  0.3567177  0.6639259
##   23  0.8146464  0.3506468  0.6710669
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
knn_results <-
  knnModel$results |>
  filter(k == 11)

kbl(knn_results,
    caption = "KNN Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
KNN Training Model
k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
11 0.7948818 0.3594389 0.6471427 0.0638573 0.0721586 0.0444572
top_pred <- 
  varImp(knnModel)$importance |>
  arrange(desc(Overall))

kbl(head(top_pred, 10),
    caption = "Top 10 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("KNN Model")
Top 10 Predictors
Overall
manufacturing_process32 100.00000
manufacturing_process13 93.81970
manufacturing_process09 89.92917
manufacturing_process17 88.19815
biological_material06 82.60809
biological_material03 79.43762
manufacturing_process36 73.84749
biological_material12 72.36048
manufacturing_process06 69.00493
manufacturing_process11 62.33657
Note:
KNN Model
knnPred <- predict(knnModel, newdata = test_x)

## The function 'postResample' can be used to get the test set
## performance values

knn_metrics <- postResample(pred = knnPred, obs = test_y)
knn_metrics <- as_tibble(as.list(knn_metrics))

kbl(knn_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("KNN Model")
Prediction Metrics
RMSE Rsquared MAE
0.6800894 0.6760131 0.5026363
Note:
KNN Model

MARS

Multivariate Adaptive Regression Splines


set.seed(42)

marsGrid <- expand.grid(.degree = 1:2, .nprune=2:16)

marsTuned <- train(train_x, train_y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

marsTuned
## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 129, 130, 129, 130, 130, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   MAE      
##   1        2      0.7695069  0.4220518  0.5984336
##   1        3      0.6768030  0.5256092  0.5403361
##   1        4      0.6245993  0.6027065  0.4990512
##   1        5      0.6615419  0.5563684  0.5349329
##   1        6      0.6605250  0.5613775  0.5284659
##   1        7      0.6548434  0.5832657  0.5275124
##   1        8      0.6451181  0.6045384  0.5197220
##   1        9      0.6374083  0.6156973  0.5091047
##   1       10      0.6457879  0.6041529  0.5182521
##   1       11      0.6599379  0.5886736  0.5353378
##   1       12      0.6493610  0.6024656  0.5211062
##   1       13      0.6586947  0.5938681  0.5231964
##   1       14      0.6607319  0.5945327  0.5248909
##   1       15      0.6663452  0.5909608  0.5336365
##   1       16      0.6667444  0.5854095  0.5326528
##   2        2      0.7695069  0.4220518  0.5984336
##   2        3      0.6536650  0.5696119  0.5257628
##   2        4      0.6609252  0.5861703  0.5381869
##   2        5      0.6488079  0.6109076  0.5240895
##   2        6      0.6509261  0.6047239  0.5101911
##   2        7      0.6536639  0.6051220  0.5157193
##   2        8      0.6480840  0.6141045  0.5226038
##   2        9      0.6523284  0.6172441  0.5168820
##   2       10      0.6445892  0.6187437  0.5023031
##   2       11      0.6532795  0.6173548  0.5105779
##   2       12      0.6779349  0.6076352  0.5293236
##   2       13      0.6575654  0.6177892  0.5163227
##   2       14      0.6617909  0.6061550  0.5242065
##   2       15      0.6571651  0.6128942  0.5187193
##   2       16      0.6685920  0.6013302  0.5310939
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
mars_results <-
  marsTuned$results |>
  filter(degree == 1, nprune == 4)

kbl(mars_results,
    caption = "MARS Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
MARS Training Model
degree nprune RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 4 0.6245993 0.6027065 0.4990512 0.1061586 0.1334267 0.0822903
mars_top_pred <- 
  varImp(marsTuned)$importance |>
  arrange(desc(Overall))

kbl(head(mars_top_pred, 10),
    caption = "Top 10 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("MARS Model")
Top 10 Predictors
Overall
manufacturing_process32 100
manufacturing_process09 0
Note:
MARS Model
marsPred <- predict(marsTuned, newdata = test_x)

## The function 'postResample' can be used to get the test set
## perforamnce values

mars_metrics <- postResample(pred = marsPred, obs = test_y)
mars_metrics <- as_tibble(as.list(mars_metrics))

kbl(mars_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("MARS Model")
Prediction Metrics
RMSE Rsquared MAE
0.6509033 0.6890094 0.5099939
Note:
MARS Model


SVM

Support Vector Machines

set.seed(42)

svmRTuned <- train(train_x, train_y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))
svmRTuned
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 129, 130, 129, 130, 130, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE      
##      0.25  0.7595696  0.4827820  0.6261061
##      0.50  0.6978859  0.5322223  0.5743776
##      1.00  0.6339877  0.6039829  0.5167883
##      2.00  0.6037287  0.6372236  0.4860754
##      4.00  0.6032620  0.6356415  0.4873033
##      8.00  0.5846226  0.6518356  0.4732665
##     16.00  0.5846632  0.6519478  0.4732776
##     32.00  0.5846632  0.6519478  0.4732776
##     64.00  0.5846632  0.6519478  0.4732776
##    128.00  0.5846632  0.6519478  0.4732776
##    256.00  0.5846632  0.6519478  0.4732776
##    512.00  0.5846632  0.6519478  0.4732776
##   1024.00  0.5846632  0.6519478  0.4732776
##   2048.00  0.5846632  0.6519478  0.4732776
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01496228
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01496228 and C = 8.
svm_results <-
  svmRTuned$results |>
  filter(round(sigma, 8) == 0.01496228, C == 8)

kbl(svm_results,
    caption = "SVM Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
SVM Training Model
sigma C RMSE Rsquared MAE RMSESD RsquaredSD MAESD
0.0149623 8 0.5846226 0.6518356 0.4732665 0.0958087 0.171438 0.0898937
svm_top_pred <- 
  varImp(svmRTuned)$importance |>
  arrange(desc(Overall))

kbl(head(svm_top_pred, 10),
    caption = "Top 10 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("SVM Model")
Top 10 Predictors
Overall
manufacturing_process32 100.00000
manufacturing_process13 93.81970
manufacturing_process09 89.92917
manufacturing_process17 88.19815
biological_material06 82.60809
biological_material03 79.43762
manufacturing_process36 73.84749
biological_material12 72.36048
manufacturing_process06 69.00493
manufacturing_process11 62.33657
Note:
SVM Model
svmPred <- predict(svmRTuned, newdata = test_x)

## The function 'postResample' can be used to get the test set
## perforamnce values

svm_metrics <- postResample(pred = svmPred, obs = test_y)
svm_metrics <- as_tibble(as.list(svm_metrics))

kbl(svm_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("SVM Model")
Prediction Metrics
RMSE Rsquared MAE
0.6808684 0.6256048 0.4918045
Note:
SVM Model

Results

Our SVM model appears to give the best performance with the training model having the best \(R^2=0.6518\). However, it’s prediction \(R^2=0.6256\) is less than the MARS which was \(R^2=0.6890\). MARS has it’s training \(R^2=0.6027\) just below the SVM model.

append_tables <- function(train_table, pred_table, model_name){
  
  train_table <-
    train_table |>
    select(RMSE, Rsquared) |>
    rename(Train_RMSE = RMSE,
           Train_Rsq = Rsquared) |>
    mutate(Model = model_name) |>
    relocate(Model)

  pred_table <-
    pred_table |>
    select(RMSE, Rsquared) |>
    rename(Pred_RMSE = RMSE,
           Pred_Rsq = Rsquared) |>
    mutate(Model = model_name) |>
    relocate(Model)
  
  
    return(train_table |>
              left_join(pred_table, join_by(Model)))
}

knn_table <- append_tables(knn_results, knn_metrics, "KNN")
mars_table <- append_tables(mars_results, mars_metrics, "MARS")
svm_table <- append_tables(svm_results, svm_metrics, "SVM")
model_table <- 
  bind_rows(knn_table, mars_table, svm_table)

kbl(model_table,
    caption = "Model Comparisons",
    digits=4) |>
  kable_classic(full_width = F, html_font = "Cambria")
Model Comparisons
Model Train_RMSE Train_Rsq Pred_RMSE Pred_Rsq
KNN 0.7949 0.3594 0.6801 0.6760
MARS 0.6246 0.6027 0.6509 0.6890
SVM 0.5846 0.6518 0.6809 0.6256
  1. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

The process variables still dominate the top 15 list of important predictors. Comparing this to the optimal linear model, the same top four process predictors here are the same in the top five for our linear model. However, biological materials appear in the top 10 with our SVM compared to the linear model.

svm_top_pred |>
  arrange(desc(Overall)) |>
  head(10) %>%
  ggplot(aes(y=reorder(row.names(.), Overall), x=Overall, label=round(Overall,2))) +
  geom_bar(stat='identity', fill="#F28E2B") +
  geom_text(position = position_stack(vjust = 0.5),
            size=3,
            fontface = "bold",
            colour="black") +
  theme_minimal() +
  labs(y="", title = "SVM Predictors by Model Importance") +
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"))

  1. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

We can see the relationships and their strength between yield and the top predictors. We saw how manufacturing_process32 was the highest predictor in our model and it shows with the moderately strong correlation of 0.61. In general there is a moderately strong relationship in both directions with how our top predictors work with our response variable Yield. If we can manage these specific top predictors in our chemical manufacturing process, we can possibly increase our total yeild.

filtered_pred <-
  rownames_to_column(head(svm_top_pred, 10))

pred_corr <-
  filtered_chem |>
  select(yield, filtered_pred$rowname)

q <- cor(pred_corr)

ggcorrplot(q, type = "upper", outline.color = "white",
           ggtheme = theme_classic,
           colors = c("#4E79A7", "white", "#F28E2B"),
           lab = TRUE, show.legend = F, tl.cex = 5, lab_size = 3)