Non-Linear Regression

Required Libraries

library(tidyverse)
library(kableExtra)
library(janitor)
library(mlbench)
library(caret) ## KNN
library(latex2exp)
library(earth) ## MARS
library(AppliedPredictiveModeling)
library(ggcorrplot)

7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\[ y = 10sin(\pi x_1x_2) + 20(x_3 - 0.5)^2 + 10x_4 + 5x_5 + N(0, \sigma^2) \]

where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1that simulates these data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.

trainingData$x <- data.frame(trainingData$x)

## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

KNN

k-Nearest Neighbors

set.seed(42)

knnModel <- train(x = trainingData$x, y = trainingData$y, 
                  method = "knn", 
                  preProc = c("center", "scale"), 
                  tuneLength = 10)

knnModel

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.593384  0.4866962  2.925730
##    7  3.450129  0.5309533  2.812117
##    9  3.374966  0.5603358  2.738616
##   11  3.344853  0.5811160  2.699411
##   13  3.288017  0.6137613  2.651344
##   15  3.291558  0.6243500  2.656925
##   17  3.293868  0.6348015  2.660223
##   19  3.287441  0.6500167  2.651429
##   21  3.290406  0.6578647  2.661403
##   23  3.295990  0.6670376  2.674025
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.

knn_results <-
  knnModel$results |>
  filter(k == 19)

kbl(knn_results,
    caption = "KNN Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")

KNN Training Model
k	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD
19	3.287441	0.6500167	2.651429	0.2962589	0.0646111	0.2795915

top_pred <- 
  varImp(knnModel)$importance |>
  arrange(desc(Overall))

kbl(head(top_pred, 5),
    caption = "Top 5 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("KNN Model")

Top 5 Predictors
	Overall
X4	100.00000
X1	95.50471
X2	89.61865
X5	45.21698
X3	29.93295
Note:
KNN Model

knnPred <- predict(knnModel, newdata = testData$x)

## The function 'postResample' can be used to get the test set
## perforamnce values

knn_metrics <- postResample(pred = knnPred, obs = testData$y)
knn_metrics <- as_tibble(as.list(knn_metrics))

kbl(knn_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("KNN Model")

Prediction Metrics
RMSE	Rsquared	MAE
3.228683	0.6871735	2.593973
Note:
KNN Model

MARS

Multivariate Adaptive Regression Splines

set.seed(42)

marsGrid <- expand.grid(.degree = 1:2, .nprune=2:16)

marsTuned <- train(trainingData$x, trainingData$y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

marsTuned

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      4.476579  0.2277382  3.7356067
##   1        3      3.553760  0.4883923  2.8477328
##   1        4      2.693919  0.7206389  2.1654581
##   1        5      2.466322  0.7609079  1.9586776
##   1        6      2.360291  0.7853958  1.8438467
##   1        7      1.855162  0.8634450  1.4472161
##   1        8      1.709378  0.8862307  1.3044347
##   1        9      1.622279  0.8998138  1.2631306
##   1       10      1.616540  0.9008842  1.2649581
##   1       11      1.646626  0.8961273  1.2803571
##   1       12      1.674181  0.8923909  1.2855550
##   1       13      1.664137  0.8948943  1.2645698
##   1       14      1.651457  0.8956602  1.2630422
##   1       15      1.646502  0.8960189  1.2580649
##   1       16      1.646502  0.8960189  1.2580649
##   2        2      4.476579  0.2277382  3.7356067
##   2        3      3.553760  0.4883923  2.8477328
##   2        4      2.693919  0.7206389  2.1654581
##   2        5      2.391966  0.7756076  1.9090990
##   2        6      2.333309  0.7881029  1.8043481
##   2        7      1.885029  0.8607341  1.4635248
##   2        8      1.743065  0.8797282  1.3691428
##   2        9      1.640348  0.8899848  1.3033033
##   2       10      1.437179  0.9154582  1.1344466
##   2       11      1.387157  0.9206236  1.0618170
##   2       12      1.260926  0.9363945  0.9719606
##   2       13      1.249483  0.9372768  0.9627787
##   2       14      1.237039  0.9404698  0.9591283
##   2       15      1.236968  0.9419557  0.9678294
##   2       16      1.245441  0.9402940  0.9647491
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 15 and degree = 2.

mars_results <-
  marsTuned$results |>
  filter(degree == 2, nprune == 15)

kbl(mars_results,
    caption = "MARS Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")

MARS Training Model
degree	nprune	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD
2	15	1.236968	0.9419557	0.9678294	0.1780041	0.0228732	0.17343

mars_top_pred <- 
  varImp(marsTuned)$importance |>
  arrange(desc(Overall))

kbl(head(mars_top_pred, 5),
    caption = "Top 5 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("MARS Model")

Top 5 Predictors
	Overall
X1	100.00000
X4	75.23592
X2	48.72974
X5	15.51884
X3	0.00000
Note:
MARS Model

marsPred <- predict(marsTuned, newdata = testData$x)

## The function 'postResample' can be used to get the test set
## perforamnce values

mars_metrics <- postResample(pred = marsPred, obs = testData$y)
mars_metrics <- as_tibble(as.list(mars_metrics))

kbl(mars_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("MARS Model")

Prediction Metrics
RMSE	Rsquared	MAE
1.158995	0.9460418	0.925023
Note:
MARS Model

SVM

Support Vector Machines

set.seed(42)

svmRTuned <- train(trainingData$x, trainingData$y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))
svmRTuned

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.494595  0.7977557  2.005513
##      0.50  2.216555  0.8156746  1.766512
##      1.00  2.020950  0.8402258  1.608446
##      2.00  1.945841  0.8514850  1.536883
##      4.00  1.894326  0.8609361  1.504003
##      8.00  1.886268  0.8620782  1.494837
##     16.00  1.879360  0.8627289  1.491156
##     32.00  1.879360  0.8627289  1.491156
##     64.00  1.879360  0.8627289  1.491156
##    128.00  1.879360  0.8627289  1.491156
##    256.00  1.879360  0.8627289  1.491156
##    512.00  1.879360  0.8627289  1.491156
##   1024.00  1.879360  0.8627289  1.491156
##   2048.00  1.879360  0.8627289  1.491156
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06533796
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06533796 and C = 16.

svm_results <-
  svmRTuned$results |>
  filter(round(sigma, 8) == 0.06533796, C == 16)

kbl(svm_results,
    caption = "SVM Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")

SVM Training Model
sigma	C	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD
0.065338	16	1.87936	0.8627289	1.491156	0.3480277	0.0531924	0.2883053

top_pred <- 
  varImp(svmRTuned)$importance |>
  arrange(desc(Overall))

kbl(head(top_pred, 5),
    caption = "Top 5 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("SVM Model")

Top 5 Predictors
	Overall
X4	100.00000
X1	95.50471
X2	89.61865
X5	45.21698
X3	29.93295
Note:
SVM Model

svmPred <- predict(svmRTuned, newdata = testData$x)

## The function 'postResample' can be used to get the test set
## perforamnce values

svm_metrics <- postResample(pred = svmPred, obs = testData$y)
svm_metrics <- as_tibble(as.list(svm_metrics))

kbl(svm_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("SVM Model")

Prediction Metrics
RMSE	Rsquared	MAE
2.079382	0.824766	1.57968
Note:
SVM Model

Results

Our MARS model appears to give the best performance from both the training model and prediction results. When we compare each of their RMSE and \(R^2\) scores, MARS vastly outperforms the other two models. We also see how MARS selected which variables were the most informative predictors. In our chart, it has X1, X4, X2, X5 in order of importance and we see how X3 is not as important in the model.

append_tables <- function(train_table, pred_table, model_name){
  
  train_table <-
    train_table |>
    select(RMSE, Rsquared) |>
    rename(Train_RMSE = RMSE,
           Train_Rsq = Rsquared) |>
    mutate(Model = model_name) |>
    relocate(Model)

  pred_table <-
    pred_table |>
    select(RMSE, Rsquared) |>
    rename(Pred_RMSE = RMSE,
           Pred_Rsq = Rsquared) |>
    mutate(Model = model_name) |>
    relocate(Model)
  
  
    return(train_table |>
              left_join(pred_table, join_by(Model)))
}

knn_table <- append_tables(knn_results, knn_metrics, "KNN")
mars_table <- append_tables(mars_results, mars_metrics, "MARS")
svm_table <- append_tables(svm_results, svm_metrics, "SVM")

model_table <- 
  bind_rows(knn_table, mars_table, svm_table)

kbl(model_table,
    caption = "Model Comparisons",
    digits=4) |>
  kable_classic(full_width = F, html_font = "Cambria")

Model Comparisons
Model	Train_RMSE	Train_Rsq	Pred_RMSE	Pred_Rsq
KNN	3.2874	0.6500	3.2287	0.6872
MARS	1.2370	0.9420	1.1590	0.9460
SVM	1.8794	0.8627	2.0794	0.8248

mars_top_pred |>
  ggplot(aes(y=reorder(row.names(mars_top_pred), Overall), x=Overall, label=round(Overall,2))) +
  geom_bar(stat='identity', fill="#F28E2B") +
  geom_text(position = position_stack(vjust = 0.5),
            size=3,
            fontface = "bold",
            colour="black") +
  theme_minimal() +
  labs(y="", title = "MARS Predictors by Model Importance") +
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"))

7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

Which nonlinear regression model gives the optimal resampling and test set performance?

Data Preparation

data(ChemicalManufacturingProcess)

chem <- ChemicalManufacturingProcess

kbl(chem[1:5, 1:10],
    caption = "Chemical Manufacturing Process") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(chem), " x ", ncol(chem))))

Chemical Manufacturing Process
Yield	BiologicalMaterial01	BiologicalMaterial02	BiologicalMaterial03	BiologicalMaterial04	BiologicalMaterial05	BiologicalMaterial06	BiologicalMaterial07	BiologicalMaterial08	BiologicalMaterial09
38.00	6.25	49.58	56.97	12.74	19.51	43.73	100	16.66	11.44
42.44	8.01	60.97	67.48	14.65	19.36	53.14	100	19.04	12.55
42.03	8.01	60.97	67.48	14.65	19.36	53.14	100	19.04	12.55
41.42	8.01	60.97	67.48	14.65	19.36	53.14	100	19.04	12.55
42.49	7.47	63.33	72.25	14.02	17.91	54.66	100	18.22	12.80
Dimensions:
176 x 58

We can see which variables have missing values below. Afterwards, we can use a KNN imputation method to replace these NA values and confirm that we have zero missing values.

missing <- 
  chem %>% 
  summarise(across(everything(), ~ sum(is.na(.x)))) %>%
  pivot_longer(., 
            cols = everything()) |>
  filter(value != 0) |>
  rename(variable = name, 
         total = value) |>
  mutate(pct = round(total/nrow(chem), 4)) |>
  arrange(desc(pct), variable) |>
  mutate(variable = fct_reorder(variable, pct))

kbl(missing,
  caption = "Missing Values") |>
  kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(latex_options = "HOLD_position")

Missing Values
variable	total	pct
ManufacturingProcess03	15	0.0852
ManufacturingProcess11	10	0.0568
ManufacturingProcess10	9	0.0511
ManufacturingProcess25	5	0.0284
ManufacturingProcess26	5	0.0284
ManufacturingProcess27	5	0.0284
ManufacturingProcess28	5	0.0284
ManufacturingProcess29	5	0.0284
ManufacturingProcess30	5	0.0284
ManufacturingProcess31	5	0.0284
ManufacturingProcess33	5	0.0284
ManufacturingProcess34	5	0.0284
ManufacturingProcess35	5	0.0284
ManufacturingProcess36	5	0.0284
ManufacturingProcess02	3	0.0170
ManufacturingProcess06	2	0.0114
ManufacturingProcess01	1	0.0057
ManufacturingProcess04	1	0.0057
ManufacturingProcess05	1	0.0057
ManufacturingProcess07	1	0.0057
ManufacturingProcess08	1	0.0057
ManufacturingProcess12	1	0.0057
ManufacturingProcess14	1	0.0057
ManufacturingProcess22	1	0.0057
ManufacturingProcess23	1	0.0057
ManufacturingProcess24	1	0.0057
ManufacturingProcess40	1	0.0057
ManufacturingProcess41	1	0.0057

missing |>
  ggplot(aes(x=variable, y=pct)) +
  geom_bar(stat='identity', fill="#4E79A7") +
  coord_flip() +
  theme_bw() +
  labs(x = "") +
  geom_text(aes(label = paste0(round(pct*100, 2), "%")), colour = "white", hjust = 1, size=3, fontface = "bold") +
  scale_y_continuous(labels = scales::percent)

#t1 <- tableGrob(missing)  # transform into a tableGrob

#grid.arrange(missing_plot, t1, nrow=1)

set.seed(42)

knn_impute <- preProcess(chem, "knnImpute", k=5)
impute_chem <- predict(knn_impute, chem)

impute_missing <-
  impute_chem %>% 
  summarise(across(everything(), ~ sum(is.na(.x)))) %>%
  pivot_longer(., 
            cols = everything()) |>
  filter(value != 0) %>%
  nrow(.)

print(paste0("Missing Values: ", impute_missing >0))

## [1] "Missing Values: FALSE"

We will filter out predictors with low frequency using the nearZeroVar function. We then will be able to build our non-linear models.

nzv_predictors <- nearZeroVar(impute_chem |> select(-Yield),
                              names = TRUE, saveMetrics = FALSE)

filtered_chem <-
  as_tibble(impute_chem) |>
  select(-all_of(nzv_predictors)) |>
  clean_names()

kbl(filtered_chem[1:5, 1:10],
    caption = "Filtered Chemical Manufacturing Process") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(filtered_chem), " x ", ncol(filtered_chem))))

Filtered Chemical Manufacturing Process
yield	biological_material01	biological_material02	biological_material03	biological_material04	biological_material05	biological_material06	biological_material08	biological_material09	biological_material10
-1.1792673	-0.2261036	-1.514098	-2.683036	0.2201765	0.4941942	-1.382888	-1.233131	-3.3962895	1.1005296
1.2263678	2.2391498	1.308996	-0.056235	1.2964386	0.4128555	1.129077	2.282619	-0.7227225	1.1005296
1.0042258	2.2391498	1.308996	-0.056235	1.2964386	0.4128555	1.129077	2.282619	-0.7227225	1.1005296
0.6737219	2.2391498	1.308996	-0.056235	1.2964386	0.4128555	1.129077	2.282619	-0.7227225	1.1005296
1.2534583	1.4827653	1.893939	1.135948	0.9414412	-0.3734185	1.534835	1.071310	-0.1205678	0.4162193
Dimensions:
176 x 57

set.seed(42)

train_index <- createDataPartition(filtered_chem$yield, p = 0.80, list = FALSE)

train_df <- filtered_chem[train_index, ]
train_x <- train_df |>
    select(-yield) %>%
  as.data.frame()
train_y <- train_df$yield
train_y <- as.numeric(train_y) 

test_df <- filtered_chem[-train_index, ]
test_x <- test_df |>
    select(-yield) %>%
  as.data.frame()
test_y <- test_df$yield
test_y <- as.numeric(test_y)

KNN

k-Nearest Neighbors

set.seed(42)

knnModel <- train(x = train_x, y = train_y, 
                  method = "knn", 
                  preProc = c("center", "scale"), 
                  tuneLength = 10)

knnModel

## k-Nearest Neighbors 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.8215308  0.3207346  0.6521219
##    7  0.8055626  0.3350894  0.6473780
##    9  0.7993434  0.3468390  0.6491551
##   11  0.7948818  0.3594389  0.6471427
##   13  0.8003748  0.3539355  0.6533868
##   15  0.7990797  0.3600081  0.6546080
##   17  0.8040805  0.3571988  0.6608149
##   19  0.8065744  0.3551302  0.6620227
##   21  0.8090859  0.3567177  0.6639259
##   23  0.8146464  0.3506468  0.6710669
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.

knn_results <-
  knnModel$results |>
  filter(k == 11)

kbl(knn_results,
    caption = "KNN Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")

KNN Training Model
k	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD
11	0.7948818	0.3594389	0.6471427	0.0638573	0.0721586	0.0444572

top_pred <- 
  varImp(knnModel)$importance |>
  arrange(desc(Overall))

kbl(head(top_pred, 10),
    caption = "Top 10 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("KNN Model")

Top 10 Predictors
	Overall
manufacturing_process32	100.00000
manufacturing_process13	93.81970
manufacturing_process09	89.92917
manufacturing_process17	88.19815
biological_material06	82.60809
biological_material03	79.43762
manufacturing_process36	73.84749
biological_material12	72.36048
manufacturing_process06	69.00493
manufacturing_process11	62.33657
Note:
KNN Model

knnPred <- predict(knnModel, newdata = test_x)

## The function 'postResample' can be used to get the test set
## performance values

knn_metrics <- postResample(pred = knnPred, obs = test_y)
knn_metrics <- as_tibble(as.list(knn_metrics))

kbl(knn_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("KNN Model")

Prediction Metrics
RMSE	Rsquared	MAE
0.6800894	0.6760131	0.5026363
Note:
KNN Model

MARS

Multivariate Adaptive Regression Splines

set.seed(42)

marsGrid <- expand.grid(.degree = 1:2, .nprune=2:16)

marsTuned <- train(train_x, train_y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

marsTuned

## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 129, 130, 129, 130, 130, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   MAE      
##   1        2      0.7695069  0.4220518  0.5984336
##   1        3      0.6768030  0.5256092  0.5403361
##   1        4      0.6245993  0.6027065  0.4990512
##   1        5      0.6615419  0.5563684  0.5349329
##   1        6      0.6605250  0.5613775  0.5284659
##   1        7      0.6548434  0.5832657  0.5275124
##   1        8      0.6451181  0.6045384  0.5197220
##   1        9      0.6374083  0.6156973  0.5091047
##   1       10      0.6457879  0.6041529  0.5182521
##   1       11      0.6599379  0.5886736  0.5353378
##   1       12      0.6493610  0.6024656  0.5211062
##   1       13      0.6586947  0.5938681  0.5231964
##   1       14      0.6607319  0.5945327  0.5248909
##   1       15      0.6663452  0.5909608  0.5336365
##   1       16      0.6667444  0.5854095  0.5326528
##   2        2      0.7695069  0.4220518  0.5984336
##   2        3      0.6536650  0.5696119  0.5257628
##   2        4      0.6609252  0.5861703  0.5381869
##   2        5      0.6488079  0.6109076  0.5240895
##   2        6      0.6509261  0.6047239  0.5101911
##   2        7      0.6536639  0.6051220  0.5157193
##   2        8      0.6480840  0.6141045  0.5226038
##   2        9      0.6523284  0.6172441  0.5168820
##   2       10      0.6445892  0.6187437  0.5023031
##   2       11      0.6532795  0.6173548  0.5105779
##   2       12      0.6779349  0.6076352  0.5293236
##   2       13      0.6575654  0.6177892  0.5163227
##   2       14      0.6617909  0.6061550  0.5242065
##   2       15      0.6571651  0.6128942  0.5187193
##   2       16      0.6685920  0.6013302  0.5310939
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.

mars_results <-
  marsTuned$results |>
  filter(degree == 1, nprune == 4)

kbl(mars_results,
    caption = "MARS Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")

MARS Training Model
degree	nprune	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD
1	4	0.6245993	0.6027065	0.4990512	0.1061586	0.1334267	0.0822903

mars_top_pred <- 
  varImp(marsTuned)$importance |>
  arrange(desc(Overall))

kbl(head(mars_top_pred, 10),
    caption = "Top 10 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("MARS Model")

Top 10 Predictors
	Overall
manufacturing_process32	100
manufacturing_process09	0
Note:
MARS Model

marsPred <- predict(marsTuned, newdata = test_x)

## The function 'postResample' can be used to get the test set
## perforamnce values

mars_metrics <- postResample(pred = marsPred, obs = test_y)
mars_metrics <- as_tibble(as.list(mars_metrics))

kbl(mars_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("MARS Model")

Prediction Metrics
RMSE	Rsquared	MAE
0.6509033	0.6890094	0.5099939
Note:
MARS Model

SVM

Support Vector Machines

set.seed(42)

svmRTuned <- train(train_x, train_y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))
svmRTuned

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 129, 130, 129, 130, 130, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE      
##      0.25  0.7595696  0.4827820  0.6261061
##      0.50  0.6978859  0.5322223  0.5743776
##      1.00  0.6339877  0.6039829  0.5167883
##      2.00  0.6037287  0.6372236  0.4860754
##      4.00  0.6032620  0.6356415  0.4873033
##      8.00  0.5846226  0.6518356  0.4732665
##     16.00  0.5846632  0.6519478  0.4732776
##     32.00  0.5846632  0.6519478  0.4732776
##     64.00  0.5846632  0.6519478  0.4732776
##    128.00  0.5846632  0.6519478  0.4732776
##    256.00  0.5846632  0.6519478  0.4732776
##    512.00  0.5846632  0.6519478  0.4732776
##   1024.00  0.5846632  0.6519478  0.4732776
##   2048.00  0.5846632  0.6519478  0.4732776
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01496228
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01496228 and C = 8.

svm_results <-
  svmRTuned$results |>
  filter(round(sigma, 8) == 0.01496228, C == 8)

kbl(svm_results,
    caption = "SVM Training Model") |>
  kable_classic(full_width = F, html_font = "Cambria")

SVM Training Model
sigma	C	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD
0.0149623	8	0.5846226	0.6518356	0.4732665	0.0958087	0.171438	0.0898937

svm_top_pred <- 
  varImp(svmRTuned)$importance |>
  arrange(desc(Overall))

kbl(head(svm_top_pred, 10),
    caption = "Top 10 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("SVM Model")

Top 10 Predictors
	Overall
manufacturing_process32	100.00000
manufacturing_process13	93.81970
manufacturing_process09	89.92917
manufacturing_process17	88.19815
biological_material06	82.60809
biological_material03	79.43762
manufacturing_process36	73.84749
biological_material12	72.36048
manufacturing_process06	69.00493
manufacturing_process11	62.33657
Note:
SVM Model

svmPred <- predict(svmRTuned, newdata = test_x)

## The function 'postResample' can be used to get the test set
## perforamnce values

svm_metrics <- postResample(pred = svmPred, obs = test_y)
svm_metrics <- as_tibble(as.list(svm_metrics))

kbl(svm_metrics,
    caption = "Prediction Metrics") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote("SVM Model")

Prediction Metrics
RMSE	Rsquared	MAE
0.6808684	0.6256048	0.4918045
Note:
SVM Model

Results

Our SVM model appears to give the best performance with the training model having the best \(R^2=0.6518\). However, it’s prediction \(R^2=0.6256\) is less than the MARS which was \(R^2=0.6890\). MARS has it’s training \(R^2=0.6027\) just below the SVM model.

append_tables <- function(train_table, pred_table, model_name){
  
  train_table <-
    train_table |>
    select(RMSE, Rsquared) |>
    rename(Train_RMSE = RMSE,
           Train_Rsq = Rsquared) |>
    mutate(Model = model_name) |>
    relocate(Model)

  pred_table <-
    pred_table |>
    select(RMSE, Rsquared) |>
    rename(Pred_RMSE = RMSE,
           Pred_Rsq = Rsquared) |>
    mutate(Model = model_name) |>
    relocate(Model)
  
  
    return(train_table |>
              left_join(pred_table, join_by(Model)))
}

knn_table <- append_tables(knn_results, knn_metrics, "KNN")
mars_table <- append_tables(mars_results, mars_metrics, "MARS")
svm_table <- append_tables(svm_results, svm_metrics, "SVM")

model_table <- 
  bind_rows(knn_table, mars_table, svm_table)

kbl(model_table,
    caption = "Model Comparisons",
    digits=4) |>
  kable_classic(full_width = F, html_font = "Cambria")

Model Comparisons
Model	Train_RMSE	Train_Rsq	Pred_RMSE	Pred_Rsq
KNN	0.7949	0.3594	0.6801	0.6760
MARS	0.6246	0.6027	0.6509	0.6890
SVM	0.5846	0.6518	0.6809	0.6256

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

The process variables still dominate the top 15 list of important predictors. Comparing this to the optimal linear model, the same top four process predictors here are the same in the top five for our linear model. However, biological materials appear in the top 10 with our SVM compared to the linear model.

svm_top_pred |>
  arrange(desc(Overall)) |>
  head(10) %>%
  ggplot(aes(y=reorder(row.names(.), Overall), x=Overall, label=round(Overall,2))) +
  geom_bar(stat='identity', fill="#F28E2B") +
  geom_text(position = position_stack(vjust = 0.5),
            size=3,
            fontface = "bold",
            colour="black") +
  theme_minimal() +
  labs(y="", title = "SVM Predictors by Model Importance") +
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"))

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

We can see the relationships and their strength between yield and the top predictors. We saw how manufacturing_process32 was the highest predictor in our model and it shows with the moderately strong correlation of 0.61. In general there is a moderately strong relationship in both directions with how our top predictors work with our response variable Yield. If we can manage these specific top predictors in our chemical manufacturing process, we can possibly increase our total yeild.

filtered_pred <-
  rownames_to_column(head(svm_top_pred, 10))

pred_corr <-
  filtered_chem |>
  select(yield, filtered_pred$rowname)

q <- cor(pred_corr)

ggcorrplot(q, type = "upper", outline.color = "white",
           ggtheme = theme_classic,
           colors = c("#4E79A7", "white", "#F28E2B"),
           lab = TRUE, show.legend = F, tl.cex = 5, lab_size = 3)

Non-Linear Regression

John Cruz

2024-04-02

Required Libraries

7.2

KNN

MARS

SVM

Results

7.5

Data Preparation

KNN

MARS

SVM

Results