Questions 7.2 and 7.5 from the Kuhn& Johnson book

library(caret)
library(corrplot)
library(dplyr)
library(data.table)
library(e1071)
library(earth)
library(fable)
library(forecast)
library(fpp2)
library(fpp3)
library(ggplot2)
library(GGally)
library(knitr)
library(kernlab)
library(latex2exp) 
library(mlbench)
library(MASS)
library(purrr)
library(pls)
library(patchwork)
library(RANN) 
library(stats)
library(tsibble)
library(tidyr)

——————————————————————————–

(7.2) Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\(y = 10sin(πx_1x_2) + 20(x_3-0.5)^2 + 10x_4 +5x_5 + N(0,σ^2)\)

where the \(x\) values are random variables uniformly distributed between \([0, 1]\) (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
trainingData = mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x = data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data.
Store results for comparison
results <- data.frame(matrix(vector(), 0, 3,
                dimnames = list(c(), c("RMSE","Rsquared","MAE"))),
                stringsAsFactors = FALSE)
KNN
model_knn <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "knn",
                   preProc = c("center", "scale"), 
                   tuneLength = 10)

model_knn
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
# Store predictions for accuracy against test data
pred_knn <- predict(model_knn, newdata = testData$x)
results <- rbind(results, postResample(pred = pred_knn, obs = testData$y))

Check the closest neighbors for each predictor by looking at Euclidean distance between observations.

MARS Regression
library(earth)
control <- trainControl(method = "cv")
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:15)
model_mars <- train(trainingData$x, 
                    trainingData$y,
                    method = "earth",
                    tuneGrid = marsGrid,
                    preProcess = c("center", "scale"),
                    tuneLength = 10,
                    trControl = control)
model_mars
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.462296  0.2176253  3.697979
##   1        3      3.720663  0.4673821  2.949121
##   1        4      2.680039  0.7094916  2.123848
##   1        5      2.333538  0.7781559  1.856629
##   1        6      2.367933  0.7754329  1.901509
##   1        7      1.809983  0.8656526  1.414985
##   1        8      1.692656  0.8838936  1.333678
##   1        9      1.704958  0.8845683  1.339517
##   1       10      1.688559  0.8842495  1.309838
##   1       11      1.669043  0.8886165  1.296522
##   1       12      1.645066  0.8892796  1.271981
##   1       13      1.655570  0.8886896  1.271232
##   1       14      1.666354  0.8879143  1.285545
##   1       15      1.666354  0.8879143  1.285545
##   2        2      4.440854  0.2204755  3.686796
##   2        3      3.697203  0.4714312  2.938566
##   2        4      2.664266  0.7149235  2.119566
##   2        5      2.313371  0.7837374  1.852172
##   2        6      2.335796  0.7875253  1.841919
##   2        7      1.833248  0.8623489  1.461538
##   2        8      1.695822  0.8883658  1.329030
##   2        9      1.555106  0.9028532  1.221365
##   2       10      1.497805  0.9088251  1.158054
##   2       11      1.419280  0.9207646  1.139722
##   2       12      1.326566  0.9315939  1.066200
##   2       13      1.266877  0.9354482  1.002983
##   2       14      1.256694  0.9349307  1.006273
##   2       15      1.311401  0.9316487  1.039213
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
# Store predictions for accuracy against test data
pred_mars <- predict(model_mars, newdata = testData$x)
results <- rbind(results, postResample(pred = pred_mars, obs = testData$y))

MARS Regression (multivariate adaptive regression splines) is a non-parametric regression technique that automatically captures non-linearity and interaction between predictors.

Neural Network
nnet_Grid <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:10), .bag = FALSE)
nnet_maxnwts <- 5 * (ncol(trainingData$x) + 1) + 5 + 1
model_nnet <- train(x = trainingData$x,
                    y = trainingData$y,
                    method = "avNNet",
                    preProcess = c("center", "scale"),
                    tuneGrid = nnet_Grid,
                    trControl = control,
                    linout = TRUE,
                    trace = FALSE,
                    MaxNWts = nnet_maxnwts,
                    maxit = 500)
model_nnet
## Model Averaged Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0.00    1    2.409360  0.7626158  1.918956
##   0.00    2    2.495497  0.7466800  1.998154
##   0.00    3    2.042870  0.8276737  1.620556
##   0.00    4    2.218146  0.8058404  1.608860
##   0.00    5    2.290307  0.7791256  1.785130
##   0.00    6         NaN        NaN       NaN
##   0.00    7         NaN        NaN       NaN
##   0.00    8         NaN        NaN       NaN
##   0.00    9         NaN        NaN       NaN
##   0.00   10         NaN        NaN       NaN
##   0.01    1    2.441045  0.7589837  1.927773
##   0.01    2    2.408586  0.7646177  1.924190
##   0.01    3    2.094390  0.8207054  1.639698
##   0.01    4    2.019234  0.8369700  1.608611
##   0.01    5    2.186633  0.8126958  1.765935
##   0.01    6         NaN        NaN       NaN
##   0.01    7         NaN        NaN       NaN
##   0.01    8         NaN        NaN       NaN
##   0.01    9         NaN        NaN       NaN
##   0.01   10         NaN        NaN       NaN
##   0.10    1    2.451052  0.7561366  1.935306
##   0.10    2    2.439995  0.7533865  1.925580
##   0.10    3    2.142421  0.8119015  1.697562
##   0.10    4    2.021443  0.8367619  1.613081
##   0.10    5    2.058645  0.8303731  1.641058
##   0.10    6         NaN        NaN       NaN
##   0.10    7         NaN        NaN       NaN
##   0.10    8         NaN        NaN       NaN
##   0.10    9         NaN        NaN       NaN
##   0.10   10         NaN        NaN       NaN
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0.01 and bag = FALSE.
# Store predictions for accuracy against test data
pred_nnet <- predict(model_nnet, newdata = testData$x)
results <- rbind(results, postResample(pred = pred_nnet, obs = testData$y))

Input layer goes through many hidden layers, where weights are randomized, resampled, and adjusted to the output layer.

SVM
model_svm <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),
                   tuneLength = 10,
                   trControl = control)
model_svm
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.485843  0.8015980  1.997058
##     0.50  2.217552  0.8191439  1.783734
##     1.00  2.038394  0.8395080  1.621050
##     2.00  1.931284  0.8537910  1.510868
##     4.00  1.875643  0.8624807  1.471216
##     8.00  1.873494  0.8639409  1.479643
##    16.00  1.886621  0.8626959  1.496953
##    32.00  1.886621  0.8626959  1.496953
##    64.00  1.886621  0.8626959  1.496953
##   128.00  1.886621  0.8626959  1.496953
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0623323
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0623323 and C = 8.
# Store predictions for accuracy against test data
svm_pred <- predict(model_svm, newdata = testData$x)
results <- rbind(results, postResample(pred = svm_pred, obs = testData$y))

Support Vector Machines are used for regression or classification by making a hyper-plane decision boundary in some \(n\)-dimensional space.

Which models appear to give the best performance? Does MARS select the informative predictors (those named \(x_1\)\(x_5\))?
colnames(results) <- c("RMSE","R-Squared","MAE")
results$Model <- c("KNN", "MARS", "Neural Network", "SVM")
results <- results %>% relocate("Model") %>% arrange(RMSE)
results
##            Model     RMSE R-Squared      MAE
## 1           MARS 1.277999 0.9338365 1.014707
## 2            SVM 2.051520 0.8294511 1.556470
## 3 Neural Network 2.061504 0.8305890 1.561138
## 4            KNN 3.204059 0.6819919 2.568346
# Check if it selects x_1 to x_5
varImp(model_mars)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.40
## X2   49.00
## X5   15.72
## X3    0.00

MARS gives the best overall performance, by having the greatest \(R^2\) and lowest errors, and it selects the informative predictors, although with varying importance levels.

——————————————————————————–

(7.5) Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
Load chemical data
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

################################################################################
# Impute missing values
CMP_impute <- ChemicalManufacturingProcess %>%
  preProcess("knnImpute")

CMP_complete <- predict(CMP_impute, ChemicalManufacturingProcess)

################################################################################
# Remove near zero Variance predictors
CMP_complete <- CMP_complete[, -nearZeroVar(CMP_complete)]

################################################################################
# Train Test Split
set.seed(12345)

index <- createDataPartition(CMP_complete$Yield, p = .8, list = FALSE)

# Train
train_predictors <- CMP_complete[index, -1]  
train_yield <- CMP_complete[index, 1]

# Test
test_predictors <- CMP_complete[-index, -1]
test_yield <- CMP_complete[-index, 1]
KNN
model_chem_knn <- train(x=train_predictors, 
                        y=train_yield,
                        method = "knn",
                        preProc = c("center", "scale"),
                        tuneLength = 10)
MARS Regression
control <- trainControl(method = "cv")
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:15)

model_chem_mars <- train(x=train_predictors, 
                         y=train_yield,
                         method = "earth",
                         tuneGrid = marsGrid,
                         preProcess = c("center", "scale"),
                         tuneLength = 10,
                         trControl = control)
Neural Network
nnet_Grid <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:10), .bag = FALSE)
nnet_maxnwts <- 5 * (ncol(train_predictors) + 1) + 5 + 1

model_chem_nnet <- train(x=train_predictors, 
                         y=train_yield,
                         method = "avNNet",
                         preProcess = c("center", "scale"),
                         tuneGrid = nnet_Grid,
                         trControl = control,
                         linout = TRUE,
                         trace = FALSE,
                         MaxNWts = nnet_maxnwts,
                         maxit = 500)
SVM
control <- trainControl(method = "cv")

model_chem_svm <- train(x=train_predictors, 
                   y=train_yield,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),
                   tuneLength = 10,
                   trControl = control)

(a) Which nonlinear regression model gives the optimal resampling and test set performance?

results_chem <- data.frame(matrix(vector(), 0, 3,
                dimnames = list(c(), c("RMSE","Rsquared","MAE"))),
                stringsAsFactors = FALSE)

pred_chem_knn  <- predict(model_chem_knn, newdata = test_predictors)
pred_chem_mars <- predict(model_chem_mars, newdata = test_predictors)
pred_chem_nnet <- predict(model_chem_nnet, newdata = test_predictors)
pred_chem_svm  <- predict(model_chem_svm, newdata = test_predictors)

results_chem <- rbind(results_chem, postResample(pred = pred_chem_knn, obs = test_yield))
results_chem <- rbind(results_chem, postResample(pred = pred_chem_mars, obs = test_yield))
results_chem <- rbind(results_chem, postResample(pred = pred_chem_nnet, obs = test_yield))
results_chem <- rbind(results_chem, postResample(pred = pred_chem_svm, obs = test_yield))

colnames(results_chem) <- c("RMSE","R-Squared","MAE")
results_chem$Model <- c("KNN", "MARS", "Neural Network", "SVM")
results_chem <- results_chem %>% relocate("Model") %>% arrange(RMSE)
results_chem
##            Model      RMSE R-Squared       MAE
## 1            SVM 0.5818004 0.6237450 0.4734947
## 2 Neural Network 0.6205936 0.6156185 0.5148025
## 3            KNN 0.6294354 0.5890334 0.5464538
## 4           MARS 0.8650264 0.3076008 0.6402553

For the chemical manufacturing process data, the best-performing model is SVM.

(b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear model?

varImp(model_chem_svm)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess32   84.32
## ManufacturingProcess09   79.70
## ManufacturingProcess17   74.30
## BiologicalMaterial06     68.60
## ManufacturingProcess36   64.77
## BiologicalMaterial03     64.43
## BiologicalMaterial02     59.70
## BiologicalMaterial12     58.79
## ManufacturingProcess06   58.19
## ManufacturingProcess31   56.21
## BiologicalMaterial11     48.83
## ManufacturingProcess11   48.23
## ManufacturingProcess30   46.69
## BiologicalMaterial08     45.14
## BiologicalMaterial04     44.95
## ManufacturingProcess29   42.66
## ManufacturingProcess33   38.27
## BiologicalMaterial01     35.50
## ManufacturingProcess12   35.30
plot(varImp(model_chem_svm, top = 20))

For the chemical manufacturing process data, the most important predictors in the SVM model are the manufacturing process variables. When comparing to the results of assignment 7, problem 6.3f, I can see that alot of variables actually shifted around, and that the top 10 predictors are still 6 manufacturing processes and 4 biological processes. With the exception of BiologicalMaterial02, the other three (BM06, BM03, BM12) all went up in terms of importance.

(c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

library(corrplot)
library(dplyr)
library(tidyselect)

top_predictors <- varImp(model_chem_svm)$importance %>%
  arrange(-Overall) %>%
  head(10)

# Extract top predictor names 
top_predictor_names <- rownames(top_predictors)

# Get correlations
cor_matrix <- CMP_complete %>%
  dplyr::select(all_of(c("Yield", top_predictor_names))) %>%
  cor()

# Corrplot
corrplot::corrplot(cor_matrix,
                   method = "color",
                   type = "upper",
                   addCoef.col = "black",
                   tl.cex = 0.8,
                   tl.col = "black",
                   diag = FALSE)

print(top_predictor_names)
##  [1] "ManufacturingProcess13" "ManufacturingProcess32" "ManufacturingProcess09"
##  [4] "ManufacturingProcess17" "BiologicalMaterial06"   "ManufacturingProcess36"
##  [7] "BiologicalMaterial03"   "BiologicalMaterial02"   "BiologicalMaterial12"  
## [10] "ManufacturingProcess06"

top_predictors <- CMP_complete %>% select(‘ManufacturingProcess13’, ‘ManufacturingProcess32’, ‘ManufacturingProcess09’, ‘ManufacturingProcess17’,‘BiologicalMaterial06’, ‘ManufacturingProcess36’, ‘BiologicalMaterial03’, ‘BiologicalMaterial02’, ‘BiologicalMaterial12’, ‘ManufacturingProcess06’, ‘Yield’)