Load Packages

library(AppliedPredictiveModeling)
library(caret)
library(corrplot)
library(earth)
library(e1071)
library(elasticnet)
library(grplasso)
library(kernlab)
library(lars)
library(MASS)
library(mlbench)
library(nnet)
library(penalized)
library(pls)
library(stats)
library(tidyverse)
library(VIM)

Exercise 2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: y = 10 sin(πx1x2) + 20(x3 − 0.5)2 + 10x4 + 5x5 + N(0, σ2) where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(81)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

featurePlot(trainingData$x, trainingData$y)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

knnModel <- train(x = trainingData$x,y = trainingData$y, method = "knn",
      preProc = c("center", "scale"),tuneLength = 10)

knnPred <- predict(knnModel, newdata = testData$x)

postResample(pred = knnPred, obs = testData$y)
##     RMSE Rsquared      MAE 
## 3.348767 0.624878 2.707009

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

set.seed(82)
ctrl <- trainControl(method = "cv", number = 10)
nnet_grid <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:10), .bag = FALSE)
nnet_model <- train(x = trainingData$x,
                    y = trainingData$y,
                    method = "avNNet",
                    tuneGrid = nnet_grid,
                    trControl = ctrl,
                    preProc = c("center", "scale"),
                    linout = TRUE,
                    trace = FALSE,
                    MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
                    maxit = 500)
nnet_pred <- predict(nnet_model, newdata = testData$x)
postResample(pred = nnet_pred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.0922308 0.8267906 1.6124033
varImp(nnet_model)
## loess r-squared variable importance
## 
##     Overall
## X4  100.000
## X1   41.858
## X2   39.015
## X5   33.470
## X3   15.448
## X8   15.050
## X10   7.249
## X6    3.797
## X9    1.179
## X7    0.000
set.seed(83)
mars_model <- train(x = trainingData$x,
                    y = trainingData$y,
                    method = "earth",
                    preProc = c("center", "scale"),
                    tuneLength = 10)
mars_pred <- predict(mars_model, newdata = testData$x)
postResample(pred = mars_pred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 1.7955370 0.8749859 1.3722738
varImp(mars_model)
## earth variable importance
## 
##    Overall
## X4  100.00
## X1   62.87
## X2   45.11
## X5   18.67
## X3    0.00
set.seed(84)
svm_model <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 10)
svm_pred <- predict(svm_model, newdata = testData$x)
postResample(pred = svm_pred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.0730089 0.8317222 1.6154641
varImp(svm_model)
## loess r-squared variable importance
## 
##     Overall
## X4  100.000
## X1   41.858
## X2   39.015
## X5   33.470
## X3   15.448
## X8   15.050
## X10   7.249
## X6    3.797
## X9    1.179
## X7    0.000

As seen from the results above, the MARS model provides us with the best performance having an RMSE, R^2, and MAE value of 1.7955370, 0.8749859, and 1.3722738, respectively, the best rates of all 4 models. The MARS model does indeed select the informative predictors (X1–X5) in the order shown above.

Exercise 5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

data(ChemicalManufacturingProcess)
cmp <- kNN(ChemicalManufacturingProcess, k = 5) %>% 
  select(c(1:58)) %>%
  select(!nearZeroVar(.))

set.seed(85)

ctrl1 <- trainControl(method = "cv", number = 10)
split <- createDataPartition(cmp$Yield, p = 0.8, list = FALSE)
train.data  <- cmp[split, ]
test.data <- cmp[-split, ]

knn_model1 <- train(Yield ~ .,
                    data = train.data,
                    method = "knn",
                    trControl = ctrl1,
                    preProcess = c("center","scale"),
                    tuneLength = 10)
knn_pred1 <- predict(knn_model1, newdata = test.data)
postResample(pred = knn_pred1, obs = test.data$Yield)
##      RMSE  Rsquared       MAE 
## 1.5407310 0.3995876 1.2454375
nnet_grid1 <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:10), .bag = FALSE)
nnet_model1 <- train(Yield ~ .,
                    data = train.data,
                    method = "avNNet",
                    tuneGrid = nnet_grid1,
                    trControl = ctrl1,
                    preProc = c("center", "scale"),
                    linout = TRUE,
                    trace = FALSE,
                    MaxNWts = 10 * (ncol(train.data) + 1) + 10 + 1,
                    maxit = 500)
nnet_pred1 <- predict(nnet_model1, newdata = test.data)
postResample(pred = nnet_pred1, obs = test.data$Yield)
##      RMSE  Rsquared       MAE 
## 1.4904717 0.4294837 1.1301685
mars_model1 <- train(Yield ~ .,
                     data = train.data,
                     method = "earth",
                     trControl = ctrl1,
                     preProcess = c("center","scale"),
                     tuneLength = 10)
mars_pred1 <- predict(mars_model1, newdata = test.data)
postResample(pred = mars_pred1, obs = test.data$Yield)
##      RMSE  Rsquared       MAE 
## 1.3937965 0.5013424 1.1221161
svm_model1 <- train(Yield ~ .,
                    data = train.data,
                    method = "svmRadial",
                    trControl = ctrl1,
                    preProcess = c("center","scale"),
                    tuneLength = 10)
svm_pred1 <- predict(svm_model1, newdata = test.data)
postResample(pred = svm_pred1, obs = test.data$Yield)
##      RMSE  Rsquared       MAE 
## 1.2625926 0.6915330 0.9494307

Which nonlinear regression model gives the optimal resampling and test set performance?

As can be seen in the results above, after running KNN, neural network, MARS, and SVM models, the SVM model had the best performance with an RMSE of 1.2625926, R^2 of 0.6915330, and MAE of 0.9494307.

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

varImp(svm_model1)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   88.02
## BiologicalMaterial06     80.62
## ManufacturingProcess36   76.93
## ManufacturingProcess17   76.22
## ManufacturingProcess09   73.66
## ManufacturingProcess31   67.43
## BiologicalMaterial02     65.15
## BiologicalMaterial12     64.23
## BiologicalMaterial03     63.28
## ManufacturingProcess06   54.74
## ManufacturingProcess33   53.22
## ManufacturingProcess29   48.11
## ManufacturingProcess11   46.12
## BiologicalMaterial04     42.82
## BiologicalMaterial11     42.22
## BiologicalMaterial08     38.75
## BiologicalMaterial01     36.12
## ManufacturingProcess30   34.48
## BiologicalMaterial09     31.08
ridge_grid1 <- data.frame(.lambda = seq(0.001, 0.5, length = 25))
ridge_model1 <- train(Yield ~ .,
                     data = train.data,
                     method = "ridge",
                     metric = "Rsquared",
                     tuneGrid = ridge_grid1,
                     trControl = ctrl1,
                     preProc = c("center", "scale"))

varImp(ridge_model1)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   88.02
## BiologicalMaterial06     80.62
## ManufacturingProcess36   76.93
## ManufacturingProcess17   76.22
## ManufacturingProcess09   73.66
## ManufacturingProcess31   67.43
## BiologicalMaterial02     65.15
## BiologicalMaterial12     64.23
## BiologicalMaterial03     63.28
## ManufacturingProcess06   54.74
## ManufacturingProcess33   53.22
## ManufacturingProcess29   48.11
## ManufacturingProcess11   46.12
## BiologicalMaterial04     42.82
## BiologicalMaterial11     42.22
## BiologicalMaterial08     38.75
## BiologicalMaterial01     36.12
## ManufacturingProcess30   34.48
## BiologicalMaterial09     31.08

The first of the above data frames displays which predictors are most important in the SVM nonlinear regression model built previously. Out of the top 10 predictors, it seems like the process variables dominate the list. Compared to the the top 10 predictors from the optimal linear model (the ridge model seen above), the top 10 important predictors from this model are exactly the same but just slightly reordered when compared to the results from the previous chapter’s exercise.

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

cmp_cor <- cor(cmp %>% 
                       select(ManufacturingProcess32, ManufacturingProcess13,
                              BiologicalMaterial06, ManufacturingProcess36,
                              ManufacturingProcess17, ManufacturingProcess09,
                              ManufacturingProcess31, BiologicalMaterial02,
                              BiologicalMaterial12, BiologicalMaterial03,
                              Yield))
corrplot::corrplot(cmp_cor)

As was seen when we previously assessed the ChemicalManufacturingProcess data set, the correlation between predictors and yield gives us intuition on the the processes and materials which have a higher correlation with yield leading to higher outcomes and which variables have high correlations to each other displaying which ones can be removed and which ones can be focused more on to acquire better results.