library(AppliedPredictiveModeling)
library(dplyr)
library(forecast)
library(ggplot2)
library(tidyr)
library(mice)
library(corrplot)
library(MASS)
library(mlbench)
library(caret)
library(earth)
library(RANN)

Exercises 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

y=10sin(πx1x2)+20(x3−0.5)2+10x4+5x5+N(0,σ2)

where the x values are random variables uniformly distributed between [0,1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(700)
trainingData = mlbench.friedman1(200, sd = 1)
trainingData$x = data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y, col="darkblue")

We convert the ‘x’ data from a matrix to a data frame One reason is that this will give the columns names.

This creates a list with a vector ‘y’ and a matrix of predictors ‘x’. Also simulate a large test set to estimate the true error rate with good precision:

testData = mlbench.friedman1(5000, sd = 1)
testData$x = data.frame(testData$x)

Tune several models on these data. For example:

knnModel <- train(x = trainingData$x, y = trainingData$y,
                  method = "knn", 
                  preProc = c("center", "scale"), 
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.592164  0.4700127  2.871999
##    7  3.521376  0.4958189  2.820022
##    9  3.515259  0.5070750  2.813598
##   11  3.498618  0.5253673  2.812452
##   13  3.506894  0.5367038  2.814692
##   15  3.516859  0.5528522  2.821580
##   17  3.521950  0.5693887  2.824534
##   19  3.543040  0.5776504  2.842053
##   21  3.553574  0.5835731  2.854556
##   23  3.570695  0.5921952  2.881045
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
knnPred <- predict(knnModel, newdata = testData$x)
postResample(pred = knnPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 3.3162685 0.6485782 2.6437241

MARS

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)

set.seed(701)

marsFit <- earth(trainingData$x, trainingData$y)

marsPred <- predict(marsFit, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 1.8290283 0.8682575 1.4246109

MARS Tuned

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)

set.seed(702)

marsTuned <- train(trainingData$x, trainingData$y, method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

marsTunePred <- predict(marsTuned, newdata = testData$x)
postResample(pred = marsTunePred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 1.3589618 0.9272836 1.0668592

SVM Tuned. SVM model with tuning is created.

set.seed(703)

svmRTuned <- train(trainingData$x, trainingData$y,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),
                   tuneLength = 15,
                   trControl = trainControl(method = "cv"))

svmPred <- predict(svmRTuned, newdata = testData$x)
postResample(svmPred, testData$y)
##      RMSE  Rsquared       MAE 
## 2.0150962 0.8418664 1.5510843

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

Answer: The model that generates the best performance is MARS Tuned. Rsquared: 0.9272836. Mars model if you select the informative predictors X1–X5.

Exercises 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

Load the data:

data(ChemicalManufacturingProcess)

Impute using caret and find low values then remove low frequency columns using baser df[row,columns].

imputed_data <- preProcess(ChemicalManufacturingProcess, "knnImpute")
full_data <- predict(imputed_data, ChemicalManufacturingProcess)
low_values <- nearZeroVar(full_data)
chem_data <- full_data[,-low_values]
index_chem <- createDataPartition(chem_data$Yield , p=.8, list=F)
train_chem <-  chem_data[index_chem,] 
test_chem <- chem_data[-index_chem,]
  1. Which nonlinear regression model gives the optimal resampling and test set performance?

KNN

set.seed(800)
knnModel <- train(Yield~., 
                    data = train_chem,
                    method = "knn",
                    preProc = c("center", "scale"), 
                    tuneLength = 10)

knnPred <- predict(knnModel,  test_chem)
postResample(pred = knnPred, obs = test_chem$Yield)
##      RMSE  Rsquared       MAE 
## 0.8472828 0.3562812 0.6960110

Tuned Neural Network

set.seed(801)
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
                        .size = c(1:10), .bag = FALSE)

nnetTune <- train(Yield~., 
                  data = train_chem, 
                  method = "avNNet", 
                  tuneGrid = nnetGrid,
                  trControl = trainControl(method = "cv"), 
                  linout = TRUE,trace = FALSE,
                  MaxNWts = 10 * (ncol(train_chem) + 1) + 10 + 1, 
                  maxit = 100)
nnetPred <- predict(nnetTune,  test_chem)
postResample(predict(nnetTune,  test_chem), test_chem$Yield)
##      RMSE  Rsquared       MAE 
## 0.5767047 0.6950333 0.4667609

MARS Tuned

set.seed(802)
marsTuned_chem <- train(Yield~. ,
                  data = train_chem,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

marsTunePred_chem <- predict(marsTuned_chem,  test_chem)
postResample(marsTunePred_chem, test_chem$Yield)
##      RMSE  Rsquared       MAE 
## 0.7252408 0.5495472 0.5710779

SVM Tuned

set.seed(803)
svmTuned_chem <- train(Yield~. ,
                  data = train_chem,
                   method = "svmRadial",
                   tuneLength = 15,
                   trControl = trainControl(method = "cv"))

svmTunePred_chem <- predict(svmTuned_chem,  test_chem)
postResample(svmTunePred_chem, test_chem$Yield)
##      RMSE  Rsquared       MAE 
## 0.5569832 0.7377903 0.4498412

Answer: SVM Tuned model give the optimal resampling and test set perfomance, with RMSE: 0.5569832 , and R squared: 0.7377903.

  1. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
varImp(svmTuned_chem)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## BiologicalMaterial06    100.00
## ManufacturingProcess32   99.33
## ManufacturingProcess13   83.87
## ManufacturingProcess31   74.14
## BiologicalMaterial02     73.66
## ManufacturingProcess36   68.88
## BiologicalMaterial03     68.73
## BiologicalMaterial12     65.69
## ManufacturingProcess17   62.22
## ManufacturingProcess09   58.46
## ManufacturingProcess29   57.65
## BiologicalMaterial04     55.58
## ManufacturingProcess33   54.86
## ManufacturingProcess06   49.92
## BiologicalMaterial01     48.40
## ManufacturingProcess02   46.87
## BiologicalMaterial11     43.62
## BiologicalMaterial08     42.90
## ManufacturingProcess11   41.68
## BiologicalMaterial09     35.20
plot(varImp(svmTuned_chem), top=10)

Answer: The three most important predictors are: BiologicalMaterial06, ManufacturingProcess32, and ManufacturingProcess13. There are 6 ManufacturingProcess in the top 10. The top predictors remain Process predictors. But change the main predictor in the previous model was ManufacturingProcess13 and for this model it was BiologicalMaterial06.

  1. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
corr_vals <- chem_data %>% 
  dplyr::select('Yield', 'BiologicalMaterial06','ManufacturingProcess32', 'ManufacturingProcess13', 'ManufacturingProcess31', 'BiologicalMaterial02','ManufacturingProcess36',
         'BiologicalMaterial03','BiologicalMaterial12','ManufacturingProcess17','ManufacturingProcess09')

corr_plot_vals <- cor(corr_vals)

corrplot.mixed(corr_plot_vals, tl.col = 'blue', tl.cex = 0.8, tl.pos = 'lt',  upper = "number", lower="ellipse",  number.cex = 0.6)

Answer: According to the graph we can see the Yield response variable doesn’t have strong correlations with many of the important predictors. The strongest positive correlations are with BiologicalMaterial02 and BiologicalMaterial06 with correlation values 0.95. The strongest negative correlations are with ManufacturingProcess32 and ManufacturingProcess36 with correlations values -0.79. The strongest correlations occur with Biological material’s.