Libraries

library(tidyverse)
library(caret)
library(earth)
library(knitr)
library(caret)
library(pls)
library(glmnet)
library(Amelia)
library(knitr)
library(mice)
library(psych)
library(corrplot)

Exercise 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\(y = 10 sin(\pi x_1x_2) + 20(x_3- 0.5)^2 + 10x_4 + 5x_5 + N(0, \sigma^2)\)

where the \(x\) values are random variables uniformly distributed between \([0,1]\) (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will five the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'.  Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

KNN Model

library(caret)
knnModel <- train(x = trainingData$x, 
                  y = trainingData$y,
                  method = "knn",
                  preProcess = c("center", "scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
result_knn <- postResample(pred = knnPred, obs = testData$y)
result_knn
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

Neural Networks

nn_grid <- expand.grid(decay = c(0, 0.01, 0.1), 
                       size = c(1:10), 
                       bag = FALSE)


nn_model <- train(x = trainingData$x,
                    y = trainingData$y,
                    method = "avNNet",
                    preProcess = c("center", "scale"),
                    tuneGrid = nn_grid,
                    trControl = trainControl(method = "cv"),
                    linout = TRUE,
                    trace = FALSE,
                    MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
                    maxit = 500)
## Warning: executing %dopar% sequentially: no parallel backend registered
nn_model
## Model Averaged Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0.00    1    2.434845  0.7683498  1.921367
##   0.00    2    2.497822  0.7558233  1.993325
##   0.00    3    2.037885  0.8419795  1.609413
##   0.00    4    1.900063  0.8584928  1.529545
##   0.00    5    2.176661  0.8092998  1.628603
##   0.00    6    2.743381  0.7255103  1.988222
##   0.00    7    3.496229  0.6401273  2.493454
##   0.00    8    4.034891  0.5941657  2.749735
##   0.00    9    4.221796  0.5137164  2.800450
##   0.00   10    4.682342  0.5848908  2.818883
##   0.01    1    2.437231  0.7689665  1.934978
##   0.01    2    2.510986  0.7596191  1.988260
##   0.01    3    1.999944  0.8419567  1.555751
##   0.01    4    2.003357  0.8445288  1.549723
##   0.01    5    2.104801  0.8296459  1.664982
##   0.01    6    2.314704  0.7997307  1.857949
##   0.01    7    2.341101  0.8072335  1.872758
##   0.01    8    2.205611  0.8163107  1.748153
##   0.01    9    2.262921  0.8146166  1.776693
##   0.01   10    2.453311  0.7709666  1.981977
##   0.10    1    2.450897  0.7652309  1.942945
##   0.10    2    2.489399  0.7606443  1.997060
##   0.10    3    2.200693  0.8155496  1.786599
##   0.10    4    2.059322  0.8432340  1.651716
##   0.10    5    2.189025  0.8133603  1.729453
##   0.10    6    2.215091  0.8128993  1.757966
##   0.10    7    2.209521  0.8196474  1.786772
##   0.10    8    2.317124  0.8010433  1.826655
##   0.10    9    2.286711  0.7928430  1.849002
##   0.10   10    2.238560  0.8113030  1.787851
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0 and bag = FALSE.
nn_pred <- predict(nn_model, newdata = testData$x)
result_nn <- postResample(pred = nn_pred, obs = testData$y)
result_nn
##     RMSE Rsquared      MAE 
## 2.496722 0.784618 1.685182
varImp(nn_model)
## loess r-squared variable importance
## 
##      Overall
## X4  100.0000
## X1   95.5047
## X2   89.6186
## X5   45.2170
## X3   29.9330
## X9    6.3299
## X10   5.5182
## X8    3.2527
## X6    0.8884
## X7    0.0000

MARS: Multivariate Adaptive Regression Splines

mars_grid <- expand.grid(.degree = 1:2, 
                         .nprune = 2:15)

mars_model <- train(x = trainingData$x, 
                    y = trainingData$y,
                   method = "earth",
                   preProcess = c("center", "scale"),
                   tuneGrid = mars_grid,
                   tuneLength = 10)
mars_model
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.456198  0.2137427  3.656603
##   1        3      3.710794  0.4575656  2.964039
##   1        4      2.812657  0.6890668  2.245613
##   1        5      2.483519  0.7553108  1.986493
##   1        6      2.351915  0.7827773  1.873954
##   1        7      1.979841  0.8415897  1.568660
##   1        8      1.898749  0.8571450  1.490094
##   1        9      1.825444  0.8677663  1.422307
##   1       10      1.804445  0.8715281  1.401342
##   1       11      1.814366  0.8708521  1.403212
##   1       12      1.814148  0.8711081  1.406672
##   1       13      1.805964  0.8725645  1.401427
##   1       14      1.814806  0.8710622  1.408247
##   1       15      1.825854  0.8699861  1.415226
##   2        2      4.451233  0.2150330  3.649946
##   2        3      3.683521  0.4624456  2.949529
##   2        4      2.807360  0.6886212  2.238406
##   2        5      2.441942  0.7618319  1.952288
##   2        6      2.336803  0.7830504  1.853624
##   2        7      2.006705  0.8377829  1.584936
##   2        8      1.907203  0.8529641  1.496002
##   2        9      1.757742  0.8757059  1.404230
##   2       10      1.694435  0.8836014  1.356540
##   2       11      1.572794  0.9006788  1.259079
##   2       12      1.549429  0.9020481  1.220577
##   2       13      1.490619  0.9112893  1.184305
##   2       14      1.527004  0.9059230  1.198241
##   2       15      1.579746  0.8992902  1.223095
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 2.
mars_pred <- predict(mars_model, newdata = testData$x)
result_mars <- postResample(pred = mars_pred, obs = testData$y)
result_mars
##      RMSE  Rsquared       MAE 
## 1.3227340 0.9291489 1.0524686
varImp(mars_model)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.40
## X2   49.00
## X5   15.72
## X3    0.00

SVM Model

SVM_model <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),
                   tuneLength = 10,
                   trControl = trainControl(method = "cv"))
SVM_model
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.525464  0.7876823  2.007994
##     0.50  2.260891  0.8039989  1.788189
##     1.00  2.064308  0.8297627  1.622225
##     2.00  1.969286  0.8418120  1.534184
##     4.00  1.892373  0.8543854  1.490401
##     8.00  1.879069  0.8559798  1.494280
##    16.00  1.893412  0.8534906  1.508484
##    32.00  1.893412  0.8534906  1.508484
##    64.00  1.893412  0.8534906  1.508484
##   128.00  1.893412  0.8534906  1.508484
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06401541
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06401541 and C = 8.
SVM_pred <- predict(SVM_model, newdata = testData$x)
result_SVM <- postResample(pred = SVM_pred, obs = testData$y)
result_SVM
##      RMSE  Rsquared       MAE 
## 2.0583739 0.8283495 1.5622198
varImp(SVM_model)
## loess r-squared variable importance
## 
##      Overall
## X4  100.0000
## X1   95.5047
## X2   89.6186
## X5   45.2170
## X3   29.9330
## X9    6.3299
## X10   5.5182
## X8    3.2527
## X6    0.8884
## X7    0.0000

Summary

knn <- data.frame(t(result_knn)) %>% mutate("Model"= "KNN")
nn <- data.frame(t(result_nn)) %>% mutate("Model"= "Neural Network")
mars <- data.frame(t(result_mars)) %>% mutate("Model"= "MARS")
svm <- data.frame(t(result_SVM)) %>% mutate("Model"= "SVM")
 
all_result <- rbind(knn,nn,mars,svm) %>% select(Model, RMSE, Rsquared, MAE) %>% arrange(RMSE)  %>% kable()

all_result
Model RMSE Rsquared MAE
MARS 1.322734 0.9291489 1.052469
SVM 2.058374 0.8283495 1.562220
Neural Network 2.496722 0.7846180 1.685182
KNN 3.204060 0.6819919 2.568346

MARS model appear to give the best performance and it was able to select the informative predictors (those named X1–X5).

Exercise 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

I will use data, data imputation, data splitting, and pre-processing steps from homework 7 (Exercise 6.3) as per instruction above.

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)
## [1] 176  58

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

Imputation

table(is.na(ChemicalManufacturingProcess))
## 
## FALSE  TRUE 
## 10102   106
missmap(ChemicalManufacturingProcess)

We can see there are 106 (around 1%) missing values in our dataset. I will impute the dataset using mice package which tend to give really good imputation compared to other techniques mentions in the book.

chem_imputed <- mice(ChemicalManufacturingProcess, print = F, m = 1) %>% complete()
## Warning: Number of logged events: 135
missmap(chem_imputed)

There is no more NA in our dataset.

Split the data

tooHighCor <- findCorrelation(cor(chem_imputed), 0.90)
chem_filtered <- chem_imputed[, -tooHighCor]
dim(chem_filtered)
## [1] 176  48

In these data, a significant number of predictors had pair-wise absolute correlations that were larger than 0.90. Because of this, a high-correlation filter was used on the predictor set to remove these highly redundant predictors from the data. In the end, 10 predictors were eliminated from the data for this reason.

Let’s check the histogram of all fields for skewness.

multi.hist(chem_filtered[1:9])

multi.hist(chem_filtered[10:18])

multi.hist(chem_filtered[19:27])

multi.hist(chem_filtered[28:36])

multi.hist(chem_filtered[37:48])

I will do Pre-processing using preProc = c(“center”, “scale”) to address skewness of the data.

# Set seed
set.seed(43)
train <- createDataPartition(chem_filtered$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- chem_filtered[train, ]
test_df <- chem_filtered[-train, ]

Part (a)

Which nonlinear regression model gives the optimal resampling and test set performance?

# KNN Model
knn_model <- train(
  Yield ~ ., data = train_df, 
  method = "knn",
  center = TRUE,
  scale = TRUE,
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)
knn_pred <- predict(knn_model, newdata = test_df)
result_knn <- postResample(pred = knn_pred, obs = test_df$Yield)

# MARS Model

mars_grid <- expand.grid(.degree = 1:2, .nprune = 2:15)
mars_model <- train(
  Yield ~ ., data = train_df, method = "earth",
  preProcess = c("center", "scale"),
  tuneGrid = mars_grid,
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)

mars_pred <- predict(mars_model, newdata = test_df)
result_mars <- postResample(pred = mars_pred, obs = test_df$Yield)

# SVM Model

SVM_model <- train(
  Yield ~ ., data = train_df, method = "svmRadial",
  preProcess = c("center", "scale"),
  trControl = trainControl(method = "cv"),
  tuneLength = 25
)

SVM_pred <- predict(SVM_model, newdata = test_df)
result_SVM <- postResample(pred = SVM_pred, obs = test_df$Yield)

# Neural Network Model

nn_grid <- expand.grid(decay = c(0, 0.01, 0.1), 
                       size = c(1:10), 
                       bag = FALSE)

nn_maxnwts <- 10 * ncol(train_df) + 10 + 1

nn_model <- train(
  Yield ~ ., data = train_df, method = "avNNet",
  preProcess = c("center", "scale"),
  tuneGrid = nn_grid,
  trControl = trainControl(method = "cv"),
  linout = TRUE,
  trace = FALSE,
  MaxNWts = nn_maxnwts,
  maxit = 500
)

nn_pred <- predict(nn_model, newdata = test_df)
result_nn <- postResample(pred = nn_pred, obs = test_df$Yield)
knn <- data.frame(t(result_knn)) %>% mutate("Model"= "KNN")
nn <- data.frame(t(result_nn)) %>% mutate("Model"= "Neural Network")
mars <- data.frame(t(result_mars)) %>% mutate("Model"= "MARS")
svm <- data.frame(t(result_SVM)) %>% mutate("Model"= "SVM")
 
all_result <- rbind(knn,nn,mars,svm) %>% select(Model, RMSE, Rsquared, MAE) %>% arrange(RMSE) %>% kable()

all_result
Model RMSE Rsquared MAE
SVM 1.172245 0.5896795 0.8541953
MARS 1.243938 0.5358244 0.9606100
KNN 1.280641 0.5141833 1.0945478
Neural Network 1.489858 0.4089787 1.1576915

We can see from above that SVM performed best among the models with lowest RMSE.

Part (b)

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

Let’s check most important predictors in the optimal nonlinear regression model which is SVM in our case:

varImp(SVM_model)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 47)
## 
##                        Overall
## ManufacturingProcess32  100.00
## BiologicalMaterial06     86.37
## ManufacturingProcess13   81.44
## BiologicalMaterial03     71.18
## ManufacturingProcess17   66.65
## ManufacturingProcess36   65.59
## ManufacturingProcess09   64.70
## ManufacturingProcess06   53.78
## ManufacturingProcess33   52.24
## BiologicalMaterial11     51.54
## ManufacturingProcess11   47.28
## BiologicalMaterial08     45.54
## ManufacturingProcess30   43.74
## BiologicalMaterial01     39.21
## BiologicalMaterial09     29.91
## ManufacturingProcess20   28.06
## ManufacturingProcess27   24.29
## ManufacturingProcess35   24.08
## ManufacturingProcess12   23.59
## ManufacturingProcess15   19.78

Process variables dominate the list as 7 of the top 10 important predictors are process variables.

Let’s get the important predictors from optimal linear regression model from homework 7.

pls_model <- train(
  Yield ~ ., data = train_df, method = "pls",
  preProc = c("center", "scale"),
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)
# predictions
pls_predictions <- predict(pls_model, test_df)
# Performance metrics
results <- data.frame(RMSE = caret::RMSE(pls_predictions, test_df$Yield),
           Rsquared = caret::R2(pls_predictions, test_df$Yield))
results 
##       RMSE Rsquared
## 1 2.186425 0.256877
varImp(pls_model)
## pls variable importance
## 
##   only 20 most important variables shown (out of 47)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess09   76.12
## ManufacturingProcess36   75.74
## ManufacturingProcess13   75.20
## BiologicalMaterial06     71.53
## ManufacturingProcess33   67.96
## ManufacturingProcess17   67.85
## BiologicalMaterial03     64.55
## BiologicalMaterial08     62.49
## BiologicalMaterial01     57.35
## BiologicalMaterial11     56.96
## ManufacturingProcess06   55.60
## ManufacturingProcess11   52.22
## ManufacturingProcess12   44.23
## ManufacturingProcess28   42.53
## ManufacturingProcess04   41.19
## ManufacturingProcess34   37.43
## ManufacturingProcess30   37.38
## BiologicalMaterial10     35.80
## ManufacturingProcess02   33.07

PLS model most predictors are very similar to SVM and process variables dominate the list.

Part (c)

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

I have identified following important predictors that are unique to the SVM model compared to PLS model.

  • BiologicalMaterial10
  • ManufacturingProcess28
  • ManufacturingProcess34
ggplot(train_df, aes(BiologicalMaterial10, Yield)) +
  geom_point()

ggplot(train_df, aes(ManufacturingProcess28, Yield)) +
  geom_point()

ggplot(train_df, aes(ManufacturingProcess34, Yield)) +
  geom_point()

BiologicalMaterial10 has some positive relation with response variable. I don’t see any relation for other two predictors.