suppressWarnings({library(caret)
library(pls)
library(VIM)
library(AppliedPredictiveModeling)
library(dplyr)
library(tidyr)})
## Loading required package: ggplot2
## Loading required package: lattice
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

7.2

Friedman (1991) introduced several benchmark data sets created by simulation. One of these simulations used the following nonlinear equation to create data:

\[ y-10\sin(\pi x_1x_2)+20(x_3-0.5)^2+10x_4+5x_5+N(0,\sigma^2)\\ \]

where the \(x\) values are random variables uniformly distributed between [0,1] (there are also 5 other non-informative variables also created in the simulation.) The package mlbench contains a function called mlbench.friedman1 that simulates these data.

library(mlbench)
set.seed(1989)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

library(caret)
set.seed(1989)
knnModel_1 <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel_1
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.673263  0.4914916  2.950154
##    7  3.611938  0.5171046  2.910604
##    9  3.613619  0.5261556  2.948502
##   11  3.614597  0.5355296  2.945352
##   13  3.602264  0.5509632  2.939492
##   15  3.604075  0.5670811  2.955562
##   17  3.609379  0.5781473  2.959968
##   19  3.612490  0.5927001  2.973828
##   21  3.615889  0.6065100  2.976307
##   23  3.624355  0.6147256  2.989350
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knnPred_1 <- predict(knnModel_1, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnPred_1, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 3.1629975 0.6561682 2.5411946

Neural Network

trainingData <- as.data.frame(trainingData)
tooHigh <- findCorrelation(cor(trainingData[-11]), cutoff = 0.75)
tooHigh
## integer(0)
library(nnet)
set.seed(1989)

nnetModel_1 <- nnet(trainingData[1:10], trainingData$y,
                    size = 5,
                    decay = 0.01,
                    linout = TRUE,
                    trace = FALSE,
                    maxit = 500)
summary(nnetModel_1)
## a 10-5-1 network with 61 weights
## options were - linear output units  decay=0.01
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1  i9->h1 
##    1.35   -0.87   -0.81   -6.86    0.91   -1.80    0.67   -3.35   -0.08   -1.02 
## i10->h1 
##    0.22 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2  i9->h2 
##   -4.10    2.26    1.74    1.19    2.56    1.13   -0.08    0.03   -0.07   -0.20 
## i10->h2 
##    0.06 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3  i5->h3  i6->h3  i7->h3  i8->h3  i9->h3 
##  -14.37   -3.59   21.49    9.01   -1.57    2.93    5.13   -2.83    2.33    5.50 
## i10->h3 
##   -0.10 
##   b->h4  i1->h4  i2->h4  i3->h4  i4->h4  i5->h4  i6->h4  i7->h4  i8->h4  i9->h4 
##  -18.24   11.79   10.11   -0.26   -1.43   -0.52   -1.09    0.82   -1.05    0.45 
## i10->h4 
##    0.90 
##   b->h5  i1->h5  i2->h5  i3->h5  i4->h5  i5->h5  i6->h5  i7->h5  i8->h5  i9->h5 
##   -4.80    0.14   -5.55  -18.95    6.17    5.69   -2.34    2.95    2.38    4.88 
## i10->h5 
##    0.27 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o 
##   0.26  14.31  19.35   3.71 -12.07   5.61
nnetPred_1 <- predict(nnetModel_1, newdata = testData$x)
postResample( pred = nnetPred_1, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.2891126 0.7984834 1.7669998

MARS

library(earth)
## Warning: package 'earth' was built under R version 4.3.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.3
## Loading required package: plotrix
library(Formula)
library(plotmo)
library(plotrix)
set.seed(1989)

marsModel_1 <- earth(trainingData[1:10], trainingData$y)

summary(marsModel_1)
## Call: earth(x=trainingData[1:10], y=trainingData$y)
## 
##                   coefficients
## (Intercept)           6.497745
## h(x.X1-0.422926)     -7.772862
## h(0.779509-x.X1)    -12.426485
## h(x.X2-0.34687)      -3.028095
## h(0.659947-x.X2)    -12.830365
## h(x.X3-0.0681259)    49.362553
## h(0.510262-x.X3)     55.423704
## h(x.X3-0.510262)    -45.518326
## h(x.X3-0.773716)     13.836121
## h(0.948905-x.X4)    -11.421149
## h(0.867854-x.X5)     -5.572901
## 
## Selected 11 of 16 terms, and 5 of 10 predictors
## Termination condition: Reached nk 21
## Importance: x.X4, x.X2, x.X1, x.X5, x.X3, x.X6-unused, x.X7-unused, ...
## Number of terms at each degree of interaction: 1 10 (additive model)
## GCV 3.013579    RSS 482.7905    GRSq 0.8852932    RSq 0.9071912
marsPred_1 <- predict(marsModel_1, newdata = testData$x)
postResample(pred = marsPred_1, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 1.7805275 0.8742564 1.4034468

Support Vector Machines

library(kernlab)
## Warning: package 'kernlab' was built under R version 4.3.3
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
set.seed(1989)

svmModel_1 <- ksvm(x = as.matrix(trainingData[1:10]), y = trainingData$y,
                   kernel = "rbfdot", kpar = "automatic",
                   C = 1, epsilon = 0.1)
svmModel_1
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0636554327040298 
## 
## Number of Support Vectors : 163 
## 
## Objective Function Value : -43.9197 
## Training error : 0.086662
svmPred_1 <- predict(svmModel_1, newdata = testData$x)
postResample(pred = svmPred_1, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.2753581 0.7863505 1.7573860

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1-X5?)

The most accurate model appears to be the MARS model. The MARS model does select the informative predictors X1 through X5, and leaves X6 through X10 unused. In decreasing order of R^2, the models ranked are the MARS (0.87), the neural network (0.80), the support vector machines model (0.79), and the k-nearest neighbors model (0.66).

7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

data(ChemicalManufacturingProcess)
# use k-nearest neighbors imputation with k=5
ChemicalManufacturingProcess <- kNN(ChemicalManufacturingProcess, k=5)

# remove predictors with near zero variance
cmp_near_zero_var <- nearZeroVar(ChemicalManufacturingProcess)
ChemicalManufacturingProcess <- ChemicalManufacturingProcess[,-cmp_near_zero_var]

# center and scale
ChemicalManufacturingProcess <- as.data.frame(scale(ChemicalManufacturingProcess))

#create train/test split
set.seed(1989)
trainingRows <- createDataPartition(ChemicalManufacturingProcess$Yield, p=0.80, list=FALSE)

cmp_train <- ChemicalManufacturingProcess[trainingRows,]
cmp_test <- ChemicalManufacturingProcess[-trainingRows,]

Neural Network

tooHigh <- findCorrelation(cor(trainingData[-1]), cutoff = 0.75)
tooHigh
## integer(0)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(foreach)
num_cores <- detectCores()

cl <- makeCluster(num_cores - 1)  
registerDoParallel(cl)
train_control <- trainControl(method = "cv", number = 5, allowParallel = TRUE)
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
                         .size = c(1:10),
                         .bag = FALSE)
set.seed(1989)
nnetTune <- train(cmp_train[-1], cmp_train$Yield,
                   method = "avNNet",
                   tuneGrid = nnetGrid,
                   trControl = train_control,
                   preProc = c("center", "scale"),
                   linout = TRUE,
                   trace = FALSE,
 MaxNWts = 10 * (ncol(cmp_train[-1]) + 1) + 10 + 1,
 maxit = 500)
nnetTune$bestTune
##    size decay   bag
## 27    7   0.1 FALSE
nnetPred_2 <- predict(nnetTune, newdata = cmp_test[-1])
postResample(pred = nnetPred_2, obs = cmp_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.7763265 0.5407323 0.6048169

MARS

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(1989)
marsTuned <- train(cmp_train[-1], cmp_train$Yield,
                    method = "earth",
                    tuneGrid = marsGrid,
                    trControl = train_control)
summary(marsTuned)
## Call: earth(x=data.frame[144,59], y=c(1.226,1.004,0...), keepxy=TRUE, degree=2,
##             nprune=5)
## 
##                                                                          coefficients
## (Intercept)                                                                -0.6151759
## h(0.72417-ManufacturingProcess09)                                          -0.4124863
## h(ManufacturingProcess30-0.244377)                                          0.6272757
## h(ManufacturingProcess32- -1.198)                                           0.6308404
## h(0.0450647-ManufacturingProcess17) * h(0.244377-ManufacturingProcess30)    0.9258183
## 
## Selected 5 of 50 terms, and 4 of 59 predictors (nprune=5)
## Termination condition: RSq changed by less than 0.001 at 50 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 3 1
## GCV 0.3531445    RSS 43.38037    GRSq 0.6239723    RSq 0.6747247
marsPred_2 <- predict(marsTuned, newdata = cmp_test[-1])
postResample(pred = marsPred_2, obs = cmp_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.8140276 0.4853746 0.6099804

Support Vector Machines

set.seed(1989)
svmRTuned <- train(cmp_train[-1], cmp_train$Yield,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = train_control)
svmPred_2 <- predict(svmRTuned, newdata = cmp_test[-1])
postResample(pred = svmPred_2, obs = cmp_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.7858505 0.5738751 0.6200308

k-Nearest Neighbors

set.seed(1989)
knnModel_2 <- train(x = cmp_train[-1],
                  y = cmp_train$Yield,
                  method = "knn",
                  tuneLength = 10)
knnModel_2
## k-Nearest Neighbors 
## 
## 144 samples
##  59 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.7776339  0.4094798  0.6191288
##    7  0.7721779  0.4124874  0.6197428
##    9  0.7741258  0.4145199  0.6216486
##   11  0.7740105  0.4208849  0.6223671
##   13  0.7788807  0.4193827  0.6278057
##   15  0.7774926  0.4319648  0.6228322
##   17  0.7843950  0.4304737  0.6283642
##   19  0.7898261  0.4309829  0.6332886
##   21  0.7951545  0.4274195  0.6363676
##   23  0.8023689  0.4216891  0.6436688
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.
knnPred_2 <- predict(knnModel_2, newdata = cmp_test[-1])
postResample(pred = knnPred_2, obs = cmp_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.8472461 0.5004522 0.6683126

a.)

Which nonlinear regression model gives the optimal resampling and test set performance?

For this data the support vector machines model gives the optimal performance metric (\(R^2\) of 0.57), followed by the neural network (0.54), k-nearest neighbors (0.50), and MARS (0.49).

b.)

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

varImp(svmRTuned)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 59)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   91.04
## BiologicalMaterial06     83.27
## BiologicalMaterial03     78.82
## ManufacturingProcess09   77.17
## ManufacturingProcess36   72.61
## ManufacturingProcess17   70.90
## ManufacturingProcess31   65.67
## BiologicalMaterial12     63.15
## ManufacturingProcess11   58.65
## ManufacturingProcess06   56.17
## BiologicalMaterial02     50.42
## BiologicalMaterial09     47.54
## BiologicalMaterial11     47.19
## ManufacturingProcess33   45.24
## BiologicalMaterial04     41.47
## ManufacturingProcess30   37.83
## BiologicalMaterial08     36.27
## BiologicalMaterial01     34.80
## ManufacturingProcess12   33.63

The most important predictors are manufacturing processes 32 and 13, and biological materials 6 and 3. The manufacturing processes appear to dominate the list. However, the list includes 9/12 biological materials, and only 11/38 manufacturing processes.

8 out of the top ten predictors are also in the top ten predictors of the optimal linear (partial least squares) model from Homework 7: manufacturing processes 32, 13, 9, 36, 17, and 11, and biological processes 6 and 3. In each model, manufacturing process 32 is the most important predictor.

c.)

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

The predictors within the top ten of the optimal nonlinear regression model that are not in the optimal linear regression model are manufacturing process 31 and biological material 12.

library(ggplot2)
ggplot(cmp_train, aes(x = ManufacturingProcess31, y = Yield)) + 
  geom_point() + 
  geom_smooth(method = "loess", color = "blue") + 
  labs(title = "Relationship between Manufacturing Process 31 and Yield")
## `geom_smooth()` using formula = 'y ~ x'

Manufacturing Process 31 appears to have a negative influence on yield.

ggplot(cmp_train, aes(x = BiologicalMaterial12, y = Yield)) + 
  geom_point() + 
  geom_smooth(method = "loess", color = "blue") + 
  labs(title = "Relationship between Biological Material 12 and Yield")
## `geom_smooth()` using formula = 'y ~ x'

Biological Material 12 appears to have the most positive impact on yield in the interval from 0 to 1.5, and then it has a negative influence beyond that.

stopCluster(cl)
registerDoSEQ()