7.2

Data prep

library(mlbench)
library(caret)

set.seed(200) # for reproducibility

# train and test set data are setup in accordance with Kuhn and Johnson provided code
trainingData <- mlbench.friedman1(200, sd=1)
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x,trainingData$y)

testData <- mlbench.friedman1(5000, sd=1)
testData$x <- data.frame(testData$x)

Tune Models

Four types of non-linear regression models will be built:
* K-nearest neighbors (KNN) * Support Vector Machines (SPV) * Neural Networks * Multivariate Adaptive Regression Splines (MARS)

For each, model parameters will be tuned using cross validation and pre-processing from the caret package.

KNN

For k-nearest neighbors, the tuning parameter is k, the number of observations used in distance calculation optimization. Based on RMSE, 13 neighbors was determined to be optimal for the model.

# train model 
set.seed(2021)
knnModel <- train(x=trainingData$x,
                  y=trainingData$y,
                  method="knn",
                  preProc = c("center","scale"),
                  tuneLength = 10)

# plot neighbors
ggplot(knnModel)

# make predictions on test set data
knnPred <- predict(knnModel$finalModel , newdata = testData$x)

# get accuracy metrics of test set data 
postResample(pred=knnPred, obs=testData$y)
##      RMSE  Rsquared       MAE 
## 6.0419338 0.4716198 4.9490727

Support Vector Machine

In SVM the tuning parameter, sigma is estimated from a grid of cost values (length 10) using cross validation. A radial kernel method was used. The final values used for the model were sigma = 0.06026524 and C = 16

set.seed(2021)
svmTuned <- train(x=trainingData$x,
                  y=trainingData$y,
                  method="svmRadial",
                  preProc = c("center","scale"),
                  tuneLength = 10,
                  trControl=trainControl(method="cv"))
svmTuned
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.489005  0.8102424  1.992541
##     0.50  2.276983  0.8219804  1.818488
##     1.00  2.093621  0.8455647  1.669413
##     2.00  1.995275  0.8583611  1.568835
##     4.00  1.917723  0.8662980  1.490901
##     8.00  1.842144  0.8753789  1.450055
##    16.00  1.840550  0.8764514  1.462855
##    32.00  1.840686  0.8764312  1.462988
##    64.00  1.840686  0.8764312  1.462988
##   128.00  1.840686  0.8764312  1.462988
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06026524
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06026524 and C = 16.
plot(svmTuned)

svmPred <- predict(svmTuned, newdata = testData$x)
postResample(pred=svmPred, obs=testData$y)
##      RMSE  Rsquared       MAE 
## 2.0663539 0.8268499 1.5697228

Neural Networks

The final values used for the model were size = 3, decay = 0

set.seed(2021)
# remove highly correlated parameters
tooHigh <- findCorrelation(cor(trainingData$x), cutoff=.75)
trainXnnet <- trainingData$x[,-tooHigh]
testXnnet <- testData$x[,-tooHigh]

# create grid of pairwise combinations of tuning parameters
nnetGrid <- expand.grid(.decay=c(0,0.01,.1),
                        .size=c(1:10),
                        .bag=F)

ctrl <- trainControl(method="cv", number=10)
nnetTune <- train(x=trainingData$x, trainingData$y,
                  method="avNNet",
                  tuneGrid=nnetGrid,
                  trControl=ctrl,
                  preProc = c("center", "scale"),
                  linout=T,
                  trace=F,
                  MaxNWts = 10 * (ncol(trainingData$x)+1) + 10 +1,
                  maxit=300
                  )
## Warning: executing %dopar% sequentially: no parallel backend registered
nnetPred <- predict(nnetTune, newdata = testData$x)
postResample(pred=nnetPred, obs=testData$y)
##      RMSE  Rsquared       MAE 
## 2.1967632 0.8085611 1.6295679

MARS

The final values used for the model were degree=2, nprune=17

library(earth)
## Warning: package 'earth' was built under R version 4.0.5
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.0.5
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 4.0.5
set.seed(2021)
# trainingDataX_scale <- scale(trainingData$x, center = T, scale=T)
# testDataX_scale <- scale(testData$x, center = T, scale=T)
# trainingDataY_scale <- scale(trainingData$y, center = T, scale=T)
# testDataY_scale <- scale(testData$y, center = T, scale=T)

marsGrid <- expand.grid(.degree=1:2, .nprune=2:38)

marsTuned <- train(trainingData$x, trainingData$y,
                   preProcess = c("center", "scale"),
                   method="earth",
                   tuneGrid = marsGrid,
                   trControl=trainControl(method="cv")
                   )

summary(marsTuned)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=2,
##             nprune=17)
## 
##                                   coefficients
## (Intercept)                          22.042898
## h(0.507267-X1)                       -4.261526
## h(X1-0.507267)                        2.661009
## h(0.325504-X2)                       -5.256624
## h(-0.216741-X3)                       2.909153
## h(X3- -0.216741)                      1.657839
## h(0.953812-X4)                       -2.810030
## h(X4-0.953812)                        2.833141
## h(1.17878-X5)                        -1.503531
## h(-0.951872-X1) * h(X2-0.325504)     -3.617965
## h(X1- -0.951872) * h(X2-0.325504)    -1.505955
## h(X1-0.507267) * h(X2- -0.798188)    -2.198535
## h(0.606835-X1) * h(0.325504-X2)       1.980041
## h(0.325504-X2) * h(X3-0.795427)       2.143490
## h(X2-0.325504) * h(X3- -0.917499)     1.340520
## h(X2-0.325504) * h(-0.917499-X3)      4.483181
## 
## Selected 16 of 21 terms, and 5 of 10 predictors (nprune=17)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 8 7
## GCV 1.642635    RSS 214.2181    GRSq 0.9333083    RSq 0.9560751
marsPred <- predict(marsTuned, newdata = testData$x)
postResample(pred=marsPred, obs=testData$y)
##      RMSE  Rsquared       MAE 
## 1.2793868 0.9343367 1.0091132

In terms of RMSE, Rsquared and MAE, the MARS model performed the best. In of variable importance to the model, the MARS output has X1, X4, X2, X5, X3 as the most important, in that order, and were the only variables selected.

7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

Missing values were present in some of the process predictors, and were imputed with a k-nearest neighbor imputation algorithm form the DMwR package.

library(AppliedPredictiveModeling)
library(DMwR)
library(dplyr)

data("ChemicalManufacturingProcess")

# impute with KNN from DMwR package
ChemicalManufacturingProcess <-  knnImputation(ChemicalManufacturingProcess, k=3)

# convert to dataframe: target is the first variable
chem_df <- ChemicalManufacturingProcess %>% 
  as.data.frame()

# function removes sparse variables with no predictive power
chem_df <- chem_df[,-nearZeroVar(chem_df)]

# preprocess: center and scale
chem_df <- scale(chem_df,center=T,scale=T) %>% 
  as.data.frame()

# Train/Test Split
set.seed(2021)
trainIndex <- createDataPartition(chem_df$Yield , p = .8) %>% 
  unlist()
trainY <- chem_df[ trainIndex, 'Yield']
trainX  <- chem_df[trainIndex, -1]
testY <- chem_df[-trainIndex, 'Yield']
testX <- chem_df[-trainIndex, -1]

a.

Which model gives the optimal resampling and test set performance?

Four non-linear regression models are tuned, built, and evaluated based on accuracy metrics of test predictive power which are output below. The SVM model had the best performance and is considered to be the best model for this dataset. The final values used for the model were sigma = 0.01374538 and C = 16, with an R-squared value of nearly 0.60.

Neural Net

tooHigh <- findCorrelation(cor(trainX), cutoff=.75)
trainXnnet <- trainX[,-tooHigh]
testXnnet <- testX[,-tooHigh]

nnetGrid <- expand.grid(.decay=c(0,0.01,.1),
                        .size=c(1:10),
                        .bag=F)

ctrl <- trainControl(method="cv", number=10)
nnetTune <- train(x=trainX, trainY,
                  method="avNNet",
                  tuneGrid=nnetGrid,
                  trControl=ctrl,
                  preProc = c("center", "scale"),
                  linout=T,
                  trace=F,
                  MaxNWts = 10 * (ncol(trainX)+1) + 10 +1,
                  maxit=100
                  )

nnetPred <- predict(nnetTune, newdata = testX)
postResample(pred=nnetPred, obs=testY)
##      RMSE  Rsquared       MAE 
## 0.6169057 0.5714694 0.5224510

KNN

knnModel <- train(x=trainX ,
                  y=trainY,
                  method="knn",
                  preProc = c("center","scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.8030031  0.4049326  0.6334621
##    7  0.8050819  0.4016342  0.6363268
##    9  0.8027220  0.4053764  0.6358452
##   11  0.7975178  0.4208843  0.6328776
##   13  0.7971246  0.4246893  0.6329816
##   15  0.8006581  0.4218811  0.6355447
##   17  0.8029276  0.4230091  0.6353964
##   19  0.7999436  0.4328688  0.6320980
##   21  0.8033300  0.4330551  0.6343342
##   23  0.8034847  0.4387375  0.6344384
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knnPred <- predict(knnModel, newdata = testX)
postResample(pred=knnPred, obs=testY)
##      RMSE  Rsquared       MAE 
## 0.6290612 0.5208582 0.5662823

SVM

svmTuned <- train(x=trainX,
                  y=trainY,
                  method="svmRadial",
                  preProc = c("center","scale"),
                  tuneLength = 10,
                  trControl=trainControl(method="cv"))
svmTuned
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 130, 129, 131, 129, 128, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE       Rsquared   MAE      
##     0.25  0.7435830  0.5480574  0.5972377
##     0.50  0.6757240  0.5962838  0.5451832
##     1.00  0.6137731  0.6525283  0.4919169
##     2.00  0.5796716  0.6909718  0.4589531
##     4.00  0.5613477  0.7076823  0.4417404
##     8.00  0.5541046  0.7133746  0.4444288
##    16.00  0.5514067  0.7184713  0.4422590
##    32.00  0.5514067  0.7184713  0.4422590
##    64.00  0.5514067  0.7184713  0.4422590
##   128.00  0.5514067  0.7184713  0.4422590
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01374538
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01374538 and C = 16.
svmPred <- predict(svmTuned, newdata = testX)
postResample(pred=svmPred, obs=testY)
##      RMSE  Rsquared       MAE 
## 0.5849225 0.5971161 0.4962676

MARS

marsGrid <- expand.grid(.degree=1:3, .nprune=2:20)

marsTuned <- train(trainX, trainY,
                   preProcess = c("center", "scale"),
                   method="earth",
                   tuneGrid = marsGrid,
                   trControl=trainControl(method="cv")
                   )


summary(marsTuned)
## Call: earth(x=data.frame[144,56], y=c(1.226,1.004,0...), keepxy=TRUE, degree=1,
##             nprune=16)
## 
##                                      coefficients
## (Intercept)                            -0.7227732
## h(1.17267-BiologicalMaterial03)        -0.1639689
## h(1.16079-ManufacturingProcess04)      -0.2169926
## h(-1.0195-ManufacturingProcess09)      -0.4525759
## h(ManufacturingProcess09- -1.0195)      0.3376934
## h(-0.908266-ManufacturingProcess13)     1.2073150
## h(ManufacturingProcess15- -0.710248)    0.2175139
## h(ManufacturingProcess22-0.793942)     -0.2926231
## h(-1.19593-ManufacturingProcess32)     -1.4136512
## h(ManufacturingProcess32- -1.19593)     0.8378956
## h(-1.00096-ManufacturingProcess33)      1.1294725
## h(ManufacturingProcess33- -1.00096)    -0.3400981
## h(0.154327-ManufacturingProcess35)      0.1671941
## h(0.0619141-ManufacturingProcess39)    -0.1220007
## h(ManufacturingProcess39-0.0619141)    -1.1541907
## 
## Selected 15 of 21 terms, and 10 of 56 predictors (nprune=16)
## Termination condition: RSq changed by less than 0.001 at 21 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 14 (additive model)
## GCV 0.3142457    RSS 28.86041    GRSq 0.7003567    RSq 0.8062114
marsPred <- predict(marsTuned, newdata = testX)
postResample(pred=marsPred, obs=testY)
##      RMSE  Rsquared       MAE 
## 0.7508385 0.5030175 0.5880714
#summary(mars)
#plotmo(mars)

b.

Which predictors are most important in the optimal model? Do either biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

For the SVM model manufacturing variables are the most important but there are some biological as well. This is different from the previous assignment where a linear model was built, and manufacturing variables dominated.

varImp(svmTuned)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess32   90.69
## BiologicalMaterial06     84.99
## ManufacturingProcess17   84.04
## ManufacturingProcess09   79.56
## ManufacturingProcess36   76.55
## BiologicalMaterial03     74.39
## BiologicalMaterial12     72.50
## ManufacturingProcess06   65.46
## BiologicalMaterial02     63.85
## BiologicalMaterial04     58.90
## BiologicalMaterial11     57.76
## BiologicalMaterial08     56.94
## ManufacturingProcess29   55.10
## ManufacturingProcess11   53.39
## ManufacturingProcess31   53.02
## BiologicalMaterial09     43.60
## BiologicalMaterial01     43.37
## ManufacturingProcess30   43.15
## ManufacturingProcess33   42.34

c. 

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

Process 32 has a positive relationship with Yield, but the yield improvements may have limiting returns after a threshold. Process 13 has a negative impact on yield. This information could be valuable in guiding future process runs so that yield can be improved.

library(corrplot)

top2 <- ChemicalManufacturingProcess %>% 
  dplyr::select(Yield, ManufacturingProcess32, ManufacturingProcess13)

top2 %>% ggplot(aes(x=ManufacturingProcess32,y=Yield)) + geom_point()

top2 %>% ggplot(aes(x=ManufacturingProcess13,y=Yield)) + geom_point()

corrplot(cor(top2),order="hclust")