Libraries Used:

library(caret)
library(nnet)
library(earth)
library(kernlab)
library(mlbench)
library(AppliedPredictiveModeling)
library(RANN)
library(dplyr)

Question 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: \[y = 1-sin(\pi x_{1} x_{2}) + 20(x_{3} - 0.5)^{2} + 10x_{4}+5x_{5}+N(0,\sigma^{2]}) \]

Where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x" data from a matrix to a dataframe
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using 
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a maxtrix
## of predictors 'x'. Also simulates a large test set to 
## estimate the true error rate with good precision:

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

## Example of Tuning a Model is 
knnmodel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnmodel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnpred <- predict(knnmodel, newdata = testData$x)
## the function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnpred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Different Models

Neural Networks
# 1 Layer Neural Network with 5 hidden Layer

nnetfit <- nnet(trainingData$x, trainingData$y,
                size = 5,
                decay = 0.01,
                linout = TRUE,
                trace = FALSE,
                maxit = 500,
                MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)

nnet1_pred <- predict(nnetfit, newdata = testData$x)
postResample(pred = nnet1_pred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.5244877 0.7621849 1.8722728
# Averaged Neural Network Model
nnetavg <- avNNet(trainingData$x, trainingData$y,
                  size = 5,
                  decay = 0.01,
                  repeats = 5,
                  linout = TRUE,
                  trace = FALSE,
                  maxit = 500,
                  MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)


nnet2_pred <- predict(nnetavg, newdata = testData$x)
postResample(pred = nnet2_pred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.0404770 0.8331947 1.5329094
Multivariate Adaptive Regression Splines(MARS)
# Regular MARS Model
marsfit <- earth(trainingData$x, trainingData$y)
marsfit
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
summary(marsfit)
## Call: earth(x=trainingData$x, y=trainingData$y)
## 
##                coefficients
## (Intercept)       18.451984
## h(0.621722-X1)   -11.074396
## h(0.601063-X2)   -10.744225
## h(X3-0.281766)    20.607853
## h(0.447442-X3)    17.880232
## h(X3-0.447442)   -23.282007
## h(X3-0.636458)    15.150350
## h(0.734892-X4)   -10.027487
## h(X4-0.734892)     9.092045
## h(0.850094-X5)    -4.723407
## h(X5-0.850094)    10.832932
## h(X6-0.361791)    -1.956821
## 
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
mars_pred1 <- predict(marsfit, newdata = testData$x)
postResample(pred = mars_pred1, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 1.8136467 0.8677298 1.3911836
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:16)
set.seed(334)

# Tuning MARS based on Cross-Validation
marstuned <- train(trainingData$x, trainingData$y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))
marstuned
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      4.215043  0.2958829  3.4715006
##   1        3      3.596326  0.4925289  2.9105998
##   1        4      2.685802  0.7205596  2.1332921
##   1        5      2.457939  0.7645088  1.9835490
##   1        6      2.408785  0.7936804  1.9029015
##   1        7      1.946327  0.8555392  1.5609017
##   1        8      1.780400  0.8811200  1.4047687
##   1        9      1.688583  0.8901135  1.3192783
##   1       10      1.682062  0.8909456  1.3250403
##   1       11      1.646291  0.8973754  1.2914038
##   1       12      1.656040  0.8952787  1.2772892
##   1       13      1.653213  0.8953591  1.2818334
##   1       14      1.680246  0.8916559  1.3003684
##   1       15      1.680656  0.8915064  1.2995570
##   1       16      1.680656  0.8915064  1.2995570
##   2        2      4.215043  0.2958829  3.4715006
##   2        3      3.594757  0.4933080  2.9114298
##   2        4      2.683231  0.7210424  2.1310233
##   2        5      2.449310  0.7637838  1.9578090
##   2        6      2.333471  0.7894762  1.8489458
##   2        7      1.897656  0.8619660  1.5248216
##   2        8      1.719145  0.8846421  1.3595734
##   2        9      1.473660  0.9158201  1.1974286
##   2       10      1.380194  0.9257455  1.0992083
##   2       11      1.272798  0.9368827  1.0053695
##   2       12      1.227547  0.9419180  0.9692761
##   2       13      1.230587  0.9413298  0.9725188
##   2       14      1.202656  0.9445373  0.9394496
##   2       15      1.190089  0.9458320  0.9283424
##   2       16      1.189953  0.9456648  0.9346434
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 16 and degree = 2.
mars_pred2 <- predict(marstuned, newdata = testData$x)
postResample(pred = mars_pred2, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 1.1492504 0.9471145 0.9158382
Support Vector Machines
# Training SVM using a radial basis function kernel
svmfit <- ksvm(x = as.matrix(trainingData$x), y = trainingData$y,
               kernel = "rbfdot", kpar = "automatic",
               C = 1, epsilon = 0.1)

svmr_pred1 <- predict(svmfit, newdata = testData$x)
postResample(pred = svmr_pred1, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.2566786 0.7990135 1.7223733
# Tuned SVM with Centering and Scaling
svmrtuned <- train(trainingData$x, trainingData$y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength =  14,
                   trControl = trainControl(method = "cv"))
svmrtuned
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.494175  0.8024728  1.993835
##      0.50  2.243663  0.8155481  1.777142
##      1.00  2.057937  0.8395877  1.624516
##      2.00  1.950295  0.8508529  1.534357
##      4.00  1.888448  0.8562437  1.482902
##      8.00  1.876818  0.8564835  1.490836
##     16.00  1.886530  0.8552723  1.499634
##     32.00  1.886530  0.8552723  1.499634
##     64.00  1.886530  0.8552723  1.499634
##    128.00  1.886530  0.8552723  1.499634
##    256.00  1.886530  0.8552723  1.499634
##    512.00  1.886530  0.8552723  1.499634
##   1024.00  1.886530  0.8552723  1.499634
##   2048.00  1.886530  0.8552723  1.499634
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0668599
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0668599 and C = 8.
svmrtuned$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 8 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.066859902035286 
## 
## Number of Support Vectors : 153 
## 
## Objective Function Value : -67.2203 
## Training error : 0.008897
svmr_pred2 <- predict(svmrtuned, newdata = testData$x)
postResample(pred = svmr_pred2, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.0715241 0.8262178 1.5731231

Which Models appear to give the best performance? Does MARS select the informative predictors (those named x1-x5)

As shown above for every type of Non-Linear Model, we are utilizing a base model as well as a tuned model except for KNN. So comparing just the based model performances between the 4 different types we can see that Multivariate Adaptive Regression Splines(MARS) has the strongest based performance at a RMSE at 1.813 and a R-Squared of .8677. Following this and looking at the different tuned model we can see that MARS still performed the best at 1.1492 RMSE and 0.947 R-Squared value. Interestingly enough for the other 3, the next best performing base model is Support Vector Machine, then Neural Network and then finally K-Nearest Neighbor. However for the tuned model Neural Network slightly edged out Support Vector Machines.

MARS does select informative predictors x1-x5 as the unused ones seem to X7-10. The informative predictors all seem to be the highest importance predictors as all 5 were at the top with X6 lagging behind them.


Question 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models

A.

data("ChemicalManufacturingProcess")
impute <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
imputed <- predict(impute, ChemicalManufacturingProcess)
X <- dplyr::select(imputed, -Yield)
y <- imputed$Yield

set.seed(22)
index <- createDataPartition(y, p = .8, list = FALSE)
train_X <- X[index, ] %>% as.matrix()
test_X <- X[-index, ] %>% as.matrix()
train_y <- y[index]
test_y <- y[-index]
K-Nearest Neighbor
knnmodel_2 <- train(x = train_X, y = train_y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnpred2 <- predict(knnmodel_2, newdata = test_X)
postResample(pred = knnpred2, obs = test_y)
##      RMSE  Rsquared       MAE 
## 0.6514929 0.4499752 0.5493612
Multivariate Adaptive Regression Splines
marsGrid2 <- expand.grid(.degree = 1:2, .nprune = 2:58)
marstuned2 <- train(x = train_X, y = train_y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))
marspred2 <- predict(marstuned2, newdata = test_X)
postResample(pred = marspred2, obs = test_y)
##      RMSE  Rsquared       MAE 
## 0.6384544 0.4975974 0.4921832
Support Vector Machines
svmrtuned2 <- train(x = train_X, y = train_y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength =  14,
                   trControl = trainControl(method = "cv"))

svmrpred2 <- predict(svmrtuned2, newdata = test_X)
postResample(pred = svmrpred2, obs = test_y)
##      RMSE  Rsquared       MAE 
## 0.6449523 0.4399861 0.5114197
Neural Network
nnetavg2 <- avNNet(x = train_X, y = train_y,
                  size = 5,
                  decay = 0.01,
                  repeats = 5,
                  linout = TRUE,
                  trace = FALSE,
                  maxit = 500,
                  MaxNWts = 5 * (ncol(train_X) + 1) + 5 + 1)

nnetpred2 <- predict(nnetavg2, newdata = test_X)
postResample(pred = nnetpred2, obs = test_y)
##      RMSE  Rsquared       MAE 
## 0.8143599 0.4626198 0.6549063
Which nonlinear regression model gives the optimal resampling and test set performance?

The Non-Linear Regression model that gave the most optimal resampling and test set performance was MARS. At a staggering, 0.6384544 RMSE and 0.4975973 R-Squared it nearly beat out the rest of the models with KNN and Support Vector Machine performing similarly and Neural Network having a higher RMSE even thought it has a higher R-Squared compared to the other 2.


B.

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

varImp(marstuned2)
## earth variable importance
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess09   47.83
## ManufacturingProcess13    0.00
marstuned3 <- earth(x = train_X, y = train_y)
imp1 <- varImp(marstuned3)
varImp(marstuned3)
##                           Overall
## ManufacturingProcess32 100.000000
## ManufacturingProcess09  65.120510
## ManufacturingProcess13  35.988820
## ManufacturingProcess39  23.602022
## ManufacturingProcess22  23.211861
## ManufacturingProcess28  21.953670
## BiologicalMaterial12    21.541077
## BiologicalMaterial03    19.654206
## ManufacturingProcess01  15.680530
## ManufacturingProcess33   8.971403

The top 10 predictors are mainly Manufacturing Process with the top predictors in general being ManufacturingProcess 32, 09, and 13, and the only biological material predictors are toward the bottom of the list. Comparing it to the top 10 predictors from the linear model we can see that MP32, MP09, MP13, and MP33 are shared between the 2 but the non-linear regression seems to have only 2 biological material predictor compared to the linear’s 3.


C.

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

Biological Predictors seem to have a positive correlation with yield as the manufacturing process there is all different types of correlation types. It should be noted the top predictors for non-linear correlation, MP 32 and MP 09 are positively correlated while MP 13 is negatively correlated.

top_pred <- c("ManufacturingProcess32", "ManufacturingProcess09", "ManufacturingProcess13",
                    "ManufacturingProcess39", "ManufacturingProcess22", "ManufacturingProcess28",
                    "BiologicalMaterial12", "BiologicalMaterial03", "ManufacturingProcess01",
                    "ManufacturingProcess33")


for(i in top_pred) {
  plot_data <- data.frame(X = X[[i]], Y = y)
  
  p <- ggplot(plot_data, aes_string(x = "X", y = "Y")) +
    geom_point(alpha = 0.5) + 
    geom_smooth(method = "lm", color = "blue", se = FALSE) + 
    labs(title = paste("Relationship between", i, "and Yield"),
         x = i, y = "Yield") +
    theme_minimal()
  
  print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'