# Load libraries 

library(caret)
library(tidyverse)
library(earth)
library(kernlab)
library(nnet)
library(mlbench)
library(AppliedPredictiveModeling)

EXERCISE 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\[y=10sin(\pi ~*x_1x_2)+20(x_3−0.5) +10x_4+5x_5+N(0,\sigma)\]

where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates the data below:

set.seed(200)
trainingData <- mlbench.friedman1(200,sd = 1)

## We convert the 'x' data from a matrix to a data frame > ## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)

## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to > ## estimate the true error rate with good precision:

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
corr <- cor(cbind(trainingData$x, tibble(y = trainingData$y)))
corrplot::corrplot(corr)

Tune several models on these data.

KNN Example from the book:

set.seed(201)

knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.663055  0.4845252  2.973509
##    7  3.544987  0.5222393  2.888308
##    9  3.444614  0.5597118  2.813217
##   11  3.447524  0.5703913  2.808627
##   13  3.408807  0.5942498  2.780438
##   15  3.395975  0.6124595  2.773349
##   17  3.394028  0.6239658  2.763966
##   19  3.389454  0.6354978  2.759273
##   21  3.402952  0.6423762  2.778907
##   23  3.412867  0.6483598  2.792254
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.
knnPred <- predict(knnModel, newdata = testData$x)

## The function 'postResample' can be used to get the test set

## performance values
knn_metrics <- postResample(pred = knnPred, obs = testData$y)

knn_metrics
##      RMSE  Rsquared       MAE 
## 3.2286834 0.6871735 2.5939727

The best model chosen from the training data was one using 19 nearest neighbors.This model resulted in an RMSE of 3.39 and an R-squared value of 0.6355. Predicting and analyzing the model performance on the test data produced an R-squared value of 0.6871, slightly better than what was achieved on the training set.

NEURAL NETWORK

# Fit a basic neural network with parameters for size and decay set
set.seed(202) 
nnetFit <- nnet(trainingData$x, trainingData$y,
                # Number of hidden units
                size = 5,
                # Weight decay value
                decay = 0.01,
                linout = TRUE,
                ## Reduce the amount of printed output
                trace = FALSE,
                ## Expand the number of iterations to find
                ## parameter estimates..
                maxit = 500,
                ## and the number of parameters used by the model
                MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)

The 10-5-1 network refers to the neural network being fitted with 10 predictors and 5 hidden units, in which 61 parameters were estimated.

summary(nnetFit)
## a 10-5-1 network with 61 weights
## options were - linear output units  decay=0.01
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1  i9->h1 
##   30.73  -14.22  -12.29    2.79  -10.21  -12.97    4.77   10.51   -3.66   -8.23 
## i10->h1 
##   -0.58 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2  i9->h2 
##  -15.90   16.38    4.89    0.47    9.05   -3.57    3.89   -4.06   18.41   -3.45 
## i10->h2 
##    2.80 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3  i5->h3  i6->h3  i7->h3  i8->h3  i9->h3 
##   -3.87    1.58    0.57   -0.04    3.34    3.45   -2.25    1.75   -0.41    1.78 
## i10->h3 
##    1.25 
##   b->h4  i1->h4  i2->h4  i3->h4  i4->h4  i5->h4  i6->h4  i7->h4  i8->h4  i9->h4 
##   -2.16    2.88   22.46    0.88    3.73   -1.16    0.10   -2.25   -8.51   -4.94 
## i10->h4 
##   -2.35 
##   b->h5  i1->h5  i2->h5  i3->h5  i4->h5  i5->h5  i6->h5  i7->h5  i8->h5  i9->h5 
##   -5.05   21.53  -17.50   -1.32   16.22    2.30   17.74   10.03    1.12  -12.62 
## i10->h5 
##   -6.55 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o 
##   1.27  -3.57   3.74   9.86   4.51   3.73
varImp(nnetFit)
##       Overall
## X1  15.558532
## X2  17.611755
## X3   1.485776
## X4  13.867493
## X5   9.410363
## X6   8.442635
## X7   8.754029
## X8  10.570163
## X9   9.615566
## X10  4.683688
nnetPred <- predict(nnetFit, testData$x)

## performance values
nn_metrics <- postResample(pred = nnetPred, obs = testData$y)

nn_metrics
##      RMSE  Rsquared       MAE 
## 3.0020014 0.6499896 2.3696503

NEURAL NETWORK 2 USING train()

Another neural net was fitted using the function train() and cross validation and grid search to identify the best values for decay and size.

set.seed(100)

nnetGrid <- expand.grid(data_frame(decay = c(0, 0.0001, 0.001, 0.01, 0.1),
                     size = c(1:5)))
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## The next option is to use bagging (see the
## next chapter) instead of different random
## seeds.

ctrl <- trainControl(method = "cv", number = 10)

nnetFit2 <- train(trainingData$x, trainingData$y,
                  method = "nnet",
## Automatically standardize data prior to modeling
## and prediction
preProc = c("center", "scale"),
linout = TRUE, 
trace = FALSE,
metric = "Rsquared",
# Use cross validation with parameter set in crtl
trControl = ctrl,
tuneGrid = nnetGrid,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
maxit = 500)

nnetFit2
## Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0e+00  1     2.607237  0.7127469  2.096428
##   0e+00  2     2.548956  0.7335340  2.035750
##   0e+00  3     2.503043  0.7579204  2.029461
##   0e+00  4     2.399673  0.7600243  1.924916
##   0e+00  5     2.684662  0.7126869  2.099095
##   1e-04  1     2.597490  0.7144538  2.107682
##   1e-04  2     2.603926  0.7200340  2.069772
##   1e-04  3     2.308446  0.7814813  1.830308
##   1e-04  4     2.259230  0.7854771  1.789155
##   1e-04  5     3.609989  0.6738037  2.588714
##   1e-03  1     2.388277  0.7597428  1.889526
##   1e-03  2     2.549721  0.7331913  2.048162
##   1e-03  3     2.119573  0.8095134  1.709940
##   1e-03  4     2.311796  0.7975198  1.857224
##   1e-03  5     2.528018  0.7591689  2.024323
##   1e-02  1     2.385599  0.7602740  1.887921
##   1e-02  2     2.616377  0.7253723  2.094091
##   1e-02  3     2.505821  0.7506812  1.998530
##   1e-02  4     2.628755  0.7287019  2.110121
##   1e-02  5     2.496664  0.7615431  1.981277
##   1e-01  1     2.393940  0.7596468  1.894140
##   1e-01  2     2.644669  0.7149218  2.110677
##   1e-01  3     2.434552  0.7643552  1.930961
##   1e-01  4     2.477422  0.7523673  1.950687
##   1e-01  5     2.504647  0.7535966  1.981100
## 
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 0.001.
summary(nnetFit2)
## a 10-3-1 network with 37 weights
## options were - linear output units  decay=0.001
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1  i9->h1 
##    0.88    0.61    0.60   -0.56    0.59    0.21   -0.15   -0.01    0.03   -0.07 
## i10->h1 
##   -0.08 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2  i9->h2 
##  -35.34   13.15   13.36  -13.87   21.26   11.99    3.71   -0.78   -0.80    6.63 
## i10->h2 
##   -2.31 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3  i5->h3  i6->h3  i7->h3  i8->h3  i9->h3 
##   -2.62   -0.16   -0.31    2.02    0.06    0.26    0.04    0.06   -0.04    0.08 
## i10->h3 
##    0.17 
##   b->o  h1->o  h2->o  h3->o 
##  -1.71  19.76   3.60  13.81
# Use the final best model 
nnetPred2 <- predict(nnetFit2, testData$x)

## performance values
nn2_metrics <- postResample(pred = nnetPred2, obs = testData$y)

nn2_metrics
##      RMSE  Rsquared       MAE 
## 2.3768141 0.7784505 1.7883484

The neural net fitted via grid search and cross validation identified a model with 10 predictors 3 hidden units and 37 weights. The model did perfrom as well on the test data, noted by a lower R_squared value 0.809 on the training data vs. 0.7784 with the test set. This might be a result of the model being over fit on the training data. The second neural net did perform better with cross validation then the first neural net.

MARS

set.seed(203)

# Create a parameter tuning grid 
marsGrid <- expand.grid(degree = 1:4, nprune = seq(5, 50, by = 5))

marsFit <- train(trainingData$x, trainingData$y, 
                 method = "earth", 
                 metric = "Rsquared",
                 trControl = ctrl,
                 tuneGrid = marsGrid)

summary(marsFit)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=3,
##             nprune=15)
## 
##                                 coefficients
## (Intercept)                        21.218097
## h(0.621722-X1)                    -15.691434
## h(X1-0.621722)                      9.066076
## h(0.601063-X2)                    -18.220627
## h(X2-0.601063)                     10.428191
## h(X3-0.281766)                      3.419830
## h(0.447442-X3)                     12.201226
## h(X3-0.606015)                      7.875117
## h(0.734892-X4)                    -10.033343
## h(X4-0.734892)                      9.894681
## h(0.850094-X5)                     -5.385087
## h(0.218266-X1) * h(X2-0.601063)   -58.628609
## h(X1-0.218266) * h(X2-0.601063)   -29.226178
## h(X1-0.621722) * h(X2-0.295997)   -25.768101
## h(0.649253-X1) * h(0.601063-X2)    26.940533
## 
## Selected 15 of 18 terms, and 5 of 10 predictors (nprune=15)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 10 4
## GCV 1.618197    RSS 217.6151    GRSq 0.9343005    RSq 0.9553786
marsFit
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        5      2.374157  0.7744430  1.870326
##   1       10      1.655024  0.8877220  1.281697
##   1       15      1.633327  0.8904661  1.261820
##   1       20      1.633327  0.8904661  1.261820
##   1       25      1.633327  0.8904661  1.261820
##   1       30      1.633327  0.8904661  1.261820
##   1       35      1.633327  0.8904661  1.261820
##   1       40      1.633327  0.8904661  1.261820
##   1       45      1.633327  0.8904661  1.261820
##   1       50      1.633327  0.8904661  1.261820
##   2        5      2.413660  0.7682677  1.907182
##   2       10      1.395304  0.9224492  1.124020
##   2       15      1.240538  0.9400268  1.002271
##   2       20      1.255784  0.9380147  1.008117
##   2       25      1.255784  0.9380147  1.008117
##   2       30      1.255784  0.9380147  1.008117
##   2       35      1.255784  0.9380147  1.008117
##   2       40      1.255784  0.9380147  1.008117
##   2       45      1.255784  0.9380147  1.008117
##   2       50      1.255784  0.9380147  1.008117
##   3        5      2.413660  0.7682677  1.907182
##   3       10      1.394865  0.9227581  1.113340
##   3       15      1.234752  0.9411538  1.004014
##   3       20      1.248352  0.9393078  1.007118
##   3       25      1.248352  0.9393078  1.007118
##   3       30      1.248352  0.9393078  1.007118
##   3       35      1.248352  0.9393078  1.007118
##   3       40      1.248352  0.9393078  1.007118
##   3       45      1.248352  0.9393078  1.007118
##   3       50      1.248352  0.9393078  1.007118
##   4        5      2.413660  0.7682677  1.907182
##   4       10      1.394865  0.9227581  1.113340
##   4       15      1.234752  0.9411538  1.004014
##   4       20      1.248352  0.9393078  1.007118
##   4       25      1.248352  0.9393078  1.007118
##   4       30      1.248352  0.9393078  1.007118
##   4       35      1.248352  0.9393078  1.007118
##   4       40      1.248352  0.9393078  1.007118
##   4       45      1.248352  0.9393078  1.007118
##   4       50      1.248352  0.9393078  1.007118
## 
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 15 and degree = 3.
varImp(marsFit)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.24
## X2   48.73
## X5   15.52
## X3    0.00
ggplot(marsFit)

# Use the final best model 
marsPred <- predict(marsFit, testData$x)

## performance values
mars_metrics <- postResample(pred = marsPred, obs = testData$y)

mars_metrics
##      RMSE  Rsquared       MAE 
## 1.1589948 0.9460418 0.9250230

A MARS model was fit using grid search and cross validation, with R-squared bring the metric to determine the best model. The final model selected 15 of 18 terms, and 5 of 10 predictors (nprune=15) with X1, X4, X2, X5, X3 being selected in the repspective order of importance. The fitted model explained 94.11% of variability in the response variable and the model did slightly better on the test data, with an R-squared of 0.9460.

SVM

set.seed(204)

svmFit <- train(trainingData$x, trainingData$y,
                    method = "svmRadial",
                    preProc = c("center", "scale"),
                    tuneLength = 14,
                    trControl = ctrl)

# View output
svmFit
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.484463  0.7995413  1.986294
##      0.50  2.212816  0.8203388  1.767826
##      1.00  2.031772  0.8405654  1.603824
##      2.00  1.921412  0.8549609  1.503827
##      4.00  1.819381  0.8688953  1.411205
##      8.00  1.782927  0.8743623  1.377246
##     16.00  1.786039  0.8744254  1.386926
##     32.00  1.786039  0.8744254  1.386926
##     64.00  1.786039  0.8744254  1.386926
##    128.00  1.786039  0.8744254  1.386926
##    256.00  1.786039  0.8744254  1.386926
##    512.00  1.786039  0.8744254  1.386926
##   1024.00  1.786039  0.8744254  1.386926
##   2048.00  1.786039  0.8744254  1.386926
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06127708
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06127708 and C = 8.
# View final model
svmFit$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 8 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0612770781663944 
## 
## Number of Support Vectors : 154 
## 
## Objective Function Value : -75.3495 
## Training error : 0.009332
# Use the final best model 
svmPred <- predict(svmFit, testData$x)

## performance values
svm_metrics <- postResample(pred = svmPred, obs = testData$y)

svm_metrics
##      RMSE  Rsquared       MAE 
## 2.0475100 0.8300953 1.5531906

Which models appear to give the best performance?

models <- c("KNN", "NEURAL_NETWORK", "MARS", "SVM")
values <- c(knn_metrics["Rsquared"], nn2_metrics["Rsquared"], mars_metrics["Rsquared"], svm_metrics["Rsquared"])

tibble(Models = models, R_squared = values) %>% arrange(desc(R_squared))
## # A tibble: 4 × 2
##   Models         R_squared
##   <chr>              <dbl>
## 1 MARS               0.946
## 2 SVM                0.830
## 3 NEURAL_NETWORK     0.778
## 4 KNN                0.687

The model that had the highest R-squared was the Multivariate Adaptive Regression Splines at 0.946. This model explained 94.6% of the variability in the response variable.

Does MARS select the informative predictors (those named X1–X5)?

Yes, the MARS model did select the informative predictors with the model judging X1 to be the most important.

EXERCISE 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

# Load library
library(tidymodels)

# Load data 
data("ChemicalManufacturingProcess")

chemical_df <- ChemicalManufacturingProcess

# head(chemical_df)
# Find number of missing values in each predictor
chemical_df %>% summarise_all(~ sum(is.na(.))) %>% pivot_longer(cols = colnames(chemical_df), names_to = "Variable", values_to = "Counts") %>% 
  filter(Counts > 0)
## # A tibble: 28 × 2
##    Variable               Counts
##    <chr>                   <int>
##  1 ManufacturingProcess01      1
##  2 ManufacturingProcess02      3
##  3 ManufacturingProcess03     15
##  4 ManufacturingProcess04      1
##  5 ManufacturingProcess05      1
##  6 ManufacturingProcess06      2
##  7 ManufacturingProcess07      1
##  8 ManufacturingProcess08      1
##  9 ManufacturingProcess10      9
## 10 ManufacturingProcess11     10
## # ℹ 18 more rows

EDA

# Find predictors with a correlation greater or equal to 0.5 or less than or equal to -0.5

corr_tibble <- tibble(Predictor = rownames(cor(chemical_df[,2:ncol(chemical_df)], chemical_df$Yield)),
                      Corr_coef = cor(chemical_df[,2:ncol(chemical_df)], chemical_df$Yield)[,1])

corr_tibble %>% filter(Corr_coef <= -0.5 | Corr_coef >= 0.5)
## # A tibble: 3 × 2
##   Predictor              Corr_coef
##   <chr>                      <dbl>
## 1 ManufacturingProcess09     0.503
## 2 ManufacturingProcess13    -0.504
## 3 ManufacturingProcess32     0.608
# Create a function that imputes the median of the predictor for the missing values

for(i in 1:ncol(chemical_df)) {
  chemical_df[ , i][is.na(chemical_df[ , i])] <- median(chemical_df[ , i], na.rm=TRUE)
}
# Get zero variance predictors 
nzv_labels <- nearZeroVar(chemical_df, names = TRUE)

print(paste0("The predictor/s with near zero variance are: ", nzv_labels))
## [1] "The predictor/s with near zero variance are: BiologicalMaterial07"
# Filter out the zero variance predictors 
filtered_chemical_df <- as_tibble(chemical_df) %>% select(-nzv_labels)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(nzv_labels)
## 
##   # Now:
##   data %>% select(all_of(nzv_labels))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(300)

# Split data into train and test sets
chemical_split <- initial_split(filtered_chemical_df, 
                            prop = 0.7)

# Create the training data
chemical_train <- chemical_split %>%
  training()

chemXtrain <- data.frame(chemical_train %>% select(-Yield))
chemYtrain <- chemical_train %>% select(Yield)

# Create the test data
chemical_test <- chemical_split %>% 
  testing()

chemXtest <- data.frame(chemical_test %>% select(-Yield))
chemYtest <- chemical_test %>% select(Yield)

Train Models

Four different non linear regression strategies will be used to build a model. The model will be trained on the training data and each model’s performance will be evaluated and compared by using a test set to make predictions.

# Will use 10 fold cross validation as in previous exercise
ctrl <- trainControl(method = "cv", number = 10)

KNN

set.seed(301)

knnChemical <- train(x = chemXtrain,
y = chemYtrain$Yield,
method = "knn",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Rsquared",
tuneLength = 10)

knnChemical
## k-Nearest Neighbors 
## 
## 123 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 111, 111, 111, 110, 110, 111, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.397233  0.4940931  1.131294
##    7  1.412603  0.4719315  1.123157
##    9  1.461348  0.4259835  1.188005
##   11  1.434232  0.4519084  1.170092
##   13  1.422837  0.4692733  1.168444
##   15  1.442009  0.4576442  1.172183
##   17  1.463912  0.4555665  1.185686
##   19  1.475629  0.4474194  1.199788
##   21  1.467981  0.4750504  1.195504
##   23  1.476938  0.4619071  1.204509
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnChemical)

# Use the final best model 
knnPredChemical <- predict(knnChemical, chemXtest)

## performance values
knnChemical_metrics <- postResample(pred = knnPredChemical, obs = chemYtest$Yield)

knnChemical_metrics
##      RMSE  Rsquared       MAE 
## 1.1829298 0.5965649 0.8871321

Using KNN method did a poor job of capturing the variability in the response variable - Yield.

MARS

set.seed(302)

# Create a parameter tuning grid 
marsGrid <- expand.grid(degree = 1:4, nprune = seq(5, 50, by = 5))

marsChemicalFit <- train(x = chemXtrain,
                      y = chemYtrain$Yield,
                      method = "earth",
                      # Use R-sqaured to find best model
                      metric = "Rsquared",
                      # Use 10 fold cross validation and Grid search
                      trControl = ctrl,
                      tuneGrid = marsGrid)

summary(marsChemicalFit)
## Call: earth(x=data.frame[123,56], y=c(38.95,38.44,4...), keepxy=TRUE, degree=4,
##             nprune=10)
## 
##                                                                                       coefficients
## (Intercept)                                                                              37.930007
## h(44.09-ManufacturingProcess09)                                                          -0.742552
## h(ManufacturingProcess09-44.09)                                                           0.439954
## h(33.3-ManufacturingProcess13)                                                            2.213490
## h(ManufacturingProcess32-152)                                                             0.223687
## ManufacturingProcess26 * h(61-ManufacturingProcess33)                                     0.002139
## h(207.3-ManufacturingProcess06) * h(ManufacturingProcess09-44.09)                        -0.258805
## h(ManufacturingProcess13-33.3) * h(0.7-ManufacturingProcess43)                           -2.472565
## ManufacturingProcess26 * h(61-ManufacturingProcess33) * ManufacturingProcess36           -0.097280
## BiologicalMaterial04 * h(10.6-ManufacturingProcess28) * h(ManufacturingProcess32-152)     0.001364
## 
## Selected 10 of 46 terms, and 10 of 56 predictors (nprune=10)
## Termination condition: GRSq -10 at 46 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 4 3 2
## GCV 1.062752    RSS 85.54071    GRSq 0.6975312    RSq 0.7988097
marsChemicalFit
## Multivariate Adaptive Regression Spline 
## 
## 123 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 111, 111, 110, 111, 111, 110, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        5      1.316534  0.5038886  1.095417
##   1       10      1.335827  0.5081493  1.072217
##   1       15      1.415144  0.4817017  1.147743
##   1       20      1.412757  0.4874074  1.144313
##   1       25      1.412757  0.4874074  1.144313
##   1       30      1.412757  0.4874074  1.144313
##   1       35      1.412757  0.4874074  1.144313
##   1       40      1.412757  0.4874074  1.144313
##   1       45      1.412757  0.4874074  1.144313
##   1       50      1.412757  0.4874074  1.144313
##   2        5      1.246913  0.5776938  1.047440
##   2       10      1.324101  0.5686925  1.054947
##   2       15      1.512324  0.5419897  1.122435
##   2       20      1.647405  0.4971650  1.188040
##   2       25      1.666719  0.4909862  1.215407
##   2       30      1.666719  0.4909862  1.215407
##   2       35      1.666719  0.4909862  1.215407
##   2       40      1.666719  0.4909862  1.215407
##   2       45      1.666719  0.4909862  1.215407
##   2       50      1.666719  0.4909862  1.215407
##   3        5      1.315531  0.5193487  1.047427
##   3       10      1.509136  0.5532722  1.164606
##   3       15      1.590378  0.5351941  1.189340
##   3       20      1.608669  0.5167416  1.228182
##   3       25      1.632211  0.5020129  1.213308
##   3       30      1.630696  0.4998450  1.208556
##   3       35      1.630696  0.4998450  1.208556
##   3       40      1.630696  0.4998450  1.208556
##   3       45      1.630696  0.4998450  1.208556
##   3       50      1.630696  0.4998450  1.208556
##   4        5      1.278208  0.5409620  1.056750
##   4       10      1.424388  0.5799493  1.078559
##   4       15      1.572460  0.5380332  1.178461
##   4       20      1.626354  0.4997918  1.244416
##   4       25      1.653682  0.4923258  1.219965
##   4       30      1.649475  0.4912986  1.215747
##   4       35      1.649475  0.4912986  1.215747
##   4       40      1.649475  0.4912986  1.215747
##   4       45      1.649475  0.4912986  1.215747
##   4       50      1.649475  0.4912986  1.215747
## 
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 10 and degree = 4.
# Retrive important 
varImp(marsChemicalFit)
## earth variable importance
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess09   63.93
## ManufacturingProcess13   42.72
## ManufacturingProcess28   24.73
## BiologicalMaterial04     24.73
## ManufacturingProcess43   20.11
## ManufacturingProcess33   12.23
## ManufacturingProcess26   12.23
## ManufacturingProcess06    7.53
## ManufacturingProcess36    0.00
# Plot 
ggplot(marsChemicalFit)

The ranking of features by importance makes sense as the top three predictors had the highest correlation with the response variable.

# Use the final best model 
marsChemicalPred <- predict(marsChemicalFit, chemXtest)

## performance values
marsChemical_metrics <- postResample(pred = marsChemicalPred, obs = chemYtest$Yield)

marsChemical_metrics
##      RMSE  Rsquared       MAE 
## 1.1131686 0.6302952 0.8278500

NEURAL NET

set.seed(303)

# Set parameters
nnetGrid <- expand.grid(data_frame(decay = c(0, 0.0001, 0.001, 0.01, 0.1),
                     size = c(1:5)))

## The next option is to use bagging (see the
## next chapter) instead of different random
## seeds.

nnetChemicalFit <- train(chemXtrain, chemYtrain$Yield,
                  method = "nnet",
## Automatically standardize data prior to modeling
## and prediction
preProc = c("center", "scale"),
linout = TRUE, 
trace = FALSE,
metric = "Rsquared",
# Use cross validation with parameter set in crtl
trControl = ctrl,
tuneGrid = nnetGrid,
MaxNWts = 10 * (ncol(chemXtrain) + 1) + 10 + 1,
maxit = 500)

nnetChemicalFit
## Neural Network 
## 
## 123 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 111, 111, 110, 110, 111, 111, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0e+00  1     1.703514  0.2597174  1.380961
##   0e+00  2     1.722955  0.2193736  1.418749
##   0e+00  3     1.743161  0.2469934  1.406113
##   0e+00  4     2.886874  0.2857376  2.256411
##   0e+00  5     3.548444  0.2090359  2.813092
##   1e-04  1     1.618337  0.3864089  1.305114
##   1e-04  2     1.692357  0.3499415  1.339765
##   1e-04  3     1.613197  0.3574490  1.315076
##   1e-04  4     1.935835  0.3700008  1.519958
##   1e-04  5     3.322728  0.2785144  2.587177
##   1e-03  1     1.989049  0.2718451  1.553984
##   1e-03  2     1.750163  0.4052093  1.399875
##   1e-03  3     2.217212  0.3820242  1.678624
##   1e-03  4     2.378709  0.3192322  1.910201
##   1e-03  5     3.051805  0.3058756  2.378399
##   1e-02  1     1.679888  0.4367466  1.337996
##   1e-02  2     2.573273  0.3566771  1.980118
##   1e-02  3     2.305264  0.2926005  1.861456
##   1e-02  4     2.916294  0.2754428  2.310904
##   1e-02  5     3.229173  0.1810854  2.342558
##   1e-01  1     1.872232  0.4167089  1.424454
##   1e-01  2     2.112293  0.3176324  1.600205
##   1e-01  3     2.636771  0.2446606  2.021290
##   1e-01  4     2.561239  0.2591982  1.921910
##   1e-01  5     2.206744  0.3353673  1.730974
## 
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were size = 1 and decay = 0.01.
ggplot(nnetChemicalFit)

# Use the final best model 
nnetChemicalPred <- predict(nnetChemicalFit, chemXtest)

## performance values
nnChemical_metrics <- postResample(pred = nnetChemicalPred, obs = chemYtest$Yield)

nnChemical_metrics
##      RMSE  Rsquared       MAE 
## 1.6329030 0.3762549 1.3240861

SVM

set.seed(304)

svmChemicalFit <- train(chemXtrain, chemYtrain$Yield,
                    method = "svmRadial",
                    preProc = c("center", "scale"),
                    tuneLength = 14,
                    trControl = ctrl,
                    metric = "Rsquared")

# View output
svmChemicalFit
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 123 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 111, 110, 111, 111, 111, 111, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE      
##      0.25  1.457806  0.5073477  1.1940027
##      0.50  1.354669  0.5443848  1.1056640
##      1.00  1.252397  0.5975845  1.0405274
##      2.00  1.179359  0.6393431  0.9792180
##      4.00  1.156255  0.6522999  0.9500740
##      8.00  1.146292  0.6636211  0.9264818
##     16.00  1.143445  0.6658972  0.9237502
##     32.00  1.143445  0.6658972  0.9237502
##     64.00  1.143445  0.6658972  0.9237502
##    128.00  1.143445  0.6658972  0.9237502
##    256.00  1.143445  0.6658972  0.9237502
##    512.00  1.143445  0.6658972  0.9237502
##   1024.00  1.143445  0.6658972  0.9237502
##   2048.00  1.143445  0.6658972  0.9237502
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01419071
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01419071 and C = 16.
# Use the final best model 
svmChemicalPred <- predict(svmChemicalFit, chemXtest)

## performance values
svmChemical_metrics <- postResample(pred = svmChemicalPred, obs = chemYtest$Yield)

svmChemical_metrics
##      RMSE  Rsquared       MAE 
## 1.0764559 0.6378348 0.8492403

Model Performance

models <- c("KNN", "NEURAL_NETWORK", "MARS", "SVM")
values <- c(knnChemical_metrics["Rsquared"], nnChemical_metrics["Rsquared"], marsChemical_metrics["Rsquared"], svmChemical_metrics["Rsquared"])

tibble(Models = models, R_squared = values) %>% arrange(desc(R_squared))
## # A tibble: 4 × 2
##   Models         R_squared
##   <chr>              <dbl>
## 1 SVM                0.638
## 2 MARS               0.630
## 3 KNN                0.597
## 4 NEURAL_NETWORK     0.376

Questions

a. Which nonlinear regression model gives the optimal re-sampling and test set performance?

The model that gives the optimal re sampling and test set performance is the SVM model. The fitted SVM model explained 66.59% of the variability in Yield and performed similarly with the test data; R-squared value was 0.6378 - explaining 63.78% of the data. In the final SVM model with a radial basis function kernel, the values used for sigma and the cost were 0.01419071 and 16 respectively. The cost parameter in SVM controls the regularization strength. It determines how much the model is penalized for misclassifying training examples. Sigma values control the influence that a single training example is felt over a wider range of distances.

b. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

# Find the importance rankings
var_imp <- varImp(svmChemicalFit)
var_imp
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   95.40
## ManufacturingProcess17   77.32
## BiologicalMaterial06     66.70
## ManufacturingProcess09   63.30
## BiologicalMaterial03     60.83
## BiologicalMaterial12     60.67
## ManufacturingProcess06   59.87
## BiologicalMaterial02     59.80
## ManufacturingProcess36   59.59
## ManufacturingProcess31   54.33
## BiologicalMaterial04     46.49
## BiologicalMaterial09     45.78
## ManufacturingProcess30   39.93
## ManufacturingProcess11   39.76
## BiologicalMaterial08     39.56
## ManufacturingProcess29   39.49
## BiologicalMaterial11     38.75
## BiologicalMaterial01     37.31
## ManufacturingProcess33   37.04
# Extract only the top 10
top10 <- var_imp$importance %>% arrange(desc(Overall)) %>% head(10)

Within th etop 20 most important features, 11/20 are manufacturing processes and the rest are biological processes. In exercise 6.3 an elastic net model was fitted. The nonlinear svm regression model and the elastic net model shared 8 predictors in their respective top ten lists, which include

  • ManufacturingProcess09
  • ManufacturingProcess32
  • ManufacturingProcess06
  • ManufacturingProcess17
  • ManufacturingProcess13
  • ManufacturingProcess36
  • BiologicalMaterial02
  • BiologicalMaterial03

c. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

# Scatter plots 
featurePlot(chemical_df[,rownames(top10)], chemical_df$Yield)

# Corrlation plot
chemical_corr <- cor(chemical_df[,append(rownames(top10), "Yield")])
corrplot::corrplot(chemical_corr,method = "number")

Visualizing the relationships between the top 10 features and the response variable yield, some patterns in the relationships are easier to spot than others. In general it looks like the Biological materials have a positive relationship with yield, which is confirmed by the correlation plot. The strongest positive relationship with the response was ManufacturingProcess32 and the strongest negative correlation was ManufacturingProcess36.