Load libraries

library(tidyverse)
library(fpp3)
library(caret)
library(RANN)
library(mlbench)
library(nnet)
library(earth)

Exercises 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
#create training data
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

# plot the data
featurePlot(trainingData$x, trainingData$y)

#create test data
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

#knn model with preprocessing
knnModel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
#predict the model using knn model on test dataset
knnPred <- predict(knnModel, newdata = testData$x)
# evaluate performance model using postResample
knn_performance <- postResample(pred = knnPred, obs = testData$y)
print(knn_performance)
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

MARS model provides the best performance since it has the highest rsquared. MARS did select the informative predictors X1-X5 first with X1 and X4 with teh highest importance numbers.

# Multivariate Adaptive Regression Splines (MARS)
set.seed(200)
# Train a MARS model
mars_model <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",           
  tuneLength = 10,              
  trControl = trainControl(method = "cv", number = 10))

# print MARS model 
print(mars_model)
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##    2      4.188280  0.3042527  3.460689
##    3      3.551182  0.4999832  2.837116
##    4      2.653143  0.7167280  2.128222
##    6      2.295006  0.7754603  1.853199
##    7      1.771950  0.8611767  1.391357
##    9      1.609816  0.8837307  1.299705
##   10      1.635035  0.8798236  1.309436
##   12      1.571561  0.8898750  1.253077
##   13      1.567577  0.8906927  1.250795
##   15      1.571673  0.8909652  1.245508
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 1.
#summary MARS model
summary(mars_model)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
##             nprune=13)
## 
##                coefficients
## (Intercept)       18.451984
## h(0.621722-X1)   -11.074396
## h(0.601063-X2)   -10.744225
## h(X3-0.281766)    20.607853
## h(0.447442-X3)    17.880232
## h(X3-0.447442)   -23.282007
## h(X3-0.636458)    15.150350
## h(0.734892-X4)   -10.027487
## h(X4-0.734892)     9.092045
## h(0.850094-X5)    -4.723407
## h(X5-0.850094)    10.832932
## h(X6-0.361791)    -1.956821
## 
## Selected 12 of 18 terms, and 6 of 10 predictors (nprune=13)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
#prediction using mars model
mars_prediction <- predict(mars_model, newdata = testData$x)
# performance using postResample
mars_performance <- postResample(pred = mars_prediction, obs = testData$y)
print(mars_performance)
##      RMSE  Rsquared       MAE 
## 1.8136467 0.8677298 1.3911836
summary(mars_model$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
##             nprune=13)
## 
##                coefficients
## (Intercept)       18.451984
## h(0.621722-X1)   -11.074396
## h(0.601063-X2)   -10.744225
## h(X3-0.281766)    20.607853
## h(0.447442-X3)    17.880232
## h(X3-0.447442)   -23.282007
## h(X3-0.636458)    15.150350
## h(0.734892-X4)   -10.027487
## h(X4-0.734892)     9.092045
## h(0.850094-X5)    -4.723407
## h(X5-0.850094)    10.832932
## h(X6-0.361791)    -1.956821
## 
## Selected 12 of 18 terms, and 6 of 10 predictors (nprune=13)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
mars_final <- mars_model$finalModel
varImp_mars <- evimp(mars_final)
print(varImp_mars)
##    nsubsets   gcv    rss
## X1       11 100.0  100.0
## X4       10  84.2   83.7
## X2        9  67.2   66.5
## X5        8  45.4   45.4
## X3        7  34.6   35.0
## X6        4  11.9   14.2
#Neural Network model 
 set.seed(200)
#nnet model 
nnet_model <- nnet( 
  x = trainingData$x, 
  y = trainingData$y,
  size = 5,
  decay = 0.01,
  lineout = TRUE,
  trace = FALSE,
  maxit = 500, 
  MaxNWts = 5 * (ncol(trainingData$x) + 1 + 5 +1))

# predictions using the neural network model
nnet_prediction <- predict(nnet_model, newdata = testData$x)

# performance using postResample
nn_performance <-postResample(pred = nnet_prediction, obs = testData$y)
print(nn_performance)
##      RMSE  Rsquared       MAE 
## 14.276935  0.288859 13.386918
#Support Vector Machines
 set.seed(200)
#nnet model 
svm_model <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "svmRadial", 
  preProc = c("center", "scale"),
  tuneLength = 14,
  trControl = trainControl(method = "cv"))

#print svm 
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.525164  0.7810576  2.010680
##      0.50  2.270567  0.7944850  1.794902
##      1.00  2.099319  0.8155594  1.659342
##      2.00  2.005858  0.8302852  1.578799
##      4.00  1.934650  0.8435677  1.528373
##      8.00  1.915672  0.8475581  1.528584
##     16.00  1.923960  0.8463016  1.536051
##     32.00  1.923960  0.8463016  1.536051
##     64.00  1.923960  0.8463016  1.536051
##    128.00  1.923960  0.8463016  1.536051
##    256.00  1.923960  0.8463016  1.536051
##    512.00  1.923960  0.8463016  1.536051
##   1024.00  1.923960  0.8463016  1.536051
##   2048.00  1.923960  0.8463016  1.536051
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 8.
# predictions using the neural network model
svm_prediction <- predict(svm_model, newdata = testData$x)

# performance using postResample
svm_performance <- postResample(pred = svm_prediction, obs = testData$y)
print(svm_performance)
##      RMSE  Rsquared       MAE 
## 2.0541197 0.8290353 1.5586411
svm_model$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 8 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0629932410345396 
## 
## Number of Support Vectors : 152 
## 
## Objective Function Value : -72.63 
## Training error : 0.009177
# Compare performances
performance_results <- rbind(
  KNN = knn_performance,
  MARS = mars_performance,
  SVM = svm_performance,
  NeuralNetwork = nn_performance)

print(performance_results)
##                    RMSE  Rsquared       MAE
## KNN            3.204059 0.6819919  2.568346
## MARS           1.813647 0.8677298  1.391184
## SVM            2.054120 0.8290353  1.558641
## NeuralNetwork 14.276935 0.2888590 13.386918

Exercise 7.3

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

  1. Which nonlinear regression model gives the optimal resampling and test set performance?

The MARS model has the optiomal performance since it has the lowest RMSE and highest rsquared.

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
#separate out predictor and response
predictor <- ChemicalManufacturingProcess[,-1]
response <- ChemicalManufacturingProcess$Yield

# imputation and pre-process
imputation<- preProcess(predictor, method = c("center", "scale", "knnImpute"))
pred_che <- predict(imputation, predictor)
set.seed(123)  
#split data
trainindex <- createDataPartition(response, p = 0.8, list = FALSE)
#create training and test set
train_response <- response[trainindex]  # train response
test_response <- response[-trainindex]  # test response
train_predictor <- pred_che[trainindex,] # train predictor
test_predictor <- pred_che[-trainindex,] # Test predictor
# KNN Model
knn_model <- train(
  x = train_predictor,
  y = train_response,
  method = "knn",
  tuneLength = 10)
print(knn_model)
## k-Nearest Neighbors 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.404328  0.4362944  1.120535
##    7  1.421869  0.4273089  1.140008
##    9  1.427747  0.4198231  1.152886
##   11  1.426694  0.4234220  1.153227
##   13  1.439947  0.4139886  1.164513
##   15  1.440685  0.4169647  1.167104
##   17  1.449785  0.4128142  1.170281
##   19  1.456103  0.4091016  1.174277
##   21  1.463941  0.4060540  1.179770
##   23  1.471751  0.4028304  1.185189
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
knn_pred <- predict(knn_model, newdata = test_predictor)
knn_performances <- postResample(pred = knn_pred, obs = test_response)
print(knn_performances)
##      RMSE  Rsquared       MAE 
## 1.4058635 0.4203612 1.1572500
# Neural Network Model 
 set.seed(200)
#nnet model 
nnet_model2 <- nnet( 
  x = train_predictor, 
  y = train_response,
  size = 5,
  decay = 0.01,
  lineout = TRUE,
  trace = FALSE,
  maxit = 500, 
  MaxNWts = 5 * (ncol(train_predictor) + 1 + 5 +1))

# predictions using the neural network model
nnet_prediction2 <- predict(nnet_model2, newdata = testData$x)

# performance using postResample
nn_performances <-postResample(pred = nnet_prediction2, obs = testData$y)
print(nn_performances)
##         RMSE     Rsquared          MAE 
## 14.277987503  0.000167585 13.387911203
# Multivariate Adaptive Regression Splines (MARS)
set.seed(200)
# Train a MARS model
mars_model2 <- train(
  x = train_predictor,
  y = train_response,
  method = "earth",           
  tuneLength = 10,              
  trControl = trainControl(method = "cv", number = 10))

# print MARS model 
print(mars_model2)
## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE      
##    2      1.422973  0.4663591  1.1152847
##    4      1.214079  0.5968870  0.9765712
##    6      1.280296  0.5458428  1.0198042
##    8      1.298122  0.5328182  1.0511910
##   10      1.337865  0.5363937  1.0626002
##   13      1.413456  0.4921121  1.1144966
##   15      1.419967  0.5082017  1.0872101
##   17      1.417457  0.5161495  1.0820099
##   19      1.418712  0.5156862  1.0845296
##   22      1.418712  0.5156862  1.0845296
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
#summary MARS model
summary(mars_model2)
## Call: earth(x=data.frame[144,57], y=c(42.44,42.03,4...), keepxy=TRUE, degree=1,
##             nprune=4)
## 
##                                      coefficients
## (Intercept)                             39.561204
## h(-1.38676-ManufacturingProcess13)       3.489821
## h(ManufacturingProcess17- -0.916315)    -0.699822
## h(ManufacturingProcess32- -0.827442)     1.330065
## 
## Selected 4 of 27 terms, and 3 of 57 predictors (nprune=4)
## Termination condition: RSq changed by less than 0.001 at 27 terms
## Importance: ManufacturingProcess32, ManufacturingProcess17, ...
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 1.455657    RSS 189.7308    GRSq 0.5784267    RSq 0.6130613
#prediction using mars model
mars_prediction2 <- predict(mars_model2, newdata = test_predictor)
# performance using postResample
mars_performances <- postResample(pred = mars_prediction2, obs = test_response)
print(mars_performances)
##      RMSE  Rsquared       MAE 
## 1.0995748 0.6604107 0.9190909
#Support Vector Machines
 set.seed(200)
#nnet model 
svm_model2 <- train(
  x = train_predictor,
  y = train_response,
  method = "svmRadial", 
  preProc = c("center", "scale"),
  tuneLength = 14,
  trControl = trainControl(method = "cv"))

#print svm 
print(svm_model2)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE      
##      0.25  1.368998  0.5420547  1.1209359
##      0.50  1.267554  0.5811406  1.0415826
##      1.00  1.196243  0.6298251  0.9759420
##      2.00  1.149146  0.6632057  0.9416576
##      4.00  1.126931  0.6692433  0.9147001
##      8.00  1.102570  0.6793603  0.9035725
##     16.00  1.102223  0.6794431  0.9034194
##     32.00  1.102223  0.6794431  0.9034194
##     64.00  1.102223  0.6794431  0.9034194
##    128.00  1.102223  0.6794431  0.9034194
##    256.00  1.102223  0.6794431  0.9034194
##    512.00  1.102223  0.6794431  0.9034194
##   1024.00  1.102223  0.6794431  0.9034194
##   2048.00  1.102223  0.6794431  0.9034194
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01578546
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01578546 and C = 16.
# predictions using the neural network model
svm_prediction2 <- predict(svm_model2, newdata = test_predictor)

# performance using postResample
svm_performances <- postResample(pred = svm_prediction2, obs = test_response)
print(svm_performances)
##      RMSE  Rsquared       MAE 
## 1.2289765 0.5534509 1.0363530
svm_model2$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 16 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0157854649377838 
## 
## Number of Support Vectors : 126 
## 
## Objective Function Value : -78.2082 
## Training error : 0.009227
# Compare performances
performance_results2 <- rbind(
  KNN = knn_performances,
  MARS = mars_performances,
  SVM = svm_performances,
  NeuralNetwork = nn_performances)

print(performance_results2)
##                    RMSE    Rsquared        MAE
## KNN            1.405864 0.420361195  1.1572500
## MARS           1.099575 0.660410699  0.9190909
## SVM            1.228976 0.553450940  1.0363530
## NeuralNetwork 14.277988 0.000167585 13.3879112
  1. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

The most important predictors for the mars model are ManufacturingProcess32, ManufacturingProcess17, ManufacturingProcess13. The important predictor for linear model was ManufacturingProcess37, ManufacturingProcess29, ManufacturingProcess32. The process variables are dominating the list as they are listed above the biological variables. On the mars model is shows ManufacturingProcess32 as number one but on the linear one it was number 3. The mars model had less predictors than the linear model.

# Get variable importance for MARS model
mars_model3 <- mars_model2$finalModel 
mars_import <- varImp(mars_model3)
print(mars_import)
##                          Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess17  55.97625
## ManufacturingProcess13  28.37452
# linear model
linear_model <- train(
  x = train_predictor,
  y = train_response,
  method = "lm",
  trControl = trainControl(method = "cv", number = 10))

# importance for the linear model
linear_importance <- varImp(linear_model)
print(linear_importance)
## lm variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess37  100.00
## ManufacturingProcess29   88.14
## ManufacturingProcess32   72.81
## ManufacturingProcess45   71.20
## BiologicalMaterial05     62.70
## ManufacturingProcess43   61.09
## ManufacturingProcess04   60.95
## ManufacturingProcess28   60.77
## ManufacturingProcess06   51.67
## BiologicalMaterial03     48.16
## BiologicalMaterial09     45.96
## ManufacturingProcess33   45.87
## ManufacturingProcess31   45.24
## ManufacturingProcess27   44.82
## ManufacturingProcess08   41.44
## ManufacturingProcess20   39.30
## ManufacturingProcess09   39.10
## ManufacturingProcess18   37.56
## ManufacturingProcess38   35.76
## ManufacturingProcess05   35.13
  1. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

There is a positve relationship between ManufacturingProcess32 and Yield as one increases the other increasing by seeing the dots moving on an upward trend. The some of the predictors have a relationship with yield but not as great as Manufacturingprocess32. The relationship is not very clear on some such as ManufacturingProcess37 and ManufacturingProcess29.

# Plot relationships between MARS-unique predictors and response
#side by side chart
par(mfrow = c(1, 3))

plot(train_predictor$ManufacturingProcess32, train_response, 
     main = "ManufacturingProcess32 vs Yield", 
     xlab = "ManufacturingProcess32", 
     ylab = "Yield",
     pch = 16, col = "blue")

plot(train_predictor$ManufacturingProcess17, train_response, 
     main = "ManufacturingProcess17 vs Yield", 
     xlab = "ManufacturingProcess17", 
     ylab = "Yield",
     pch = 16, col = "red")

plot(train_predictor$ManufacturingProcess13, train_response, 
     main = "ManufacturingProcess13 vs Yield", 
     xlab = "ManufacturingProcess13", 
     ylab = "Yield",
     pch = 16, col = "green")

plot(train_predictor$ManufacturingProcess37, train_response, 
     main = "ManufacturingProcess37 vs Yield", 
     xlab = "ManufacturingProcess37", 
     ylab = "Yield",
     pch = 16, col = "purple")

plot(train_predictor$ManufacturingProcess29, train_response, 
     main = "ManufacturingProcess29 vs Yield", 
     xlab = "ManufacturingProcess29", 
     ylab = "Yield",
     pch = 16, col = "orange")