library(tidyverse)
library(mlbench)
library(earth)
library(RSNNS)
library(corrplot)
library(caret)
set.seed(200)
training_data<- mlbench.friedman1(200, sd =1)
training_data$x<- data.frame(training_data$x)
featurePlot(training_data$x, training_data$y)

#create a test dataset:
test_data<- mlbench.friedman1(5000, sd = 1)
test_data$x<- data.frame(test_data$x)

#define the 10 fold cross-validation method:
cv<-trainControl(method = 'cv', number = 10)

7.2 Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

ANS: MARS is the best fitted model with R^2 around 0.93 and the RMSE 1.28, which is the lowest. According to the importance map, MARS ranked X1, 4, 2 ,5 ,3 as the most important predictors but X3 had 0 importance. Therefore, we can conclude that X1-5 are all being utilized.

**KNN example from the textbook, which will be used as the benchmark:

knn_tune<- caret::train(x = training_data$x,
                 y = training_data$y,
                 method = 'knn',
                 preProcess = c('scale', 'center'),
                 tuneLength = 10)

To predict with KNN:

knn_pred<- predict(knn_tune, test_data$x)
postResample(knn_pred, test_data$y)
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Neural network with RSNNS:

rsnns_grid<- expand.grid(.size = 5:20)

rsnns_tune<- caret::train(x = training_data$x,
                   y = training_data$y,
                   method = 'mlp',
                   preProcess = c('scale', 'center'),
                   trControl = cv,
                   tuneGrid = rsnns_grid,
                   linout = TRUE,
                   maxit = 500,
                   MaxNWts = 1000,
                   learnFuncParams = c(0.01, 0.9)
                   )

RSNNS prediction:

rsnns_pred<- predict(rsnns_tune, test_data$x)
postResample(rsnns_pred, test_data$y)
##      RMSE  Rsquared       MAE 
## 2.4920704 0.7482272 1.9299780

MARS:

#MARS tuning:
set.seed(123)
mars_grid<- expand.grid(.degree = 1:3,
                        .nprune = 2:20)

mars_tune<- caret::train(x = training_data$x,
                  y = training_data$y,
                  method = 'earth',
                  trControl = cv,
                  tuneGrid = mars_grid,
                  preProcess = c('scale', 'center'))
plot(varImp(mars_tune))

MARS prediction:

mars_predict<- predict(mars_tune, test_data$x)
postResample(mars_predict, test_data$y)
##      RMSE  Rsquared       MAE 
## 1.2779993 0.9338365 1.0147070

SVM:

svm_radial<- caret::train(x = training_data$x,
                          y = training_data$y,
                          method = 'svmRadial',
                          tuneLength = 10,
                          trControl = cv)

svm_poly<- caret::train(x = training_data$x,
                          y = training_data$y,
                          method = 'svmPoly',
                          tuneLength = 3,
                          trControl = cv)

SVM prediction:

svm_rad_pred<- predict(svm_radial, test_data$x)
svm_poly_pred<- predict(svm_radial, test_data$x)
postResample(svm_rad_pred, test_data$y)
##      RMSE  Rsquared       MAE 
## 2.0782039 0.8249488 1.5787446
postResample(svm_poly_pred, test_data$y)
##      RMSE  Rsquared       MAE 
## 2.0782039 0.8249488 1.5787446

To put all models side by side comparison:

list(knn = postResample(knn_pred, test_data$y),
     rsnns = postResample(rsnns_pred, test_data$y),
     MARS = postResample(mars_predict, test_data$y),
     svm_rad = postResample(svm_rad_pred, test_data$y),
     svm_poly = postResample(svm_poly_pred, test_data$y))
## $knn
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461 
## 
## $rsnns
##      RMSE  Rsquared       MAE 
## 2.4920704 0.7482272 1.9299780 
## 
## $MARS
##      RMSE  Rsquared       MAE 
## 1.2779993 0.9338365 1.0147070 
## 
## $svm_rad
##      RMSE  Rsquared       MAE 
## 2.0782039 0.8249488 1.5787446 
## 
## $svm_poly
##      RMSE  Rsquared       MAE 
## 2.0782039 0.8249488 1.5787446

7.5 Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

  1. Which nonlinear regression model gives the optimal re-sampling and test set performance?

    Ans: The SVM radial model has the best re-sampling and test validation performance. They are RMSE = 0.638, Rsquared=0.644; RMSE = 0.552;Rsquared =0.65 for re-sample and test validation, respectively.

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

Train split the data:

set.seed(123)
#checking for missing data:
is.na(ChemicalManufacturingProcess) %>% sum() #106 missing data
## [1] 106
#impute the data with knn:
imputed_data<- preProcess(ChemicalManufacturingProcess, method = 'knnImpute')

#apply to the dataset:
imputed_chem<- predict(imputed_data, ChemicalManufacturingProcess)
imputed_chem %>% is.na() %>% sum() # 0 missing data
## [1] 0
#spliting data in 70-30:
chem_train_idx<- createDataPartition(imputed_chem$Yield, p = 0.7, list=FALSE)

#training dataset
chem_train_x<- imputed_chem[chem_train_idx, ] %>% select(-Yield)
chem_train_y<- imputed_chem$Yield[chem_train_idx]

#testing dataset:
chem_test_x<- imputed_chem[-chem_train_idx, ] %>% select(-Yield)
chem_test_y<- imputed_chem$Yield[-chem_train_idx]

KNN:

set.seed(43)
knn_model<- caret::train(x = chem_train_x,
                  y = chem_train_y,
                  method = 'knn',
                  preProcess = c('scale', 'center'),
                  trControl = cv,
                  tuneGrid = data.frame(.k = 1:20))

#to check the re-sampling metrics:
knn_model
## k-Nearest Neighbors 
## 
## 124 samples
##  57 predictor
## 
## Pre-processing: scaled (57), centered (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 111, 110, 112, 112, 112, 112, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    1  0.8205099  0.4593838  0.6286764
##    2  0.7523686  0.4983340  0.5778125
##    3  0.7287325  0.5294176  0.5935537
##    4  0.7399352  0.5123103  0.5813488
##    5  0.7701000  0.4668749  0.6194218
##    6  0.7820163  0.4564448  0.6242010
##    7  0.7919033  0.4503019  0.6329623
##    8  0.7954740  0.4371723  0.6354999
##    9  0.7896281  0.4427387  0.6295239
##   10  0.7919164  0.4359477  0.6313449
##   11  0.7851989  0.4476922  0.6221146
##   12  0.7952866  0.4359111  0.6334504
##   13  0.7943265  0.4324838  0.6325867
##   14  0.7978386  0.4270360  0.6415137
##   15  0.8022131  0.4221110  0.6450661
##   16  0.8005464  0.4289696  0.6440362
##   17  0.7996381  0.4326691  0.6388798
##   18  0.7993412  0.4342193  0.6422833
##   19  0.8051389  0.4316030  0.6475247
##   20  0.8079171  0.4268165  0.6502997
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 3.
#to predict:
knn_model_pred<- predict(knn_model, chem_test_x)
postResample(knn_model_pred, chem_test_y)
##      RMSE  Rsquared       MAE 
## 0.5746305 0.6049632 0.4763410

SVM radial:

set.seed(43)
#tuning parameters:
svm_tune<- expand.grid(
  sigma = c(0.01, 0.1, 1),
  C = c(1:10))

svm_model<- caret::train(
  x = chem_train_x,
  y = chem_train_y,
  method = 'svmRadial',
  trControl = cv,
  tuneGrid = svm_tune,
  preProcess = c('scale', 'center')
  )

#to check the re-sampling metrics:
svm_model
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 124 samples
##  57 predictor
## 
## Pre-processing: scaled (57), centered (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 111, 110, 112, 112, 112, 112, ... 
## Resampling results across tuning parameters:
## 
##   sigma  C   RMSE       Rsquared    MAE      
##   0.01    1  0.6711945  0.61797405  0.5395421
##   0.01    2  0.6514612  0.63395417  0.5224476
##   0.01    3  0.6458360  0.63495711  0.5110088
##   0.01    4  0.6436582  0.63452534  0.5084002
##   0.01    5  0.6406828  0.63691692  0.5065740
##   0.01    6  0.6402824  0.63899121  0.5036939
##   0.01    7  0.6394369  0.64101029  0.5013607
##   0.01    8  0.6392954  0.64208360  0.5003111
##   0.01    9  0.6388491  0.64226495  0.4992261
##   0.01   10  0.6376887  0.64357761  0.4973097
##   0.10    1  0.9428680  0.33045731  0.7426450
##   0.10    2  0.9313877  0.34632928  0.7374551
##   0.10    3  0.9304864  0.34807281  0.7370064
##   0.10    4  0.9304796  0.34807332  0.7369998
##   0.10    5  0.9304796  0.34807332  0.7369998
##   0.10    6  0.9304796  0.34807332  0.7369998
##   0.10    7  0.9304796  0.34807332  0.7369998
##   0.10    8  0.9304796  0.34807332  0.7369998
##   0.10    9  0.9304796  0.34807332  0.7369998
##   0.10   10  0.9304796  0.34807332  0.7369998
##   1.00    1  1.0289303  0.10447804  0.8149096
##   1.00    2  1.0283211  0.09865463  0.8223000
##   1.00    3  1.0285000  0.09915705  0.8228371
##   1.00    4  1.0285044  0.09915761  0.8229235
##   1.00    5  1.0285044  0.09915761  0.8229235
##   1.00    6  1.0285044  0.09915761  0.8229235
##   1.00    7  1.0285044  0.09915761  0.8229235
##   1.00    8  1.0285044  0.09915761  0.8229235
##   1.00    9  1.0285044  0.09915761  0.8229235
##   1.00   10  1.0285044  0.09915761  0.8229235
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01 and C = 10.
#to predict:
svm_model_pred<- predict(svm_model, chem_test_x)
postResample(svm_model_pred, chem_test_y)
##      RMSE  Rsquared       MAE 
## 0.5518020 0.6530168 0.4514493

MARS:

set.seed(43)
mars_grid2<- expand.grid(.degree = 1:3,
                         .nprune = 2:20)

mars_model<- caret::train(
  x = chem_train_x,
  y = chem_train_y,
  method = 'earth',
  trControl = cv,
  tuneGrid = mars_grid2,
  preProcess = c('scale', 'center')
  )
#to check the re-sampling metrics:
mars_model$finalModel
## Selected 9 of 47 terms, and 8 of 57 predictors (nprune=9)
## Termination condition: GRSq -10 at 47 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 4 4
## GCV 0.3054605    RSS 26.13412    GRSq 0.716068    RSq 0.8008966
#to predict:
mars_model_pred<- predict(mars_model, chem_test_x)
postResample(mars_model_pred, chem_test_y)
##      RMSE  Rsquared       MAE 
## 0.6040282 0.6794115 0.5080937

neural network:

set.seed(43)
rsnns_grid2<- expand.grid(.size = 5:20)

rsnns_model<- caret::train(x = chem_train_x,
                   y = chem_train_y,
                   method = 'mlp',
                   preProcess = c('scale', 'center'),
                   trControl = cv,
                   tuneGrid = rsnns_grid2,
                   linout = TRUE,
                   maxit = 500,
                   MaxNWts = 1000,
                   learnFuncParams = c(0.01, 0.9)
                   )
rsnns_model
## Multi-Layer Perceptron 
## 
## 124 samples
##  57 predictor
## 
## Pre-processing: scaled (57), centered (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 111, 110, 112, 112, 112, 112, ... 
## Resampling results across tuning parameters:
## 
##   size  RMSE       Rsquared   MAE      
##    5    0.7386158  0.5599023  0.6121461
##    6    0.6692738  0.6229918  0.5481855
##    7    0.7385661  0.5316902  0.6159301
##    8    0.6842188  0.5903394  0.5625240
##    9    0.6894243  0.5800230  0.5666227
##   10    0.6929524  0.5898268  0.5740712
##   11    0.7400959  0.5399855  0.6191846
##   12    0.7200079  0.5438850  0.5993379
##   13    0.6813056  0.5857365  0.5574185
##   14    0.7165098  0.5491966  0.5811928
##   15    0.7057504  0.5501162  0.5940637
##   16    0.6767010  0.6063028  0.5593269
##   17    0.6454775  0.6188008  0.5306226
##   18    0.6756315  0.5856233  0.5576466
##   19    0.6670559  0.5976512  0.5546076
##   20    0.6719511  0.5883250  0.5516515
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was size = 17.
RSNNS_model_pred<- predict(rsnns_model, chem_test_x)
postResample(RSNNS_model_pred, chem_test_y)
##      RMSE  Rsquared       MAE 
## 0.6861379 0.5133632 0.5577056

All residuals combined:

list(knn = postResample(knn_model_pred, chem_test_y),
     rsnns = postResample(RSNNS_model_pred, chem_test_y),
     MARS = postResample(mars_model_pred, chem_test_y),
     svm_rad = postResample(svm_model_pred, chem_test_y))
## $knn
##      RMSE  Rsquared       MAE 
## 0.5746305 0.6049632 0.4763410 
## 
## $rsnns
##      RMSE  Rsquared       MAE 
## 0.6861379 0.5133632 0.5577056 
## 
## $MARS
##      RMSE  Rsquared       MAE 
## 0.6040282 0.6794115 0.5080937 
## 
## $svm_rad
##      RMSE  Rsquared       MAE 
## 0.5518020 0.6530168 0.4514493
  1. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

    Ans: Variable 13, 17, 32, 09, 06, 03, 36, 06, 12, 02 are the most important variables in the non-linear model. The ManufacturingProcess variables predominate the chart. The top ten linear regression model (PLS) and the SVM model overlaps in 13, 17, 32, 09, 06, 03, 36. The only unique predictor in the SVM model is the Biomaterial12.

plot(varImp(svm_model), 
     top = 10,
     main='SVM-radial importance')

  1. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

    ANS: The only predictor unique to the SVM model is the BioMaterial 12. It has a positive coefficient to the response variable, meaning with increases in BioMaterial12 the response variable will also increase proportionally. However, the importance of it is ranked #9. MP 13 has the strongest correlation to the response variable according to SVM. These correlation coefficient plots help us to visualize and further narrow down what predictors can help to either boost or reduce the effect of the response variable. In the corrplot shown below, the Biomaterial predictors all have positive correlations to the Yield whereas Manufacturing has a varying degree of correlation between positive and negative correlations. This might be an indication that BioMaterial has an overall positive correlation to the Yield than the manufacturing predictors.

We will check the top predictor correlations

#Select important predictor names:
top_pre_names<- varImp(svm_model)$importance %>% 
  as.data.frame() %>% 
  rownames_to_column('predictors') %>% 
  slice_max(order_by = Overall, n = 10) %>% 
  pull(predictors)

#select the column of the predictors and the response variable column:
correlations<- imputed_chem %>% 
  select(c('Yield', top_pre_names)) %>% 
  cor()
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(top_pre_names)
## 
##   # Now:
##   data %>% select(all_of(top_pre_names))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#visualizing the predictor correlations:
corrplot(correlations, order = 'hclust')