library(tidyverse)
library(mlbench)
library(earth)
library(RSNNS)
library(corrplot)
library(caret)
set.seed(200)
training_data<- mlbench.friedman1(200, sd =1)
training_data$x<- data.frame(training_data$x)
featurePlot(training_data$x, training_data$y)
#create a test dataset:
test_data<- mlbench.friedman1(5000, sd = 1)
test_data$x<- data.frame(test_data$x)
#define the 10 fold cross-validation method:
cv<-trainControl(method = 'cv', number = 10)
7.2 Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
**KNN example from the textbook, which will be used as the benchmark:
knn_tune<- caret::train(x = training_data$x,
y = training_data$y,
method = 'knn',
preProcess = c('scale', 'center'),
tuneLength = 10)
To predict with KNN:
knn_pred<- predict(knn_tune, test_data$x)
postResample(knn_pred, test_data$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
Neural network with RSNNS:
rsnns_grid<- expand.grid(.size = 5:20)
rsnns_tune<- caret::train(x = training_data$x,
y = training_data$y,
method = 'mlp',
preProcess = c('scale', 'center'),
trControl = cv,
tuneGrid = rsnns_grid,
linout = TRUE,
maxit = 500,
MaxNWts = 1000,
learnFuncParams = c(0.01, 0.9)
)
RSNNS prediction:
rsnns_pred<- predict(rsnns_tune, test_data$x)
postResample(rsnns_pred, test_data$y)
## RMSE Rsquared MAE
## 2.4920704 0.7482272 1.9299780
MARS:
#MARS tuning:
set.seed(123)
mars_grid<- expand.grid(.degree = 1:3,
.nprune = 2:20)
mars_tune<- caret::train(x = training_data$x,
y = training_data$y,
method = 'earth',
trControl = cv,
tuneGrid = mars_grid,
preProcess = c('scale', 'center'))
plot(varImp(mars_tune))
MARS prediction:
mars_predict<- predict(mars_tune, test_data$x)
postResample(mars_predict, test_data$y)
## RMSE Rsquared MAE
## 1.2779993 0.9338365 1.0147070
SVM:
svm_radial<- caret::train(x = training_data$x,
y = training_data$y,
method = 'svmRadial',
tuneLength = 10,
trControl = cv)
svm_poly<- caret::train(x = training_data$x,
y = training_data$y,
method = 'svmPoly',
tuneLength = 3,
trControl = cv)
SVM prediction:
svm_rad_pred<- predict(svm_radial, test_data$x)
svm_poly_pred<- predict(svm_radial, test_data$x)
postResample(svm_rad_pred, test_data$y)
## RMSE Rsquared MAE
## 2.0782039 0.8249488 1.5787446
postResample(svm_poly_pred, test_data$y)
## RMSE Rsquared MAE
## 2.0782039 0.8249488 1.5787446
To put all models side by side comparison:
list(knn = postResample(knn_pred, test_data$y),
rsnns = postResample(rsnns_pred, test_data$y),
MARS = postResample(mars_predict, test_data$y),
svm_rad = postResample(svm_rad_pred, test_data$y),
svm_poly = postResample(svm_poly_pred, test_data$y))
## $knn
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
##
## $rsnns
## RMSE Rsquared MAE
## 2.4920704 0.7482272 1.9299780
##
## $MARS
## RMSE Rsquared MAE
## 1.2779993 0.9338365 1.0147070
##
## $svm_rad
## RMSE Rsquared MAE
## 2.0782039 0.8249488 1.5787446
##
## $svm_poly
## RMSE Rsquared MAE
## 2.0782039 0.8249488 1.5787446
7.5 Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
Which nonlinear regression model gives the optimal re-sampling and test set performance?
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
Train split the data:
set.seed(123)
#checking for missing data:
is.na(ChemicalManufacturingProcess) %>% sum() #106 missing data
## [1] 106
#impute the data with knn:
imputed_data<- preProcess(ChemicalManufacturingProcess, method = 'knnImpute')
#apply to the dataset:
imputed_chem<- predict(imputed_data, ChemicalManufacturingProcess)
imputed_chem %>% is.na() %>% sum() # 0 missing data
## [1] 0
#spliting data in 70-30:
chem_train_idx<- createDataPartition(imputed_chem$Yield, p = 0.7, list=FALSE)
#training dataset
chem_train_x<- imputed_chem[chem_train_idx, ] %>% select(-Yield)
chem_train_y<- imputed_chem$Yield[chem_train_idx]
#testing dataset:
chem_test_x<- imputed_chem[-chem_train_idx, ] %>% select(-Yield)
chem_test_y<- imputed_chem$Yield[-chem_train_idx]
KNN:
set.seed(43)
knn_model<- caret::train(x = chem_train_x,
y = chem_train_y,
method = 'knn',
preProcess = c('scale', 'center'),
trControl = cv,
tuneGrid = data.frame(.k = 1:20))
#to check the re-sampling metrics:
knn_model
## k-Nearest Neighbors
##
## 124 samples
## 57 predictor
##
## Pre-processing: scaled (57), centered (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 110, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 0.8205099 0.4593838 0.6286764
## 2 0.7523686 0.4983340 0.5778125
## 3 0.7287325 0.5294176 0.5935537
## 4 0.7399352 0.5123103 0.5813488
## 5 0.7701000 0.4668749 0.6194218
## 6 0.7820163 0.4564448 0.6242010
## 7 0.7919033 0.4503019 0.6329623
## 8 0.7954740 0.4371723 0.6354999
## 9 0.7896281 0.4427387 0.6295239
## 10 0.7919164 0.4359477 0.6313449
## 11 0.7851989 0.4476922 0.6221146
## 12 0.7952866 0.4359111 0.6334504
## 13 0.7943265 0.4324838 0.6325867
## 14 0.7978386 0.4270360 0.6415137
## 15 0.8022131 0.4221110 0.6450661
## 16 0.8005464 0.4289696 0.6440362
## 17 0.7996381 0.4326691 0.6388798
## 18 0.7993412 0.4342193 0.6422833
## 19 0.8051389 0.4316030 0.6475247
## 20 0.8079171 0.4268165 0.6502997
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 3.
#to predict:
knn_model_pred<- predict(knn_model, chem_test_x)
postResample(knn_model_pred, chem_test_y)
## RMSE Rsquared MAE
## 0.5746305 0.6049632 0.4763410
SVM radial:
set.seed(43)
#tuning parameters:
svm_tune<- expand.grid(
sigma = c(0.01, 0.1, 1),
C = c(1:10))
svm_model<- caret::train(
x = chem_train_x,
y = chem_train_y,
method = 'svmRadial',
trControl = cv,
tuneGrid = svm_tune,
preProcess = c('scale', 'center')
)
#to check the re-sampling metrics:
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 124 samples
## 57 predictor
##
## Pre-processing: scaled (57), centered (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 110, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## sigma C RMSE Rsquared MAE
## 0.01 1 0.6711945 0.61797405 0.5395421
## 0.01 2 0.6514612 0.63395417 0.5224476
## 0.01 3 0.6458360 0.63495711 0.5110088
## 0.01 4 0.6436582 0.63452534 0.5084002
## 0.01 5 0.6406828 0.63691692 0.5065740
## 0.01 6 0.6402824 0.63899121 0.5036939
## 0.01 7 0.6394369 0.64101029 0.5013607
## 0.01 8 0.6392954 0.64208360 0.5003111
## 0.01 9 0.6388491 0.64226495 0.4992261
## 0.01 10 0.6376887 0.64357761 0.4973097
## 0.10 1 0.9428680 0.33045731 0.7426450
## 0.10 2 0.9313877 0.34632928 0.7374551
## 0.10 3 0.9304864 0.34807281 0.7370064
## 0.10 4 0.9304796 0.34807332 0.7369998
## 0.10 5 0.9304796 0.34807332 0.7369998
## 0.10 6 0.9304796 0.34807332 0.7369998
## 0.10 7 0.9304796 0.34807332 0.7369998
## 0.10 8 0.9304796 0.34807332 0.7369998
## 0.10 9 0.9304796 0.34807332 0.7369998
## 0.10 10 0.9304796 0.34807332 0.7369998
## 1.00 1 1.0289303 0.10447804 0.8149096
## 1.00 2 1.0283211 0.09865463 0.8223000
## 1.00 3 1.0285000 0.09915705 0.8228371
## 1.00 4 1.0285044 0.09915761 0.8229235
## 1.00 5 1.0285044 0.09915761 0.8229235
## 1.00 6 1.0285044 0.09915761 0.8229235
## 1.00 7 1.0285044 0.09915761 0.8229235
## 1.00 8 1.0285044 0.09915761 0.8229235
## 1.00 9 1.0285044 0.09915761 0.8229235
## 1.00 10 1.0285044 0.09915761 0.8229235
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01 and C = 10.
#to predict:
svm_model_pred<- predict(svm_model, chem_test_x)
postResample(svm_model_pred, chem_test_y)
## RMSE Rsquared MAE
## 0.5518020 0.6530168 0.4514493
MARS:
set.seed(43)
mars_grid2<- expand.grid(.degree = 1:3,
.nprune = 2:20)
mars_model<- caret::train(
x = chem_train_x,
y = chem_train_y,
method = 'earth',
trControl = cv,
tuneGrid = mars_grid2,
preProcess = c('scale', 'center')
)
#to check the re-sampling metrics:
mars_model$finalModel
## Selected 9 of 47 terms, and 8 of 57 predictors (nprune=9)
## Termination condition: GRSq -10 at 47 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 4 4
## GCV 0.3054605 RSS 26.13412 GRSq 0.716068 RSq 0.8008966
#to predict:
mars_model_pred<- predict(mars_model, chem_test_x)
postResample(mars_model_pred, chem_test_y)
## RMSE Rsquared MAE
## 0.6040282 0.6794115 0.5080937
neural network:
set.seed(43)
rsnns_grid2<- expand.grid(.size = 5:20)
rsnns_model<- caret::train(x = chem_train_x,
y = chem_train_y,
method = 'mlp',
preProcess = c('scale', 'center'),
trControl = cv,
tuneGrid = rsnns_grid2,
linout = TRUE,
maxit = 500,
MaxNWts = 1000,
learnFuncParams = c(0.01, 0.9)
)
rsnns_model
## Multi-Layer Perceptron
##
## 124 samples
## 57 predictor
##
## Pre-processing: scaled (57), centered (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 110, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## size RMSE Rsquared MAE
## 5 0.7386158 0.5599023 0.6121461
## 6 0.6692738 0.6229918 0.5481855
## 7 0.7385661 0.5316902 0.6159301
## 8 0.6842188 0.5903394 0.5625240
## 9 0.6894243 0.5800230 0.5666227
## 10 0.6929524 0.5898268 0.5740712
## 11 0.7400959 0.5399855 0.6191846
## 12 0.7200079 0.5438850 0.5993379
## 13 0.6813056 0.5857365 0.5574185
## 14 0.7165098 0.5491966 0.5811928
## 15 0.7057504 0.5501162 0.5940637
## 16 0.6767010 0.6063028 0.5593269
## 17 0.6454775 0.6188008 0.5306226
## 18 0.6756315 0.5856233 0.5576466
## 19 0.6670559 0.5976512 0.5546076
## 20 0.6719511 0.5883250 0.5516515
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was size = 17.
RSNNS_model_pred<- predict(rsnns_model, chem_test_x)
postResample(RSNNS_model_pred, chem_test_y)
## RMSE Rsquared MAE
## 0.6861379 0.5133632 0.5577056
All residuals combined:
list(knn = postResample(knn_model_pred, chem_test_y),
rsnns = postResample(RSNNS_model_pred, chem_test_y),
MARS = postResample(mars_model_pred, chem_test_y),
svm_rad = postResample(svm_model_pred, chem_test_y))
## $knn
## RMSE Rsquared MAE
## 0.5746305 0.6049632 0.4763410
##
## $rsnns
## RMSE Rsquared MAE
## 0.6861379 0.5133632 0.5577056
##
## $MARS
## RMSE Rsquared MAE
## 0.6040282 0.6794115 0.5080937
##
## $svm_rad
## RMSE Rsquared MAE
## 0.5518020 0.6530168 0.4514493
Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
plot(varImp(svm_model),
top = 10,
main='SVM-radial importance')
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
We will check the top predictor correlations
#Select important predictor names:
top_pre_names<- varImp(svm_model)$importance %>%
as.data.frame() %>%
rownames_to_column('predictors') %>%
slice_max(order_by = Overall, n = 10) %>%
pull(predictors)
#select the column of the predictors and the response variable column:
correlations<- imputed_chem %>%
select(c('Yield', top_pre_names)) %>%
cor()
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(top_pre_names)
##
## # Now:
## data %>% select(all_of(top_pre_names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#visualizing the predictor correlations:
corrplot(correlations, order = 'hclust')