Questions 7.2 and 7.5 from the Kuhn& Johnson book
library(caret)
library(corrplot)
library(dplyr)
library(data.table)
library(e1071)
library(earth)
library(fable)
library(forecast)
library(fpp2)
library(fpp3)
library(ggplot2)
library(GGally)
library(knitr)
library(kernlab)
library(latex2exp)
library(mlbench)
library(MASS)
library(purrr)
library(pls)
library(patchwork)
library(RANN)
library(stats)
library(tsibble)
library(tidyr)
\(y = 10sin(πx_1x_2) + 20(x_3-0.5)^2 + 10x_4 +5x_5 + N(0,σ^2)\)
where the \(x\) values are random variables uniformly distributed between \([0, 1]\) (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(200)
trainingData = mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x = data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
results <- data.frame(matrix(vector(), 0, 3,
dimnames = list(c(), c("RMSE","Rsquared","MAE"))),
stringsAsFactors = FALSE)
model_knn <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
model_knn
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
# Store predictions for accuracy against test data
pred_knn <- predict(model_knn, newdata = testData$x)
results <- rbind(results, postResample(pred = pred_knn, obs = testData$y))
Check the closest neighbors for each predictor by looking at Euclidean distance between observations.
library(earth)
control <- trainControl(method = "cv")
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:15)
model_mars <- train(trainingData$x,
trainingData$y,
method = "earth",
tuneGrid = marsGrid,
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
model_mars
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.462296 0.2176253 3.697979
## 1 3 3.720663 0.4673821 2.949121
## 1 4 2.680039 0.7094916 2.123848
## 1 5 2.333538 0.7781559 1.856629
## 1 6 2.367933 0.7754329 1.901509
## 1 7 1.809983 0.8656526 1.414985
## 1 8 1.692656 0.8838936 1.333678
## 1 9 1.704958 0.8845683 1.339517
## 1 10 1.688559 0.8842495 1.309838
## 1 11 1.669043 0.8886165 1.296522
## 1 12 1.645066 0.8892796 1.271981
## 1 13 1.655570 0.8886896 1.271232
## 1 14 1.666354 0.8879143 1.285545
## 1 15 1.666354 0.8879143 1.285545
## 2 2 4.440854 0.2204755 3.686796
## 2 3 3.697203 0.4714312 2.938566
## 2 4 2.664266 0.7149235 2.119566
## 2 5 2.313371 0.7837374 1.852172
## 2 6 2.335796 0.7875253 1.841919
## 2 7 1.833248 0.8623489 1.461538
## 2 8 1.695822 0.8883658 1.329030
## 2 9 1.555106 0.9028532 1.221365
## 2 10 1.497805 0.9088251 1.158054
## 2 11 1.419280 0.9207646 1.139722
## 2 12 1.326566 0.9315939 1.066200
## 2 13 1.266877 0.9354482 1.002983
## 2 14 1.256694 0.9349307 1.006273
## 2 15 1.311401 0.9316487 1.039213
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
# Store predictions for accuracy against test data
pred_mars <- predict(model_mars, newdata = testData$x)
results <- rbind(results, postResample(pred = pred_mars, obs = testData$y))
MARS Regression (multivariate adaptive regression splines) is a non-parametric regression technique that automatically captures non-linearity and interaction between predictors.
nnet_Grid <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:10), .bag = FALSE)
nnet_maxnwts <- 5 * (ncol(trainingData$x) + 1) + 5 + 1
model_nnet <- train(x = trainingData$x,
y = trainingData$y,
method = "avNNet",
preProcess = c("center", "scale"),
tuneGrid = nnet_Grid,
trControl = control,
linout = TRUE,
trace = FALSE,
MaxNWts = nnet_maxnwts,
maxit = 500)
model_nnet
## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 2.409360 0.7626158 1.918956
## 0.00 2 2.495497 0.7466800 1.998154
## 0.00 3 2.042870 0.8276737 1.620556
## 0.00 4 2.218146 0.8058404 1.608860
## 0.00 5 2.290307 0.7791256 1.785130
## 0.00 6 NaN NaN NaN
## 0.00 7 NaN NaN NaN
## 0.00 8 NaN NaN NaN
## 0.00 9 NaN NaN NaN
## 0.00 10 NaN NaN NaN
## 0.01 1 2.441045 0.7589837 1.927773
## 0.01 2 2.408586 0.7646177 1.924190
## 0.01 3 2.094390 0.8207054 1.639698
## 0.01 4 2.019234 0.8369700 1.608611
## 0.01 5 2.186633 0.8126958 1.765935
## 0.01 6 NaN NaN NaN
## 0.01 7 NaN NaN NaN
## 0.01 8 NaN NaN NaN
## 0.01 9 NaN NaN NaN
## 0.01 10 NaN NaN NaN
## 0.10 1 2.451052 0.7561366 1.935306
## 0.10 2 2.439995 0.7533865 1.925580
## 0.10 3 2.142421 0.8119015 1.697562
## 0.10 4 2.021443 0.8367619 1.613081
## 0.10 5 2.058645 0.8303731 1.641058
## 0.10 6 NaN NaN NaN
## 0.10 7 NaN NaN NaN
## 0.10 8 NaN NaN NaN
## 0.10 9 NaN NaN NaN
## 0.10 10 NaN NaN NaN
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0.01 and bag = FALSE.
# Store predictions for accuracy against test data
pred_nnet <- predict(model_nnet, newdata = testData$x)
results <- rbind(results, postResample(pred = pred_nnet, obs = testData$y))
Input layer goes through many hidden layers, where weights are randomized, resampled, and adjusted to the output layer.
model_svm <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
model_svm
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.485843 0.8015980 1.997058
## 0.50 2.217552 0.8191439 1.783734
## 1.00 2.038394 0.8395080 1.621050
## 2.00 1.931284 0.8537910 1.510868
## 4.00 1.875643 0.8624807 1.471216
## 8.00 1.873494 0.8639409 1.479643
## 16.00 1.886621 0.8626959 1.496953
## 32.00 1.886621 0.8626959 1.496953
## 64.00 1.886621 0.8626959 1.496953
## 128.00 1.886621 0.8626959 1.496953
##
## Tuning parameter 'sigma' was held constant at a value of 0.0623323
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0623323 and C = 8.
# Store predictions for accuracy against test data
svm_pred <- predict(model_svm, newdata = testData$x)
results <- rbind(results, postResample(pred = svm_pred, obs = testData$y))
Support Vector Machines are used for regression or classification by making a hyper-plane decision boundary in some \(n\)-dimensional space.
colnames(results) <- c("RMSE","R-Squared","MAE")
results$Model <- c("KNN", "MARS", "Neural Network", "SVM")
results <- results %>% relocate("Model") %>% arrange(RMSE)
results
## Model RMSE R-Squared MAE
## 1 MARS 1.277999 0.9338365 1.014707
## 2 SVM 2.051520 0.8294511 1.556470
## 3 Neural Network 2.061504 0.8305890 1.561138
## 4 KNN 3.204059 0.6819919 2.568346
# Check if it selects x_1 to x_5
varImp(model_mars)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.40
## X2 49.00
## X5 15.72
## X3 0.00
MARS gives the best overall performance, by having the greatest \(R^2\) and lowest errors, and it selects the informative predictors, although with varying importance levels.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
################################################################################
# Impute missing values
CMP_impute <- ChemicalManufacturingProcess %>%
preProcess("knnImpute")
CMP_complete <- predict(CMP_impute, ChemicalManufacturingProcess)
################################################################################
# Remove near zero Variance predictors
CMP_complete <- CMP_complete[, -nearZeroVar(CMP_complete)]
################################################################################
# Train Test Split
set.seed(12345)
index <- createDataPartition(CMP_complete$Yield, p = .8, list = FALSE)
# Train
train_predictors <- CMP_complete[index, -1]
train_yield <- CMP_complete[index, 1]
# Test
test_predictors <- CMP_complete[-index, -1]
test_yield <- CMP_complete[-index, 1]
model_chem_knn <- train(x=train_predictors,
y=train_yield,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
control <- trainControl(method = "cv")
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:15)
model_chem_mars <- train(x=train_predictors,
y=train_yield,
method = "earth",
tuneGrid = marsGrid,
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
nnet_Grid <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:10), .bag = FALSE)
nnet_maxnwts <- 5 * (ncol(train_predictors) + 1) + 5 + 1
model_chem_nnet <- train(x=train_predictors,
y=train_yield,
method = "avNNet",
preProcess = c("center", "scale"),
tuneGrid = nnet_Grid,
trControl = control,
linout = TRUE,
trace = FALSE,
MaxNWts = nnet_maxnwts,
maxit = 500)
control <- trainControl(method = "cv")
model_chem_svm <- train(x=train_predictors,
y=train_yield,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
(a) Which nonlinear regression model gives the optimal resampling and test set performance?
results_chem <- data.frame(matrix(vector(), 0, 3,
dimnames = list(c(), c("RMSE","Rsquared","MAE"))),
stringsAsFactors = FALSE)
pred_chem_knn <- predict(model_chem_knn, newdata = test_predictors)
pred_chem_mars <- predict(model_chem_mars, newdata = test_predictors)
pred_chem_nnet <- predict(model_chem_nnet, newdata = test_predictors)
pred_chem_svm <- predict(model_chem_svm, newdata = test_predictors)
results_chem <- rbind(results_chem, postResample(pred = pred_chem_knn, obs = test_yield))
results_chem <- rbind(results_chem, postResample(pred = pred_chem_mars, obs = test_yield))
results_chem <- rbind(results_chem, postResample(pred = pred_chem_nnet, obs = test_yield))
results_chem <- rbind(results_chem, postResample(pred = pred_chem_svm, obs = test_yield))
colnames(results_chem) <- c("RMSE","R-Squared","MAE")
results_chem$Model <- c("KNN", "MARS", "Neural Network", "SVM")
results_chem <- results_chem %>% relocate("Model") %>% arrange(RMSE)
results_chem
## Model RMSE R-Squared MAE
## 1 SVM 0.5818004 0.6237450 0.4734947
## 2 Neural Network 0.6205936 0.6156185 0.5148025
## 3 KNN 0.6294354 0.5890334 0.5464538
## 4 MARS 0.8650264 0.3076008 0.6402553
For the chemical manufacturing process data, the best-performing model is SVM.
(b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear model?
varImp(model_chem_svm)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess32 84.32
## ManufacturingProcess09 79.70
## ManufacturingProcess17 74.30
## BiologicalMaterial06 68.60
## ManufacturingProcess36 64.77
## BiologicalMaterial03 64.43
## BiologicalMaterial02 59.70
## BiologicalMaterial12 58.79
## ManufacturingProcess06 58.19
## ManufacturingProcess31 56.21
## BiologicalMaterial11 48.83
## ManufacturingProcess11 48.23
## ManufacturingProcess30 46.69
## BiologicalMaterial08 45.14
## BiologicalMaterial04 44.95
## ManufacturingProcess29 42.66
## ManufacturingProcess33 38.27
## BiologicalMaterial01 35.50
## ManufacturingProcess12 35.30
plot(varImp(model_chem_svm, top = 20))
For the chemical manufacturing process data, the most important predictors in the SVM model are the manufacturing process variables. When comparing to the results of assignment 7, problem 6.3f, I can see that alot of variables actually shifted around, and that the top 10 predictors are still 6 manufacturing processes and 4 biological processes. With the exception of BiologicalMaterial02, the other three (BM06, BM03, BM12) all went up in terms of importance.
(c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
library(corrplot)
library(dplyr)
library(tidyselect)
top_predictors <- varImp(model_chem_svm)$importance %>%
arrange(-Overall) %>%
head(10)
# Extract top predictor names
top_predictor_names <- rownames(top_predictors)
# Get correlations
cor_matrix <- CMP_complete %>%
dplyr::select(all_of(c("Yield", top_predictor_names))) %>%
cor()
# Corrplot
corrplot::corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
tl.cex = 0.8,
tl.col = "black",
diag = FALSE)
print(top_predictor_names)
## [1] "ManufacturingProcess13" "ManufacturingProcess32" "ManufacturingProcess09"
## [4] "ManufacturingProcess17" "BiologicalMaterial06" "ManufacturingProcess36"
## [7] "BiologicalMaterial03" "BiologicalMaterial02" "BiologicalMaterial12"
## [10] "ManufacturingProcess06"
top_predictors <- CMP_complete %>% select(‘ManufacturingProcess13’, ‘ManufacturingProcess32’, ‘ManufacturingProcess09’, ‘ManufacturingProcess17’,‘BiologicalMaterial06’, ‘ManufacturingProcess36’, ‘BiologicalMaterial03’, ‘BiologicalMaterial02’, ‘BiologicalMaterial12’, ‘ManufacturingProcess06’, ‘Yield’)