7.2. Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: y = 10 sin(πx1x2) + 20(x3 − 0.5)2 + 10x4 + 5x5 + N(0, σ2) where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
#KNN
library(caret)
knnModel <- train(x = trainingData$x, y = trainingData$y, method = "knn",
preProc = c("center", "scale"), tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
#MARS
library(earth)
## Warning: package 'earth' was built under R version 4.3.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.3
## Loading required package: plotrix
marsModel <- train(x = trainingData$x, y = trainingData$y, method = "earth", tuneLength = 10)
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.383438 0.2405683 3.597961
## 3 3.645469 0.4745962 2.930453
## 4 2.727602 0.7035031 2.184240
## 6 2.331605 0.7835496 1.833420
## 7 1.976830 0.8421599 1.562591
## 9 1.804342 0.8683110 1.410395
## 10 1.787676 0.8711960 1.386944
## 12 1.821005 0.8670619 1.419893
## 13 1.858688 0.8617344 1.445459
## 15 1.871033 0.8607099 1.457618
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.
marsPred <- predict(marsModel, newdata = testData$x)
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 4.3.3
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
model = neuralnet(
y ~ .,
data = trainingData,
hidden = c(5, 5),
linear.output = FALSE
)
nnPred <- predict(model, newdata = testData$x)
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.3.3
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
svmModel <- train(x = trainingData$x, y = trainingData$y, method = "svmRadial", tuneLength = 10)
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.605094 0.7645344 2.068745
## 0.50 2.383375 0.7824840 1.866226
## 1.00 2.261899 0.7977780 1.760744
## 2.00 2.184218 0.8098223 1.697706
## 4.00 2.156007 0.8134055 1.673019
## 8.00 2.143625 0.8152672 1.669552
## 16.00 2.141395 0.8156733 1.668470
## 32.00 2.141315 0.8156906 1.668308
## 64.00 2.141315 0.8156906 1.668308
## 128.00 2.141315 0.8156906 1.668308
##
## Tuning parameter 'sigma' was held constant at a value of 0.06269697
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06269697 and C = 32.
svmPred <- predict(svmModel, newdata = testData$x)
svmPred <- predict(svmModel, newdata = testData$x)
postResample(pred = svmPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.0729792 0.8257729 1.5746258
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
as.data.frame(rbind(
knn = postResample(pred = knnPred, obs = testData$y),
mars = postResample(pred = predict(marsModel, newdata = testData$x), obs = testData$y),
nn = postResample(pred = predict(model, newdata = testData$x), obs = testData$y),
svm = postResample(pred = svmPred, obs = testData$y)
) )
## RMSE Rsquared MAE
## knn 3.204059 0.68199191 2.568346
## mars 1.776575 0.87269996 1.358367
## nn 14.276930 0.01587806 13.386911
## svm 2.072979 0.82577292 1.574626
The MARS model has the best performance across all three metrics. It has the lowest RMSE and MAE, and the highest R-squared.
varImp(marsModel)
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.78
## X2 64.18
## X5 40.21
## X3 28.14
## X6 0.00
The mars model does select the informative predictors. The top 5 predictors are X1, X2, X3, X4, and X5.
7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
library(caret)
set.seed(200)
data("ChemicalManufacturingProcess")
pre_process_model <- preProcess(ChemicalManufacturingProcess,
method = c("BoxCox", "knnImpute", "center", "scale"))
processed_data <- predict(pre_process_model, ChemicalManufacturingProcess)
processed_data$Yield <- ChemicalManufacturingProcess$Yield
train_indices <- sample(seq_len(nrow(processed_data)), size = floor(0.85 * nrow(processed_data)))
train_data <- processed_data[train_indices, ]
test_data <- processed_data[-train_indices, ]
knn_model <- train(x = train_data[, -1], y = train_data$Yield, method = "knn", preProc = c("center", "scale"), tuneLength = 10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
knn_model
## k-Nearest Neighbors
##
## 149 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 149, 149, 149, 149, 149, 149, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.355320 0.4107029 1.066563
## 7 1.329797 0.4302767 1.045199
## 9 1.324941 0.4390097 1.046048
## 11 1.344199 0.4237490 1.069445
## 13 1.343844 0.4310569 1.073263
## 15 1.357754 0.4248850 1.085066
## 17 1.384597 0.4029840 1.104304
## 19 1.391522 0.4013591 1.107286
## 21 1.397203 0.4016020 1.107565
## 23 1.406312 0.3971716 1.115442
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
knn_pred <- predict(knn_model, newdata = test_data[, -1])
mars_Model <- train(x = train_data[, -1], y = train_data$Yield, method = "earth", tuneLength = 10)
mars_Model
## Multivariate Adaptive Regression Spline
##
## 149 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 149, 149, 149, 149, 149, 149, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 1.485273 0.3243963 1.164782
## 3 1.329175 0.4628585 1.063063
## 5 1.611336 0.4683647 1.092185
## 7 2.285255 0.4293546 1.211014
## 8 2.425555 0.4193465 1.235860
## 10 2.835064 0.4000325 1.319180
## 12 3.162953 0.3716991 1.402142
## 13 3.180676 0.3571767 1.406669
## 15 3.630383 0.3334311 1.479677
## 17 4.050221 0.3151682 1.545117
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 3 and degree = 1.
mars_pred <- predict(mars_Model, newdata = test_data[, -1])
library(neuralnet)
model = neuralnet(
Yield ~ .,
data = train_data,
hidden = c(5, 5),
linear.output = FALSE
)
nn_pred <- predict(model, newdata = test_data[, -1])
svm_model <- train(x = train_data[, -1], y = train_data$Yield, method = "svmRadial", tuneLength = 10)
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 149 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 149, 149, 149, 149, 149, 149, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.417383 0.4738932 1.1124693
## 0.50 1.308322 0.5290979 1.0197167
## 1.00 1.233713 0.5649867 0.9579879
## 2.00 1.179980 0.5917616 0.9125943
## 4.00 1.162134 0.5986932 0.9020097
## 8.00 1.160849 0.5991925 0.9001530
## 16.00 1.160561 0.5994268 0.8998274
## 32.00 1.160561 0.5994268 0.8998274
## 64.00 1.160561 0.5994268 0.8998274
## 128.00 1.160561 0.5994268 0.8998274
##
## Tuning parameter 'sigma' was held constant at a value of 0.01567433
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01567433 and C = 16.
svm_pred <- predict(svm_model, newdata = test_data[, -1])
as.data.frame(rbind(
knn = postResample(pred = knn_pred, obs = test_data$Yield),
mars = postResample(pred = mars_pred, obs = test_data$Yield),
nn = postResample(pred = nn_pred, obs = test_data$Yield),
svm = postResample(pred = svm_pred, obs = test_data$Yield)
) )
## RMSE Rsquared MAE
## knn 1.578356 0.4705665 1.4018519
## mars 1.249131 0.6706054 0.9052919
## nn 39.813943 0.1436116 39.7559269
## svm 1.419354 0.6100339 1.2341218
The MARS model has the best performance across all three metrics. It has the lowest RMSE and MAE, and the highest R-squared.
varImp(mars_Model)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100
## ManufacturingProcess13 0
preProcess <- preProcess(ChemicalManufacturingProcess,
method = c("BoxCox", "knnImpute", "center", "scale"))
predPreProcess <- predict(preProcess, ChemicalManufacturingProcess)
varImp_mars <- varImp(mars_Model)
top_predictors <- varImp_mars$importance %>%
as.data.frame() %>%
rownames_to_column("Predictor") %>%
arrange(desc(Overall)) %>%
head(10) %>%
select(Predictor)
variables <- top_predictors$Predictor
featurePlot(
x = predPreProcess[, variables],
y = predPreProcess$Yield,
plot = "scatter"
)
There seems to be a clear trend in ManufacturingProcess32 but there isn’t a clear trend in the other predictors. However we can see that they are all clusters.