##7.2 Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: y = 10 sin(πx1x2) + 20(x3 − 0.5)2 + 10x4 + 5x5 + N(0, σ2) where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
set.seed(200)
trainingData = mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x = data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData = mlbench.friedman1(5000, sd = 1)
testData$x = data.frame(testData$x)
Tune several models on these data. For example:
library(caret)
knnModel <- train(x = trainingData$x, y = trainingData$y,method = "knn",preProc = c("center", "scale"), tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
mars_model<- train(x = trainingData$x, y = trainingData$y, method = "earth",preProcess = c("center", "scale"), tuneLength = 10)
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.2.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.2.3
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.2.3
mars_pred <- predict(mars_model, newdata = testData$x)
postResample(pred = mars_pred, obs = testData$y)
## RMSE Rsquared MAE
## 1.776575 0.872700 1.358367
varImp(mars_model)
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.78
## X2 64.18
## X5 40.21
## X3 28.14
## X6 0.00
The MARS model has a lower RMSE and higher Rsquared than KNN and only selected X1-X5 as its predictors. It appears to be a better fit.
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.2.3
library(RANN)
## Warning: package 'RANN' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(ChemicalManufacturingProcess)
set.seed(1234)
knn_pre <- preProcess(ChemicalManufacturingProcess, "knnImpute")
knn_pred <- predict(knn_pre, ChemicalManufacturingProcess)
knn_pred <- knn_pred %>% select_at(vars(-one_of(nearZeroVar(., names = TRUE))))
training_data <- createDataPartition(knn_pred$Yield, times = 1, p = 0.8, list = FALSE)
train_data <- knn_pred[training_data, ]
test_data <- knn_pred[-training_data, ]
knn_model <- train(Yield ~ ., data = train_data, method = "knn", center = TRUE, scale = TRUE, trControl = trainControl("cv", number = 10),tuneLength = 10)
knn_model
## k-Nearest Neighbors
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 129, 128, 131, 130, 130, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.7237610 0.5219385 0.6000154
## 7 0.7298434 0.5103148 0.6038800
## 9 0.7401387 0.4957407 0.6110017
## 11 0.7218398 0.5368142 0.5865431
## 13 0.7379855 0.5135531 0.6008255
## 15 0.7487626 0.5025050 0.6110746
## 17 0.7555478 0.5016355 0.6163655
## 19 0.7515744 0.5160700 0.6101203
## 21 0.7582183 0.5071037 0.6173736
## 23 0.7639953 0.5085869 0.6198149
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
knn_predict <- predict(knn_model, test_data)
knn_results <- data.frame(t(postResample(pred = knn_predict, obs = test_data$Yield)))
knn_results
## RMSE Rsquared MAE
## 1 0.6994187 0.3547056 0.605811
mars_grid <- expand.grid(degree = c(1:2), nprune = c(2:10))
mars_model <- train(Yield ~ ., data = train_data, method = "earth",tuneGrid = mars_grid,trControl = trainControl("cv", number = 10),tuneLength = 25)
mars_model
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 129, 132, 130, 130, 128, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 0.7789194 0.4401603 0.6179795
## 1 3 0.6804064 0.5664303 0.5508993
## 1 4 0.6761158 0.5806914 0.5507581
## 1 5 0.6402040 0.6279732 0.5322559
## 1 6 0.6352661 0.6320216 0.5239903
## 1 7 0.6247490 0.6317278 0.5165607
## 1 8 0.6808449 0.5746501 0.5453058
## 1 9 0.6732789 0.5797236 0.5383759
## 1 10 0.6896934 0.5680038 0.5520418
## 2 2 0.7868154 0.4294405 0.6324954
## 2 3 0.6941249 0.5473943 0.5581296
## 2 4 0.7336219 0.5028495 0.5860609
## 2 5 0.6901801 0.5706026 0.5656028
## 2 6 0.7004739 0.5619804 0.5713038
## 2 7 0.6838762 0.5872813 0.5443752
## 2 8 0.7222177 0.5471597 0.5674303
## 2 9 0.7437381 0.5380753 0.5681492
## 2 10 0.7273905 0.5412025 0.5678327
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 7 and degree = 1.
mars_pred2 <- predict(mars_model, test_data)
#postResample(pred = mars_pred2, obs = testData$y)
The Mars model appears to perform better than KNN with a higher rsquared value and lower RMSE.
varImp(mars_model, 10)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09 59.823
## ManufacturingProcess13 17.294
## ManufacturingProcess01 2.199
## ManufacturingProcess42 0.000
It returns 7 predictors with the manufacturing process variables dominating the predictors, with 32 being the highest in the model.
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
train_data %>% select(c('ManufacturingProcess32','ManufacturingProcess13','ManufacturingProcess09','ManufacturingProcess13','ManufacturingProcess39','ManufacturingProcess04','ManufacturingProcess01','Yield')) %>% cor() %>% corrplot()
Manufacturing process 32 and 9 have stronger positive correlations with
yield, while 13 has a stronger negative relationship to yield. We can
see this in the plots, below, as well.
library(ggplot2)
predictors <- c('ManufacturingProcess32', 'ManufacturingProcess13', 'ManufacturingProcess09', 'ManufacturingProcess39', 'ManufacturingProcess04', 'ManufacturingProcess01')
plot_data <- lapply(predictors, function(pred) {
ggplot(train_data, aes_string(x = pred, y = "Yield")) +
geom_point() +
labs(x = pred, y = "Yield", title = paste(pred, "vs. Yield"))
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(grobs = plot_data, ncol = 2)