library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(mlbench)
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
Exercises from Chapter 7 of textbook Applied Predictive Modeling by Kuhn & Johnson
\(y = 10 sin(\pi x_1x_2) + 20(x_3 − 0.5)^2 + 10x_4 + 5x_5 + N(0, \sigma^2)\)
where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(200)
trainingData = mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x = data.frame(trainingData$x)
## Look at the data using
caret::featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData = mlbench.friedman1(5000, sd = 1)
testData$x = data.frame(testData$x)
For example:
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method="cv", number=10, savePredictions = TRUE))
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.305255 0.5721163 2.754163
## 7 3.149006 0.6185350 2.606181
## 9 3.088199 0.6674249 2.510615
## 11 3.071514 0.6867903 2.506869
## 13 3.067138 0.6963304 2.491396
## 15 3.120643 0.6955263 2.523599
## 17 3.107764 0.7108114 2.526434
## 19 3.124437 0.7189758 2.539485
## 21 3.156001 0.7136684 2.575904
## 23 3.179459 0.7172603 2.601060
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
svmModel = train(x = trainingData$x,
y = trainingData$y,
method = "svmLinear",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method="cv", number=10, savePredictions = TRUE))
svmModel
## Support Vector Machines with Linear Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2.47687 0.7581852 2.009267
##
## Tuning parameter 'C' was held constant at a value of 1
marsModel = train(x = trainingData$x,
y = trainingData$y,
method = "earth",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method="cv", number=10, savePredictions = TRUE))
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.1.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.1.3
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 4.1.3
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.271039 0.2646347 3.510925
## 3 3.557398 0.5046228 2.868580
## 4 2.535634 0.7550509 2.052441
## 6 2.166315 0.8246091 1.692498
## 7 1.712166 0.8903920 1.375257
## 9 1.610272 0.9014910 1.274234
## 10 1.604024 0.9021884 1.281600
## 12 1.579312 0.9005116 1.255295
## 13 1.598632 0.8998340 1.273442
## 15 1.590488 0.8999411 1.271184
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 1.
The mars model was able to perform the best (out of MARS, SVM and KNN) with an RMSE of ~1.6 MARS was able to select the most informative predictors (X1-X5) along with X6. X7-X10 were unused
marsModel$finalModel
## Selected 12 of 18 terms, and 6 of 10 predictors (nprune=12)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
data(ChemicalManufacturingProcess)
chem_manuf = ChemicalManufacturingProcess
knn = preProcess(chem_manuf, method = c('knnImpute'))
chem_imputed = predict(knn,chem_manuf)
set.seed(2022)
index = sample(round(dim(chem_imputed)[1]*.70))
chem_train = chem_imputed[index,]
chem_test = chem_imputed[-index,]
chem.knnModel <- train(Yield ~ .,
data = chem_train,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method="cv", number=10, savePredictions = TRUE))
RMSE(chem_test$Yield, predict(chem.knnModel, chem_test))
## [1] 0.9146806
chem.svmModel <- train(Yield ~ .,
data = chem_train,
method = "svmLinear",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method="cv", number=10, savePredictions = TRUE))
RMSE(chem_test$Yield, predict(chem.svmModel, chem_test))
## [1] 1.658391
chem.marsModel <- train(Yield ~ .,
data = chem_train,
method = "earth",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method="cv", number=10, savePredictions = TRUE))
RMSE(chem_test$Yield, predict(chem.marsModel, chem_test))
## [1] 1.160259
knn model performed the best on the test set for chemical manufacturing data with RMSE of ~0.9
Manufacturing process variables dominate the list. While different processes are more valuable, the split between manufacturing and biological are similar as manufacturing still tops the list
top_features = varImp(chem.knnModel)$importance%>%
arrange(-Overall)%>%
head(10)
top_features
## Overall
## ManufacturingProcess13 100.00000
## ManufacturingProcess17 96.43335
## ManufacturingProcess09 88.48917
## ManufacturingProcess32 77.31591
## BiologicalMaterial06 75.26298
## ManufacturingProcess06 66.51854
## BiologicalMaterial12 64.77298
## ManufacturingProcess36 62.77863
## BiologicalMaterial11 61.49259
## ManufacturingProcess11 60.78457
Most of the top features have a correlation (Positive or Negative) with the yield. The top biological material features have positive relationships while manufacturing processes are mixed
top_feature_names = rownames(top_features)
chem_x = chem_imputed[c(top_feature_names)]
chem_y = chem_imputed$Yield
caret::featurePlot(chem_x, chem_y)