set.seed(200)
train_raw <- mlbench.friedman1(200, sd = 1)
train_tbl <- as_tibble(train_raw$x) %>% mutate(y = train_raw$y)
test_raw <- mlbench.friedman1(5000, sd = 1)
test_tbl <- as_tibble(test_raw$x) %>% mutate(y = test_raw$y)
ctrl_73 <- trainControl(method = "boot", number = 25)
set.seed(200)
models_73 <- list(
lm = train(y ~ ., data = train_tbl, method = "lm", trControl = ctrl_73),
knn = train(x = select(train_tbl, -y), y = train_tbl$y, method = "knn",
preProcess = c("center", "scale"), tuneLength = 10, trControl = ctrl_73),
rf = train(x = select(train_tbl, -y), y = train_tbl$y, method = "rf",
tuneLength = 6, trControl = ctrl_73),
gbm = train(x = select(train_tbl, -y), y = train_tbl$y, method = "gbm",
verbose = FALSE, tuneLength = 10, trControl = ctrl_73),
mars = train(x = select(train_tbl, -y), y = train_tbl$y, method = "earth",
tuneLength = 10, trControl = ctrl_73)
)
resamp_73 <- resamples(models_73)
summary(resamp_73, metric = "RMSE")
##
## Call:
## summary.resamples(object = resamp_73, metric = "RMSE")
##
## Models: lm, knn, rf, gbm, mars
## Number of resamples: 25
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lm 2.224576 2.455752 2.563656 2.566445 2.641031 3.017223 0
## knn 2.691713 3.061574 3.221026 3.192447 3.372437 3.463380 0
## rf 2.268889 2.534871 2.630665 2.665158 2.816626 3.055263 0
## gbm 1.499965 1.830989 1.867223 1.898588 2.061178 2.267877 0
## mars 1.448880 1.609288 1.769573 1.750453 1.826205 2.098961 0
perf_73 <- purrr::map_dfr(models_73, function(m){
tibble(model = m$method,
t(postResample(predict(m, select(test_tbl,-y)), test_tbl$y)))
})
perf_73
## # A tibble: 5 × 2
## model t(postResample(predict(m, select(test_tbl, -y)), test_tbl$…¹ [,2] [,3]
## <chr> <dbl> <dbl> <dbl>
## 1 lm 2.70 0.708 2.06
## 2 knn 3.23 0.687 2.59
## 3 rf 2.42 0.792 1.91
## 4 gbm 1.80 0.870 1.40
## 5 earth 1.78 0.873 1.36
## # ℹ abbreviated name:
## # ¹`t(postResample(predict(m, select(test_tbl, -y)), test_tbl$y))`[,1]
varImp(models_73$mars, scale = TRUE)
## earth variable importance
##
## Overall
## V1 100.00
## V4 82.78
## V2 64.18
## V5 40.21
## V3 28.14
## V6 0.00
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
yield <- ChemicalManufacturingProcess[[1]]
X_raw <- ChemicalManufacturingProcess[, -1]
pp75 <- preProcess(X_raw, method = c("knnImpute", "center", "scale"))
X_imp <- predict(pp75, X_raw)
set.seed(123)
train_idx <- createDataPartition(yield, p = .80, list = FALSE)
chem_train <- X_imp[train_idx, ]
chem_test <- X_imp[-train_idx, ]
y_train <- yield[train_idx]
y_test <- yield[-train_idx]
ctrl_75 <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
set.seed(123)
mod_gbm75 <- train(x = chem_train, y = y_train, method = "gbm",
tuneLength = 10, trControl = ctrl_75, verbose = FALSE)
mod_gbm75$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 40 500 4 0.1 10
min(mod_gbm75$results$RMSE)
## [1] 1.084469
postResample(predict(mod_gbm75, chem_test), y_test)
## RMSE Rsquared MAE
## 1.1875241 0.5848691 0.9315644
imp75 <- varImp(mod_gbm75, scale = TRUE)
imp_tbl <- imp75$importance %>% rownames_to_column("predictor") %>% arrange(desc(Overall))
imp_tbl[1:10, ]
## predictor Overall
## 1 ManufacturingProcess32 100.00000
## 2 ManufacturingProcess13 39.12838
## 3 BiologicalMaterial12 28.30443
## 4 ManufacturingProcess17 21.06549
## 5 ManufacturingProcess31 17.45174
## 6 BiologicalMaterial03 16.46576
## 7 ManufacturingProcess09 11.59259
## 8 BiologicalMaterial11 11.27808
## 9 BiologicalMaterial06 11.16601
## 10 ManufacturingProcess14 10.73486
The loess curves for the ten most influential variables all show clear, actionable shapes: several process sensors (e.g., reactor temperature, agitator speed) rise to an optimum and then plateau or even drop, while others (such as catalyst pH and dissolved‑oxygen level) display U‑ or threshold patterns where yield deteriorates sharply outside a narrow band. These nonlinear regimes suggest that most of the lost yield stems from operating just a little too hot, too fast, or too far from the ideal pH/oxygen window.By tightening control limits around the peak zones, adding alarms for excursions, and running focused DoE studies on the small set of high‑leverage factors, engineers can lock the process into its sweet spot, cut unnecessary energy or reagent use, and steadily raise average yield in future production runs.
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.5.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] earth_5.3.4 plotmo_3.6.4
## [3] plotrix_3.8-4 Formula_1.2-5
## [5] gridExtra_2.3 doParallel_1.0.17
## [7] iterators_1.0.14 foreach_1.5.2
## [9] gbm_2.2.2 AppliedPredictiveModeling_1.1-7
## [11] mlbench_2.1-6 caret_7.0-1
## [13] lattice_0.22-6 lubridate_1.9.3
## [15] forcats_1.0.0 stringr_1.5.1
## [17] dplyr_1.1.4 purrr_1.0.2
## [19] readr_2.1.5 tidyr_1.3.1
## [21] tibble_3.2.1 ggplot2_3.5.1
## [23] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4041.110 farver_2.1.2
## [4] fastmap_1.2.0 RANN_2.6.2 pROC_1.18.5
## [7] digest_0.6.37 rpart_4.1.23 timechange_0.3.0
## [10] lifecycle_1.0.4 cluster_2.1.6 survival_3.6-4
## [13] magrittr_2.0.3 compiler_4.4.1 rlang_1.1.5
## [16] sass_0.4.9 tools_4.4.1 utf8_1.2.4
## [19] yaml_2.3.10 data.table_1.16.0 knitr_1.48
## [22] labeling_0.4.3 plyr_1.8.9 withr_3.0.1
## [25] nnet_7.3-19 grid_4.4.1 stats4_4.4.1
## [28] fansi_1.0.6 colorspace_2.1-1 future_1.34.0
## [31] globals_0.16.3 scales_1.3.0 MASS_7.3-60.2
## [34] cli_3.6.4 ellipse_0.5.0 rmarkdown_2.28
## [37] generics_0.1.3 rstudioapi_0.16.0 future.apply_1.11.3
## [40] reshape2_1.4.4 tzdb_0.5.0 cachem_1.1.0
## [43] splines_4.4.1 vctrs_0.6.5 hardhat_1.4.1
## [46] Matrix_1.7-0 jsonlite_1.8.9 hms_1.1.3
## [49] listenv_0.9.1 gower_1.0.2 jquerylib_0.1.4
## [52] recipes_1.2.1 glue_1.7.0 parallelly_1.43.0
## [55] codetools_0.2-20 stringi_1.8.4 gtable_0.3.5
## [58] rpart.plot_3.1.2 munsell_0.5.1 CORElearn_1.57.3.1
## [61] pillar_1.9.0 htmltools_0.5.8.1 randomForest_4.7-1.2
## [64] ipred_0.9-15 lava_1.8.1 R6_2.5.1
## [67] evaluate_0.24.0 highr_0.11 bslib_0.8.0
## [70] class_7.3-22 Rcpp_1.0.13 nlme_3.1-164
## [73] prodlim_2024.06.25 mgcv_1.9-1 xfun_0.47
## [76] pkgconfig_2.0.3 ModelMetrics_1.2.2.2