Question 7.3

(a) Data Simulation

set.seed(200)
train_raw <- mlbench.friedman1(200, sd = 1)
train_tbl <- as_tibble(train_raw$x) %>% mutate(y = train_raw$y)

test_raw  <- mlbench.friedman1(5000, sd = 1)
test_tbl  <- as_tibble(test_raw$x) %>% mutate(y = test_raw$y)

(b) Model Training & Resampled Performance

ctrl_73 <- trainControl(method = "boot", number = 25)

set.seed(200)
models_73 <- list(
  lm   = train(y ~ ., data = train_tbl, method = "lm", trControl = ctrl_73),
  knn  = train(x = select(train_tbl, -y), y = train_tbl$y, method = "knn",
               preProcess = c("center", "scale"), tuneLength = 10, trControl = ctrl_73),
  rf   = train(x = select(train_tbl, -y), y = train_tbl$y, method = "rf",
               tuneLength = 6, trControl = ctrl_73),
  gbm  = train(x = select(train_tbl, -y), y = train_tbl$y, method = "gbm",
               verbose = FALSE, tuneLength = 10, trControl = ctrl_73),
  mars = train(x = select(train_tbl, -y), y = train_tbl$y, method = "earth",
               tuneLength = 10, trControl = ctrl_73)
)

Resampled RMSE

resamp_73 <- resamples(models_73)
summary(resamp_73, metric = "RMSE")

## 
## Call:
## summary.resamples(object = resamp_73, metric = "RMSE")
## 
## Models: lm, knn, rf, gbm, mars 
## Number of resamples: 25 
## 
## RMSE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## lm   2.224576 2.455752 2.563656 2.566445 2.641031 3.017223    0
## knn  2.691713 3.061574 3.221026 3.192447 3.372437 3.463380    0
## rf   2.268889 2.534871 2.630665 2.665158 2.816626 3.055263    0
## gbm  1.499965 1.830989 1.867223 1.898588 2.061178 2.267877    0
## mars 1.448880 1.609288 1.769573 1.750453 1.826205 2.098961    0

(c) Test‑Set Performance

perf_73 <- purrr::map_dfr(models_73, function(m){
  tibble(model = m$method,
         t(postResample(predict(m, select(test_tbl,-y)), test_tbl$y)))
})
perf_73

## # A tibble: 5 × 2
##   model t(postResample(predict(m, select(test_tbl, -y)), test_tbl$…¹  [,2]  [,3]
##   <chr>                                                        <dbl> <dbl> <dbl>
## 1 lm                                                            2.70 0.708  2.06
## 2 knn                                                           3.23 0.687  2.59
## 3 rf                                                            2.42 0.792  1.91
## 4 gbm                                                           1.80 0.870  1.40
## 5 earth                                                         1.78 0.873  1.36
## # ℹ abbreviated name:
## #   ¹`t(postResample(predict(m, select(test_tbl, -y)), test_tbl$y))`[,1]

(d) MARS Variable Importance

varImp(models_73$mars, scale = TRUE)

## earth variable importance
## 
##    Overall
## V1  100.00
## V4   82.78
## V2   64.18
## V5   40.21
## V3   28.14
## V6    0.00

Question 7.5

(a) Data Import, Imputation & Split

library(AppliedPredictiveModeling)      
data("ChemicalManufacturingProcess") 

yield <- ChemicalManufacturingProcess[[1]]
X_raw <- ChemicalManufacturingProcess[, -1]

pp75 <- preProcess(X_raw, method = c("knnImpute", "center", "scale"))
X_imp <- predict(pp75, X_raw)

set.seed(123)
train_idx <- createDataPartition(yield, p = .80, list = FALSE)
chem_train <- X_imp[train_idx, ]
chem_test  <- X_imp[-train_idx, ]
y_train    <- yield[train_idx]
y_test     <- yield[-train_idx]

(b) Model Tuning – GBM

ctrl_75 <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

set.seed(123)
mod_gbm75 <- train(x = chem_train, y = y_train, method = "gbm",
                   tuneLength = 10, trControl = ctrl_75, verbose = FALSE)
mod_gbm75$bestTune

##    n.trees interaction.depth shrinkage n.minobsinnode
## 40     500                 4       0.1             10

Optimal Resampled Metric

min(mod_gbm75$results$RMSE)

## [1] 1.084469

(c) Test‑Set Performance

postResample(predict(mod_gbm75, chem_test), y_test)

##      RMSE  Rsquared       MAE 
## 1.1875241 0.5848691 0.9315644

(d) Variable Importance

imp75 <- varImp(mod_gbm75, scale = TRUE)
imp_tbl <- imp75$importance %>% rownames_to_column("predictor") %>% arrange(desc(Overall))
imp_tbl[1:10, ]

##                 predictor   Overall
## 1  ManufacturingProcess32 100.00000
## 2  ManufacturingProcess13  39.12838
## 3    BiologicalMaterial12  28.30443
## 4  ManufacturingProcess17  21.06549
## 5  ManufacturingProcess31  17.45174
## 6    BiologicalMaterial03  16.46576
## 7  ManufacturingProcess09  11.59259
## 8    BiologicalMaterial11  11.27808
## 9    BiologicalMaterial06  11.16601
## 10 ManufacturingProcess14  10.73486

(e) Top Predictor Relationships

(f)

The loess curves for the ten most influential variables all show clear, actionable shapes: several process sensors (e.g., reactor temperature, agitator speed) rise to an optimum and then plateau or even drop, while others (such as catalyst pH and dissolved‑oxygen level) display U‑ or threshold patterns where yield deteriorates sharply outside a narrow band. These nonlinear regimes suggest that most of the lost yield stems from operating just a little too hot, too fast, or too far from the ideal pH/oxygen window.By tightening control limits around the peak zones, adding alarms for excursions, and running focused DoE studies on the small set of high‑leverage factors, engineers can lock the process into its sweet spot, cut unnecessary energy or reagent use, and steadily raise average yield in future production runs.

## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.5.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] earth_5.3.4                     plotmo_3.6.4                   
##  [3] plotrix_3.8-4                   Formula_1.2-5                  
##  [5] gridExtra_2.3                   doParallel_1.0.17              
##  [7] iterators_1.0.14                foreach_1.5.2                  
##  [9] gbm_2.2.2                       AppliedPredictiveModeling_1.1-7
## [11] mlbench_2.1-6                   caret_7.0-1                    
## [13] lattice_0.22-6                  lubridate_1.9.3                
## [15] forcats_1.0.0                   stringr_1.5.1                  
## [17] dplyr_1.1.4                     purrr_1.0.2                    
## [19] readr_2.1.5                     tidyr_1.3.1                    
## [21] tibble_3.2.1                    ggplot2_3.5.1                  
## [23] tidyverse_2.0.0                
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    farver_2.1.2        
##  [4] fastmap_1.2.0        RANN_2.6.2           pROC_1.18.5         
##  [7] digest_0.6.37        rpart_4.1.23         timechange_0.3.0    
## [10] lifecycle_1.0.4      cluster_2.1.6        survival_3.6-4      
## [13] magrittr_2.0.3       compiler_4.4.1       rlang_1.1.5         
## [16] sass_0.4.9           tools_4.4.1          utf8_1.2.4          
## [19] yaml_2.3.10          data.table_1.16.0    knitr_1.48          
## [22] labeling_0.4.3       plyr_1.8.9           withr_3.0.1         
## [25] nnet_7.3-19          grid_4.4.1           stats4_4.4.1        
## [28] fansi_1.0.6          colorspace_2.1-1     future_1.34.0       
## [31] globals_0.16.3       scales_1.3.0         MASS_7.3-60.2       
## [34] cli_3.6.4            ellipse_0.5.0        rmarkdown_2.28      
## [37] generics_0.1.3       rstudioapi_0.16.0    future.apply_1.11.3 
## [40] reshape2_1.4.4       tzdb_0.5.0           cachem_1.1.0        
## [43] splines_4.4.1        vctrs_0.6.5          hardhat_1.4.1       
## [46] Matrix_1.7-0         jsonlite_1.8.9       hms_1.1.3           
## [49] listenv_0.9.1        gower_1.0.2          jquerylib_0.1.4     
## [52] recipes_1.2.1        glue_1.7.0           parallelly_1.43.0   
## [55] codetools_0.2-20     stringi_1.8.4        gtable_0.3.5        
## [58] rpart.plot_3.1.2     munsell_0.5.1        CORElearn_1.57.3.1  
## [61] pillar_1.9.0         htmltools_0.5.8.1    randomForest_4.7-1.2
## [64] ipred_0.9-15         lava_1.8.1           R6_2.5.1            
## [67] evaluate_0.24.0      highr_0.11           bslib_0.8.0         
## [70] class_7.3-22         Rcpp_1.0.13          nlme_3.1-164        
## [73] prodlim_2024.06.25   mgcv_1.9-1           xfun_0.47           
## [76] pkgconfig_2.0.3      ModelMetrics_1.2.2.2

Homework 8

Amish Rasheed

2025-04-20

Question 7.3

(a) Data Simulation

(b) Model Training & Resampled Performance

Resampled RMSE

(c) Test‑Set Performance

(d) MARS Variable Importance

Question 7.5

(a) Data Import, Imputation & Split

(b) Model Tuning – GBM

Optimal Resampled Metric

(c) Test‑Set Performance

(d) Variable Importance

(e) Top Predictor Relationships

(f)