Data 624 Assignment 9: APM Chapter 8

library(gt)
library(mlbench)
library(caret)
library(skimr)
library(AppliedPredictiveModeling)
library(rpart)
library(tidyverse)
library(tidymodels)
library(vip)
library(ggthemes)
library(randomForest)
library(gbm)
library(party)
library(Cubist)

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

Data From 6.3

data(ChemicalManufacturingProcess, package = "AppliedPredictiveModeling")

Training and Test Split

set.seed(4763)
chem_split <- initial_split(ChemicalManufacturingProcess)
chem_train <- training(chem_split)
chem_test <- testing(chem_split)

Preprocessing From 6.3

chem_rec <- chem_train %>% 
  recipe(Yield ~ .) %>%
  step_knnimpute(all_predictors()) %>%
  prep()

Create Folds

folds <- vfold_cv(chem_train, v=10)

Part A

Which tree-based regression model gives the optimal resampling and test set performance?

Cubist

cube_param <- expand.grid(committees = seq(1,10,by=1),
                            neighbors = seq(1,9,by=2))

cube_ctrl <- trainControl(method="cv", n=10)


d1 <- chem_train %>% select(Yield)

d2 <- chem_train %>% select(-Yield)


m_cube <- train(x=d2, y=d1$Yield, method="cubist",
                   trControl = cube_ctrl,
                   tuneGrid = cube_param,
                   verbose=FALSE)

m_cube$bestTune

##    committees neighbors
## 27          6         3

cube_pred <- predict(m_cube$finalModel, newdata = chem_test)

cube_metrics <- tibble(predicted = cube_pred) %>%
  bind_cols(actual = chem_test$Yield) %>%
  rsq(truth=actual, estimate=predicted)

cube_metrics <- tibble(predicted = cube_pred) %>%
  bind_cols(actual = chem_test$Yield) %>%
  rmse(truth=actual, estimate=predicted) %>%
  rbind(cube_metrics)

cube_metrics %>% 
  gt()

.metric	.estimator	.estimate
rmse	standard	1.2645557
rsq	standard	0.5274334

Random Forest

# Our model specification. We'll tune every param.
rf <- rand_forest(mtry = tune(),
                  trees = tune(),
                  min_n = tune()) %>%
  set_engine("randomForest") %>%
  set_mode("regression")

# A grid of tuning parameters
rf_tune <- grid_regular(mtry(c(57,57)), trees(), min_n(), levels = 10)

# Create the workflow
rf_wf <- workflow() %>% add_model(rf) %>% add_recipe(chem_rec)

# Tune the model (takes a while!)
rf.model <- rf_wf %>% tune_grid(resamples = folds, grid = rf_tune)

# Get best model from the tuning results

rf.model %>% show_best("rmse")

## # A tibble: 5 x 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config 
##   <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>   
## 1    57   223     6 rmse    standard    1.13    10  0.0972 Model012
## 2    57  1555     2 rmse    standard    1.14    10  0.0991 Model008
## 3    57  2000     2 rmse    standard    1.14    10  0.0956 Model010
## 4    57  1333     2 rmse    standard    1.14    10  0.0953 Model007
## 5    57  1777     2 rmse    standard    1.14    10  0.0959 Model009

best_rf <- rf.model %>% select_best(metric = "rmse")

rf_wf <- rf_wf %>% finalize_workflow(best_rf)

rf_final <- rf_wf %>% last_fit(chem_split)

modelEval <- tibble(model = "Random Forest") %>%
  bind_cols(rf_final %>% collect_metrics)

rf_final %>% 
  collect_metrics %>% 
  gt()

.metric	.estimator	.estimate
rmse	standard	1.1850286
rsq	standard	0.6062937

XGBoost

xgb <- boost_tree(mode = "regression",
                    mtry = tune(), trees = tune(),
                    min_n = 10, tree_depth = 8,
                    learn_rate = tune(),
                    loss_reduction = tune(),
                    sample_size = .60,
                    stop_iter = 3) %>% set_engine("xgboost")


xgb_tune <- grid_regular(finalize(mtry(),select(chem_train,-Yield)),
                           trees(),
                           learn_rate(), loss_reduction(),
                           levels = 10)

xgb_wf <- workflow() %>% add_model(xgb) %>% add_recipe(chem_rec)

xgb_fit <- xgb_wf %>% tune_grid(resamples = folds, grid = xgb_tune)

xgb_fit %>% show_best("rmse")

## # A tibble: 5 x 10
##    mtry trees learn_rate loss_reduction .metric .estimator  mean     n std_err
##   <int> <int>      <dbl>          <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1     7  1111        0.1    0.000000681 rmse    standard    1.03    10  0.0606
## 2     7  1333        0.1    0.000000681 rmse    standard    1.03    10  0.0606
## 3     7  1555        0.1    0.000000681 rmse    standard    1.03    10  0.0606
## 4     7  1777        0.1    0.000000681 rmse    standard    1.03    10  0.0606
## 5     7  2000        0.1    0.000000681 rmse    standard    1.03    10  0.0606
## # ... with 1 more variable: .config <chr>

best_xgb <- xgb_fit %>% select_best(metric = "rmse")

xgb_wf <- xgb_wf %>% finalize_workflow(best_xgb)

xgb_final <- xgb_wf %>% last_fit(chem_split)

xgb_final %>% 
  collect_metrics() %>%
  gt()

.metric	.estimator	.estimate
rmse	standard	1.278147
rsq	standard	0.521448

The XGBoost Model was the top performer.

Part B

Which predictors are the most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

xgb_final %>% 
  pluck(".workflow", 1) %>%   
  pull_workflow_fit() %>% 
  vip(num_features = 10)

Similar to the earlier exercise ManufacturingProcess32 was the most important predictor. The Top 10 variables have overlap but are not exactly the same.

Part C

Plot the optimal single tree with the distribution of yield in the terminals. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

library(rpart.plot)

multi.class.model  = rpart(Yield~., data=chem_train)
rpart.plot(multi.class.model)

The plot above shows similar results to the XGBoost model - M32 was the top in both. Additionally B12 and B06 were prominent in the XGBoost model and the tree model above.

Data 624 Assignment 9: APM Chapter 8

Jim Mundy

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

Data From 6.3

Training and Test Split

Preprocessing From 6.3

Create Folds

Part A

Which tree-based regression model gives the optimal resampling and test set performance?

Cubist

Random Forest

XGBoost

The XGBoost Model was the top performer.

Part B

Which predictors are the most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

Similar to the earlier exercise ManufacturingProcess32 was the most important predictor. The Top 10 variables have overlap but are not exactly the same.

Part C

Plot the optimal single tree with the distribution of yield in the terminals. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

The plot above shows similar results to the XGBoost model - M32 was the top in both. Additionally B12 and B06 were prominent in the XGBoost model and the tree model above.