Tree Model Homework

8.1,8.2,8.3,8.7

8.1

recreate simulated data.

library(mlbench)
library(tidymodels)
cores<-parallel::detectCores()
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
sim_split<-initial_split(simulated)
sim_train<-training(sim_split)
sim_test<-training(sim_split)

Fit a random forest model to all of the predictors, then estimate the variable importance

library(tidymodels)
library(vip)
library(tidyverse)

sim_rec<-recipe(y~., data = simulated)

rf_spec<-
  rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
  set_mode('regression')%>%
  set_engine('ranger')
rf_recipe<-
  recipe(y~., data = sim_train)
rf_workflow<-
  workflow()%>%
  add_model(rf_spec)%>%
  add_recipe(rf_recipe)
#show what will be tuned
extract_parameter_set_dials(rf_spec)

## Collection of 2 parameters for tuning
## 
##  identifier  type    object
##        mtry  mtry nparam[?]
##       min_n min_n nparam[+]
## 
## Model parameters needing finalization:
##    # Randomly Selected Predictors ('mtry')
## 
## See `?dials::finalize` or `?dials::update.parameters` for more information.

Tune The model.

#create a validation set from the training data (faster than resampling i think)
val_set <- validation_split(sim_train, 
                            prop = 0.80)
rf_res<-
  rf_workflow%>%
  tune_grid(val_set,
            grid=5,
            control = control_grid(save_pred =TRUE),
            metrics = metric_set(rmse))
last_rf_spec<-
  rand_forest(mtry = 8, min_n  = 7, trees = 1000)%>%
  set_engine('ranger', num.threads = cores, importance = 'impurity')%>%
  set_mode('regression')
last_rf_workflow<-
  rf_workflow%>%
  update_model(last_rf_spec)
#fit the model
last_rf_fit<-
  last_rf_workflow%>%
  last_fit(sim_split)


vi_<-vi_model(extract_fit_parsnip(last_rf_fit))
vi_%>%arrange(desc(Importance))

## # A tibble: 10 x 2
##    Variable Importance
##    <chr>         <dbl>
##  1 V1            974. 
##  2 V4            948. 
##  3 V2            860. 
##  4 V5            260. 
##  5 V3            133. 
##  6 V7             99.1
##  7 V6             96.0
##  8 V10            62.6
##  9 V8             58.9
## 10 V9             49.2

The model did not significantly use the uninformative predictors as they are an order of magnitude below the informative predictors

Add an additional predictor that is highly correlated with one of the informative predictors

First I’ll add the mutated column and resplit the data

set.seed(200)
simulated_mod <- mlbench.friedman1(200, sd = 1)
simulated_mod <- cbind(simulated_mod$x, simulated_mod$y)
simulated_mod <- as.data.frame(simulated_mod)%>%mutate(V1_mod = V1+rnorm(200)*.1)
colnames(simulated_mod)[ncol(simulated_mod)] <- "y"
sim_mod_split<-initial_split(simulated_mod)
sim_mod_train<-training(sim_mod_split)
sim_mod_test<-training(sim_mod_split)

Remaking the model with the additional Predictor

mod_rec<-
  recipe(y~., data = sim_mod_train)
mod_spec<-
  rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
  set_mode('regression')%>%
  set_engine('ranger')
mod_workflow<-
  workflow()%>%
  add_model(rf_spec)%>%
  add_recipe(mod_rec)
mod_res<-
  rf_workflow%>%
  tune_grid(val_set,
            grid=5,
            control = control_grid(save_pred =TRUE),
            metrics = metric_set(rmse))
mod_res%>%show_best()

## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config             
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1     5     6 rmse    standard    2.62     1      NA Preprocessor1_Model3
## 2     9    17 rmse    standard    2.71     1      NA Preprocessor1_Model1
## 3     4    19 rmse    standard    2.90     1      NA Preprocessor1_Model2
## 4     8    30 rmse    standard    2.94     1      NA Preprocessor1_Model4
## 5     2    37 rmse    standard    3.61     1      NA Preprocessor1_Model5

After Tuning the Model, lets check the predictors.

last_mod_spec<-
  rand_forest(mtry = 6, min_n  = 8, trees = 1000)%>%
  set_engine('ranger', num.threads = cores, importance = 'impurity')%>%
  set_mode('regression')
last_mod_workflow<-
  mod_workflow%>%
  update_model(last_mod_spec)
#fit the model
last_mod_fit<-
  last_mod_workflow%>%
  last_fit(sim_mod_split)


vi_mod<-vi_model(extract_fit_parsnip(last_mod_fit))
vi_mod%>%arrange(desc(Importance))

## # A tibble: 11 x 2
##    Variable Importance
##    <chr>         <dbl>
##  1 V1            9.98 
##  2 V11           1.38 
##  3 V7            0.302
##  4 V10           0.290
##  5 V6            0.270
##  6 V2            0.257
##  7 V8            0.253
##  8 V4            0.247
##  9 V3            0.230
## 10 V5            0.222
## 11 V9            0.196

The importance score did change, but all the scores dropped by an order of magnitude and the New variable seems to be sapping the percent importance of the main predictor.

fit a model with conditional inference trees

library(party)

ctrl<-cforest_control(mtry = ncol(simulated)-1)
rf_spec<-cforest(y~., data = simulated, controls = ctrl)

vip(rf_spec)

No, the order is not the same. In the conditional inference tree, V4 is the most important.

repeat this process with boosted trees and cubist, check the variable importance

Tuning and Training the models

library(rules)

## Warning: package 'rules' was built under R version 4.1.3

## 
## Attaching package: 'rules'

## The following object is masked from 'package:dials':
## 
##     max_rules

library(Cubist)

## Warning: package 'Cubist' was built under R version 4.1.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.1.2

library(baguette)

## Warning: package 'baguette' was built under R version 4.1.3

cubist_spec<-
  cubist_rules(committees = tune(), neighbors = tune(), max_rules = tune())%>%
  set_engine('Cubist')%>%
  set_mode('regression')
xgb_spec<-
  boost_tree(
    trees = 1000,
    min_n = tune(),
    mtry = tune(),
    learn_rate = .01
  )%>%
  set_engine('xgboost')%>%
  set_mode('regression')
rf_spec<-
  rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
  set_mode('regression')%>%
  set_engine('ranger' , importance = 'impurity')

norm_rec<-
  rf_recipe%>%
  step_normalize(all_predictors())

all_wflw<-
  workflow_set(
    preproc = list(simple = rf_recipe),
    models = list(rand_forest = rf_spec, boost = xgb_spec,cubist = cubist_spec),
    cross = TRUE
  )



#control grid
grid_ctrl<-
  control_grid(
    save_pred = TRUE,
    parallel_over = 'everything',
    save_workflow = TRUE
  )
grid_results<-
all_wflw%>%
  workflow_map(
    seed = 1403,
    resamples = val_set,
    grid = 5,
    control = grid_ctrl
  )

## i Creating pre-processing data to finalize unknown parameter: mtry
## i Creating pre-processing data to finalize unknown parameter: mtry

## Warning: package 'xgboost' was built under R version 4.1.3

best_boost<-extract_workflow_set_result(grid_results,'simple_boost')%>%select_best(metric = 'rmse')
best_rf<-pull_workflow_set_result(grid_results,'simple_rand_forest')%>%select_best()

## Warning: `pull_workflow_set_result()` was deprecated in workflowsets 0.1.0.
## Please use `extract_workflow_set_result()` instead.

## Warning: No value of `metric` was given; metric 'rmse' will be used.

best_rf<-pull_workflow_set_result(grid_results,'simple_cubist')%>%select_best()

## Warning: `pull_workflow_set_result()` was deprecated in workflowsets 0.1.0.
## Please use `extract_workflow_set_result()` instead.
## No value of `metric` was given; metric 'rmse' will be used.

boost mod

best_boost<-extract_workflow_set_result(grid_results,'simple_boost')%>%select_best(metric = 'rmse')
boost_test_results<-
  grid_results%>%
  extract_workflow('simple_boost')%>%
  finalize_workflow(best_boost)%>%
  last_fit(split = sim_split)


vi_boost<-vi_model(extract_fit_parsnip(boost_test_results))
boost_vi_table<-vi_boost%>%arrange(desc(Importance))

rand_forest

best_boost<-extract_workflow_set_result(grid_results,'simple_rand_forest')%>%select_best(metric = 'rmse')
boost_test_results<-
  grid_results%>%
  extract_workflow('simple_rand_forest')%>%
  finalize_workflow(best_boost)%>%
  last_fit(split = sim_split)


vi_rand_forest<-vi_model(extract_fit_parsnip(boost_test_results))
rf_vi_table<-vi_rand_forest%>%arrange(desc(Importance))

cubist

best_boost<-extract_workflow_set_result(grid_results,'simple_cubist')%>%select_best(metric = 'rmse')
boost_test_results<-
  grid_results%>%
  extract_workflow('simple_cubist')%>%
  finalize_workflow(best_boost)%>%
  last_fit(split = sim_split)


vi_cubist<-vi_model(extract_fit_parsnip(boost_test_results))
cubist_vi_table<-vi_cubist%>%arrange(desc(Importance))

cubist_vi<-cbind(as.data.frame(cubist_vi_table),mod = 'cubist', rank = seq(from=1, to =10))
boost_vi<-cbind(as.data.frame(boost_vi_table), mod = 'boost',rank = seq(from=1, to =10))
rf_vi<-cbind(as.data.frame(rf_vi_table), mod = 'rf',  rank = seq(from=1, to =10))
vi_table<-rbind(cubist_vi, boost_vi, rf_vi)%>%select(-Importance)
group_means<-vi_table%>%
  group_by(Variable)%>%
  summarize(mean = mean(rank))

ggplot(vi_table, aes(x = factor(Variable, level = c('V1', 'V2', 'V3', 'V4','V5','V6','V7','V8','V9','V10')), y =rank, color = mod))+
  geom_point(alpha = .3 , position=position_jitter(h=0.15,w=0.15))+
  scale_y_continuous(breaks=seq(1,10,by=1))+
  ggtitle('Variable Rank by Model')+
  labs(y = "Rank", x = "Variable")

The above plot is the variable rank by model type.

8.2 Use a simulation to show tree bias with different granularities.

Granularity references how “pixelated” the data in predictor is. How many ‘grains’ are used to paint the picture of a continuous phenomena is a good example. Low granularity might mean that a variable ranging from 1 to 10 only has three levels of values in its range, while a highly granular variable might have 1000 ‘levels’ in between 0 and 10.

Tree models suffer from selection bias that having a higher number of distinct values, or ‘grains’ are vafored over “granular” (in this case meaning a lower number of distinct values) data.

Lets create 4 variables, where only two are predictors of the response. there will be 2 granular variables and two non-granular variables.

cont<-seq(1,10000, by=.01)
granular<-seq(1,10000, by =100)

p_cont<-sample(cont, 1000, replace = TRUE)
cont<-sample(cont, 1000, replace = TRUE)
p_gran<-sample(granular, 1000, replace = TRUE)
gran<-sample(granular, 1000, replace = TRUE)


y<-p_cont+p_gran+rnorm(1000)
dat<-data.frame(p_cont, cont,p_gran, gran,y)

train test data

# recall how to split
sim_split<-initial_split(dat)
sim_train<-training(sim_split)
sim_test<-testing(sim_split)
sim_folds<-vfold_cv(sim_train, v = 3 )

rf_rec<-
  recipe(y~., data = sim_train)

#makes a prepped version of the data if you have transformations

tune_spec<-decision_tree( cost_complexity = tune(), tree_depth = tune(), min_n = tune())%>%
  set_mode('regression')%>%
  set_engine('rpart')
ml_wflow<-
  workflow()%>%
  add_recipe(rf_rec)%>%
  add_model(tune_spec)
tree_grid <- grid_regular(cost_complexity(), tree_depth(), min_n(), levels = 4)
#tune
set.seed(234)
tree_rs<-ml_wflow%>%
  tune_grid(
  resamples = sim_folds,
  grid = 4, 
  metrics = metric_set(rmse, rsq, mae, mape)
  
)
best_params<-
  tree_rs%>%
  select_best(metric = 'rmse')%>%as.data.frame()
#refit using entire training set

reg_res<-
  ml_wflow%>%
  finalize_workflow(best_params)%>%
  last_fit(sim_split)
  
vi_cubist<-vi_model(extract_fit_parsnip(reg_res))
cubist_vi_table<-vi_cubist%>%arrange(desc(Importance))
cubist_vi_table

## # A tibble: 4 x 2
##   Variable  Importance
##   <chr>          <dbl>
## 1 p_gran   6973897781.
## 2 p_cont   6225687678.
## 3 cont      526678371.
## 4 gran      406667813.

I was expecting the importance of cont to be higher, but if we look at the variable importance between p_cont and p_gran, the model views p_cont as more significant even though they are equally important to the model.

8.3

These are the basic steps of a gradient boosting machine

given loss function (squared error for regression) and a weak learner (like regression trees)
find an additive model that minimizes the loss function.
initialize with best guess resposne (ex: mean of response in regression)
gradient (residual) is calculated and model is fit to the residuals to minimize the loss function
current model is added to the previous model
keep going for user-specified number of iterations

learning rate:

is what percentage of variance the posterior modeling step will use to make its next model. If this is set to one, it will make a model that will overfit to the residuals of the prior model. bagging fraction:

A boosting model will select a fraction of the training data, then the remaining steps of the current iteration are only based on that particular sample of the data.

effects of the learning rate

the high learning rate model will use the predictor most correlated with the response, as its root node, and then overfit the residuals very quickly. This will cause the tree to use less predictors in aggregate and therefore boost the overall importance of the nodes closest to the root node.

effects of the bagging fraction

The lower bagging fraction model will select much smaller subsets of the data, increasing the impact of outliers, causing the parallel trees to be more disparate and therefore spreading out the importance of the predictors.

These two factors both have the effect of exaggerating eachother. the Left model learns slowly and tries to work on bags that look very different, while the right model overfits on bags that look a lot more similar.

I would err towards the low learning rate and low bagging fraction model, The bagging fraction will most likely decrease predictive power, but the effect of overfitting on the training data set will likely have more of an impact.

Interaction depth is the number of iterations the weak learners will permute through. It will exaggerate overfitting in the right model and potentially correct for undersampling and slow learning in the left model.

8.7

Which tree-based regression gives the optimal resampling and test performance

I will use a simple validation set instead of cross validation due to the number of models we are testing.

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.2

data("ChemicalManufacturingProcess")
dat<-ChemicalManufacturingProcess%>%na.omit()
chem_split<-initial_split(dat)
chem_train<-training(chem_split)
chem_test<-testing(chem_split)

set.seed(1502)
chem_val <- validation_split(chem_train, 
                            prop = 0.80)

chem_rec<-recipe(Yield~., data =chem_train )%>%
  step_nzv()%>%
  step_impute_knn()%>%
  step_normalize()

create model specifications

library(rules)
library(baguette)
bag_cart_spec<-
  bag_tree()%>%
  set_engine('rpart', times =50L)%>%
  set_mode('regression')
cubist_spec<-
  cubist_rules(committees = tune(), neighbors = tune(), max_rules = tune())%>%
  set_engine('Cubist')%>%
  set_mode('regression')
xgb_spec<-
  boost_tree(
    trees = 1000,
    min_n = tune(),
    mtry = tune(),
    learn_rate = .01
  )%>%
  set_engine('xgboost')%>%
  set_mode('regression')
rf_spec<-
  rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
  set_mode('regression')%>%
  set_engine('ranger' , importance = 'impurity')

norm_rec<-
  rf_recipe%>%
  step_normalize(all_predictors())

all_wflw<-
  workflow_set(
    preproc = list(simple = chem_rec),
    models = list(bagged = bag_cart_spec, rand_forest = rf_spec, boost = xgb_spec,cubist = cubist_spec),
    cross = TRUE
  )



#control grid
grid_ctrl<-
  control_grid(
    save_pred = TRUE,
    parallel_over = 'everything',
    save_workflow = TRUE
  )
grid_results<-
all_wflw%>%
  workflow_map(
    seed = 1403,
    resamples = chem_val,
    grid = 5,
    control = grid_ctrl
  )


grid_results%>%
  rank_results()%>%
  filter(.metric == 'rmse')%>%
  select(model, .config, rmse = mean, rank)

## # A tibble: 16 x 4
##    model        .config               rmse  rank
##    <chr>        <chr>                <dbl> <int>
##  1 boost_tree   Preprocessor1_Model3 0.990     1
##  2 cubist_rules Preprocessor1_Model4 1.04      2
##  3 cubist_rules Preprocessor1_Model3 1.07      3
##  4 rand_forest  Preprocessor1_Model3 1.07      4
##  5 cubist_rules Preprocessor1_Model2 1.11      5
##  6 cubist_rules Preprocessor1_Model1 1.13      6
##  7 rand_forest  Preprocessor1_Model1 1.13      7
##  8 boost_tree   Preprocessor1_Model2 1.14      8
##  9 bag_tree     Preprocessor1_Model1 1.15      9
## 10 boost_tree   Preprocessor1_Model1 1.17     10
## 11 boost_tree   Preprocessor1_Model5 1.20     11
## 12 rand_forest  Preprocessor1_Model2 1.20     12
## 13 rand_forest  Preprocessor1_Model5 1.20     13
## 14 boost_tree   Preprocessor1_Model4 1.22     14
## 15 rand_forest  Preprocessor1_Model4 1.24     15
## 16 cubist_rules Preprocessor1_Model5 1.26     16

8.7

Which tree-based regression gives the optimal resampling and test performance

I will use a simple validation set instead of cross validation due to the number of models we are testing.

library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
dat<-ChemicalManufacturingProcess%>%na.omit()
chem_split<-initial_split(dat)
chem_train<-training(chem_split)
chem_test<-testing(chem_split)

set.seed(1502)
chem_val <- validation_split(chem_train, 
                            prop = 0.80)

chem_rec<-recipe(Yield~., data =chem_train )%>%
  step_nzv()%>%
  step_impute_knn()%>%
  step_normalize()

create model specifications

library(rules)
library(baguette)
bag_cart_spec<-
  bag_tree()%>%
  set_engine('rpart', times =50L)%>%
  set_mode('regression')
cubist_spec<-
  cubist_rules(committees = tune(), neighbors = tune(), max_rules = tune())%>%
  set_engine('Cubist')%>%
  set_mode('regression')
xgb_spec<-
  boost_tree(
    trees = 1000,
    min_n = tune(),
    mtry = tune(),
    learn_rate = .01
  )%>%
  set_engine('xgboost')%>%
  set_mode('regression')
rf_spec<-
  rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
  set_mode('regression')%>%
  set_engine('ranger' , importance = 'impurity')

norm_rec<-
  rf_recipe%>%
  step_normalize(all_predictors())

all_wflw<-
  workflow_set(
    preproc = list(simple = chem_rec),
    models = list(bagged = bag_cart_spec, rand_forest = rf_spec, boost = xgb_spec,cubist = cubist_spec),
    cross = TRUE
  )



#control grid
grid_ctrl<-
  control_grid(
    save_pred = TRUE,
    parallel_over = 'everything',
    save_workflow = TRUE
  )
grid_results<-
all_wflw%>%
  workflow_map(
    seed = 1403,
    resamples = chem_val,
    grid = 5,
    control = grid_ctrl
  )


grid_results%>%
  rank_results()%>%
  filter(.metric == 'rmse')%>%
  select(model, .config, rmse = mean, rank)

## # A tibble: 16 x 4
##    model        .config               rmse  rank
##    <chr>        <chr>                <dbl> <int>
##  1 cubist_rules Preprocessor1_Model1 0.915     1
##  2 cubist_rules Preprocessor1_Model4 0.981     2
##  3 cubist_rules Preprocessor1_Model2 1.00      3
##  4 cubist_rules Preprocessor1_Model5 1.02      4
##  5 boost_tree   Preprocessor1_Model1 1.05      5
##  6 cubist_rules Preprocessor1_Model3 1.10      6
##  7 rand_forest  Preprocessor1_Model3 1.12      7
##  8 boost_tree   Preprocessor1_Model3 1.14      8
##  9 boost_tree   Preprocessor1_Model5 1.15      9
## 10 bag_tree     Preprocessor1_Model1 1.15     10
## 11 rand_forest  Preprocessor1_Model1 1.16     11
## 12 boost_tree   Preprocessor1_Model4 1.20     12
## 13 rand_forest  Preprocessor1_Model2 1.22     13
## 14 rand_forest  Preprocessor1_Model4 1.28     14
## 15 rand_forest  Preprocessor1_Model5 1.30     15
## 16 boost_tree   Preprocessor1_Model2 1.30     16

The Cubist Rules Do the best on the OOB data.

Performance

Below is the performance of the best model.

best_results<-
  grid_results%>%
  extract_workflow_set_result('simple_cubist')%>%
  select_best(metric = 'rmse')

cubist_test_results<-
  grid_results%>%
  extract_workflow('simple_cubist')%>%
  finalize_workflow(best_results)%>%
  last_fit(split = chem_split)
collect_metrics(cubist_test_results)

## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.727 Preprocessor1_Model1
## 2 rsq     standard       0.832 Preprocessor1_Model1

cubist_test_results %>% 
   collect_predictions() %>% 
   ggplot(aes(x = Yield, y = .pred)) + 
   geom_abline(color = "gray50", lty = 2) + 
   geom_point(alpha = 0.5) + 
   coord_obs_pred() + 
   labs(x = "observed", y = "predicted")

Which are the most important predictors?

vi_mod<-vi_model(extract_fit_parsnip(cubist_test_results))
vi_mod%>%arrange(desc(Importance))

## # A tibble: 57 x 2
##    Variable               Importance
##    <chr>                       <dbl>
##  1 ManufacturingProcess13       34.5
##  2 ManufacturingProcess32       32  
##  3 BiologicalMaterial03         31  
##  4 ManufacturingProcess09       27  
##  5 BiologicalMaterial05         13  
##  6 ManufacturingProcess26       11.5
##  7 ManufacturingProcess33       10  
##  8 BiologicalMaterial06          9  
##  9 BiologicalMaterial02          8  
## 10 ManufacturingProcess15        6.5
## # ... with 47 more rows

The top 4 predictors are manufacturing process. They compare favorably to my top Nonlinear MARS model, which were Manufacturing processes 32,09, 13 and 01.

However, the RMSE and Rsquared were better for the cubist model.

I have committed to learning the tidymodels framework, and after some significant research I haven’t been able to plot the terminal nodes of my model. Cubist Rules is a model tree where the terminal nodes contain regression models based on the predictors used in the previous splits, so I would expect them to be regression models using the most important predictors on some subsets of the data.