8.1,8.2,8.3,8.7
recreate simulated data.
library(mlbench)
library(tidymodels)
cores<-parallel::detectCores()
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
sim_split<-initial_split(simulated)
sim_train<-training(sim_split)
sim_test<-training(sim_split)
a
Fit a random forest model to all of the predictors, then estimate the variable importance
library(tidymodels)
library(vip)
library(tidyverse)
sim_rec<-recipe(y~., data = simulated)
rf_spec<-
rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
set_mode('regression')%>%
set_engine('ranger')
rf_recipe<-
recipe(y~., data = sim_train)
rf_workflow<-
workflow()%>%
add_model(rf_spec)%>%
add_recipe(rf_recipe)
#show what will be tuned
extract_parameter_set_dials(rf_spec)
## Collection of 2 parameters for tuning
##
## identifier type object
## mtry mtry nparam[?]
## min_n min_n nparam[+]
##
## Model parameters needing finalization:
## # Randomly Selected Predictors ('mtry')
##
## See `?dials::finalize` or `?dials::update.parameters` for more information.
Tune The model.
#create a validation set from the training data (faster than resampling i think)
val_set <- validation_split(sim_train,
prop = 0.80)
rf_res<-
rf_workflow%>%
tune_grid(val_set,
grid=5,
control = control_grid(save_pred =TRUE),
metrics = metric_set(rmse))
last_rf_spec<-
rand_forest(mtry = 8, min_n = 7, trees = 1000)%>%
set_engine('ranger', num.threads = cores, importance = 'impurity')%>%
set_mode('regression')
last_rf_workflow<-
rf_workflow%>%
update_model(last_rf_spec)
#fit the model
last_rf_fit<-
last_rf_workflow%>%
last_fit(sim_split)
vi_<-vi_model(extract_fit_parsnip(last_rf_fit))
vi_%>%arrange(desc(Importance))
## # A tibble: 10 x 2
## Variable Importance
## <chr> <dbl>
## 1 V1 974.
## 2 V4 948.
## 3 V2 860.
## 4 V5 260.
## 5 V3 133.
## 6 V7 99.1
## 7 V6 96.0
## 8 V10 62.6
## 9 V8 58.9
## 10 V9 49.2
The model did not significantly use the uninformative predictors as they are an order of magnitude below the informative predictors
b
Add an additional predictor that is highly correlated with one of the informative predictors
First I’ll add the mutated column and resplit the data
set.seed(200)
simulated_mod <- mlbench.friedman1(200, sd = 1)
simulated_mod <- cbind(simulated_mod$x, simulated_mod$y)
simulated_mod <- as.data.frame(simulated_mod)%>%mutate(V1_mod = V1+rnorm(200)*.1)
colnames(simulated_mod)[ncol(simulated_mod)] <- "y"
sim_mod_split<-initial_split(simulated_mod)
sim_mod_train<-training(sim_mod_split)
sim_mod_test<-training(sim_mod_split)
Remaking the model with the additional Predictor
mod_rec<-
recipe(y~., data = sim_mod_train)
mod_spec<-
rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
set_mode('regression')%>%
set_engine('ranger')
mod_workflow<-
workflow()%>%
add_model(rf_spec)%>%
add_recipe(mod_rec)
mod_res<-
rf_workflow%>%
tune_grid(val_set,
grid=5,
control = control_grid(save_pred =TRUE),
metrics = metric_set(rmse))
mod_res%>%show_best()
## # A tibble: 5 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 5 6 rmse standard 2.62 1 NA Preprocessor1_Model3
## 2 9 17 rmse standard 2.71 1 NA Preprocessor1_Model1
## 3 4 19 rmse standard 2.90 1 NA Preprocessor1_Model2
## 4 8 30 rmse standard 2.94 1 NA Preprocessor1_Model4
## 5 2 37 rmse standard 3.61 1 NA Preprocessor1_Model5
After Tuning the Model, lets check the predictors.
last_mod_spec<-
rand_forest(mtry = 6, min_n = 8, trees = 1000)%>%
set_engine('ranger', num.threads = cores, importance = 'impurity')%>%
set_mode('regression')
last_mod_workflow<-
mod_workflow%>%
update_model(last_mod_spec)
#fit the model
last_mod_fit<-
last_mod_workflow%>%
last_fit(sim_mod_split)
vi_mod<-vi_model(extract_fit_parsnip(last_mod_fit))
vi_mod%>%arrange(desc(Importance))
## # A tibble: 11 x 2
## Variable Importance
## <chr> <dbl>
## 1 V1 9.98
## 2 V11 1.38
## 3 V7 0.302
## 4 V10 0.290
## 5 V6 0.270
## 6 V2 0.257
## 7 V8 0.253
## 8 V4 0.247
## 9 V3 0.230
## 10 V5 0.222
## 11 V9 0.196
The importance score did change, but all the scores dropped by an order of magnitude and the New variable seems to be sapping the percent importance of the main predictor.
c
fit a model with conditional inference trees
library(party)
ctrl<-cforest_control(mtry = ncol(simulated)-1)
rf_spec<-cforest(y~., data = simulated, controls = ctrl)
vip(rf_spec)
No, the order is not the same. In the conditional inference tree, V4 is the most important.
d
repeat this process with boosted trees and cubist, check the variable importance
library(rules)
## Warning: package 'rules' was built under R version 4.1.3
##
## Attaching package: 'rules'
## The following object is masked from 'package:dials':
##
## max_rules
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.1.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.1.2
library(baguette)
## Warning: package 'baguette' was built under R version 4.1.3
cubist_spec<-
cubist_rules(committees = tune(), neighbors = tune(), max_rules = tune())%>%
set_engine('Cubist')%>%
set_mode('regression')
xgb_spec<-
boost_tree(
trees = 1000,
min_n = tune(),
mtry = tune(),
learn_rate = .01
)%>%
set_engine('xgboost')%>%
set_mode('regression')
rf_spec<-
rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
set_mode('regression')%>%
set_engine('ranger' , importance = 'impurity')
norm_rec<-
rf_recipe%>%
step_normalize(all_predictors())
all_wflw<-
workflow_set(
preproc = list(simple = rf_recipe),
models = list(rand_forest = rf_spec, boost = xgb_spec,cubist = cubist_spec),
cross = TRUE
)
#control grid
grid_ctrl<-
control_grid(
save_pred = TRUE,
parallel_over = 'everything',
save_workflow = TRUE
)
grid_results<-
all_wflw%>%
workflow_map(
seed = 1403,
resamples = val_set,
grid = 5,
control = grid_ctrl
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: package 'xgboost' was built under R version 4.1.3
best_boost<-extract_workflow_set_result(grid_results,'simple_boost')%>%select_best(metric = 'rmse')
best_rf<-pull_workflow_set_result(grid_results,'simple_rand_forest')%>%select_best()
## Warning: `pull_workflow_set_result()` was deprecated in workflowsets 0.1.0.
## Please use `extract_workflow_set_result()` instead.
## Warning: No value of `metric` was given; metric 'rmse' will be used.
best_rf<-pull_workflow_set_result(grid_results,'simple_cubist')%>%select_best()
## Warning: `pull_workflow_set_result()` was deprecated in workflowsets 0.1.0.
## Please use `extract_workflow_set_result()` instead.
## No value of `metric` was given; metric 'rmse' will be used.
boost mod
best_boost<-extract_workflow_set_result(grid_results,'simple_boost')%>%select_best(metric = 'rmse')
boost_test_results<-
grid_results%>%
extract_workflow('simple_boost')%>%
finalize_workflow(best_boost)%>%
last_fit(split = sim_split)
vi_boost<-vi_model(extract_fit_parsnip(boost_test_results))
boost_vi_table<-vi_boost%>%arrange(desc(Importance))
rand_forest
best_boost<-extract_workflow_set_result(grid_results,'simple_rand_forest')%>%select_best(metric = 'rmse')
boost_test_results<-
grid_results%>%
extract_workflow('simple_rand_forest')%>%
finalize_workflow(best_boost)%>%
last_fit(split = sim_split)
vi_rand_forest<-vi_model(extract_fit_parsnip(boost_test_results))
rf_vi_table<-vi_rand_forest%>%arrange(desc(Importance))
cubist
best_boost<-extract_workflow_set_result(grid_results,'simple_cubist')%>%select_best(metric = 'rmse')
boost_test_results<-
grid_results%>%
extract_workflow('simple_cubist')%>%
finalize_workflow(best_boost)%>%
last_fit(split = sim_split)
vi_cubist<-vi_model(extract_fit_parsnip(boost_test_results))
cubist_vi_table<-vi_cubist%>%arrange(desc(Importance))
cubist_vi<-cbind(as.data.frame(cubist_vi_table),mod = 'cubist', rank = seq(from=1, to =10))
boost_vi<-cbind(as.data.frame(boost_vi_table), mod = 'boost',rank = seq(from=1, to =10))
rf_vi<-cbind(as.data.frame(rf_vi_table), mod = 'rf', rank = seq(from=1, to =10))
vi_table<-rbind(cubist_vi, boost_vi, rf_vi)%>%select(-Importance)
group_means<-vi_table%>%
group_by(Variable)%>%
summarize(mean = mean(rank))
ggplot(vi_table, aes(x = factor(Variable, level = c('V1', 'V2', 'V3', 'V4','V5','V6','V7','V8','V9','V10')), y =rank, color = mod))+
geom_point(alpha = .3 , position=position_jitter(h=0.15,w=0.15))+
scale_y_continuous(breaks=seq(1,10,by=1))+
ggtitle('Variable Rank by Model')+
labs(y = "Rank", x = "Variable")
The above plot is the variable rank by model type.
Granularity references how “pixelated” the data in predictor is. How many ‘grains’ are used to paint the picture of a continuous phenomena is a good example. Low granularity might mean that a variable ranging from 1 to 10 only has three levels of values in its range, while a highly granular variable might have 1000 ‘levels’ in between 0 and 10.
Tree models suffer from selection bias that having a higher number of distinct values, or ‘grains’ are vafored over “granular” (in this case meaning a lower number of distinct values) data.
Lets create 4 variables, where only two are predictors of the response. there will be 2 granular variables and two non-granular variables.
cont<-seq(1,10000, by=.01)
granular<-seq(1,10000, by =100)
p_cont<-sample(cont, 1000, replace = TRUE)
cont<-sample(cont, 1000, replace = TRUE)
p_gran<-sample(granular, 1000, replace = TRUE)
gran<-sample(granular, 1000, replace = TRUE)
y<-p_cont+p_gran+rnorm(1000)
dat<-data.frame(p_cont, cont,p_gran, gran,y)
train test data
# recall how to split
sim_split<-initial_split(dat)
sim_train<-training(sim_split)
sim_test<-testing(sim_split)
sim_folds<-vfold_cv(sim_train, v = 3 )
rf_rec<-
recipe(y~., data = sim_train)
#makes a prepped version of the data if you have transformations
tune_spec<-decision_tree( cost_complexity = tune(), tree_depth = tune(), min_n = tune())%>%
set_mode('regression')%>%
set_engine('rpart')
ml_wflow<-
workflow()%>%
add_recipe(rf_rec)%>%
add_model(tune_spec)
tree_grid <- grid_regular(cost_complexity(), tree_depth(), min_n(), levels = 4)
#tune
set.seed(234)
tree_rs<-ml_wflow%>%
tune_grid(
resamples = sim_folds,
grid = 4,
metrics = metric_set(rmse, rsq, mae, mape)
)
best_params<-
tree_rs%>%
select_best(metric = 'rmse')%>%as.data.frame()
#refit using entire training set
reg_res<-
ml_wflow%>%
finalize_workflow(best_params)%>%
last_fit(sim_split)
vi_cubist<-vi_model(extract_fit_parsnip(reg_res))
cubist_vi_table<-vi_cubist%>%arrange(desc(Importance))
cubist_vi_table
## # A tibble: 4 x 2
## Variable Importance
## <chr> <dbl>
## 1 p_gran 6973897781.
## 2 p_cont 6225687678.
## 3 cont 526678371.
## 4 gran 406667813.
I was expecting the importance of cont to be higher, but if we look at the variable importance between p_cont and p_gran, the model views p_cont as more significant even though they are equally important to the model.
a
These are the basic steps of a gradient boosting machine
learning rate:
is what percentage of variance the posterior modeling step will use to make its next model. If this is set to one, it will make a model that will overfit to the residuals of the prior model. bagging fraction:
A boosting model will select a fraction of the training data, then the remaining steps of the current iteration are only based on that particular sample of the data.
effects of the learning rate
the high learning rate model will use the predictor most correlated with the response, as its root node, and then overfit the residuals very quickly. This will cause the tree to use less predictors in aggregate and therefore boost the overall importance of the nodes closest to the root node.
effects of the bagging fraction
The lower bagging fraction model will select much smaller subsets of the data, increasing the impact of outliers, causing the parallel trees to be more disparate and therefore spreading out the importance of the predictors.
These two factors both have the effect of exaggerating eachother. the Left model learns slowly and tries to work on bags that look very different, while the right model overfits on bags that look a lot more similar.
b
I would err towards the low learning rate and low bagging fraction model, The bagging fraction will most likely decrease predictive power, but the effect of overfitting on the training data set will likely have more of an impact.
c
Interaction depth is the number of iterations the weak learners will permute through. It will exaggerate overfitting in the right model and potentially correct for undersampling and slow learning in the left model.
8.7
a
Which tree-based regression gives the optimal resampling and test performance
I will use a simple validation set instead of cross validation due to the number of models we are testing.
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.2
data("ChemicalManufacturingProcess")
dat<-ChemicalManufacturingProcess%>%na.omit()
chem_split<-initial_split(dat)
chem_train<-training(chem_split)
chem_test<-testing(chem_split)
set.seed(1502)
chem_val <- validation_split(chem_train,
prop = 0.80)
chem_rec<-recipe(Yield~., data =chem_train )%>%
step_nzv()%>%
step_impute_knn()%>%
step_normalize()
create model specifications
library(rules)
library(baguette)
bag_cart_spec<-
bag_tree()%>%
set_engine('rpart', times =50L)%>%
set_mode('regression')
cubist_spec<-
cubist_rules(committees = tune(), neighbors = tune(), max_rules = tune())%>%
set_engine('Cubist')%>%
set_mode('regression')
xgb_spec<-
boost_tree(
trees = 1000,
min_n = tune(),
mtry = tune(),
learn_rate = .01
)%>%
set_engine('xgboost')%>%
set_mode('regression')
rf_spec<-
rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
set_mode('regression')%>%
set_engine('ranger' , importance = 'impurity')
norm_rec<-
rf_recipe%>%
step_normalize(all_predictors())
all_wflw<-
workflow_set(
preproc = list(simple = chem_rec),
models = list(bagged = bag_cart_spec, rand_forest = rf_spec, boost = xgb_spec,cubist = cubist_spec),
cross = TRUE
)
#control grid
grid_ctrl<-
control_grid(
save_pred = TRUE,
parallel_over = 'everything',
save_workflow = TRUE
)
grid_results<-
all_wflw%>%
workflow_map(
seed = 1403,
resamples = chem_val,
grid = 5,
control = grid_ctrl
)
grid_results%>%
rank_results()%>%
filter(.metric == 'rmse')%>%
select(model, .config, rmse = mean, rank)
## # A tibble: 16 x 4
## model .config rmse rank
## <chr> <chr> <dbl> <int>
## 1 boost_tree Preprocessor1_Model3 0.990 1
## 2 cubist_rules Preprocessor1_Model4 1.04 2
## 3 cubist_rules Preprocessor1_Model3 1.07 3
## 4 rand_forest Preprocessor1_Model3 1.07 4
## 5 cubist_rules Preprocessor1_Model2 1.11 5
## 6 cubist_rules Preprocessor1_Model1 1.13 6
## 7 rand_forest Preprocessor1_Model1 1.13 7
## 8 boost_tree Preprocessor1_Model2 1.14 8
## 9 bag_tree Preprocessor1_Model1 1.15 9
## 10 boost_tree Preprocessor1_Model1 1.17 10
## 11 boost_tree Preprocessor1_Model5 1.20 11
## 12 rand_forest Preprocessor1_Model2 1.20 12
## 13 rand_forest Preprocessor1_Model5 1.20 13
## 14 boost_tree Preprocessor1_Model4 1.22 14
## 15 rand_forest Preprocessor1_Model4 1.24 15
## 16 cubist_rules Preprocessor1_Model5 1.26 16
a
Which tree-based regression gives the optimal resampling and test performance
I will use a simple validation set instead of cross validation due to the number of models we are testing.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
dat<-ChemicalManufacturingProcess%>%na.omit()
chem_split<-initial_split(dat)
chem_train<-training(chem_split)
chem_test<-testing(chem_split)
set.seed(1502)
chem_val <- validation_split(chem_train,
prop = 0.80)
chem_rec<-recipe(Yield~., data =chem_train )%>%
step_nzv()%>%
step_impute_knn()%>%
step_normalize()
create model specifications
library(rules)
library(baguette)
bag_cart_spec<-
bag_tree()%>%
set_engine('rpart', times =50L)%>%
set_mode('regression')
cubist_spec<-
cubist_rules(committees = tune(), neighbors = tune(), max_rules = tune())%>%
set_engine('Cubist')%>%
set_mode('regression')
xgb_spec<-
boost_tree(
trees = 1000,
min_n = tune(),
mtry = tune(),
learn_rate = .01
)%>%
set_engine('xgboost')%>%
set_mode('regression')
rf_spec<-
rand_forest(mtry = tune(), min_n=tune(), trees = 1000 )%>%
set_mode('regression')%>%
set_engine('ranger' , importance = 'impurity')
norm_rec<-
rf_recipe%>%
step_normalize(all_predictors())
all_wflw<-
workflow_set(
preproc = list(simple = chem_rec),
models = list(bagged = bag_cart_spec, rand_forest = rf_spec, boost = xgb_spec,cubist = cubist_spec),
cross = TRUE
)
#control grid
grid_ctrl<-
control_grid(
save_pred = TRUE,
parallel_over = 'everything',
save_workflow = TRUE
)
grid_results<-
all_wflw%>%
workflow_map(
seed = 1403,
resamples = chem_val,
grid = 5,
control = grid_ctrl
)
grid_results%>%
rank_results()%>%
filter(.metric == 'rmse')%>%
select(model, .config, rmse = mean, rank)
## # A tibble: 16 x 4
## model .config rmse rank
## <chr> <chr> <dbl> <int>
## 1 cubist_rules Preprocessor1_Model1 0.915 1
## 2 cubist_rules Preprocessor1_Model4 0.981 2
## 3 cubist_rules Preprocessor1_Model2 1.00 3
## 4 cubist_rules Preprocessor1_Model5 1.02 4
## 5 boost_tree Preprocessor1_Model1 1.05 5
## 6 cubist_rules Preprocessor1_Model3 1.10 6
## 7 rand_forest Preprocessor1_Model3 1.12 7
## 8 boost_tree Preprocessor1_Model3 1.14 8
## 9 boost_tree Preprocessor1_Model5 1.15 9
## 10 bag_tree Preprocessor1_Model1 1.15 10
## 11 rand_forest Preprocessor1_Model1 1.16 11
## 12 boost_tree Preprocessor1_Model4 1.20 12
## 13 rand_forest Preprocessor1_Model2 1.22 13
## 14 rand_forest Preprocessor1_Model4 1.28 14
## 15 rand_forest Preprocessor1_Model5 1.30 15
## 16 boost_tree Preprocessor1_Model2 1.30 16
The Cubist Rules Do the best on the OOB data.
Below is the performance of the best model.
best_results<-
grid_results%>%
extract_workflow_set_result('simple_cubist')%>%
select_best(metric = 'rmse')
cubist_test_results<-
grid_results%>%
extract_workflow('simple_cubist')%>%
finalize_workflow(best_results)%>%
last_fit(split = chem_split)
collect_metrics(cubist_test_results)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.727 Preprocessor1_Model1
## 2 rsq standard 0.832 Preprocessor1_Model1
cubist_test_results %>%
collect_predictions() %>%
ggplot(aes(x = Yield, y = .pred)) +
geom_abline(color = "gray50", lty = 2) +
geom_point(alpha = 0.5) +
coord_obs_pred() +
labs(x = "observed", y = "predicted")
b
Which are the most important predictors?
vi_mod<-vi_model(extract_fit_parsnip(cubist_test_results))
vi_mod%>%arrange(desc(Importance))
## # A tibble: 57 x 2
## Variable Importance
## <chr> <dbl>
## 1 ManufacturingProcess13 34.5
## 2 ManufacturingProcess32 32
## 3 BiologicalMaterial03 31
## 4 ManufacturingProcess09 27
## 5 BiologicalMaterial05 13
## 6 ManufacturingProcess26 11.5
## 7 ManufacturingProcess33 10
## 8 BiologicalMaterial06 9
## 9 BiologicalMaterial02 8
## 10 ManufacturingProcess15 6.5
## # ... with 47 more rows
The top 4 predictors are manufacturing process. They compare favorably to my top Nonlinear MARS model, which were Manufacturing processes 32,09, 13 and 01.
However, the RMSE and Rsquared were better for the cubist model.
c
I have committed to learning the tidymodels framework, and after some significant research I haven’t been able to plot the terminal nodes of my model. Cubist Rules is a model tree where the terminal nodes contain regression models based on the predictors used in the previous splits, so I would expect them to be regression models using the most important predictors on some subsets of the data.