Read in the train.csv data.
set.seed).Read in the fallmembershipreport_20192020.xlsx data.
Attending School ID, School Name, and all columns that represent the race/ethnicity percentages for the schools (there is example code in recent class slides).Join the two data sets.
If you have accessed outside data to help increase the performance of your models for the final project (e.g., NCES), you can read in and join those data as well.
git <- "~/GitHub/machinelearning"
data <- import(path(git, "data/train.csv")) %>%
sample_frac(.01) %>%
mutate(ncessch = as.factor(ncessch),
enrl_grd = as.factor(enrl_grd))
bonus_data <- import(path(git, "bonus_data.csv")) %>%
mutate(enrl_grd = as.factor(enrl_grd),
ncessch = as.factor(ncessch))
race <- readxl::read_xlsx(path(git, "/data/fallmembershipreport_20192020.xlsx"), 4) %>%
select(attnd_schl_inst_id = `Attending School ID`,
sch_name = `School Name`,
contains("%")) %>%
clean_names()
names(race) <- gsub("x2019_20_percent", "p", names(race))
data <- data %>%
left_join(bonus_data) %>%
left_join(race)
Split joined data from above into a training set and test set, stratified by the outcome score.
Use 10-fold CV to resample the training set, stratified by score.
set.seed(5000)
data_split <- initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
cv_split <- vfold_cv(data_train)
Create one recipe to prepare your data for CART, bagged tree, and random forest models.
This lab could potentially serve as a template for your Premilinary Fit 2, or your final model prediction for the Final Project, so consider applying what might be your best model formula and the necessary preprocessing steps.
rec <- recipe(score ~ ., data_train) %>%
step_mutate(tst_dt = as.numeric(lubridate::mdy_hms(tst_dt)),
lang_cd = case_when(lang_cd == "S" ~ "S", TRUE ~ "E")) %>%
update_role(contains("id"), st_schid, ncessch, new_role = "id") %>%
step_novel(all_nominal(), -all_outcomes()) %>%
step_unknown(all_nominal()) %>%
step_rm(contains("bnch"), total_n, locale, title1_status, classification) %>%
step_mutate(wealth_prop = log(wealth_prop)) %>%
step_medianimpute(all_numeric(),
-all_outcomes(),
-has_role("id")) %>%
step_normalize(
all_numeric(),
-all_outcomes(),
-has_role("id")) %>%
step_dummy(all_nominal(), -has_role("id")) %>%
step_nzv(all_predictors(), freq_cut = 95/5) %>%
step_interact(terms = ~lat:lon) %>%
step_ns(tst_dt, deg_free = 4)
parsnip CART model using{rpart} for the estimation, tuning the cost complexity and minimum \(n\) for a terminal node.cart_tune <- decision_tree() %>%
set_mode("regression") %>%
set_args(cost_complexity = tune(),
min_n = tune()) %>%
set_engine("rpart")
workflow object that combines your recipe and your parsnip objects.cart_wflow <- workflow() %>%
add_recipe(rec) %>%
add_model(cart_tune)
tune_gridgrid = 10 to choose 10 grid points automaticallymetrics argument, please include rmse, rsq, and huber_loss{tictoc}, or you could do something like:cl <- (parallel::detectCores())
doParallel::registerDoParallel(cl)
tic()
#code to fit model
cart_grid <- tune_grid(cart_wflow,
cv_split,
grid = 10,
metrics = yardstick::metric_set(rmse, rsq, huber_loss))
toc()
## 9.266 sec elapsed
show_best(cart_grid, metric = "rmse", n = 5)
## # A tibble: 1 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 rmse standard 109. 10 2.16
show_best(cart_grid, metric = "rsq", n = 5)
## # A tibble: 1 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 rsq standard 0.216 10 0.0220
show_best(cart_grid, metric = "huber_loss", n = 5)
## # A tibble: 1 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 huber_loss standard 83.6 10 1.36
parsnip bagged tree model using{baguette}cost_complexity and min_nbag_tune <- bag_tree() %>%
set_mode("regression") %>%
set_args(cost_complexity = tune(),
min_n = tune()) %>%
set_engine("rpart", times = 10)
workflow object that combines your recipe and your bagged tree model specification.bag_wflow <- workflow() %>%
add_recipe(rec) %>%
add_model(bag_tune)
tune_gridgrid = 10 to choose 10 grid points automaticallymetrics argument, please include rmse, rsq, and huber_losscontrol argument, please include extract = function(x) extract_model(x) to extract the model from each fit{baguette} is optimized to run in parallel with the {future} package. Consider using {future} to speed up processing time (see the class slides)start_rf <- Sys.time()
bag_grid <- tune_grid(bag_wflow,
cv_split,
grid = 10,
metrics = yardstick::metric_set(rmse, rsq, huber_loss),
control = control_resamples(extract = function(x) extract_model(x)))
end_rf <- Sys.time()
end_rf - start_rf
## Time difference of 3.061337 mins
show_best(bag_grid, metric = "rmse", n = 1)
## # A tibble: 1 x 8
## cost_complexity min_n .metric .estimator mean n std_err .config
## <dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00000000690 32 rmse standard 99.0 10 1.55 Model08
show_best(bag_grid, metric = "rsq", n = 1)
## # A tibble: 1 x 8
## cost_complexity min_n .metric .estimator mean n std_err .config
## <dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00000000690 32 rsq standard 0.299 10 0.0168 Model08
show_best(bag_grid, metric = "huber_loss", n = 1)
## # A tibble: 1 x 8
## cost_complexity min_n .metric .estimator mean n std_err .config
## <dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00000774 36 huber_loss standard 76.0 10 0.954 Model03
bag_roots function below. Apply this function to the extracted bagged tree models from the previous step. This will output the feature at the root node for each of the decision trees fit.bag_roots <- function(x){
x %>%
select(.extracts) %>%
unnest(cols = c(.extracts)) %>%
mutate(models = map(.extracts,
~.x$model_df)) %>%
select(-.extracts) %>%
unnest(cols = c(models)) %>%
mutate(root = map_chr(model,
~as.character(.x$fit$frame[1, 1]))) %>%
select(root)
}
roots <- bag_roots(bag_grid)
ggplot(roots, aes(x = root)) +
geom_bar() +
coord_flip()

parsnip random forest model using {ranger}importance = "permutation" argument to run variable importancerfrst_tune <- rand_forest() %>%
set_mode("regression") %>%
set_args(trees = 1000) %>%
set_engine("ranger",
num.threads = 4,
importance = "permutation")
workflow object that combines your recipe and your random forest model specification.rfrst_wflow <-workflow() %>%
add_recipe(rec) %>%
add_model(rfrst_tune)
metrics argument, please include rmse, rsq, and huber_losscontrol argument, please include extract = function(x) x to extract the workflow from each fitstart_rf <- Sys.time()
rfrst_grid <- fit_resamples(rfrst_wflow,
cv_split,
metrics = yardstick::metric_set(rmse, rsq, huber_loss),
control = control_resamples(extract = function(x) x))
end_rf <- Sys.time()
end_rf - start_rf
## Time difference of 43.37589 secs
show_best(rfrst_grid, metric = "rmse", n = 1)
## # A tibble: 1 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 rmse standard 99.6 10 1.81
show_best(rfrst_grid, metric = "rsq", n = 1)
## # A tibble: 1 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 rsq standard 0.291 10 0.0241
show_best(rfrst_grid, metric = "huber_loss", n = 1)
## # A tibble: 1 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 huber_loss standard 76.5 10 1.15
collect_metrics(rfrst_grid)
## # A tibble: 3 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 huber_loss standard 76.5 10 1.15
## 2 rmse standard 99.6 10 1.81
## 3 rsq standard 0.291 10 0.0241
rf_roots function to the results of your random forest model to output the feature at the root node for each of the decision trees fit in your random forest model.rf_tree_roots <- function(x){
map_chr(1:1000,
~ranger::treeInfo(x, tree = .)[1, "splitvarName"])
}
rf_roots <- function(x){
x %>%
select(.extracts) %>%
unnest(cols = c(.extracts)) %>%
mutate(fit = map(.extracts,
~.x$fit$fit$fit),
oob_rmse = map_dbl(fit,
~sqrt(.x$prediction.error)),
roots = map(fit,
~rf_tree_roots(.))
) %>%
select(roots) %>%
unnest(cols = c(roots))
}
rf_roots(rfrst_grid) %>%
ggplot(aes(x = roots)) +
geom_bar() +
coord_flip()

The random forest model is purposefully ommiting random variables when running new models to create more variation in the root node. Bagging is only creating variation in its sampling process and averaging across the different resampled models.
fit function to your random forest workflow object and your full training data. In class we talked about the idea that bagged tree and random forest models use resampling, and one could use the OOB prediction error provided by the models to estimate model performance.sqrt() of this value to get the rmse. Or you can extract it by running: sqrt(fit-object-name-here$fit$fit$fit$prediction.error).tic()
rfrst_fit <- fit(rfrst_wflow,
data = data_train)
toc()
## 4.273 sec elapsed
pluck(sqrt(rfrst_fit$fit$fit$fit$prediction.error))
## [1] 99.98177
The OOB estimate is very similar the the mean RSME in the CV random forest. The 10-fold helps reduce variance in the training data so it is more likely to approximate the test data set.
Consider the four models you fit: (a) decision tree, (b) bagged tree, (c) random forest fit on resamples, and (d) random forest fit on the training data. Which model would you use for your final fit? Please consider the performance metrics as well as the run time, and briefly explain your decision.
The decision tree demonstrated the best fit, but I am somewhat concerned that it might be overfitting to the data. I would be more likely to trust the random forest model based on its lower runtime compared to the bagged tree and slightly lower RMSE.