I’ve been publishing screencasts demonstrating how to use the tidymodels framework from first steps in modeling to how to tune more complex models. Today, I’m using this week’s TidyTuesday dataset on the Office to show how to build lasso regression model and choose regularization parameters!
library(tidyverse)
## -- Attaching packages -------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages ------------------- tidymodels 0.1.0 --
## v broom 0.5.6 v rsample 0.0.6
## v dials 0.0.6 v tune 0.1.0
## v infer 0.5.1 v workflows 0.1.1
## v parsnip 0.1.1 v yardstick 0.0.6
## v recipes 0.1.12
## -- Conflicts ---------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x dials::margin() masks ggplot2::margin()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
library(scales)
library(ggthemes)
theme_set(theme_light())
ratings_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv")
## Parsed with column specification:
## cols(
## season = col_double(),
## episode = col_double(),
## title = col_character(),
## imdb_rating = col_double(),
## total_votes = col_double(),
## air_date = col_date(format = "")
## )
remove_regex <- "[:punct:]|[:digit:]|parts|part|the|and"
office_ratings <- ratings_raw %>%
transmute(
episode_name = str_to_lower(title),
episode_name = str_remove_all(episode_name, remove_regex),
episode_name = str_trim(episode_name),
imdb_rating
)
office_info <- schrute::theoffice %>%
mutate(
season = as.numeric(season),
episode = as.numeric(episode),
episode_name = str_to_lower(episode_name),
episode_name = str_remove_all(episode_name, remove_regex),
episode_name = str_trim(episode_name)
) %>%
select(season, episode, episode_name, director, writer, character)
office_info
## # A tibble: 55,130 x 6
## season episode episode_name director writer character
## <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 2 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Jim
## 3 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 4 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Jim
## 5 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 6 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 7 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 8 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Pam
## 9 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 10 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Pam
## # ... with 55,120 more rows
We are going to use several different kinds of features for modeling. First, let’s find out how many times characters speak per episode.
characters <- office_info %>%
count(episode_name, character) %>%
add_count(character, wt = n , name = 'character_count') %>%
filter(character_count > 800) %>%
select(-character_count) %>%
pivot_wider(
names_from = character,
values_from = n,
values_fill = list(n = 0)
)
characters
## # A tibble: 185 x 16
## episode_name Andy Angela Darryl Dwight Jim Kelly Kevin Michael Oscar Pam
## <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 a benihana ~ 28 37 3 61 44 5 14 108 1 57
## 2 aarm 44 39 30 87 89 0 30 0 28 34
## 3 after hours 20 11 14 60 55 8 4 0 10 15
## 4 alliance 0 7 0 47 49 0 3 68 14 22
## 5 angry y 53 7 5 16 19 13 9 0 7 29
## 6 baby shower 13 13 9 35 27 2 4 79 3 25
## 7 back from v~ 3 4 6 22 25 0 5 70 0 33
## 8 banker 1 2 0 17 0 0 2 44 0 5
## 9 basketball 0 3 15 25 21 0 1 104 2 14
## 10 beach games 18 8 0 38 22 9 5 105 5 23
## # ... with 175 more rows, and 5 more variables: Phyllis <int>, Ryan <int>,
## # Toby <int>, Erin <int>, Jan <int>
Next, let’s find which directorsand writers are involved in each episode. I’m choosing here to combine this into one category in modeling, for a simpler model, since these are often the same individuals
creators <- office_info %>%
distinct(episode_name, director, writer) %>%
pivot_longer(director:writer, names_to = 'role', values_to = 'person') %>%
separate_rows(person, sep = ';') %>%
add_count(person) %>%
filter(n > 10) %>%
distinct(episode_name, person) %>%
mutate(person_value = 1) %>%
pivot_wider(
names_from = person,
values_from = person_value,
values_fill = list(person_value = 0)
)
creators
## # A tibble: 135 x 14
## episode_name `Ken Kwapis` `Greg Daniels` `B.J. Novak` `Paul Lieberste~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 pilot 1 1 0 0
## 2 diversity d~ 1 0 1 0
## 3 health care 0 0 0 1
## 4 basketball 0 1 0 0
## 5 hot girl 0 0 0 0
## 6 dundies 0 1 0 0
## 7 sexual hara~ 1 0 1 0
## 8 office olym~ 0 0 0 0
## 9 fire 1 0 1 0
## 10 halloween 0 1 0 0
## # ... with 125 more rows, and 9 more variables: `Mindy Kaling` <dbl>, `Paul
## # Feig` <dbl>, `Gene Stupnitsky` <dbl>, `Lee Eisenberg` <dbl>, `Jennifer
## # Celotta` <dbl>, `Randall Einhorn` <dbl>, `Brent Forrester` <dbl>, `Jeffrey
## # Blitz` <dbl>, `Justin Spitzer` <dbl>
Next, let’s find the season and episode number for each episode, and then finally let’s put it all together into one dataset for modeling
office<- office_info %>%
distinct(season, episode, episode_name) %>%
inner_join(characters) %>%
inner_join(creators) %>%
inner_join(office_ratings %>%
select(episode_name, imdb_rating)) %>%
janitor::clean_names()
## Joining, by = "episode_name"
## Joining, by = "episode_name"
## Joining, by = "episode_name"
office
## # A tibble: 135 x 32
## season episode episode_name andy angela darryl dwight jim kelly kevin
## <dbl> <dbl> <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 1 1 pilot 0 1 0 29 36 0 1
## 2 1 2 diversity d~ 0 4 0 17 25 2 8
## 3 1 3 health care 0 5 0 62 42 0 6
## 4 1 5 basketball 0 3 15 25 21 0 1
## 5 1 6 hot girl 0 3 0 28 55 0 5
## 6 2 1 dundies 0 1 1 32 32 7 1
## 7 2 2 sexual hara~ 0 2 9 11 16 0 6
## 8 2 3 office olym~ 0 6 0 55 55 0 9
## 9 2 4 fire 0 17 0 65 51 4 5
## 10 2 5 halloween 0 13 0 33 30 3 2
## # ... with 125 more rows, and 22 more variables: michael <int>, oscar <int>,
## # pam <int>, phyllis <int>, ryan <int>, toby <int>, erin <int>, jan <int>,
## # ken_kwapis <dbl>, greg_daniels <dbl>, b_j_novak <dbl>,
## # paul_lieberstein <dbl>, mindy_kaling <dbl>, paul_feig <dbl>,
## # gene_stupnitsky <dbl>, lee_eisenberg <dbl>, jennifer_celotta <dbl>,
## # randall_einhorn <dbl>, brent_forrester <dbl>, jeffrey_blitz <dbl>,
## # justin_spitzer <dbl>, imdb_rating <dbl>
There are lots of great examples of EDA on Twitter, I especially encourage you to check out the screencast of my coauthor Dave, which is is similar in spirit to the modeling I am showing here and includes more EDA. Just for kicks, let’s show one graph
office %>%
ggplot(aes(x = episode, y = imdb_rating, fill = as.factor(episode))) +
geom_boxplot(show.legend = FALSE)
Ratings are higher for episodes later in the season. What else is associated with higher ratings? Let’s use lasso regression to find out!
We can start by loading the tidymodels metapackage, and splitting our data into training and testing sets
library(tidymodels)
office_split <- initial_split(office, strata = season)
office_train <- training(office_split)
office_test <- testing(office_split)
Then, we build a recipe for data preprocessing
recipe() what our model is going to be (using a formula here) and what our training data isepisode_name, since this is a variable we might like to keep around for convenience as an identifier for rows but is not a predictor or outcomeThe object office_rec is a recipe that has not been trained on data yet (for example. the centered and scaling has not been calculated) and the office_prep is an object that has been trained on data. The reason I use string_as_factors = FALSEhere is that my ID column episode_name is of type character (as opposed to, say, integers)
office_rec <- recipe(imdb_rating ~., data = office_train) %>%
update_role(episode_name, new_role = 'id') %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes())
office_prep <- office_rec %>%
prep(string_as_factor = FALSE)
Now it’s time to specify&& and then fit** our models. Here I set up one more specification for lasso regression; I picked a value for penalty (sort of randomly) and I set mixture =1 for lasso. I am using a workflow() in this example for convenience; these are objects that can help you manage modeling pipelines more easily, with ppeices that fit together like Lego blocks. You can fit() a workflow, much like you can fit a model, and then you can pull out the fit object and tidy() it
lasso_spec <- linear_reg(penalty = 0.1, mixture =1) %>%
set_engine('glmnet')
wf <- workflow() %>% add_recipe(office_rec)
lasso_fit <- wf %>% add_model(lasso_spec) %>% fit(data = office_train)
lasso_fit %>% pull_workflow_fit() %>% tidy()
## # A tibble: 1,597 x 5
## term step estimate lambda dev.ratio
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1 8.40 0.197 0
## 2 (Intercept) 2 8.40 0.180 0.0254
## 3 michael 2 0.0176 0.180 0.0254
## 4 (Intercept) 3 8.40 0.164 0.0465
## 5 michael 3 0.0336 0.164 0.0465
## 6 (Intercept) 4 8.40 0.149 0.0721
## 7 angela 4 0.00736 0.149 0.0721
## 8 michael 4 0.0478 0.149 0.0721
## 9 (Intercept) 5 8.40 0.136 0.0994
## 10 angela 5 0.0199 0.136 0.0994
## # ... with 1,587 more rows
So we fit one lasso model, but how do we know the right regularization parameter penalty?. We can figure that out using resampling and tuning the model. Let’s build a set of bootstrap resamples, and set penalty() to set up an appropriate grid for this kind of regularization model.
set.seed(1234)
office_boot <- bootstraps(office_train, strata = season)
tune_spec <- linear_reg(penalty = tune(), mixture =1) %>%
set_engine('glmnet')
lambda_grid <- grid_regular(penalty(), levels = 50)
Not it’s time to tune the grid, using our workflow object
doParallel::registerDoParallel()
set.seed(2020)
lasso_grid <- tune_grid(
wf %>% add_model(tune_spec),
resamples = office_boot,
grid = lambda_grid
)
What results did we get?
lasso_grid %>% collect_metrics()
## # A tibble: 100 x 6
## penalty .metric .estimator mean n std_err
## <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 1.00e-10 rmse standard 0.601 25 0.0156
## 2 1.00e-10 rsq standard 0.115 25 0.0198
## 3 1.60e-10 rmse standard 0.601 25 0.0156
## 4 1.60e-10 rsq standard 0.115 25 0.0198
## 5 2.56e-10 rmse standard 0.601 25 0.0156
## 6 2.56e-10 rsq standard 0.115 25 0.0198
## 7 4.09e-10 rmse standard 0.601 25 0.0156
## 8 4.09e-10 rsq standard 0.115 25 0.0198
## 9 6.55e-10 rmse standard 0.601 25 0.0156
## 10 6.55e-10 rsq standard 0.115 25 0.0198
## # ... with 90 more rows
That’s nice, but I would rather see a visualization of performance with the regularization parameter
lasso_grid %>% collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_errorbar(aes(
ymin = mean - std_err,
ymax = mean + std_err
),
alpha =0.5
) +
geom_line(size = 1.5) +facet_wrap(~.metric, scales = 'free', nrow = 2)+ scale_x_log10() +theme(legend.position = 'none')
## Warning: Removed 3 row(s) containing missing values (geom_path).
This is the great way to see that regularization helps this modeling a lot. We have a couple of options for choosing our final parameter, such as selecct_by_pct_loss() or select_by_one_std_err(), but for now let’s stick with just picking the lowest RMSE. After we have that parameter, we can finalize our workflow, i.e update it with this value.
lowest_rmse <- lasso_grid %>% select_best('rmse', maximize = FALSE)
## Warning: The `maximize` argument is no longer needed. This value was ignored.
final_lasso <- finalize_workflow(
wf %>% add_model(tune_spec),
lowest_rmse
)
We can then fit this finalized workflow on our training data. While we’re all it, let’s see what the most important variables are using vip package.
library(vip)
## Warning: package 'vip' was built under R version 4.0.2
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
final_lasso %>%
fit(office_train)%>%
pull_workflow_fit() %>%
vi(lambda = lowest_rmse$penalty) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0,0)) +
labs( y = NULL)
An then, finally, let’s return to our test data. The tune package has a function last_fit() which is nice for situations when you have tuned and finalized a model on workflow and want to fit it one last time on your training data and evaluate it on your tsting data. You only have to pass this function your finalized model/workflow and your split
last_fit(
final_lasso,
office_split
) %>%
collect_metrics()
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.437
## 2 rsq standard 0.254