LASSO regression using tidymodels for The Office (Based on Silge, 2020)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
ratings_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv")
## Parsed with column specification:
## cols(
## season = col_double(),
## episode = col_double(),
## title = col_character(),
## imdb_rating = col_double(),
## total_votes = col_double(),
## air_date = col_date(format = "")
## )
remove_regex <- "[:punct:]|[:digit:]|parts |part |the |and"
office_ratings <- ratings_raw %>%
transmute(
episode_name = str_to_lower(title),
episode_name = str_remove_all(episode_name, remove_regex),
episode_name = str_trim(episode_name),
imdb_rating
)
office_info <- schrute::theoffice %>%
mutate(
season = as.numeric(season),
episode = as.numeric(episode),
episode_name = str_to_lower(episode_name),
episode_name = str_remove_all(episode_name, remove_regex),
episode_name = str_trim(episode_name)
) %>%
select(season, episode, episode_name, director, writer, character)
office_info
characters <- office_info %>%
count(episode_name, character) %>%
add_count(character, wt = n, name = "character_count") %>%
filter(character_count > 800) %>%
select(-character_count) %>%
pivot_wider(
names_from = character,
values_from = n,
values_fill = list(n = 0)
)
characters
creators <- office_info %>%
distinct(episode_name, director, writer) %>%
pivot_longer(director:writer, names_to = "role", values_to = "person") %>%
separate_rows(person, sep = ";") %>%
add_count(person) %>%
filter(n > 10) %>%
distinct(episode_name, person) %>%
mutate(person_value = 1) %>%
pivot_wider(
names_from = person,
values_from = person_value,
values_fill = list(person_value = 0)
)
creators
office <- office_info %>%
distinct(season, episode, episode_name) %>%
inner_join(characters) %>%
inner_join(creators) %>%
inner_join(office_ratings %>%
select(episode_name, imdb_rating)) %>%
janitor::clean_names()
## Joining, by = "episode_name"
## Joining, by = "episode_name"
## Joining, by = "episode_name"
office
office %>%
ggplot(aes(episode, imdb_rating, fill = as.factor(episode))) +
geom_boxplot(show.legend = FALSE)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────────────────────────── tidymodels 0.1.0 ──
## ✓ broom 0.5.5 ✓ rsample 0.0.5
## ✓ dials 0.0.4 ✓ tune 0.0.1
## ✓ infer 0.5.1 ✓ workflows 0.1.0
## ✓ parsnip 0.0.5 ✓ yardstick 0.0.5
## ✓ recipes 0.1.9
## ── Conflicts ───────────────────────────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x dials::margin() masks ggplot2::margin()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x recipes::yj_trans() masks scales::yj_trans()
office_split <- initial_split(office, strata = season)
office_train <- training(office_split)
office_test <- testing(office_split)
office_rec <- recipe(imdb_rating ~ ., data = office_train) %>%
update_role(episode_name, new_role = "ID") %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes())
office_prep <- office_rec %>%
prep(strings_as_factors = FALSE)
lasso_spec <- linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine("glmnet")
wf <- workflow() %>%
add_recipe(office_rec)
lasso_fit <- wf %>%
add_model(lasso_spec) %>%
fit(data = office_train)
lasso_fit %>%
pull_workflow_fit() %>%
tidy()
set.seed(1234)
office_boot <- bootstraps(office_train, strata = season)
tune_spec <- linear_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
lambda_grid <- grid_regular(penalty(), levels = 50)
doParallel::registerDoParallel()
set.seed(2020)
lasso_grid <- tune_grid(
wf %>% add_model(tune_spec),
resamples = office_boot,
grid = lambda_grid
)
lasso_grid %>%
collect_metrics()
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_errorbar(aes(
ymin = mean - std_err,
ymax = mean + std_err
),
alpha = 0.5
) +
geom_line(size = 1.5) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
## Warning: Removed 3 row(s) containing missing values (geom_path).
lowest_rmse <- lasso_grid %>%
select_best("rmse", maximize = FALSE)
final_lasso <- finalize_workflow(
wf %>% add_model(tune_spec),
lowest_rmse
)
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
final_lasso %>%
fit(office_train) %>%
pull_workflow_fit() %>%
vi(lambda = lowest_rmse$penalty) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
last_fit(
final_lasso,
office_split
) %>%
collect_metrics()
The most important factors for the rating of an episode of The Office are the work of Greg Daniel as writer, the lines of Michael Scott, and the time of the season: last episodes of each season has a higer rating.