Explore The Office
Alright, cut the crap show me the code!
# Data Manipulation
library(tidyverse)
library(lubridate)
# Models
library(tidymodels)
# Theme
library(extrafont)
loadfonts(device = "win")
theme_plex <- function(base_size = 11,
strip_text_size = 12,
strip_text_margin = 5,
subtitle_size = 13,
subtitle_margin = 10,
plot_title_size = 16,
plot_title_margin = 10,
...) {
ret <- ggplot2::theme_minimal(base_family = "Comic Sans MS",
base_size = base_size, ...)
ret$strip.text <- ggplot2::element_text(
hjust = 0, size = strip_text_size,
margin = ggplot2::margin(b = strip_text_margin),
family = "Comic Sans MS-Medium"
)
ret$plot.subtitle <- ggplot2::element_text(
hjust = 0, size = subtitle_size,
margin = ggplot2::margin(b = subtitle_margin),
family = "Comic Sans MS"
)
ret$plot.title <- ggplot2::element_text(
hjust = 0, size = plot_title_size,
margin = ggplot2::margin(b = plot_title_margin),
family = "Comic Sans MS-Bold"
)
ret
}
theme_set(theme_plex())Data Gathering & Preprocessing
ratings_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv")
remove_regex <- "[:punct:]|[:digit:]|parts |part |the |and"
# Rating df
office_ratings <- ratings_raw %>%
transmute( # transmute() only keep features we create
episode_name = str_to_lower(title),
episode_name = str_remove_all(episode_name, remove_regex),
episode_name = str_trim(episode_name),
imdb_rating
)
# install.packages("schrute")
office_info <- schrute::theoffice %>%
mutate(
air_date = ymd(air_date),
season = as.numeric(season),
episode = as.numeric(episode),
# Same thing like before
episode_name = str_to_lower(episode_name),
episode_name = str_remove_all(episode_name, remove_regex),
episode_name = str_trim(episode_name)
) %>%
select(season, episode, episode_name, director, writer, character)
office_info## # A tibble: 55,130 x 6
## season episode episode_name director writer character
## <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 2 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Jim
## 3 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 4 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Jim
## 5 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 6 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 7 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 8 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Pam
## 9 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Michael
## 10 1 1 pilot Ken Kwapis Ricky Gervais;Stephen Merch~ Pam
## # ... with 55,120 more rows
Feature Engineering
- How many times characters speak per episode
characters <- office_info %>%
# Count character speaking frequency
count(episode_name, character) %>%
# Filter the characters that involve the most during the entire series
add_count(character, wt = n, name = "character_count") %>%
filter(character_count > 800) %>%
select(-character_count) %>%
# Pivot into matrix for modeling purpose
pivot_wider(names_from = character, values_from = n, values_fill = list(n = 0))
characters## # A tibble: 185 x 16
## episode_name Andy Angela Darryl Dwight Jim Kelly Kevin Michael Oscar Pam
## <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 a benihana ~ 28 37 3 61 44 5 14 108 1 57
## 2 aarm 44 39 30 87 89 0 30 0 28 34
## 3 after hours 20 11 14 60 55 8 4 0 10 15
## 4 alliance 0 7 0 47 49 0 3 68 14 22
## 5 angry y 53 7 5 16 19 13 9 0 7 29
## 6 baby shower 13 13 9 35 27 2 4 79 3 25
## 7 back from v~ 3 4 6 22 25 0 5 70 0 33
## 8 banker 1 2 0 17 0 0 2 44 0 5
## 9 basketball 0 3 15 25 21 0 1 104 2 14
## 10 beach games 18 8 0 38 22 9 5 105 5 23
## # ... with 175 more rows, and 5 more variables: Phyllis <int>, Ryan <int>,
## # Toby <int>, Erin <int>, Jan <int>
- Which directors and writers are involved in each episode
creators <- office_info %>%
distinct(episode_name, director, writer) %>% # Highlight 1
pivot_longer(director:writer, names_to = "role", values_to = "person") %>%
separate_rows(person, sep = ";") %>% # Highlight 2
add_count(person) %>% # count their occurrence
filter(n > 10) %>% # filter step 1
distinct(episode_name, person) %>% # filter step 2
mutate(person_value = 1) %>%
pivot_wider( # Dummy matrix
names_from = person,
values_from = person_value,
values_fill = list(person_value = 0)
)
creators## # A tibble: 135 x 14
## episode_name `Ken Kwapis` `Greg Daniels` `B.J. Novak` `Paul Lieberste~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 pilot 1 1 0 0
## 2 diversity d~ 1 0 1 0
## 3 health care 0 0 0 1
## 4 basketball 0 1 0 0
## 5 hot girl 0 0 0 0
## 6 dundies 0 1 0 0
## 7 sexual hara~ 1 0 1 0
## 8 office olym~ 0 0 0 0
## 9 fire 1 0 1 0
## 10 halloween 0 1 0 0
## # ... with 125 more rows, and 9 more variables: `Mindy Kaling` <dbl>, `Paul
## # Feig` <dbl>, `Gene Stupnitsky` <dbl>, `Lee Eisenberg` <dbl>, `Jennifer
## # Celotta` <dbl>, `Randall Einhorn` <dbl>, `Brent Forrester` <dbl>, `Jeffrey
## # Blitz` <dbl>, `Justin Spitzer` <dbl>
- Put it all together
office <- office_info %>%
# Distinct the season and episode
distinct(season, episode, episode_name) %>%
# Join the features
inner_join(characters) %>%
inner_join(creators) %>%
inner_join(office_ratings) %>%
# Clean the colnames
janitor::clean_names()## Joining, by = "episode_name"
## Joining, by = "episode_name"
## Joining, by = "episode_name"
## # A tibble: 136 x 32
## season episode episode_name andy angela darryl dwight jim kelly kevin
## <dbl> <dbl> <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 1 1 pilot 0 1 0 29 36 0 1
## 2 1 2 diversity d~ 0 4 0 17 25 2 8
## 3 1 3 health care 0 5 0 62 42 0 6
## 4 1 5 basketball 0 3 15 25 21 0 1
## 5 1 6 hot girl 0 3 0 28 55 0 5
## 6 2 1 dundies 0 1 1 32 32 7 1
## 7 2 2 sexual hara~ 0 2 9 11 16 0 6
## 8 2 3 office olym~ 0 6 0 55 55 0 9
## 9 2 4 fire 0 17 0 65 51 4 5
## 10 2 5 halloween 0 13 0 33 30 3 2
## # ... with 126 more rows, and 22 more variables: michael <int>, oscar <int>,
## # pam <int>, phyllis <int>, ryan <int>, toby <int>, erin <int>, jan <int>,
## # ken_kwapis <dbl>, greg_daniels <dbl>, b_j_novak <dbl>,
## # paul_lieberstein <dbl>, mindy_kaling <dbl>, paul_feig <dbl>,
## # gene_stupnitsky <dbl>, lee_eisenberg <dbl>, jennifer_celotta <dbl>,
## # randall_einhorn <dbl>, brent_forrester <dbl>, jeffrey_blitz <dbl>,
## # justin_spitzer <dbl>, imdb_rating <dbl>
office %>%
ggplot(aes(episode, imdb_rating)) +
geom_boxplot(aes(fill = as.factor(episode))) + # Put the group factor here
geom_smooth(color = "orange", size = 1.2, alpha = 0.4) +
labs(y = "IMDB Rating") +
ggtitle("Ratings variation in episodes",
subtitle = "Ratings are higher for episodes later in the season. ") +
theme(legend.position = "None")## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Lasso Regression
Split training & testing sets
Notice that we use strata = season to make sure we get even samples from each season.
Feature engineering recipe
office_rec <- recipe(imdb_rating ~., data = office_train) %>%
update_role(episode_name, new_role = "ID") %>% # Keep the episode_name as id, exclude it from training
step_zv(all_numeric(), -all_outcomes()) %>% # Exclude the outcome since the outcome is numeric as well
step_normalize(all_numeric(), -all_outcomes()) # Center and scale the predictors
office_prep <- office_rec %>%
prep(strings_as_factors = F) # Since we kept the id
office_prep## Data Recipe
##
## Inputs:
##
## role #variables
## ID 1
## outcome 1
## predictor 30
##
## Training data contained 103 data points and no missing data.
##
## Operations:
##
## Zero variance filter removed no terms [trained]
## Centering and scaling for season, episode, andy, angela, darryl, ... [trained]
Fit the model
# mixture = 1 --> Lasso;
lasso_spec <- linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine("glmnet")
# Set up a workflow
wf <- workflow() %>%
add_recipe(office_rec)
# Fit
lasso_fit <- wf %>%
add_model(lasso_spec) %>%
fit(data = office_train)
lasso_fit %>%
pull_workflow_fit() %>%
tidy()## # A tibble: 1,598 x 5
## term step estimate lambda dev.ratio
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1 8.39 0.194 0
## 2 (Intercept) 2 8.39 0.177 0.0334
## 3 jim 2 0.00616 0.177 0.0334
## 4 greg_daniels 2 0.0179 0.177 0.0334
## 5 (Intercept) 3 8.39 0.161 0.0873
## 6 jim 3 0.0223 0.161 0.0873
## 7 michael 3 0.00949 0.161 0.0873
## 8 greg_daniels 3 0.0346 0.161 0.0873
## 9 (Intercept) 4 8.39 0.147 0.135
## 10 jim 4 0.0367 0.147 0.135
## # ... with 1,588 more rows
Fine tune the parameters
How to define the best set of parameters. Here we only have one, that is the penalty. We build a set of bootstrap resamples and set penalty = tune(). We can use a function penalty() to set up an appropriate grid for this kind of regularization model, so we don’t need to choose values by ourselves. Notice that we use the parallelcomputation to boost the speed. The doParallel::registerDoParallel() function will activate all of our cores.
## # Bootstrap sampling using stratification
## # A tibble: 25 x 2
## splits id
## <named list> <chr>
## 1 <split [103/34]> Bootstrap01
## 2 <split [103/37]> Bootstrap02
## 3 <split [103/39]> Bootstrap03
## 4 <split [103/40]> Bootstrap04
## 5 <split [103/35]> Bootstrap05
## 6 <split [103/38]> Bootstrap06
## 7 <split [103/39]> Bootstrap07
## 8 <split [103/32]> Bootstrap08
## 9 <split [103/37]> Bootstrap09
## 10 <split [103/34]> Bootstrap10
## # ... with 15 more rows
# Tune Model
tune_spec <- linear_reg(penalty = tune(), mixture = 1) %>% # Highlight
set_engine("glmnet")
# Parameter Grid
lambda_grid <- grid_regular(penalty(), levels = 50) # Highlight
# Tune the grid, using the same workflow
doParallel::registerDoParallel() # Use all the cores
set.seed(1024)
lasso_grid <- tune_grid(
# Model & Workflow
wf %>% add_model(tune_spec),
# Resamples
resamples = office_boot,
# Parameter grid
grid = lambda_grid
)
# Results
lasso_grid %>%
collect_metrics()## # A tibble: 100 x 6
## penalty .metric .estimator mean n std_err
## <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 1.00e-10 rmse standard 0.517 25 0.0152
## 2 1.00e-10 rsq standard 0.241 25 0.0234
## 3 1.60e-10 rmse standard 0.517 25 0.0152
## 4 1.60e-10 rsq standard 0.241 25 0.0234
## 5 2.56e-10 rmse standard 0.517 25 0.0152
## 6 2.56e-10 rsq standard 0.241 25 0.0234
## 7 4.09e-10 rmse standard 0.517 25 0.0152
## 8 4.09e-10 rsq standard 0.241 25 0.0234
## 9 6.55e-10 rmse standard 0.517 25 0.0152
## 10 6.55e-10 rsq standard 0.241 25 0.0234
## # ... with 90 more rows
# Best parameter --> returns a tibble with parameter
lowest_rmse <- lasso_grid %>%
select_best("rmse", maximize = FALSE)The result is stored in lasso_grid %>% collect_metrics(), now we visualize it to see how the parameter alters the metrics.
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_errorbar(aes(
ymin = mean - std_err,
ymax = mean + std_err
),
alpha = 0.5
) +
geom_line(size = 1.5) +
facet_wrap(~.metric, scales = "free", nrow = 2) + # Present trend on same sacle
scale_x_log10() + # Step 2
theme(legend.position = "none")Finalize our workflow
After we find the best parameter, we update that value to our model.
final_lasso <- finalize_workflow(
# Workflow
wf %>% add_model(tune_spec),
# Parameter
lowest_rmse
)
final_lasso## == Workflow ================================================================================================
## Preprocessor: Recipe
## Model: linear_reg()
##
## -- Preprocessor --------------------------------------------------------------------------------------------
## 2 Recipe Steps
##
## * step_zv()
## * step_normalize()
##
## -- Model ---------------------------------------------------------------------------------------------------
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 0.0372759372031494
## mixture = 1
##
## Computational engine: glmnet
Now let us fit the finalized workflow on the training data, and check out the feature importance.
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
final_lasso %>%
fit(office_train) %>%
pull_workflow_fit() %>%
vi(lambda = lowest_rmse$penalty) %>% # Return tibble with variable, importance, and sign
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance) # Reorder by importance
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0,0)) +
labs(y = NULL)## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.476
## 2 rsq standard 0.0576
That’s it! Stay safe at home!
Reference
Inspired by Julia’s great work!