Explore The Office

Alright, cut the crap show me the code!

# Data Manipulation
library(tidyverse)
library(lubridate)
# Models
library(tidymodels)
# Theme
library(extrafont)
loadfonts(device = "win")
theme_plex <- function(base_size = 11,
                       strip_text_size = 12,
                       strip_text_margin = 5,
                       subtitle_size = 13,
                       subtitle_margin = 10,
                       plot_title_size = 16,
                       plot_title_margin = 10,
                       ...) {
    ret <- ggplot2::theme_minimal(base_family = "Comic Sans MS",
                                  base_size = base_size, ...)
    ret$strip.text <- ggplot2::element_text(
        hjust = 0, size = strip_text_size,
        margin = ggplot2::margin(b = strip_text_margin),
        family = "Comic Sans MS-Medium"
    )
    ret$plot.subtitle <- ggplot2::element_text(
        hjust = 0, size = subtitle_size,
        margin = ggplot2::margin(b = subtitle_margin),
        family = "Comic Sans MS"
    )
    ret$plot.title <- ggplot2::element_text(
        hjust = 0, size = plot_title_size,
        margin = ggplot2::margin(b = plot_title_margin),
        family = "Comic Sans MS-Bold"
    )
    ret
}
theme_set(theme_plex())

Data Gathering & Preprocessing

ratings_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv")

remove_regex <- "[:punct:]|[:digit:]|parts |part |the |and"
# Rating df
office_ratings <- ratings_raw %>%
  transmute(                     # transmute() only keep features we create
    episode_name = str_to_lower(title),
    episode_name = str_remove_all(episode_name, remove_regex),
    episode_name = str_trim(episode_name),
    imdb_rating
  )
# install.packages("schrute")
office_info <- schrute::theoffice %>%
  mutate(
    air_date = ymd(air_date),
    season = as.numeric(season),
    episode = as.numeric(episode),
    # Same thing like before
    episode_name = str_to_lower(episode_name),
    episode_name = str_remove_all(episode_name, remove_regex),
    episode_name = str_trim(episode_name)
  ) %>%
  select(season, episode, episode_name, director, writer, character)
office_info

## # A tibble: 55,130 x 6
##    season episode episode_name director   writer                       character
##     <dbl>   <dbl> <chr>        <chr>      <chr>                        <chr>    
##  1      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Michael  
##  2      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Jim      
##  3      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Michael  
##  4      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Jim      
##  5      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Michael  
##  6      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Michael  
##  7      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Michael  
##  8      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Pam      
##  9      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Michael  
## 10      1       1 pilot        Ken Kwapis Ricky Gervais;Stephen Merch~ Pam      
## # ... with 55,120 more rows

Feature Engineering

How many times characters speak per episode

characters <- office_info %>%
  # Count character speaking frequency
  count(episode_name, character) %>%
  # Filter the characters that involve the most during the entire series
  add_count(character, wt = n, name = "character_count") %>%
  filter(character_count > 800) %>%
  select(-character_count) %>%
  # Pivot into matrix for modeling purpose 
  pivot_wider(names_from = character, values_from = n, values_fill = list(n = 0))
characters

## # A tibble: 185 x 16
##    episode_name  Andy Angela Darryl Dwight   Jim Kelly Kevin Michael Oscar   Pam
##    <chr>        <int>  <int>  <int>  <int> <int> <int> <int>   <int> <int> <int>
##  1 a benihana ~    28     37      3     61    44     5    14     108     1    57
##  2 aarm            44     39     30     87    89     0    30       0    28    34
##  3 after hours     20     11     14     60    55     8     4       0    10    15
##  4 alliance         0      7      0     47    49     0     3      68    14    22
##  5 angry y         53      7      5     16    19    13     9       0     7    29
##  6 baby shower     13     13      9     35    27     2     4      79     3    25
##  7 back from v~     3      4      6     22    25     0     5      70     0    33
##  8 banker           1      2      0     17     0     0     2      44     0     5
##  9 basketball       0      3     15     25    21     0     1     104     2    14
## 10 beach games     18      8      0     38    22     9     5     105     5    23
## # ... with 175 more rows, and 5 more variables: Phyllis <int>, Ryan <int>,
## #   Toby <int>, Erin <int>, Jan <int>

Which directors and writers are involved in each episode

creators <- office_info %>%
  distinct(episode_name, director, writer) %>% # Highlight 1
  pivot_longer(director:writer, names_to = "role", values_to = "person") %>%
  separate_rows(person, sep = ";") %>%         # Highlight 2
  add_count(person) %>%              # count their occurrence 
  filter(n > 10) %>%                 # filter step 1
  distinct(episode_name, person) %>% # filter step 2
  mutate(person_value = 1) %>%  
  pivot_wider(                # Dummy matrix
    names_from = person,
    values_from = person_value,
    values_fill = list(person_value = 0)
  )
creators

## # A tibble: 135 x 14
##    episode_name `Ken Kwapis` `Greg Daniels` `B.J. Novak` `Paul Lieberste~
##    <chr>               <dbl>          <dbl>        <dbl>            <dbl>
##  1 pilot                   1              1            0                0
##  2 diversity d~            1              0            1                0
##  3 health care             0              0            0                1
##  4 basketball              0              1            0                0
##  5 hot girl                0              0            0                0
##  6 dundies                 0              1            0                0
##  7 sexual hara~            1              0            1                0
##  8 office olym~            0              0            0                0
##  9 fire                    1              0            1                0
## 10 halloween               0              1            0                0
## # ... with 125 more rows, and 9 more variables: `Mindy Kaling` <dbl>, `Paul
## #   Feig` <dbl>, `Gene Stupnitsky` <dbl>, `Lee Eisenberg` <dbl>, `Jennifer
## #   Celotta` <dbl>, `Randall Einhorn` <dbl>, `Brent Forrester` <dbl>, `Jeffrey
## #   Blitz` <dbl>, `Justin Spitzer` <dbl>

Put it all together

office <- office_info %>%
  # Distinct the season and episode
  distinct(season, episode, episode_name) %>%
  # Join the features
  inner_join(characters) %>%
  inner_join(creators) %>%
  inner_join(office_ratings) %>%
  # Clean the colnames
  janitor::clean_names()

## Joining, by = "episode_name"
## Joining, by = "episode_name"
## Joining, by = "episode_name"

office

## # A tibble: 136 x 32
##    season episode episode_name  andy angela darryl dwight   jim kelly kevin
##     <dbl>   <dbl> <chr>        <int>  <int>  <int>  <int> <int> <int> <int>
##  1      1       1 pilot            0      1      0     29    36     0     1
##  2      1       2 diversity d~     0      4      0     17    25     2     8
##  3      1       3 health care      0      5      0     62    42     0     6
##  4      1       5 basketball       0      3     15     25    21     0     1
##  5      1       6 hot girl         0      3      0     28    55     0     5
##  6      2       1 dundies          0      1      1     32    32     7     1
##  7      2       2 sexual hara~     0      2      9     11    16     0     6
##  8      2       3 office olym~     0      6      0     55    55     0     9
##  9      2       4 fire             0     17      0     65    51     4     5
## 10      2       5 halloween        0     13      0     33    30     3     2
## # ... with 126 more rows, and 22 more variables: michael <int>, oscar <int>,
## #   pam <int>, phyllis <int>, ryan <int>, toby <int>, erin <int>, jan <int>,
## #   ken_kwapis <dbl>, greg_daniels <dbl>, b_j_novak <dbl>,
## #   paul_lieberstein <dbl>, mindy_kaling <dbl>, paul_feig <dbl>,
## #   gene_stupnitsky <dbl>, lee_eisenberg <dbl>, jennifer_celotta <dbl>,
## #   randall_einhorn <dbl>, brent_forrester <dbl>, jeffrey_blitz <dbl>,
## #   justin_spitzer <dbl>, imdb_rating <dbl>

office %>%
  ggplot(aes(episode, imdb_rating)) +
  geom_boxplot(aes(fill = as.factor(episode))) + # Put the group factor here
  geom_smooth(color = "orange", size = 1.2, alpha = 0.4) +
  labs(y = "IMDB Rating") +
  ggtitle("Ratings variation in episodes",
          subtitle = "Ratings are higher for episodes later in the season. ") +
  theme(legend.position = "None")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Lasso Regression

Split training & testing sets

Notice that we use strata = season to make sure we get even samples from each season.

office_split <- initial_split(office, strata = season) # Highlight
office_train <- training(office_split)
office_test <- testing(office_split)

Feature engineering recipe

office_rec <- recipe(imdb_rating ~., data = office_train) %>%
  update_role(episode_name, new_role = "ID") %>% # Keep the episode_name as id, exclude it from training
  step_zv(all_numeric(), -all_outcomes()) %>%    # Exclude the outcome since the outcome is numeric as well
  step_normalize(all_numeric(), -all_outcomes()) # Center and scale the predictors

office_prep <- office_rec %>%
  prep(strings_as_factors = F) # Since we kept the id
office_prep

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         ID          1
##    outcome          1
##  predictor         30
## 
## Training data contained 103 data points and no missing data.
## 
## Operations:
## 
## Zero variance filter removed no terms [trained]
## Centering and scaling for season, episode, andy, angela, darryl, ... [trained]

Fit the model

# mixture = 1 --> Lasso; 
lasso_spec <- linear_reg(penalty = 0.1, mixture = 1) %>%
  set_engine("glmnet")
# Set up a workflow
wf <- workflow() %>%
  add_recipe(office_rec)
# Fit
lasso_fit <- wf %>%
  add_model(lasso_spec) %>%
  fit(data = office_train)
lasso_fit %>%
  pull_workflow_fit() %>%
  tidy()

## # A tibble: 1,598 x 5
##    term          step estimate lambda dev.ratio
##    <chr>        <dbl>    <dbl>  <dbl>     <dbl>
##  1 (Intercept)      1  8.39     0.194    0     
##  2 (Intercept)      2  8.39     0.177    0.0334
##  3 jim              2  0.00616  0.177    0.0334
##  4 greg_daniels     2  0.0179   0.177    0.0334
##  5 (Intercept)      3  8.39     0.161    0.0873
##  6 jim              3  0.0223   0.161    0.0873
##  7 michael          3  0.00949  0.161    0.0873
##  8 greg_daniels     3  0.0346   0.161    0.0873
##  9 (Intercept)      4  8.39     0.147    0.135 
## 10 jim              4  0.0367   0.147    0.135 
## # ... with 1,588 more rows

Fine tune the parameters

How to define the best set of parameters. Here we only have one, that is the penalty. We build a set of bootstrap resamples and set penalty = tune(). We can use a function penalty() to set up an appropriate grid for this kind of regularization model, so we don’t need to choose values by ourselves. Notice that we use the parallelcomputation to boost the speed. The doParallel::registerDoParallel() function will activate all of our cores.

# Bootstrap Resampling
set.seed(1024)
(office_boot <- bootstraps(office_train, strata = season))

## # Bootstrap sampling using stratification 
## # A tibble: 25 x 2
##    splits           id         
##    <named list>     <chr>      
##  1 <split [103/34]> Bootstrap01
##  2 <split [103/37]> Bootstrap02
##  3 <split [103/39]> Bootstrap03
##  4 <split [103/40]> Bootstrap04
##  5 <split [103/35]> Bootstrap05
##  6 <split [103/38]> Bootstrap06
##  7 <split [103/39]> Bootstrap07
##  8 <split [103/32]> Bootstrap08
##  9 <split [103/37]> Bootstrap09
## 10 <split [103/34]> Bootstrap10
## # ... with 15 more rows

# Tune Model
tune_spec <- linear_reg(penalty = tune(), mixture = 1) %>% # Highlight
  set_engine("glmnet")
# Parameter Grid
lambda_grid <- grid_regular(penalty(), levels = 50)        # Highlight
# Tune the grid, using the same workflow
doParallel::registerDoParallel()                           # Use all the cores
set.seed(1024)
lasso_grid <- tune_grid(
  # Model & Workflow
  wf %>% add_model(tune_spec),
  # Resamples
  resamples = office_boot,
  # Parameter grid
  grid = lambda_grid
)
# Results
lasso_grid %>%
  collect_metrics()

## # A tibble: 100 x 6
##     penalty .metric .estimator  mean     n std_err
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
##  1 1.00e-10 rmse    standard   0.517    25  0.0152
##  2 1.00e-10 rsq     standard   0.241    25  0.0234
##  3 1.60e-10 rmse    standard   0.517    25  0.0152
##  4 1.60e-10 rsq     standard   0.241    25  0.0234
##  5 2.56e-10 rmse    standard   0.517    25  0.0152
##  6 2.56e-10 rsq     standard   0.241    25  0.0234
##  7 4.09e-10 rmse    standard   0.517    25  0.0152
##  8 4.09e-10 rsq     standard   0.241    25  0.0234
##  9 6.55e-10 rmse    standard   0.517    25  0.0152
## 10 6.55e-10 rsq     standard   0.241    25  0.0234
## # ... with 90 more rows

# Best parameter --> returns a tibble with parameter
lowest_rmse <- lasso_grid %>%
  select_best("rmse", maximize = FALSE)

The result is stored in lasso_grid %>% collect_metrics(), now we visualize it to see how the parameter alters the metrics.

lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_errorbar(aes(
    ymin = mean - std_err,
    ymax = mean + std_err
  ),
  alpha = 0.5
  ) +
  geom_line(size = 1.5) +
  facet_wrap(~.metric, scales = "free", nrow = 2) + # Present trend on same sacle 
  scale_x_log10() +                                 # Step 2
  theme(legend.position = "none")

Finalize our workflow

After we find the best parameter, we update that value to our model.

final_lasso <- finalize_workflow(
  # Workflow
  wf %>% add_model(tune_spec),
  # Parameter
  lowest_rmse
)
final_lasso

## == Workflow ================================================================================================
## Preprocessor: Recipe
## Model: linear_reg()
## 
## -- Preprocessor --------------------------------------------------------------------------------------------
## 2 Recipe Steps
## 
## * step_zv()
## * step_normalize()
## 
## -- Model ---------------------------------------------------------------------------------------------------
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.0372759372031494
##   mixture = 1
## 
## Computational engine: glmnet

Now let us fit the finalized workflow on the training data, and check out the feature importance.

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

final_lasso %>%
  fit(office_train) %>%
  pull_workflow_fit() %>%
  vi(lambda = lowest_rmse$penalty) %>% # Return tibble with variable, importance, and sign
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance) # Reorder by importance
  ) %>%
  ggplot(aes(x  = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0,0)) +
  labs(y = NULL)

last_fit(
  final_lasso,
  office_split
) %>%
  collect_metrics()

## # A tibble: 2 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.476 
## 2 rsq     standard      0.0576

That’s it! Stay safe at home!