We are going to predict the Imdb rating for the Office episodes. To do so, we will use a lasso regression,
It is a regularisation technique. Regularizations are techniques used to reduce the error by fitting a function appropriately on the given training set and avoid overfitting. […] In machine learning, you will have a fair idea that regularization penalizes the coefficients. In deep learning, it actually penalizes the weight matrices of the nodes.
Why do we use it?
library(tidyverse)
library(tidytuesdayR)
theme_set(theme_light())
office_data <- tt_load("2020-03-17")
##
## Downloading file 1 of 1: `office_ratings.csv`
ratings_raw <- office_data$office_ratings
# prepare preprocess to join office data from tidytuesday challenge, to the
# office data from the library schrute
remove_regex <- "[:punct:]|[:digit:]|parts |part |the |and" # some episodes are
# part 1 and others are 1 and then there is another on which is 2
office_ratings <- ratings_raw %>%
transmute(
episode_name = str_to_lower(title),
episode_name = str_remove_all(episode_name, remove_regex),
episode_name = str_trim(episode_name),
imdb_rating
)
office_info <- schrute::theoffice %>%
mutate(
season = as.numeric(season),
episode = as.numeric(episode),
episode_name = str_to_lower(episode_name),
episode_name = str_remove_all(episode_name, remove_regex),
episode_name = str_trim(episode_name)
) %>%
select(season, episode, episode_name, director, writer, character)
Now we are going to build the dataset to start modeling:
# let's see how many line do we have for each character for each episode
characters <- office_info %>%
count(episode_name, character) %>%
add_count(character, wt = n, name = "character_count") %>%
filter(character_count > 800) %>%
select(-character_count) %>%
pivot_wider(
names_from = character,
values_from = n,
values_fill = list(n = 0)
)
Let’s focus on the writer and the director:
creators <- office_info %>%
distinct(episode_name, director, writer) %>%
pivot_longer(director:writer, names_to = "role", values_to = "person") %>%
separate_rows(person, sep = ";") %>%
add_count(person) %>%
filter(n > 10) %>%
distinct(episode_name, person) %>%
mutate(person_value = 1) %>%
pivot_wider(
names_from = person,
values_from = person_value,
values_fill = list(person_value = 0)
)
Now we are going to join the information:
office <- office_info %>%
distinct(season, episode, episode_name) %>%
inner_join(characters) %>%
inner_join(creators) %>%
inner_join(office_ratings %>%
select(episode_name, imdb_rating)) %>%
janitor::clean_names()
Show some visualizations for the ratings of the different seasons:
office %>%
ggplot(aes(season, imdb_rating, fill = as.factor(season))) +
geom_boxplot(show.legend = FALSE)
And let’s see of there is a high ratings at the end of the seasons:
office %>%
ggplot(aes(episode, imdb_rating, fill = as.factor(episode))) +
geom_boxplot(show.legend = FALSE)
What we want to understand if the appearance of an specific actor it is making the episode more highly to have a high rating.
library(tidymodels)
# let's do our data split
office_split <- initial_split(office, strata = season) # we want to have the same proportion of episodes from different seasons in the test that in the train
office_train <- training(office_split)
office_test <- testing(office_split)
# small test
# now it is time to write the recipe
office_rec <- recipe(imdb_rating ~ ., data = office_train) %>%
# we want to keep the episode name, as we are going to use it as id, but
# we don't want to use it to predict
update_role(episode_name, new_role = "ID") %>%
# Will remove variables that contains only a single value, so removing
# 0 variance
step_zv(all_numeric(), -all_outcomes()) %>%
# now we are going to normalise by center and scaling, which is required in
# lasso regression
step_normalize(all_numeric(), -all_outcomes())
# prepare the recipe
office_prep <- office_rec %>%
prep(strings_as_factors = FALSE)
Time to build our model:
lasso_spec <- linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine("glmnet")
# A workflow will help me to put all the pieces together
wf <- workflow() %>%
add_recipe(office_rec)
lasso_fit <- wf %>%
add_model(lasso_spec) %>%
fit(data = office_train)
lasso_fit %>%
# get the actual results
pull_workflow_fit() %>%
tidy() %>%
arrange(dev.ratio)
## # A tibble: 1,616 x 5
## term step estimate lambda dev.ratio
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1 8.34 0.233 0
## 2 (Intercept) 2 8.34 0.212 0.0321
## 3 michael 2 0.0208 0.212 0.0321
## 4 (Intercept) 3 8.34 0.193 0.0587
## 5 michael 3 0.0397 0.193 0.0587
## 6 (Intercept) 4 8.34 0.176 0.0808
## 7 michael 4 0.0569 0.176 0.0808
## 8 (Intercept) 5 8.34 0.160 0.0992
## 9 michael 5 0.0726 0.160 0.0992
## 10 (Intercept) 6 8.34 0.146 0.123
## # … with 1,606 more rows