Goal: Build a regression model to predict the average movie rating (vote_average). Use the horror_movies dataset. Click here for the data

Import Data

horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-11-01/horror_movies.csv')


skimr::skim(horror_movies)
Data summary
Name horror_movies
Number of rows 32540
Number of columns 20
_______________________
Column type frequency:
character 10
Date 1
logical 1
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
original_title 0 1.00 1 191 0 30296 0
title 0 1.00 1 191 0 29563 0
original_language 0 1.00 2 2 0 97 0
overview 1286 0.96 1 1000 0 31020 0
tagline 19835 0.39 1 237 0 12513 0
poster_path 4474 0.86 30 32 0 28048 0
status 0 1.00 7 15 0 4 0
backdrop_path 18995 0.42 29 32 0 13536 0
genre_names 0 1.00 6 144 0 772 0
collection_name 30234 0.07 4 56 0 815 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
release_date 0 1 1950-01-01 2022-12-31 2012-12-09 10999

Variable type: logical

skim_variable n_missing complete_rate mean count
adult 0 1 0 FAL: 32540

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 445910.83 305744.67 17 146494.8 426521.00 707534.00 1033095.00 ▇▆▆▅▅
popularity 0 1.00 4.01 37.51 0 0.6 0.84 2.24 5088.58 ▇▁▁▁▁
vote_count 0 1.00 62.69 420.89 0 0.0 2.00 11.00 16900.00 ▇▁▁▁▁
vote_average 0 1.00 3.34 2.88 0 0.0 4.00 5.70 10.00 ▇▂▆▃▁
budget 0 1.00 543126.59 4542667.81 0 0.0 0.00 0.00 200000000.00 ▇▁▁▁▁
revenue 0 1.00 1349746.73 14430479.15 0 0.0 0.00 0.00 701842551.00 ▇▁▁▁▁
runtime 0 1.00 62.14 41.00 0 14.0 80.00 91.00 683.00 ▇▁▁▁▁
collection 30234 0.07 481534.88 324498.16 656 155421.0 471259.00 759067.25 1033032.00 ▇▅▅▅▅
data <- horror_movies %>%

    # Treat missing values
    select(-overview,-tagline,-poster_path,-backdrop_path,-collection_name) %>%
    na.omit() %>%
    
    # Seperating Row Names
    separate_rows(genre_names, sep = ", ") %>%
    
    # Filter out Non Released Movies
    filter(status == "Released") %>%
    
    #log transform variables for voting average
    mutate(vote_average = log(vote_average + 1)) %>%
    
    mutate(across(where(is.character),as.factor))

Explore Data

Identify good predictors.

genre_names

data %>%
    ggplot(aes(vote_average, genre_names)) +
    geom_boxplot()

popularity

data %>%
    ggplot(aes(vote_average, popularity)) +
    geom_point()

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-id, -release_date) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 4,949
## Columns: 49
## $ `original_title__Scooby-Doo!_Frankencreepy`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `original_title__-OTHER`                        <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `title__Scooby-Doo!_Frankencreepy`              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `title__-OTHER`                                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ original_language__cn                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__de                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__en                           <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ original_language__es                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__id                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__ja                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__th                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `original_language__-OTHER`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `popularity__-Inf_1.55`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__1.55_4.882                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__4.882_13.017                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__13.017_Inf                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `vote_count__-Inf_5`                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__5_36                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__36_205                              <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ vote_count__205_Inf                             <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
## $ `vote_average__-Inf_1.66770682055808`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.66770682055808_1.85629799036563 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.85629799036563_1.97408102602201 <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ vote_average__1.97408102602201_Inf              <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
## $ `budget__-Inf_7e+05`                            <dbl> 1, 1, 0, 0, 0, 0, 0, 0…
## $ `budget__7e+05_Inf`                             <dbl> 0, 0, 1, 1, 1, 1, 1, 1…
## $ revenue__0                                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `revenue__-OTHER`                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `runtime__-Inf_81`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__81_90                                  <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ runtime__90_99                                  <dbl> 1, 1, 0, 0, 0, 0, 0, 0…
## $ runtime__99_Inf                                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ genre_names__Action                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Adventure                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Animation                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Comedy                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Crime                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Drama                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Fantasy                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Horror                             <dbl> 1, 0, 1, 0, 0, 1, 0, 0…
## $ genre_names__Mystery                            <dbl> 0, 0, 0, 1, 0, 0, 1, 0…
## $ genre_names__Science_Fiction                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Thriller                           <dbl> 0, 1, 0, 0, 1, 0, 0, 1…
## $ genre_names__TV_Movie                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `genre_names__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `collection__-Inf_133352`                       <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ collection__133352_459212                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ collection__459212_744915                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ collection__744915_Inf                          <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(vote_average__1.97408102602201_Inf)

data_corr_tbl
## # A tibble: 49 × 3
##    feature      bin                               correlation
##    <fct>        <chr>                                   <dbl>
##  1 vote_average 1.97408102602201_Inf                    1    
##  2 vote_average 1.66770682055808_1.85629799036563      -0.335
##  3 vote_average -Inf_1.66770682055808                  -0.330
##  4 vote_average 1.85629799036563_1.97408102602201      -0.324
##  5 popularity   13.017_Inf                              0.243
##  6 revenue      0                                      -0.233
##  7 revenue      -OTHER                                  0.233
##  8 vote_count   205_Inf                                 0.214
##  9 runtime      99_Inf                                  0.196
## 10 vote_count   5_36                                   -0.123
## # ℹ 39 more rows
# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Preprocess Data

Build Models

Evaluate Models

Make Predictions