Goal: to predict the average rating of horror movies. Click here for the data

Import Data

horror_movies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ikea <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv')
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
skim(horror_movies)
Data summary
Name horror_movies
Number of rows 32540
Number of columns 20
_______________________
Column type frequency:
character 10
Date 1
logical 1
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
original_title 0 1.00 1 191 0 30296 0
title 0 1.00 1 191 0 29563 0
original_language 0 1.00 2 2 0 97 0
overview 1286 0.96 1 1000 0 31020 0
tagline 19835 0.39 1 237 0 12513 0
poster_path 4474 0.86 30 32 0 28048 0
status 0 1.00 7 15 0 4 0
backdrop_path 18995 0.42 29 32 0 13536 0
genre_names 0 1.00 6 144 0 772 0
collection_name 30234 0.07 4 56 0 815 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
release_date 0 1 1950-01-01 2022-12-31 2012-12-09 10999

Variable type: logical

skim_variable n_missing complete_rate mean count
adult 0 1 0 FAL: 32540

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 445910.83 305744.67 17 146494.8 426521.00 707534.00 1033095.00 ▇▆▆▅▅
popularity 0 1.00 4.01 37.51 0 0.6 0.84 2.24 5088.58 ▇▁▁▁▁
vote_count 0 1.00 62.69 420.89 0 0.0 2.00 11.00 16900.00 ▇▁▁▁▁
vote_average 0 1.00 3.34 2.88 0 0.0 4.00 5.70 10.00 ▇▂▆▃▁
budget 0 1.00 543126.59 4542667.81 0 0.0 0.00 0.00 200000000.00 ▇▁▁▁▁
revenue 0 1.00 1349746.73 14430479.15 0 0.0 0.00 0.00 701842551.00 ▇▁▁▁▁
runtime 0 1.00 62.14 41.00 0 14.0 80.00 91.00 683.00 ▇▁▁▁▁
collection 30234 0.07 481534.88 324498.16 656 155421.0 471259.00 759067.25 1033032.00 ▇▅▅▅▅
skim(ikea)
Data summary
Name ikea
Number of rows 3694
Number of columns 14
_______________________
Column type frequency:
character 7
logical 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 3 27 0 607 0
category 0 1 4 36 0 17 0
old_price 0 1 4 13 0 365 0
link 0 1 52 163 0 2962 0
other_colors 0 1 2 3 0 2 0
short_description 0 1 3 63 0 1706 0
designer 0 1 3 1261 0 381 0

Variable type: logical

skim_variable n_missing complete_rate mean count
sellable_online 0 1 0.99 TRU: 3666, FAL: 28

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
…1 0 1.00 1846.50 1066.51 0 923.25 1846.5 2769.75 3693 ▇▇▇▇▇
item_id 0 1.00 48632396.79 28887094.10 58487 20390574.00 49288078.0 70403572.75 99932615 ▇▇▇▇▇
price 0 1.00 1078.21 1374.65 3 180.90 544.7 1429.50 9585 ▇▁▁▁▁
depth 1463 0.60 54.38 29.96 1 38.00 47.0 60.00 257 ▇▃▁▁▁
height 988 0.73 101.68 61.10 1 67.00 83.0 124.00 700 ▇▂▁▁▁
width 589 0.84 104.47 71.13 1 60.00 80.0 140.00 420 ▇▅▂▁▁
data <- horror_movies %>%
    
    # Treat missing values
    select(-collection, -collection_name) %>%
    
    na.omit()
    
    #log transform variables with pos-skewed distribution NOT APPLICABLE

Explore Data

identify good predictors.

budget

data %>%
    ggplot(aes(vote_average, budget)) +
    geom_point()

runtime

data %>%
    ggplot(aes(vote_average, runtime)) +
    geom_point()

title

data %>%
    
    #tokenize title
    unnest_tokens(output = word, input = overview) %>%
    
    # calculate avg rent per word
    group_by(word) %>%
    summarise(vote_average = mean(vote_average), 
              n    = n()) %>%
    ungroup() %>%
    
    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = vote_average, n = 20) %>%
    
    # plot
    ggplot(aes(vote_average, fct_reorder(word, vote_average))) +
    geom_point() +
    
    labs(y = "Words in Overview")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-overview, -title, -original_title, -poster_path, -backdrop_path, -tagline, -release_date) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 7,449
## Columns: 52
## $ `id__-Inf_40145`                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ id__40145_171045                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ id__171045_542713                                <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ id__542713_Inf                                   <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ original_language__en                            <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ original_language__es                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__fr                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__it                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__ja                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__ko                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__th                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `original_language__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `popularity__-Inf_1.541`                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ popularity__1.541_3.623                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ popularity__3.623_8.634                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ popularity__8.634_Inf                            <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `vote_count__-Inf_6`                             <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ vote_count__6_26                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ vote_count__26_108                               <dbl> 0, 0, 0, 0, 0, 0, 1, …
## $ vote_count__108_Inf                              <dbl> 1, 1, 1, 1, 1, 1, 0, …
## $ `vote_average__-Inf_4.2`                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ vote_average__4.2_5.3                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ vote_average__5.3_6.1                            <dbl> 0, 0, 0, 0, 1, 0, 0, …
## $ vote_average__6.1_Inf                            <dbl> 1, 1, 1, 1, 0, 1, 1, …
## $ `budget__-Inf_5000`                              <dbl> 1, 1, 0, 0, 0, 0, 1, …
## $ budget__5000_Inf                                 <dbl> 0, 0, 1, 1, 1, 1, 0, …
## $ revenue__0                                       <dbl> 0, 0, 0, 0, 0, 0, 1, …
## $ `revenue__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 0, …
## $ `runtime__-Inf_81`                               <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ runtime__81_89                                   <dbl> 0, 0, 0, 0, 1, 0, 0, …
## $ runtime__89_96                                   <dbl> 0, 1, 0, 0, 0, 0, 1, …
## $ runtime__96_Inf                                  <dbl> 1, 0, 1, 1, 0, 1, 0, …
## $ status__Released                                 <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `status__-OTHER`                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Action,_Horror`                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Action,_Horror,_Science_Fiction`   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Action,_Horror,_Thriller`          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Horror`                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Horror,_Science_Fiction`   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Horror,_Thriller`          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Crime,_Horror,_Thriller`           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Drama,_Horror`                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Drama,_Horror,_Mystery`            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Drama,_Horror,_Thriller`           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Fantasy,_Horror`                   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names__Horror                              <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Horror,_Mystery`                   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Horror,_Mystery,_Thriller`         <dbl> 0, 0, 1, 0, 1, 0, 0, …
## $ `genre_names__Horror,_Science_Fiction`           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Horror,_Science_Fiction,_Thriller` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Horror,_Thriller`                  <dbl> 1, 0, 0, 1, 0, 0, 0, …
## $ `genre_names__-OTHER`                            <dbl> 0, 1, 0, 0, 0, 1, 1, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(vote_average__6.1_Inf)

data_corr_tbl
## # A tibble: 52 × 3
##    feature      bin       correlation
##    <fct>        <chr>           <dbl>
##  1 vote_average 6.1_Inf         1    
##  2 vote_average 4.2_5.3        -0.332
##  3 vote_average -Inf_4.2       -0.318
##  4 vote_average 5.3_6.1        -0.311
##  5 vote_count   108_Inf         0.263
##  6 popularity   8.634_Inf       0.234
##  7 revenue      -OTHER          0.211
##  8 revenue      0              -0.211
##  9 runtime      96_Inf          0.179
## 10 vote_count   6_26           -0.155
## # ℹ 42 more rows
# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Preprocess Data

Build Models

Evaluate Models

Make Predictions