Goal: to predict the average rating of horror movies. Click here for the data

Import Data

horror_movies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')

## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ikea <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv')

## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

skim(horror_movies)

Data summary
Name	horror_movies
Number of rows	32540
Number of columns	20
_______________________
Column type frequency:
character	10
Date	1
logical	1
numeric	8
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
original_title	0	1.00	1	191	30296
title	0	1.00	1	191	29563
original_language	0	1.00	2	2	97
overview	1286	0.96	1	1000	31020
tagline	19835	0.39	1	237	12513
poster_path	4474	0.86	30	32	28048
status	0	1.00	7	15	4
backdrop_path	18995	0.42	29	32	13536
genre_names	0	1.00	6	144	772
collection_name	30234	0.07	4	56	815

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
release_date	0	1	1950-01-01	2022-12-31	2012-12-09	10999

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
adult	0	1	0	FAL: 32540

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	445910.83	305744.67	17	146494.8	426521.00	707534.00	1033095.00	▇▆▆▅▅
popularity	0	1.00	4.01	37.51	0	0.6	0.84	2.24	5088.58	▇▁▁▁▁
vote_count	0	1.00	62.69	420.89	0	0.0	2.00	11.00	16900.00	▇▁▁▁▁
vote_average	0	1.00	3.34	2.88	0	0.0	4.00	5.70	10.00	▇▂▆▃▁
budget	0	1.00	543126.59	4542667.81	0	0.0	0.00	0.00	200000000.00	▇▁▁▁▁
revenue	0	1.00	1349746.73	14430479.15	0	0.0	0.00	0.00	701842551.00	▇▁▁▁▁
runtime	0	1.00	62.14	41.00	0	14.0	80.00	91.00	683.00	▇▁▁▁▁
collection	30234	0.07	481534.88	324498.16	656	155421.0	471259.00	759067.25	1033032.00	▇▅▅▅▅

skim(ikea)

Data summary
Name	ikea
Number of rows	3694
Number of columns	14
_______________________
Column type frequency:
character	7
logical	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
name	1	3	27	607
category	1	4	36	17
old_price	1	4	13	365
link	1	52	163	2962
other_colors	1	2	3	2
short_description	1	3	63	1706
designer	1	3	1261	381

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
sellable_online	0	1	0.99	TRU: 3666, FAL: 28

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
…1	0	1.00	1846.50	1066.51	0	923.25	1846.5	2769.75	3693	▇▇▇▇▇
item_id	0	1.00	48632396.79	28887094.10	58487	20390574.00	49288078.0	70403572.75	99932615	▇▇▇▇▇
price	0	1.00	1078.21	1374.65	3	180.90	544.7	1429.50	9585	▇▁▁▁▁
depth	1463	0.60	54.38	29.96	1	38.00	47.0	60.00	257	▇▃▁▁▁
height	988	0.73	101.68	61.10	1	67.00	83.0	124.00	700	▇▂▁▁▁
width	589	0.84	104.47	71.13	1	60.00	80.0	140.00	420	▇▅▂▁▁

data <- horror_movies %>%
    
    # Treat missing values
    select(-collection, -collection_name) %>%
    
    na.omit()
    
    #log transform variables with pos-skewed distribution NOT APPLICABLE

Explore Data

identify good predictors.

budget

data %>%
    ggplot(aes(vote_average, budget)) +
    geom_point()

runtime

data %>%
    ggplot(aes(vote_average, runtime)) +
    geom_point()

title

data %>%
    
    #tokenize title
    unnest_tokens(output = word, input = overview) %>%
    
    # calculate avg rent per word
    group_by(word) %>%
    summarise(vote_average = mean(vote_average), 
              n    = n()) %>%
    ungroup() %>%
    
    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = vote_average, n = 20) %>%
    
    # plot
    ggplot(aes(vote_average, fct_reorder(word, vote_average))) +
    geom_point() +
    
    labs(y = "Words in Overview")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-overview, -title, -original_title, -poster_path, -backdrop_path, -tagline, -release_date) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,449
## Columns: 52
## $ `id__-Inf_40145`                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ id__40145_171045                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ id__171045_542713                                <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ id__542713_Inf                                   <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ original_language__en                            <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ original_language__es                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__fr                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__it                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__ja                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__ko                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ original_language__th                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `original_language__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `popularity__-Inf_1.541`                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ popularity__1.541_3.623                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ popularity__3.623_8.634                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ popularity__8.634_Inf                            <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `vote_count__-Inf_6`                             <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ vote_count__6_26                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ vote_count__26_108                               <dbl> 0, 0, 0, 0, 0, 0, 1, …
## $ vote_count__108_Inf                              <dbl> 1, 1, 1, 1, 1, 1, 0, …
## $ `vote_average__-Inf_4.2`                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ vote_average__4.2_5.3                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ vote_average__5.3_6.1                            <dbl> 0, 0, 0, 0, 1, 0, 0, …
## $ vote_average__6.1_Inf                            <dbl> 1, 1, 1, 1, 0, 1, 1, …
## $ `budget__-Inf_5000`                              <dbl> 1, 1, 0, 0, 0, 0, 1, …
## $ budget__5000_Inf                                 <dbl> 0, 0, 1, 1, 1, 1, 0, …
## $ revenue__0                                       <dbl> 0, 0, 0, 0, 0, 0, 1, …
## $ `revenue__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 0, …
## $ `runtime__-Inf_81`                               <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ runtime__81_89                                   <dbl> 0, 0, 0, 0, 1, 0, 0, …
## $ runtime__89_96                                   <dbl> 0, 1, 0, 0, 0, 0, 1, …
## $ runtime__96_Inf                                  <dbl> 1, 0, 1, 1, 0, 1, 0, …
## $ status__Released                                 <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `status__-OTHER`                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Action,_Horror`                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Action,_Horror,_Science_Fiction`   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Action,_Horror,_Thriller`          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Horror`                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Horror,_Science_Fiction`   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Horror,_Thriller`          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Crime,_Horror,_Thriller`           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Drama,_Horror`                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Drama,_Horror,_Mystery`            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Drama,_Horror,_Thriller`           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Fantasy,_Horror`                   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names__Horror                              <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Horror,_Mystery`                   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Horror,_Mystery,_Thriller`         <dbl> 0, 0, 1, 0, 1, 0, 0, …
## $ `genre_names__Horror,_Science_Fiction`           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Horror,_Science_Fiction,_Thriller` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `genre_names__Horror,_Thriller`                  <dbl> 1, 0, 0, 1, 0, 0, 0, …
## $ `genre_names__-OTHER`                            <dbl> 0, 1, 0, 0, 0, 1, 1, …

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(vote_average__6.1_Inf)

data_corr_tbl

## # A tibble: 52 × 3
##    feature      bin       correlation
##    <fct>        <chr>           <dbl>
##  1 vote_average 6.1_Inf         1    
##  2 vote_average 4.2_5.3        -0.332
##  3 vote_average -Inf_4.2       -0.318
##  4 vote_average 5.3_6.1        -0.311
##  5 vote_count   108_Inf         0.263
##  6 popularity   8.634_Inf       0.234
##  7 revenue      -OTHER          0.211
##  8 revenue      0              -0.211
##  9 runtime      96_Inf          0.179
## 10 vote_count   6_26           -0.155
## # ℹ 42 more rows

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Apply 1: Horror Movies

2024-09-12

Import Data

Explore Data

Preprocess Data

Build Models

Evaluate Models

Make Predictions