Apply to Your Data 1: Horror Movies

#Import Data:

horror <- horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')

## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(horror)

Data summary
Name	horror
Number of rows	32540
Number of columns	20
_______________________
Column type frequency:
character	10
Date	1
logical	1
numeric	8
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
original_title	0	1.00	1	191	30296
title	0	1.00	1	191	29563
original_language	0	1.00	2	2	97
overview	1286	0.96	1	1000	31020
tagline	19835	0.39	1	237	12513
poster_path	4474	0.86	30	32	28048
status	0	1.00	7	15	4
backdrop_path	18995	0.42	29	32	13536
genre_names	0	1.00	6	144	772
collection_name	30234	0.07	4	56	815

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
release_date	0	1	1950-01-01	2022-12-31	2012-12-09	10999

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
adult	0	1	0	FAL: 32540

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	445910.83	305744.67	17	146494.8	426521.00	707534.00	1033095.00	▇▆▆▅▅
popularity	0	1.00	4.01	37.51	0	0.6	0.84	2.24	5088.58	▇▁▁▁▁
vote_count	0	1.00	62.69	420.89	0	0.0	2.00	11.00	16900.00	▇▁▁▁▁
vote_average	0	1.00	3.34	2.88	0	0.0	4.00	5.70	10.00	▇▂▆▃▁
budget	0	1.00	543126.59	4542667.81	0	0.0	0.00	0.00	200000000.00	▇▁▁▁▁
revenue	0	1.00	1349746.73	14430479.15	0	0.0	0.00	0.00	701842551.00	▇▁▁▁▁
runtime	0	1.00	62.14	41.00	0	14.0	80.00	91.00	683.00	▇▁▁▁▁
collection	30234	0.07	481534.88	324498.16	656	155421.0	471259.00	759067.25	1033032.00	▇▅▅▅▅

data <- horror %>%
    
    # Treat Missing Values
    select(-tagline, -release_date) %>%
    filter(budget != 0) %>%
    filter(revenue != 0) %>%
    na.omit() %>%
    
    # Log transform variables with pos-skewed distribution
    mutate(vote_average = log(vote_average))

Goal: Build a regression model to predict average movie rating (vote_average). Using the horror_movies dataset.

#Explore Data:

Identify Good Predictors.

Budget

data %>%
    ggplot(aes(vote_average, budget)) +
    scale_y_log10() +
    geom_point()

data %>%
    ggplot(aes(vote_average, as.factor(runtime))) +
    geom_boxplot()

Correlation Plot

# Step 1: Prepare Data
data_binarize_tbl <- data %>%
    select(-id, -original_title, -title, -overview) %>%
    binarize()
data_binarize_tbl %>%
    glimpse()

## Rows: 390
## Columns: 76
## $ original_language__en                                     <dbl> 1, 1, 1, 1, …
## $ original_language__es                                     <dbl> 0, 0, 0, 0, …
## $ original_language__hi                                     <dbl> 0, 0, 0, 0, …
## $ original_language__ja                                     <dbl> 0, 0, 0, 0, …
## $ original_language__ko                                     <dbl> 0, 0, 0, 0, …
## $ `original_language__-OTHER`                               <dbl> 0, 0, 0, 0, …
## $ `poster_path__/11tOuxWiGFzL60bVwoiF9SSoMLc.jpg`           <dbl> 0, 0, 0, 0, …
## $ `poster_path__-OTHER`                                     <dbl> 1, 1, 1, 1, …
## $ `popularity__-Inf_14.8335`                                <dbl> 0, 0, 0, 0, …
## $ popularity__14.8335_27.89                                 <dbl> 0, 0, 0, 0, …
## $ popularity__27.89_51.0185                                 <dbl> 0, 0, 0, 0, …
## $ popularity__51.0185_Inf                                   <dbl> 1, 1, 1, 1, …
## $ `vote_count__-Inf_424`                                    <dbl> 1, 0, 0, 0, …
## $ vote_count__424_1095.5                                    <dbl> 0, 1, 0, 0, …
## $ vote_count__1095.5_2517.5                                 <dbl> 0, 0, 1, 1, …
## $ vote_count__2517.5_Inf                                    <dbl> 0, 0, 0, 0, …
## $ `vote_average__-Inf_1.7227665977411`                      <dbl> 0, 0, 0, 0, …
## $ vote_average__1.7227665977411_1.80828877117927            <dbl> 1, 0, 0, 0, …
## $ vote_average__1.80828877117927_1.88706964903238           <dbl> 0, 0, 0, 0, …
## $ vote_average__1.88706964903238_Inf                        <dbl> 0, 1, 1, 1, …
## $ `budget__-Inf_2500000`                                    <dbl> 0, 0, 0, 0, …
## $ budget__2500000_8150000                                   <dbl> 0, 0, 0, 0, …
## $ `budget__8150000_1.9e+07`                                 <dbl> 0, 1, 0, 0, …
## $ `budget__1.9e+07_Inf`                                     <dbl> 1, 0, 1, 1, …
## $ `revenue__-Inf_10341767.5`                                <dbl> 1, 0, 0, 0, …
## $ revenue__10341767.5_33900000                              <dbl> 0, 1, 0, 0, …
## $ revenue__33900000_91132596.5                              <dbl> 0, 0, 0, 0, …
## $ revenue__91132596.5_Inf                                   <dbl> 0, 0, 1, 1, …
## $ `runtime__-Inf_89`                                        <dbl> 1, 0, 0, 0, …
## $ runtime__89_96                                            <dbl> 0, 0, 0, 0, …
## $ runtime__96_104.75                                        <dbl> 0, 0, 0, 0, …
## $ runtime__104.75_Inf                                       <dbl> 0, 1, 1, 1, …
## $ `backdrop_path__/114yPZmKgsQVTSt0BDx5WagBOvW.jpg`         <dbl> 0, 0, 0, 0, …
## $ `backdrop_path__-OTHER`                                   <dbl> 1, 1, 1, 1, …
## $ `genre_names__Action,_Adventure,_Horror,_Science_Fiction` <dbl> 0, 0, 0, 0, …
## $ `genre_names__Action,_Fantasy,_Horror`                    <dbl> 0, 0, 0, 0, …
## $ `genre_names__Action,_Horror,_Science_Fiction`            <dbl> 0, 0, 0, 0, …
## $ `genre_names__Action,_Horror,_Thriller`                   <dbl> 0, 0, 0, 0, …
## $ `genre_names__Adventure,_Horror,_Thriller`                <dbl> 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Fantasy,_Horror`                    <dbl> 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Horror`                             <dbl> 0, 0, 0, 0, …
## $ `genre_names__Comedy,_Horror,_Science_Fiction`            <dbl> 0, 0, 0, 0, …
## $ `genre_names__Crime,_Horror,_Thriller`                    <dbl> 0, 0, 0, 0, …
## $ `genre_names__Drama,_Horror,_Thriller`                    <dbl> 0, 0, 0, 0, …
## $ `genre_names__Fantasy,_Horror`                            <dbl> 0, 0, 0, 0, …
## $ genre_names__Horror                                       <dbl> 0, 0, 0, 0, …
## $ `genre_names__Horror,_Mystery`                            <dbl> 0, 0, 0, 0, …
## $ `genre_names__Horror,_Mystery,_Thriller`                  <dbl> 1, 1, 0, 1, …
## $ `genre_names__Horror,_Science_Fiction`                    <dbl> 0, 0, 0, 0, …
## $ `genre_names__Horror,_Science_Fiction,_Thriller`          <dbl> 0, 0, 0, 0, …
## $ `genre_names__Horror,_Thriller`                           <dbl> 0, 0, 1, 0, …
## $ `genre_names__-OTHER`                                     <dbl> 0, 0, 0, 0, …
## $ `collection__-Inf_12263`                                  <dbl> 0, 0, 0, 1, …
## $ collection__12263_111751                                  <dbl> 1, 0, 1, 0, …
## $ collection__111751_355090.5                               <dbl> 0, 0, 0, 0, …
## $ collection__355090.5_Inf                                  <dbl> 0, 1, 0, 0, …
## $ `collection_name__[REC]_Collection`                       <dbl> 0, 0, 0, 0, …
## $ collection_name__A_Nightmare_on_Elm_Street_Collection     <dbl> 0, 0, 0, 0, …
## $ `collection_name__Child's_Play_Collection`                <dbl> 0, 0, 0, 0, …
## $ collection_name__Evil_Dead_Collection                     <dbl> 0, 0, 0, 0, …
## $ collection_name__Final_Destination_Collection             <dbl> 0, 0, 0, 0, …
## $ collection_name__Friday_the_13th_Collection               <dbl> 0, 0, 0, 0, …
## $ collection_name__Halloween_Collection                     <dbl> 0, 0, 1, 0, …
## $ collection_name__Hellraiser_Collection                    <dbl> 0, 0, 0, 0, …
## $ collection_name__Insidious_Collection                     <dbl> 0, 0, 0, 0, …
## $ collection_name__Jaws_Collection                          <dbl> 0, 0, 0, 0, …
## $ collection_name__Jeepers_Creepers_Collection              <dbl> 1, 0, 0, 0, …
## $ collection_name__Living_Dead_Collection                   <dbl> 0, 0, 0, 0, …
## $ collection_name__Paranormal_Activity_Collection           <dbl> 0, 0, 0, 0, …
## $ collection_name__Resident_Evil_Collection                 <dbl> 0, 0, 0, 0, …
## $ collection_name__Saw_Collection                           <dbl> 0, 0, 0, 0, …
## $ collection_name__Scream_Collection                        <dbl> 0, 0, 0, 1, …
## $ collection_name__Texas_Chainsaw_Massacre_Collection       <dbl> 0, 0, 0, 0, …
## $ collection_name__The_Exorcist_Collection                  <dbl> 0, 0, 0, 0, …
## $ collection_name__The_Purge_Collection                     <dbl> 0, 0, 0, 0, …
## $ `collection_name__-OTHER`                                 <dbl> 0, 1, 0, 0, …

# Step 2: Correlate
data_corr_tbl <- data_binarize_tbl %>%
    correlate(vote_average__1.88706964903238_Inf)

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Apply to Your Data 1: Horror Movies

Stephen Morris

2024-02-07