#explore data

library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
data_binarized_tbl <- horror_df %>%
    select(-id, -title) %>%  # Ensure vote_log is included
    binarize()


glimpse(data_binarized_tbl)
## Rows: 4,949
## Columns: 35
## $ `popularity__-Inf_1.55`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__1.55_4.882                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__4.882_13.017                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__13.017_Inf                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `vote_count__-Inf_5`                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__5_36                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__36_205                              <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ vote_count__205_Inf                             <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
## $ `vote_average__-Inf_1.66770682055808`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.66770682055808_1.85629799036563 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.85629799036563_1.97408102602201 <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ vote_average__1.97408102602201_Inf              <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
## $ `budget__-Inf_7e+05`                            <dbl> 1, 1, 0, 0, 0, 0, 0, 0…
## $ `budget__7e+05_Inf`                             <dbl> 0, 0, 1, 1, 1, 1, 1, 1…
## $ `runtime__-Inf_81`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__81_90                                  <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ runtime__90_99                                  <dbl> 1, 1, 0, 0, 0, 0, 0, 0…
## $ runtime__99_Inf                                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ genre_names__Action                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Adventure                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Animation                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Comedy                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Crime                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Drama                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Fantasy                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Horror                             <dbl> 1, 0, 1, 0, 0, 1, 0, 0…
## $ genre_names__Mystery                            <dbl> 0, 0, 0, 1, 0, 0, 1, 0…
## $ genre_names__Science_Fiction                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Thriller                           <dbl> 0, 1, 0, 0, 1, 0, 0, 1…
## $ genre_names__TV_Movie                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `genre_names__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `collection__-Inf_133352`                       <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ collection__133352_459212                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ collection__459212_744915                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ collection__744915_Inf                          <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
colnames(data_binarized_tbl) 
##  [1] "popularity__-Inf_1.55"                          
##  [2] "popularity__1.55_4.882"                         
##  [3] "popularity__4.882_13.017"                       
##  [4] "popularity__13.017_Inf"                         
##  [5] "vote_count__-Inf_5"                             
##  [6] "vote_count__5_36"                               
##  [7] "vote_count__36_205"                             
##  [8] "vote_count__205_Inf"                            
##  [9] "vote_average__-Inf_1.66770682055808"            
## [10] "vote_average__1.66770682055808_1.85629799036563"
## [11] "vote_average__1.85629799036563_1.97408102602201"
## [12] "vote_average__1.97408102602201_Inf"             
## [13] "budget__-Inf_7e+05"                             
## [14] "budget__7e+05_Inf"                              
## [15] "runtime__-Inf_81"                               
## [16] "runtime__81_90"                                 
## [17] "runtime__90_99"                                 
## [18] "runtime__99_Inf"                                
## [19] "genre_names__Action"                            
## [20] "genre_names__Adventure"                         
## [21] "genre_names__Animation"                         
## [22] "genre_names__Comedy"                            
## [23] "genre_names__Crime"                             
## [24] "genre_names__Drama"                             
## [25] "genre_names__Fantasy"                           
## [26] "genre_names__Horror"                            
## [27] "genre_names__Mystery"                           
## [28] "genre_names__Science_Fiction"                   
## [29] "genre_names__Thriller"                          
## [30] "genre_names__TV_Movie"                          
## [31] "genre_names__-OTHER"                            
## [32] "collection__-Inf_133352"                        
## [33] "collection__133352_459212"                      
## [34] "collection__459212_744915"                      
## [35] "collection__744915_Inf"
data_corr_tbl <- data_binarized_tbl %>%
    correlate(`vote_average__1.97408102602201_Inf`)  


data_corr_tbl
## # A tibble: 35 × 3
##    feature      bin                               correlation
##    <fct>        <chr>                                   <dbl>
##  1 vote_average 1.97408102602201_Inf                    1    
##  2 vote_average 1.66770682055808_1.85629799036563      -0.335
##  3 vote_average -Inf_1.66770682055808                  -0.330
##  4 vote_average 1.85629799036563_1.97408102602201      -0.324
##  5 popularity   13.017_Inf                              0.243
##  6 vote_count   205_Inf                                 0.214
##  7 runtime      99_Inf                                  0.196
##  8 vote_count   5_36                                   -0.123
##  9 popularity   -Inf_1.55                              -0.115
## 10 popularity   1.55_4.882                             -0.109
## # ℹ 25 more rows
# Plot correlation funnel
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#preprocess data

#build models

#evaluate models

#make Predictions