#explore data
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
data_binarized_tbl <- horror_df %>%
select(-id, -title) %>% # Ensure vote_log is included
binarize()
glimpse(data_binarized_tbl)
## Rows: 4,949
## Columns: 35
## $ `popularity__-Inf_1.55` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__1.55_4.882 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__4.882_13.017 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__13.017_Inf <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `vote_count__-Inf_5` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__5_36 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__36_205 <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ vote_count__205_Inf <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
## $ `vote_average__-Inf_1.66770682055808` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.66770682055808_1.85629799036563 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.85629799036563_1.97408102602201 <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ vote_average__1.97408102602201_Inf <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
## $ `budget__-Inf_7e+05` <dbl> 1, 1, 0, 0, 0, 0, 0, 0…
## $ `budget__7e+05_Inf` <dbl> 0, 0, 1, 1, 1, 1, 1, 1…
## $ `runtime__-Inf_81` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__81_90 <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ runtime__90_99 <dbl> 1, 1, 0, 0, 0, 0, 0, 0…
## $ runtime__99_Inf <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ genre_names__Action <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Adventure <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Animation <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Comedy <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Crime <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Drama <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Fantasy <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Horror <dbl> 1, 0, 1, 0, 0, 1, 0, 0…
## $ genre_names__Mystery <dbl> 0, 0, 0, 1, 0, 0, 1, 0…
## $ genre_names__Science_Fiction <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Thriller <dbl> 0, 1, 0, 0, 1, 0, 0, 1…
## $ genre_names__TV_Movie <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `genre_names__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `collection__-Inf_133352` <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ collection__133352_459212 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ collection__459212_744915 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ collection__744915_Inf <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
colnames(data_binarized_tbl)
## [1] "popularity__-Inf_1.55"
## [2] "popularity__1.55_4.882"
## [3] "popularity__4.882_13.017"
## [4] "popularity__13.017_Inf"
## [5] "vote_count__-Inf_5"
## [6] "vote_count__5_36"
## [7] "vote_count__36_205"
## [8] "vote_count__205_Inf"
## [9] "vote_average__-Inf_1.66770682055808"
## [10] "vote_average__1.66770682055808_1.85629799036563"
## [11] "vote_average__1.85629799036563_1.97408102602201"
## [12] "vote_average__1.97408102602201_Inf"
## [13] "budget__-Inf_7e+05"
## [14] "budget__7e+05_Inf"
## [15] "runtime__-Inf_81"
## [16] "runtime__81_90"
## [17] "runtime__90_99"
## [18] "runtime__99_Inf"
## [19] "genre_names__Action"
## [20] "genre_names__Adventure"
## [21] "genre_names__Animation"
## [22] "genre_names__Comedy"
## [23] "genre_names__Crime"
## [24] "genre_names__Drama"
## [25] "genre_names__Fantasy"
## [26] "genre_names__Horror"
## [27] "genre_names__Mystery"
## [28] "genre_names__Science_Fiction"
## [29] "genre_names__Thriller"
## [30] "genre_names__TV_Movie"
## [31] "genre_names__-OTHER"
## [32] "collection__-Inf_133352"
## [33] "collection__133352_459212"
## [34] "collection__459212_744915"
## [35] "collection__744915_Inf"
data_corr_tbl <- data_binarized_tbl %>%
correlate(`vote_average__1.97408102602201_Inf`)
data_corr_tbl
## # A tibble: 35 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 vote_average 1.97408102602201_Inf 1
## 2 vote_average 1.66770682055808_1.85629799036563 -0.335
## 3 vote_average -Inf_1.66770682055808 -0.330
## 4 vote_average 1.85629799036563_1.97408102602201 -0.324
## 5 popularity 13.017_Inf 0.243
## 6 vote_count 205_Inf 0.214
## 7 runtime 99_Inf 0.196
## 8 vote_count 5_36 -0.123
## 9 popularity -Inf_1.55 -0.115
## 10 popularity 1.55_4.882 -0.109
## # ℹ 25 more rows
# Plot correlation funnel
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#preprocess data
#build models
#evaluate models
#make Predictions