Explore Data

horror_movies %>% ggplot(aes(genre_names, popularity)) + scale_y_log10() + geom_point()
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

horror_movies %>% ggplot(aes(popularity, as.factor(vote_average))) + geom_boxplot()

horror_movies %>%
    unnest_tokens(output = word, input = title) %>%
    filter(!str_detect(word, "\\d")) %>%
    count(word, wt = popularity, sort = TRUE) %>%
    filter(n > 10) %>%
    slice_max(order_by = n, n = 20) %>%
    ggplot(aes(n, fct_reorder(word, n))) +
    geom_point(color = "blue") +
    labs(x = "Popularity Weight", y = "Words in Title", title = "Most Popular Words in Horror Movie Titles") +
    theme_minimal()

Model Setup

xgboost_recipe <- recipe(vote_log ~ ., data = data_train) %>%
  step_tokenize(title) %>%
  step_tokenfilter(title, max_tokens = 100) %>%
  step_tfidf(title) %>%
  step_other(genre_names) %>%
  step_dummy(genre_names, one_hot = TRUE) %>%
  step_normalize(all_numeric_predictors())

# Check transformed dataset
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 1,844
## Columns: 110
## $ id                           <dbl> -0.5835434, -0.5852877, -0.5429932, -0.84…
## $ popularity                   <dbl> -0.09137700, -0.05081380, -0.08644133, 0.…
## $ vote_count                   <dbl> -0.31863235, 0.09639389, -0.32687791, 0.0…
## $ vote_average                 <dbl> 0.47082623, 0.14664503, 0.33189143, 1.304…
## $ collection                   <dbl> -0.40541840, -0.82537485, -1.09812930, -1…
## $ vote_log                     <dbl> 1.902108, 1.791759, 1.856298, 2.140066, 1…
## $ tfidf_title_1                <dbl> -0.06516277, -0.06516277, -0.06516277, -0…
## $ tfidf_title_2                <dbl> 1.1326826, -0.3259927, -0.3259927, -0.325…
## $ tfidf_title_3                <dbl> -0.1884525, -0.1884525, -0.1884525, -0.18…
## $ tfidf_title_4                <dbl> -0.120108, -0.120108, -0.120108, -0.12010…
## $ tfidf_title_5                <dbl> -0.1007076, -0.1007076, -0.1007076, -0.10…
## $ tfidf_title_6                <dbl> -0.08292948, -0.08292948, -0.08292948, -0…
## $ tfidf_title_a                <dbl> -0.1271264, -0.1271264, -0.1271264, -0.12…
## $ tfidf_title_alien            <dbl> -0.06075259, -0.06075259, -0.06075259, -0…
## $ tfidf_title_all              <dbl> -0.06417432, -0.06417432, -0.06417432, -0…
## $ tfidf_title_amityville       <dbl> -0.0809767, -0.0809767, -0.0809767, -0.08…
## $ tfidf_title_and              <dbl> -0.09768483, -0.09768483, -0.09768483, -0…
## $ tfidf_title_angel            <dbl> -0.06082742, -0.06082742, -0.06082742, -0…
## $ tfidf_title_atta             <dbl> -0.1646822, -0.1646822, -0.1646822, -0.16…
## $ tfidf_title_attack           <dbl> -0.07948741, -0.07948741, -0.07948741, -0…
## $ tfidf_title_back             <dbl> -0.06290499, -0.06290499, -0.06290499, -0…
## $ tfidf_title_beast            <dbl> -0.07928465, -0.07928465, -0.07928465, -0…
## $ tfidf_title_beginning        <dbl> -0.06057836, -0.06057836, -0.06057836, -0…
## $ tfidf_title_blood            <dbl> -0.1293686, -0.1293686, -0.1293686, -0.12…
## $ tfidf_title_bloody           <dbl> -0.06635186, -0.06635186, -0.06635186, -0…
## $ tfidf_title_bong             <dbl> -0.06769927, -0.06769927, -0.06769927, -0…
## $ tfidf_title_camp             <dbl> -0.08656444, -0.08656444, -0.08656444, -0…
## $ tfidf_title_chainsaw         <dbl> -0.05839193, -0.05839193, -0.05839193, -0…
## $ tfidf_title_children         <dbl> -0.07517554, -0.07517554, -0.07517554, -0…
## $ tfidf_title_corn             <dbl> -0.06846723, -0.06846723, -0.06846723, -0…
## $ tfidf_title_curse            <dbl> -0.1166804, -0.1166804, -0.1166804, -0.11…
## $ tfidf_title_dark             <dbl> -0.06938116, -0.06938116, -0.06938116, -0…
## $ tfidf_title_darkness         <dbl> -0.0765481, -0.0765481, -0.0765481, -0.07…
## $ tfidf_title_dead             <dbl> -0.1338139, -0.1338139, -0.1338139, -0.13…
## $ tfidf_title_death            <dbl> -0.114540, 4.920304, -0.114540, -0.114540…
## $ tfidf_title_demon            <dbl> -0.06000443, -0.06000443, -0.06000443, -0…
## $ tfidf_title_demons           <dbl> -0.05795978, -0.05795978, -0.05795978, -0…
## $ tfidf_title_devil            <dbl> -0.1047804, -0.1047804, -0.1047804, -0.10…
## $ tfidf_title_dracula          <dbl> -0.08879108, -0.08879108, -0.08879108, -0…
## $ tfidf_title_eizou            <dbl> -0.09202731, -0.09202731, -0.09202731, -0…
## $ tfidf_title_evil             <dbl> -0.1472014, -0.1472014, -0.1472014, -0.14…
## $ tfidf_title_faces            <dbl> -0.07454034, -0.07454034, -0.07454034, -0…
## $ tfidf_title_fear             <dbl> -0.08044309, -0.08044309, -0.08044309, -0…
## $ tfidf_title_file             <dbl> -0.05400971, -0.05400971, -0.05400971, -0…
## $ tfidf_title_final            <dbl> -0.07945224, -0.07945224, -0.07945224, -0…
## $ tfidf_title_forest           <dbl> -0.05374135, -0.05374135, -0.05374135, -0…
## $ tfidf_title_from             <dbl> -0.1095897, -0.1095897, -0.1095897, -0.10…
## $ tfidf_title_fuuin            <dbl> -0.09202731, -0.09202731, -0.09202731, -0…
## $ tfidf_title_ghost            <dbl> -0.1290547, -0.1290547, -0.1290547, -0.12…
## $ tfidf_title_halloween        <dbl> -0.09227065, -0.09227065, -0.09227065, -0…
## $ tfidf_title_haunted          <dbl> -0.08897714, -0.08897714, -0.08897714, -0…
## $ tfidf_title_haunting         <dbl> -0.06486899, -0.06486899, -0.06486899, -0…
## $ tfidf_title_hell             <dbl> -0.07546232, -0.07546232, -0.07546232, -0…
## $ tfidf_title_hellraiser       <dbl> -0.06687132, -0.06687132, -0.06687132, -0…
## $ tfidf_title_high             <dbl> -0.06564481, -0.06564481, -0.06564481, -0…
## $ tfidf_title_honto            <dbl> -0.1646822, -0.1646822, -0.1646822, -0.16…
## $ tfidf_title_horror           <dbl> -0.1044226, -0.1044226, -0.1044226, -0.10…
## $ tfidf_title_house            <dbl> -0.1291866, -0.1291866, -0.1291866, -0.12…
## $ tfidf_title_ii               <dbl> -0.192339, -0.192339, -0.192339, -0.19233…
## $ tfidf_title_iii              <dbl> -0.1373777, -0.1373777, -0.1373777, -0.13…
## $ tfidf_title_in               <dbl> -0.1275767, -0.1275767, -0.1275767, -0.12…
## $ tfidf_title_iv               <dbl> -0.08670574, -0.08670574, -0.08670574, -0…
## $ tfidf_title_killer           <dbl> -0.1020256, -0.1020256, -0.1020256, -0.10…
## $ tfidf_title_last             <dbl> -0.06057478, -0.06057478, -0.06057478, -0…
## $ tfidf_title_legend           <dbl> -0.09095487, -0.09095487, -0.09095487, -0…
## $ tfidf_title_living           <dbl> -0.07275189, -0.07275189, -0.07275189, -0…
## $ tfidf_title_man              <dbl> -0.07969301, -0.07969301, -0.07969301, -0…
## $ tfidf_title_massacre         <dbl> -0.1017334, -0.1017334, -0.1017334, -0.10…
## $ tfidf_title_master           <dbl> -0.08590297, -0.08590297, -0.08590297, -0…
## $ tfidf_title_me               <dbl> -0.06897686, -0.06897686, -0.06897686, -0…
## $ tfidf_title_mr               <dbl> -0.0640164, -0.0640164, -0.0640164, -0.06…
## $ tfidf_title_my               <dbl> -0.07019799, -0.07019799, -0.07019799, -0…
## $ tfidf_title_new              <dbl> -0.06507755, -0.06507755, -0.06507755, -0…
## $ tfidf_title_ni               <dbl> -0.138583, -0.138583, -0.138583, -0.13858…
## $ tfidf_title_night            <dbl> -0.1445632, -0.1445632, -0.1445632, -0.14…
## $ tfidf_title_nightmare        <dbl> -0.06754714, -0.06754714, -0.06754714, -0…
## $ tfidf_title_no               <dbl> -0.1673844, -0.1673844, -0.1673844, -0.16…
## $ tfidf_title_noroi            <dbl> -0.1691135, -0.1691135, -0.1691135, -0.16…
## $ tfidf_title_of               <dbl> 1.7432111, 1.7432111, 6.0175742, -0.39397…
## $ tfidf_title_on               <dbl> -0.09871148, -0.09871148, -0.09871148, -0…
## $ tfidf_title_part             <dbl> -0.1046969, -0.1046969, -0.1046969, -0.10…
## $ tfidf_title_pop              <dbl> -0.07347115, -0.07347115, -0.07347115, -0…
## $ tfidf_title_psycho           <dbl> -0.06473675, -0.06473675, -0.06473675, -0…
## $ tfidf_title_puppet           <dbl> -0.07726184, -0.07726184, -0.07726184, -0…
## $ tfidf_title_rattle           <dbl> -0.0727781, -0.0727781, -0.0727781, -0.07…
## $ tfidf_title_return           <dbl> -0.1141155, -0.1141155, -0.1141155, -0.11…
## $ tfidf_title_returns          <dbl> -0.06716826, -0.06716826, -0.06716826, -0…
## $ tfidf_title_revenge          <dbl> -0.08819256, -0.08819256, -0.08819256, -0…
## $ tfidf_title_roll             <dbl> -0.07639719, -0.07639719, -0.07639719, -0…
## $ tfidf_title_shake            <dbl> -0.0727781, -0.0727781, -0.0727781, -0.07…
## $ tfidf_title_shark            <dbl> -0.08910989, -0.08910989, -0.08910989, -0…
## $ tfidf_title_story            <dbl> -0.07172151, -0.07172151, -0.07172151, -0…
## $ tfidf_title_tales            <dbl> -0.094582, -0.094582, -0.094582, -0.09458…
## $ tfidf_title_terror           <dbl> 6.34128001, -0.08557607, -0.08557607, -0.…
## $ tfidf_title_the              <dbl> -0.5911553, 0.5309017, -0.5911553, -0.591…
## $ tfidf_title_to               <dbl> -0.08410205, -0.08410205, -0.08410205, -0…
## $ tfidf_title_tokyo            <dbl> -0.07695152, -0.07695152, -0.07695152, -0…
## $ tfidf_title_troublesome      <dbl> -0.0796682, -0.0796682, -0.0796682, -0.07…
## $ tfidf_title_v                <dbl> -0.07320981, -0.07320981, -0.07320981, -0…
## $ tfidf_title_vampire          <dbl> -0.08349751, -0.08349751, -0.08349751, 16…
## $ tfidf_title_video            <dbl> -0.1379243, -0.1379243, -0.1379243, -0.13…
## $ tfidf_title_vol              <dbl> -0.09689733, -0.09689733, -0.09689733, -0…
## $ tfidf_title_volume           <dbl> -0.0729593, -0.0729593, -0.0729593, -0.07…
## $ tfidf_title_vs               <dbl> -0.1068558, -0.1068558, -0.1068558, -0.10…
## $ tfidf_title_witchcraft       <dbl> -0.07941886, -0.07941886, -0.07941886, -0…
## $ tfidf_title_zombie           <dbl> -0.08946271, -0.08946271, -0.08946271, -0…
## $ genre_names_Comedy..Horror   <dbl> -0.3318356, -0.3318356, -0.3318356, -0.33…
## $ genre_names_Horror           <dbl> 1.4132552, 1.4132552, -0.7072026, -0.7072…
## $ genre_names_Horror..Thriller <dbl> -0.334847, -0.334847, -0.334847, -0.33484…
## $ genre_names_other            <dbl> -0.9346175, -0.9346175, 1.0693762, 1.0693…
# Specify model
xgboost_spec <- boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
  set_mode("regression") %>%
  set_engine("xgboost")

# Combine recipe and model into a workflow
xgboost_workflow <- workflow() %>%
  add_recipe(xgboost_recipe) %>%
  add_model(xgboost_spec)

# Tune hyperparameters
set.seed(344)
xgboost_tune <- tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry

Evaluate Model

# Check tuning results
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
##    mtry trees min_n learn_rate .metric .estimator    mean     n std_err .config 
##   <int> <int> <int>      <dbl> <chr>   <chr>        <dbl> <int>   <dbl> <chr>   
## 1    65   768    12    0.112   rmse    standard   0.00924     5 0.00132 Preproc…
## 2    86  1524    23    0.0836  rmse    standard   0.0133      5 0.00143 Preproc…
## 3    35  1104    28    0.00484 rmse    standard   0.0268      5 0.00168 Preproc…
## 4     7  1613    36    0.0290  rmse    standard   0.0423      5 0.00282 Preproc…
## 5    97   162     7    0.00108 rmse    standard   1.07        5 0.00367 Preproc…
# Finalize model with best parameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow, tune::select_best(xgboost_tune, metric = "rmse"))

# Fit on training data and test on test data
data_fit <- tune::last_fit(xgboost_fw, data_split)

# Collect metrics
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard     0.00763 Preprocessor1_Model1
## 2 rsq     standard     1.00    Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
    ggplot(aes(vote_log, .pred)) +
    geom_point(alpha = 0.3, color = "midnightblue") +
    geom_abline(lty = 2, color = "grey50") +
    coord_fixed()