Model Setup
xgboost_recipe <- recipe(vote_log ~ ., data = data_train) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_other(genre_names) %>%
step_dummy(genre_names, one_hot = TRUE) %>%
step_normalize(all_numeric_predictors())
# Check transformed dataset
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 1,844
## Columns: 110
## $ id <dbl> -0.5835434, -0.5852877, -0.5429932, -0.84…
## $ popularity <dbl> -0.09137700, -0.05081380, -0.08644133, 0.…
## $ vote_count <dbl> -0.31863235, 0.09639389, -0.32687791, 0.0…
## $ vote_average <dbl> 0.47082623, 0.14664503, 0.33189143, 1.304…
## $ collection <dbl> -0.40541840, -0.82537485, -1.09812930, -1…
## $ vote_log <dbl> 1.902108, 1.791759, 1.856298, 2.140066, 1…
## $ tfidf_title_1 <dbl> -0.06516277, -0.06516277, -0.06516277, -0…
## $ tfidf_title_2 <dbl> 1.1326826, -0.3259927, -0.3259927, -0.325…
## $ tfidf_title_3 <dbl> -0.1884525, -0.1884525, -0.1884525, -0.18…
## $ tfidf_title_4 <dbl> -0.120108, -0.120108, -0.120108, -0.12010…
## $ tfidf_title_5 <dbl> -0.1007076, -0.1007076, -0.1007076, -0.10…
## $ tfidf_title_6 <dbl> -0.08292948, -0.08292948, -0.08292948, -0…
## $ tfidf_title_a <dbl> -0.1271264, -0.1271264, -0.1271264, -0.12…
## $ tfidf_title_alien <dbl> -0.06075259, -0.06075259, -0.06075259, -0…
## $ tfidf_title_all <dbl> -0.06417432, -0.06417432, -0.06417432, -0…
## $ tfidf_title_amityville <dbl> -0.0809767, -0.0809767, -0.0809767, -0.08…
## $ tfidf_title_and <dbl> -0.09768483, -0.09768483, -0.09768483, -0…
## $ tfidf_title_angel <dbl> -0.06082742, -0.06082742, -0.06082742, -0…
## $ tfidf_title_atta <dbl> -0.1646822, -0.1646822, -0.1646822, -0.16…
## $ tfidf_title_attack <dbl> -0.07948741, -0.07948741, -0.07948741, -0…
## $ tfidf_title_back <dbl> -0.06290499, -0.06290499, -0.06290499, -0…
## $ tfidf_title_beast <dbl> -0.07928465, -0.07928465, -0.07928465, -0…
## $ tfidf_title_beginning <dbl> -0.06057836, -0.06057836, -0.06057836, -0…
## $ tfidf_title_blood <dbl> -0.1293686, -0.1293686, -0.1293686, -0.12…
## $ tfidf_title_bloody <dbl> -0.06635186, -0.06635186, -0.06635186, -0…
## $ tfidf_title_bong <dbl> -0.06769927, -0.06769927, -0.06769927, -0…
## $ tfidf_title_camp <dbl> -0.08656444, -0.08656444, -0.08656444, -0…
## $ tfidf_title_chainsaw <dbl> -0.05839193, -0.05839193, -0.05839193, -0…
## $ tfidf_title_children <dbl> -0.07517554, -0.07517554, -0.07517554, -0…
## $ tfidf_title_corn <dbl> -0.06846723, -0.06846723, -0.06846723, -0…
## $ tfidf_title_curse <dbl> -0.1166804, -0.1166804, -0.1166804, -0.11…
## $ tfidf_title_dark <dbl> -0.06938116, -0.06938116, -0.06938116, -0…
## $ tfidf_title_darkness <dbl> -0.0765481, -0.0765481, -0.0765481, -0.07…
## $ tfidf_title_dead <dbl> -0.1338139, -0.1338139, -0.1338139, -0.13…
## $ tfidf_title_death <dbl> -0.114540, 4.920304, -0.114540, -0.114540…
## $ tfidf_title_demon <dbl> -0.06000443, -0.06000443, -0.06000443, -0…
## $ tfidf_title_demons <dbl> -0.05795978, -0.05795978, -0.05795978, -0…
## $ tfidf_title_devil <dbl> -0.1047804, -0.1047804, -0.1047804, -0.10…
## $ tfidf_title_dracula <dbl> -0.08879108, -0.08879108, -0.08879108, -0…
## $ tfidf_title_eizou <dbl> -0.09202731, -0.09202731, -0.09202731, -0…
## $ tfidf_title_evil <dbl> -0.1472014, -0.1472014, -0.1472014, -0.14…
## $ tfidf_title_faces <dbl> -0.07454034, -0.07454034, -0.07454034, -0…
## $ tfidf_title_fear <dbl> -0.08044309, -0.08044309, -0.08044309, -0…
## $ tfidf_title_file <dbl> -0.05400971, -0.05400971, -0.05400971, -0…
## $ tfidf_title_final <dbl> -0.07945224, -0.07945224, -0.07945224, -0…
## $ tfidf_title_forest <dbl> -0.05374135, -0.05374135, -0.05374135, -0…
## $ tfidf_title_from <dbl> -0.1095897, -0.1095897, -0.1095897, -0.10…
## $ tfidf_title_fuuin <dbl> -0.09202731, -0.09202731, -0.09202731, -0…
## $ tfidf_title_ghost <dbl> -0.1290547, -0.1290547, -0.1290547, -0.12…
## $ tfidf_title_halloween <dbl> -0.09227065, -0.09227065, -0.09227065, -0…
## $ tfidf_title_haunted <dbl> -0.08897714, -0.08897714, -0.08897714, -0…
## $ tfidf_title_haunting <dbl> -0.06486899, -0.06486899, -0.06486899, -0…
## $ tfidf_title_hell <dbl> -0.07546232, -0.07546232, -0.07546232, -0…
## $ tfidf_title_hellraiser <dbl> -0.06687132, -0.06687132, -0.06687132, -0…
## $ tfidf_title_high <dbl> -0.06564481, -0.06564481, -0.06564481, -0…
## $ tfidf_title_honto <dbl> -0.1646822, -0.1646822, -0.1646822, -0.16…
## $ tfidf_title_horror <dbl> -0.1044226, -0.1044226, -0.1044226, -0.10…
## $ tfidf_title_house <dbl> -0.1291866, -0.1291866, -0.1291866, -0.12…
## $ tfidf_title_ii <dbl> -0.192339, -0.192339, -0.192339, -0.19233…
## $ tfidf_title_iii <dbl> -0.1373777, -0.1373777, -0.1373777, -0.13…
## $ tfidf_title_in <dbl> -0.1275767, -0.1275767, -0.1275767, -0.12…
## $ tfidf_title_iv <dbl> -0.08670574, -0.08670574, -0.08670574, -0…
## $ tfidf_title_killer <dbl> -0.1020256, -0.1020256, -0.1020256, -0.10…
## $ tfidf_title_last <dbl> -0.06057478, -0.06057478, -0.06057478, -0…
## $ tfidf_title_legend <dbl> -0.09095487, -0.09095487, -0.09095487, -0…
## $ tfidf_title_living <dbl> -0.07275189, -0.07275189, -0.07275189, -0…
## $ tfidf_title_man <dbl> -0.07969301, -0.07969301, -0.07969301, -0…
## $ tfidf_title_massacre <dbl> -0.1017334, -0.1017334, -0.1017334, -0.10…
## $ tfidf_title_master <dbl> -0.08590297, -0.08590297, -0.08590297, -0…
## $ tfidf_title_me <dbl> -0.06897686, -0.06897686, -0.06897686, -0…
## $ tfidf_title_mr <dbl> -0.0640164, -0.0640164, -0.0640164, -0.06…
## $ tfidf_title_my <dbl> -0.07019799, -0.07019799, -0.07019799, -0…
## $ tfidf_title_new <dbl> -0.06507755, -0.06507755, -0.06507755, -0…
## $ tfidf_title_ni <dbl> -0.138583, -0.138583, -0.138583, -0.13858…
## $ tfidf_title_night <dbl> -0.1445632, -0.1445632, -0.1445632, -0.14…
## $ tfidf_title_nightmare <dbl> -0.06754714, -0.06754714, -0.06754714, -0…
## $ tfidf_title_no <dbl> -0.1673844, -0.1673844, -0.1673844, -0.16…
## $ tfidf_title_noroi <dbl> -0.1691135, -0.1691135, -0.1691135, -0.16…
## $ tfidf_title_of <dbl> 1.7432111, 1.7432111, 6.0175742, -0.39397…
## $ tfidf_title_on <dbl> -0.09871148, -0.09871148, -0.09871148, -0…
## $ tfidf_title_part <dbl> -0.1046969, -0.1046969, -0.1046969, -0.10…
## $ tfidf_title_pop <dbl> -0.07347115, -0.07347115, -0.07347115, -0…
## $ tfidf_title_psycho <dbl> -0.06473675, -0.06473675, -0.06473675, -0…
## $ tfidf_title_puppet <dbl> -0.07726184, -0.07726184, -0.07726184, -0…
## $ tfidf_title_rattle <dbl> -0.0727781, -0.0727781, -0.0727781, -0.07…
## $ tfidf_title_return <dbl> -0.1141155, -0.1141155, -0.1141155, -0.11…
## $ tfidf_title_returns <dbl> -0.06716826, -0.06716826, -0.06716826, -0…
## $ tfidf_title_revenge <dbl> -0.08819256, -0.08819256, -0.08819256, -0…
## $ tfidf_title_roll <dbl> -0.07639719, -0.07639719, -0.07639719, -0…
## $ tfidf_title_shake <dbl> -0.0727781, -0.0727781, -0.0727781, -0.07…
## $ tfidf_title_shark <dbl> -0.08910989, -0.08910989, -0.08910989, -0…
## $ tfidf_title_story <dbl> -0.07172151, -0.07172151, -0.07172151, -0…
## $ tfidf_title_tales <dbl> -0.094582, -0.094582, -0.094582, -0.09458…
## $ tfidf_title_terror <dbl> 6.34128001, -0.08557607, -0.08557607, -0.…
## $ tfidf_title_the <dbl> -0.5911553, 0.5309017, -0.5911553, -0.591…
## $ tfidf_title_to <dbl> -0.08410205, -0.08410205, -0.08410205, -0…
## $ tfidf_title_tokyo <dbl> -0.07695152, -0.07695152, -0.07695152, -0…
## $ tfidf_title_troublesome <dbl> -0.0796682, -0.0796682, -0.0796682, -0.07…
## $ tfidf_title_v <dbl> -0.07320981, -0.07320981, -0.07320981, -0…
## $ tfidf_title_vampire <dbl> -0.08349751, -0.08349751, -0.08349751, 16…
## $ tfidf_title_video <dbl> -0.1379243, -0.1379243, -0.1379243, -0.13…
## $ tfidf_title_vol <dbl> -0.09689733, -0.09689733, -0.09689733, -0…
## $ tfidf_title_volume <dbl> -0.0729593, -0.0729593, -0.0729593, -0.07…
## $ tfidf_title_vs <dbl> -0.1068558, -0.1068558, -0.1068558, -0.10…
## $ tfidf_title_witchcraft <dbl> -0.07941886, -0.07941886, -0.07941886, -0…
## $ tfidf_title_zombie <dbl> -0.08946271, -0.08946271, -0.08946271, -0…
## $ genre_names_Comedy..Horror <dbl> -0.3318356, -0.3318356, -0.3318356, -0.33…
## $ genre_names_Horror <dbl> 1.4132552, 1.4132552, -0.7072026, -0.7072…
## $ genre_names_Horror..Thriller <dbl> -0.334847, -0.334847, -0.334847, -0.33484…
## $ genre_names_other <dbl> -0.9346175, -0.9346175, 1.0693762, 1.0693…
# Specify model
xgboost_spec <- boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model into a workflow
xgboost_workflow <- workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(344)
xgboost_tune <- tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry