Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

skimr::skim(youtube)

Data summary
Name	youtube
Number of rows	247
Number of columns	25
_______________________
Column type frequency:
character	10
logical	7
numeric	7
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
brand	0	1.00	3	9	10
superbowl_ads_dot_com_url	0	1.00	34	120	244
youtube_url	11	0.96	43	43	233
id	11	0.96	11	11	233
kind	16	0.94	13	13	1
etag	16	0.94	27	27	228
title	16	0.94	6	99	228
description	50	0.80	3	3527	194
thumbnail	129	0.48	48	48	118
channel_title	16	0.94	3	37	185

Variable type: logical

skim_variable	complete_rate	mean	count
funny	1	0.69	TRU: 171, FAL: 76
show_product_quickly	1	0.68	TRU: 169, FAL: 78
patriotic	1	0.17	FAL: 206, TRU: 41
celebrity	1	0.29	FAL: 176, TRU: 71
danger	1	0.30	FAL: 172, TRU: 75
animals	1	0.37	FAL: 155, TRU: 92
use_sex	1	0.27	FAL: 181, TRU: 66

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2010.19	5.86	2000	2005	2010	2015.00	2020	▇▇▇▇▆
view_count	16	0.94	1407556.46	11971111.01	10	6431	41379	170015.50	176373378	▇▁▁▁▁
like_count	22	0.91	4146.03	23920.40	0	19	130	527.00	275362	▇▁▁▁▁
dislike_count	22	0.91	833.54	6948.52	0	1	7	24.00	92990	▇▁▁▁▁
favorite_count	16	0.94	0.00	0.00	0	0	0	0.00	0	▁▁▇▁▁
comment_count	25	0.90	188.64	986.46	0	1	10	50.75	9190	▇▁▁▁▁
category_id	16	0.94	19.32	8.00	1	17	23	24.00	29	▃▁▂▆▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
published_at	16	0.94	2006-02-06 10:02:36	2021-01-27 13:11:29	2013-01-31 09:13:55	227

data <- youtube %>% 
    
    # Treat missing values
    select(-superbowl_ads_dot_com_url, -youtube_url, -kind, -etag, -channel_title, -comment_count, -thumbnail, -published_at, -description, -dislike_count, -favorite_count, -view_count) %>% 
    na.omit() %>% 
    mutate(like_count = log(like_count + 1)) %>% 
    mutate(category_id = as.factor(category_id)) %>% 
    mutate(across(where(is.character),as.factor)) %>%
    mutate(title = as.character(title)) %>% 
    mutate(across(where(is.logical),as.factor))

Explore Data

Identify good predictors.

celebrity

data %>% 
    ggplot(aes(like_count, funny)) + 
    geom_point()

Brand

data %>% 
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg rent per word
    group_by(word) %>%
    summarise(like_count = mean(like_count),
              n          = n()) %>%
    ungroup() %>%
    
    # Plot 
    ggplot(aes(like_count, fct_reorder(word, like_count))) + 
    geom_point() + 
    
    labs(y = "Brands")

EDA Shortcut

# Step 1: Prepare Data
data_binarized_tbl <- data %>% 
    select(-id, -title) %>% 
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 225
## Columns: 43
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE                                  <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE                                   <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE                   <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE                    <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE                              <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE                               <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE                                 <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE                                  <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE                                <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE                                 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE                                <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id__1                                <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22                               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24                               <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27                               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>% 
    correlate(like_count__6.26909628370626_Inf)

data_corr_tbl

## # A tibble: 43 × 3
##    feature     bin                               correlation
##    <fct>       <chr>                                   <dbl>
##  1 like_count  6.26909628370626_Inf                    1    
##  2 like_count  -Inf_2.99573227355399                  -0.339
##  3 like_count  4.87519732320115_6.26909628370626      -0.331
##  4 like_count  2.99573227355399_4.87519732320115      -0.327
##  5 brand       Doritos                                 0.281
##  6 brand       NFL                                     0.250
##  7 brand       Bud_Light                              -0.212
##  8 year        2015_Inf                                0.202
##  9 year        -Inf_2005                              -0.193
## 10 category_id 1                                       0.133
## # ℹ 33 more rows

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split Data

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [151/17]> Fold01
##  2 <split [151/17]> Fold02
##  3 <split [151/17]> Fold03
##  4 <split [151/17]> Fold04
##  5 <split [151/17]> Fold05
##  6 <split [151/17]> Fold06
##  7 <split [151/17]> Fold07
##  8 <split [151/17]> Fold08
##  9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10

library(usemodels)
usemodels::use_xgboost(like_count ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(18995)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify Recipe 
xgboost_recipe <- recipe(formula = like_count ~ ., data = data_train) %>% 
  recipes::update_role(id, new_role = "id variables") %>% 
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>% 
    step_tfidf(title) %>%
    step_dummy(all_nominal_predictors(),one_hot = TRUE) %>%
    step_zv(all_predictors()) 
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 168
## Columns: 140
## $ year                        <dbl> 2013, 2015, 2008, 2010, 2009, 2007, 2010, …
## $ id                          <fct> WTf0XGpINJI, 7_EfXuGev24, 2_LWZe2BGaE, 6cM…
## $ like_count                  <dbl> 3.1354942, 0.0000000, 3.6109179, 6.0038871…
## $ tfidf_title_2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2002            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2009            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2010            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2013            <dbl> 0.6437752, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2014            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2018            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2020            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_44              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_a               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_ad              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_ads             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_advertisement   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_all             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_and             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_baby            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bears           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_beer            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_best            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.6346645…
## $ tfidf_title_bestbuds        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_big             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bowl            <dbl> 0.2488648, 0.0000000, 0.2488648, 0.1777606…
## $ tfidf_title_britney         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bud             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_budweiser       <dbl> 0.0000000, 0.6812715, 0.0000000, 0.0000000…
## $ tfidf_title_camry           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_car             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cedric          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cindy           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_coca            <dbl> 0.0000000, 0.0000000, 0.5129899, 0.0000000…
## $ tfidf_title_coke            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_cola            <dbl> 0.0000000, 0.0000000, 0.5002872, 0.0000000…
## $ tfidf_title_commercial      <dbl> 0.2135681, 0.3559469, 0.2135681, 0.0000000…
## $ tfidf_title_commercials     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cool            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_crash           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_date            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_diet            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_dilly           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dog             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_dogs            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_doritos         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_e               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_elantra         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_etrade          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_extended        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_factory         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_fantasy         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_featuring       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_flavor          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_fly             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_full            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_funny           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_game            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_genesis         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_girlfriend      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_great           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_happiness       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_hd              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_horse           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_hyundai         <dbl> 0.5002872, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_in              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_is              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_island          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_it              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_kia             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_light           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_lighta          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_love            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_new             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_nfl             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.4810423…
## $ tfidf_title_of              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_official        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_on              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.5775788…
## $ tfidf_title_one             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_pepsi           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_puppy           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_spot            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_starring        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_super           <dbl> 0.2488648, 0.0000000, 0.2488648, 0.1777606…
## $ tfidf_title_superbowl       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_the             <dbl> 0.0000000, 0.6931472, 0.0000000, 0.2970631…
## $ tfidf_title_toyota          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_trade           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_tv              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_usa             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_version         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_vs              <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.…
## $ tfidf_title_winner          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_with            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_x               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_xliv            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.5775788…
## $ brand_Bud.Light             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser             <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ brand_Coca.Cola             <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ brand_Doritos               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_E.Trade               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ brand_Hynudai               <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Kia                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_NFL                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Pepsi                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny_FALSE.                <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, …
## $ funny_TRUE.                 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ show_product_quickly_FALSE. <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ show_product_quickly_TRUE.  <dbl> 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ patriotic_FALSE.            <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ patriotic_TRUE.             <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ celebrity_FALSE.            <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, …
## $ celebrity_TRUE.             <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ danger_FALSE.               <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, …
## $ danger_TRUE.                <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ animals_FALSE.              <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ animals_TRUE.               <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, …
## $ use_sex_FALSE.              <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex_TRUE.               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ category_id_X1              <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id_X10             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X15             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17             <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X19             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ category_id_X23             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X24             <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ category_id_X25             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X29             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Specify Model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(89984)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5)

Apply 1

Alden Dimick

2025-02-18

Import Data

Explore Data

Build Models