download.file(
  "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv",
  destfile = "youtube.csv",
  mode = "wb"
)

youtube <- read.csv("youtube.csv")

data <- youtube %>%
  mutate(
    title = replace_na(title, "missing"),
    description = replace_na(description, "missing"),
    brand = replace_na(brand, "missing"),
    channel_title = replace_na(channel_title, "missing"),
    funny = replace_na(funny, FALSE),
    show_product_quickly = replace_na(show_product_quickly, FALSE),
    patriotic = replace_na(patriotic, FALSE),
    celebrity = replace_na(celebrity, FALSE),
    danger = replace_na(danger, FALSE),
    animals = replace_na(animals, FALSE),
    use_sex = replace_na(use_sex, FALSE),
    view_count = replace_na(view_count, median(view_count, na.rm = TRUE))
  ) %>%
  filter(like_count > 0) %>%
  mutate(
    like_count = log(like_count),
    year = as.factor(year),
    brand = as.factor(brand),
    channel_title = as.factor(channel_title),
    funny = as.factor(funny),
    show_product_quickly = as.factor(show_product_quickly),
    patriotic = as.factor(patriotic),
    celebrity = as.factor(celebrity),
    danger = as.factor(danger),
    animals = as.factor(animals),
    use_sex = as.factor(use_sex)
  )

#EDA

data %>%
  ggplot(aes(like_count)) +
  geom_histogram(bins = 15)

tidy_youtube <- data %>%
  unnest_tokens(word, title)

tidy_youtube %>%
  count(word, sort = TRUE)
##               word   n
## 1       commercial 119
## 2             bowl  98
## 3            super  98
## 4              bud  54
## 5            light  49
## 6        budweiser  37
## 7               ad  30
## 8              the  27
## 9            pepsi  22
## 10         hyundai  21
## 11         doritos  19
## 12            cola  18
## 13            coca  17
## 14       superbowl  14
## 15            2014  13
## 16             kia  13
## 17            2013  11
## 18              hd  11
## 19               a  10
## 20            coke  10
## 21          toyota  10
## 22            2010   9
## 23            2012   9
## 24            2018   9
## 25            2019   9
## 26            game   8
## 27        official   8
## 28             dog   7
## 29               e   7
## 30             nfl   7
## 31           trade   7
## 32              tv   7
## 33            2009   6
## 34            2015   6
## 35            baby   6
## 36             big   6
## 37          cedric   6
## 38             new   6
## 39           xliii   6
## 40            2001   5
## 41            2005   5
## 42            2007   5
## 43            2020   5
## 44              44   5
## 45           crash   5
## 46            diet   5
## 47        extended   5
## 48           funny   5
## 49             usa   5
## 50              vs   5
## 51          winner   5
## 52            xliv   5
## 53             ads   4
## 54          etrade   4
## 55         fantasy   4
## 56              is   4
## 57          island   4
## 58          lighta   4
## 59        starring   4
## 60            with   4
## 61            2000   3
## 62            2002   3
## 63            2006   3
## 64            2016   3
## 65             and   3
## 66      clydesdale   3
## 67     commercials   3
## 68            cool   3
## 69       exclusive   3
## 70         genesis   3
## 71      girlfriend   3
## 72       happiness   3
## 73         legends   3
## 74            love   3
## 75           meter   3
## 76          monkey   3
## 77              of   3
## 78              on   3
## 79             one   3
## 80           puppy   3
## 81            spot   3
## 82            team   3
## 83         version   3
## 84               x   3
## 85             xli   3
## 86             you   3
## 87            2008   2
## 88            2011   2
## 89            2017   2
## 90   advertisement   2
## 91             all   2
## 92          babies   2
## 93          battle   2
## 94           bears   2
## 95            beer   2
## 96            best   2
## 97        bestbuds   2
## 98           black   2
## 99            body   2
## 100        britney   2
## 101          camry   2
## 102            car   2
## 103          cindy   2
## 104          crown   2
## 105           date   2
## 106          dilly   2
## 107           dogs   2
## 108           down   2
## 109        elantra   2
## 110           epic   2
## 111        factory   2
## 112      featuring   2
## 113         flavor   2
## 114            fly   2
## 115            for   2
## 116             ft   2
## 117           full   2
## 118           good   2
## 119          great   2
## 120       halftime   2
## 121          horse   2
## 122             in   2
## 123         inside   2
## 124             it   2
## 125         jackie   2
## 126        journey   2
## 127           life   2
## 128            max   2
## 129           next   2
## 130           open   2
## 131         optima   2
## 132          party   2
## 133        respect   2
## 134           ride   2
## 135           save   2
## 136         sonata   2
## 137        sorento   2
## 138         spears   2
## 139             to   2
## 140            tvc   2
## 141             up   2
## 142          video   2
## 143            way   2
## 144           what   2
## 145           when   2
## 146           xlix   2
## 147          xlvii   2
## 148           xxxv   2
## 149          xxxvi   2
## 150           zero   2
## 151             03   1
## 152            100   1
## 153           2003   1
## 154    2008genesis   1
## 155            209   1
## 156              4   1
## 157             42   1
## 158             43   1
## 159           720p   1
## 160             90   1
## 161        ability   1
## 162         advert   1
## 163          again   1
## 164            ali   1
## 165        allowed   1
## 166       american   1
## 167            ant   1
## 168         anthem   1
## 169            are   1
## 170           argh   1
## 171      assurance   1
## 172         attack   1
## 173         avatar   1
## 174     backstreet   1
## 175            bad   1
## 176           ball   1
## 177        barbers   1
## 178      beautiful   1
## 179        because   1
## 180          beige   1
## 181         better   1
## 182          blaze   1
## 183            bmw   1
## 184            bob   1
## 185           bold   1
## 186         border   1
## 187           born   1
## 188         bosses   1
## 189            bot   1
## 190           bots   1
## 191           boys   1
## 192          brain   1
## 193         breath   1
## 194          brett   1
## 195         bridge   1
## 196        brosnan   1
## 197    brotherhood   1
## 198          brown   1
## 199         bubbly   1
## 200          buble   1
## 201          bubly   1
## 202       budlight   1
## 203           bush   1
## 204             by   1
## 205            cab   1
## 206       campaign   1
## 207            can   1
## 208          cards   1
## 209         carlos   1
## 210         carson   1
## 211         casket   1
## 212          catch   1
## 213    celebration   1
## 214   celebrations   1
## 215           chan   1
## 216         chance   1
## 217          chase   1
## 218       checkout   1
## 219         cheese   1
## 220    chessmaster   1
## 221          chimp   1
## 222          choir   1
## 223    christopher   1
## 224          clown   1
## 225    clydesdales   1
## 226          cobie   1
## 227        cockato   1
## 228         collar   1
## 229           come   1
## 230 commercial.avi   1
## 231     coronation   1
## 232          coupe   1
## 233          court   1
## 234         cowboy   1
## 235           crab   1
## 236       crawford   1
## 237          crazy   1
## 238          crews   1
## 239         crunch   1
## 240        crystal   1
## 241           cult   1
## 242            dad   1
## 243          dad's   1
## 244          daddy   1
## 245           dale   1
## 246      dalmatian   1
## 247  deprogramming   1
## 248       detector   1
## 249            dew   1
## 250           doin   1
## 251          doing   1
## 252           dole   1
## 253         donkey   1
## 254         dorito   1
## 255      dorrito's   1
## 256         double   1
## 257          dream   1
## 258      earnhardt   1
## 259       elevator   1
## 260        elliott   1
## 261         energy   1
## 262    entertainer   1
## 263        eternal   1
## 264        excited   1
## 265         family   1
## 266           fans   1
## 267    fashionista   1
## 268          favre   1
## 269             fe   1
## 270           feat   1
## 271           feel   1
## 272         fergus   1
## 273          fetch   1
## 274      financial   1
## 275         finger   1
## 276           fire   1
## 277          first   1
## 278       football   1
## 279      forgotten   1
## 280          forte   1
## 281          frank   1
## 282        freeman   1
## 283         fridge   1
## 284         friend   1
## 285           from   1
## 286          garry   1
## 287    generations   1
## 288        getaway   1
## 289        getting   1
## 290           girl   1
## 291           goat   1
## 292           goes   1
## 293          going   1
## 294       greatest   1
## 295         groove   1
## 296            guy   1
## 297          happy   1
## 298           hard   1
## 299         harris   1
## 300           hart   1
## 301           have   1
## 302           hawk   1
## 303         hayley   1
## 304            hbo   1
## 305          heist   1
## 306        hendrix   1
## 307         heroes   1
## 308         hero’s   1
## 309     highlander   1
## 310          hiker   1
## 311           hill   1
## 312          hitch   1
## 313           hope   1
## 314            hot   1
## 315            how   1
## 316             hq   1
## 317           hulk   1
## 318          hurts   1
## 319         hybrid   1
## 320          hyped   1
## 321              i   1
## 322            i'm   1
## 323             if   1
## 324        instant   1
## 325           it's   1
## 326            its   1
## 327           jimi   1
## 328          jonah   1
## 329             jr   1
## 330         justin   1
## 331       kasparov   1
## 332          kevin   1
## 333            kid   1
## 334           king   1
## 335         king's   1
## 336         knight   1
## 337           lamb   1
## 338         landry   1
## 339       language   1
## 340            law   1
## 341             li   1
## 342            lii   1
## 343           liii   1
## 344            lil   1
## 345          lines   1
## 346       lipstick   1
## 347       listener   1
## 348            liv   1
## 349           live   1
## 350           lost   1
## 351        machine   1
## 352          magic   1
## 353         magnus   1
## 354           make   1
## 355         malone   1
## 356            man   1
## 357          man's   1
## 358          mango   1
## 359         martin   1
## 360         master   1
## 361         matrix   1
## 362       mccarthy   1
## 363           mean   1
## 364       medieval   1
## 365        meeting   1
## 366        melissa   1
## 367         mencia   1
## 368       mercedes   1
## 369        michael   1
## 370         middle   1
## 371           mine   1
## 372           mini   1
## 373          money   1
## 374           moon   1
## 375           more   1
## 376         morgan   1
## 377       morpheus   1
## 378         mother   1
## 379         motion   1
## 380       mountain   1
## 381          mouse   1
## 382        muppets   1
## 383          music   1
## 384        musical   1
## 385             my   1
## 386            nas   1
## 387        ne_bear   1
## 388          never   1
## 389           nice   1
## 390          ninja   1
## 391           niro   1
## 392             no   1
## 393            now   1
## 394          nsync   1
## 395          obese   1
## 396           odds   1
## 397        offical   1
## 398         office   1
## 399             ok   1
## 400            old   1
## 401       optimism   1
## 402       original   1
## 403           otto   1
## 404            out   1
## 405           pahk   1
## 406           pain   1
## 407      paintball   1
## 408          paper   1
## 409        parties   1
## 410           pass   1
## 411         pencil   1
## 412            pep   1
## 413        perfect   1
## 414           pick   1
## 415         pierce   1
## 416           pigs   1
## 417        pinball   1
## 418          pitch   1
## 419         planet   1
## 420       platinum   1
## 421       playdate   1
## 422          polar   1
## 423      political   1
## 424  possibilities   1
## 425           post   1
## 426         post's   1
## 427     postystore   1
## 428          power   1
## 429            pug   1
## 430          quits   1
## 431         quoris   1
## 432          ranch   1
## 433         rapper   1
## 434           rav4   1
## 435            ray   1
## 436           real   1
## 437        referee   1
## 438         reggie   1
## 439     reinvented   1
## 440         replay   1
## 441         rescue   1
## 442           rock   1
## 443           roof   1
## 444       roommate   1
## 445            run   1
## 446      ryanville   1
## 447              s   1
## 448           sale   1
## 449            sam   1
## 450      sanctuary   1
## 451          santa   1
## 452          satin   1
## 453            saw   1
## 454      scientist   1
## 455       scissors   1
## 456       scorsese   1
## 457           seal   1
## 458           seat   1
## 459         secret   1
## 460            see   1
## 461          sells   1
## 462         seltos   1
## 463          sense   1
## 464            she   1
## 465         sheets   1
## 466          shock   1
## 467          short   1
## 468           show   1
## 469          shows   1
## 470          siege   1
## 471         sitter   1
## 472          sixth   1
## 473          skier   1
## 474       skydiver   1
## 475           slap   1
## 476         sleigh   1
## 477          sling   1
## 478           slow   1
## 479          smaht   1
## 480       smulders   1
## 481          socks   1
## 482      something   1
## 483            spa   1
## 484          space   1
## 485          stand   1
## 486           star   1
## 487         steven   1
## 488         stevie   1
## 489       stranded   1
## 490       streaker   1
## 491      strongman   1
## 492         subway   1
## 493           suck   1
## 494          sugar   1
## 495   superstition   1
## 496          supra   1
## 497              t   1
## 498         tacoma   1
## 499           talk   1
## 500          tears   1
## 501           tech   1
## 502     television   1
## 503         tennis   1
## 504          terry   1
## 505           than   1
## 506         themed   1
## 507           then   1
## 508          these   1
## 509           this   1
## 510        thrones   1
## 511     timberlake   1
## 512           time   1
## 513       timeline   1
## 514           toni   1
## 515      touchdown   1
## 516        touches   1
## 517          tough   1
## 518       training   1
## 519           trap   1
## 520         trojan   1
## 521           troy   1
## 522          truck   1
## 523           true   1
## 524          truth   1
## 525         tundra   1
## 526          turbo   1
## 527          twist   1
## 528        twisted   1
## 529          tyler   1
## 530        typical   1
## 531       unknowns   1
## 532         upside   1
## 533         vision   1
## 534         walken   1
## 535         wassup   1
## 536        watcher   1
## 537           wave   1
## 538          wazoo   1
## 539        whassup   1
## 540          wheel   1
## 541          who’s   1
## 542           wine   1
## 543         wizard   1
## 544         wonder   1
## 545          wrong   1
## 546         xlviii   1
## 547          xxxiv   1
## 548        xxxviii   1
## 549           yoga   1
## 550       yourself   1

#vis

tidy_youtube %>%
  group_by(word) %>%
  summarise(
    n = n(),
    like_count = mean(like_count)
  ) %>%
  ggplot(aes(n, like_count)) +
  geom_hline(yintercept = mean(data$like_count), lty = 2, color = "gray50", size = 1.5) +
  geom_jitter(color = "midnightblue", alpha = 0.7) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = "top", hjust = "left") +
  scale_x_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#build models

set.seed(123)
youtube_split <- initial_split(data, strata = like_count)
youtube_train <- training(youtube_split)
youtube_test <- testing(youtube_split)

set.seed(234)
youtube_folds <- vfold_cv(youtube_train, strata = like_count)
youtube_rec <- recipe(
  like_count ~ title + description + brand + channel_title + year +
    funny + show_product_quickly + patriotic + celebrity +
    danger + animals + use_sex + view_count,
  data = youtube_train
) %>%
  step_tokenize(title, description) %>%
  step_tokenfilter(title, max_tokens = 25) %>%
  step_tokenfilter(description, max_tokens = 25) %>%
  step_tf(title, description) %>%
  step_dummy(all_nominal_predictors())

prep(youtube_rec) %>% bake(new_data = NULL)
## # A tibble: 160 × 260
##    view_count like_count tf_title_2010 tf_title_2012 tf_title_2013 tf_title_2014
##         <int>      <dbl>         <int>         <int>         <int>         <int>
##  1        782      1.95              0             0             0             0
##  2       3805      2.20              1             0             0             0
##  3       4302      3.09              0             0             1             0
##  4        301      0.693             0             0             0             0
##  5       3667      1.95              0             0             1             0
##  6      14927      2.94              1             0             0             0
##  7       5264      2.30              0             0             0             0
##  8       1171      1.61              0             0             1             0
##  9        350      0                 0             0             0             0
## 10       3900      2.64              0             0             0             1
## # ℹ 150 more rows
## # ℹ 254 more variables: tf_title_ad <int>, tf_title_big <int>,
## #   tf_title_bowl <int>, tf_title_bud <int>, tf_title_budweiser <int>,
## #   tf_title_coca <int>, tf_title_coke <int>, tf_title_cola <int>,
## #   tf_title_commercial <int>, tf_title_dog <int>, tf_title_doritos <int>,
## #   tf_title_game <int>, tf_title_hd <int>, tf_title_hyundai <int>,
## #   tf_title_kia <int>, tf_title_light <int>, tf_title_pepsi <int>, …
rf_spec <- rand_forest(trees = 500) %>%
  set_engine("ranger") %>%
  set_mode("regression")

svm_spec <- svm_linear() %>%
  set_engine("LiblineaR") %>%
  set_mode("regression")

#workflow

svm_wf <- workflow(youtube_rec, svm_spec)
rf_wf <- workflow(youtube_rec, rf_spec)

#eval

doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
  svm_wf,
  resamples = youtube_folds,
  control = contrl_preds
)

ranger_rs <- fit_resamples(
  rf_wf,
  resamples = youtube_folds,
  control = contrl_preds
)

#metrics

collect_metrics(svm_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config        
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
## 1 rmse    standard   7.16     10  1.83   pre0_mod0_post0
## 2 rsq     standard   0.455    10  0.0310 pre0_mod0_post0
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config        
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
## 1 rmse    standard   1.74     10  0.0785 pre0_mod0_post0
## 2 rsq     standard   0.641    10  0.0336 pre0_mod0_post0

#visuale

bind_rows(
  collect_predictions(svm_rs) %>% mutate(mod = "SVM"),
  collect_predictions(ranger_rs) %>% mutate(mod = "ranger")
) %>%
  ggplot(aes(like_count, .pred, color = id)) +
  geom_abline(lty = 2, color = "gray50", size = 1.2) +
  geom_jitter(width = 0.5, alpha = 0.5) +
  facet_wrap(vars(mod)) +
  coord_fixed()

final_fitted <- last_fit(svm_wf, youtube_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config        
##   <chr>   <chr>          <dbl> <chr>          
## 1 rmse    standard       5.10  pre0_mod0_post0
## 2 rsq     standard       0.316 pre0_mod0_post0
final_wf <- extract_workflow(final_fitted)

final_wf %>%
  tidy() %>%
  filter(term != "Bias") %>%
  group_by(estimate > 0) %>%
  slice_max(abs(estimate), n = 10) %>%
  ungroup() %>%
  mutate(term = str_remove(term, "tf_title_|tf_description_")) %>%
  ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
  geom_col(alpha = 0.8) +
  labs(y = NULL, fill = "More from...")