download.file(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv",
destfile = "youtube.csv",
mode = "wb"
)
youtube <- read.csv("youtube.csv")
data <- youtube %>%
mutate(
title = replace_na(title, "missing"),
description = replace_na(description, "missing"),
brand = replace_na(brand, "missing"),
channel_title = replace_na(channel_title, "missing"),
funny = replace_na(funny, FALSE),
show_product_quickly = replace_na(show_product_quickly, FALSE),
patriotic = replace_na(patriotic, FALSE),
celebrity = replace_na(celebrity, FALSE),
danger = replace_na(danger, FALSE),
animals = replace_na(animals, FALSE),
use_sex = replace_na(use_sex, FALSE),
view_count = replace_na(view_count, median(view_count, na.rm = TRUE))
) %>%
filter(like_count > 0) %>%
mutate(
like_count = log(like_count),
year = as.factor(year),
brand = as.factor(brand),
channel_title = as.factor(channel_title),
funny = as.factor(funny),
show_product_quickly = as.factor(show_product_quickly),
patriotic = as.factor(patriotic),
celebrity = as.factor(celebrity),
danger = as.factor(danger),
animals = as.factor(animals),
use_sex = as.factor(use_sex)
)
#EDA
data %>%
ggplot(aes(like_count)) +
geom_histogram(bins = 15)
tidy_youtube <- data %>%
unnest_tokens(word, title)
tidy_youtube %>%
count(word, sort = TRUE)
## word n
## 1 commercial 119
## 2 bowl 98
## 3 super 98
## 4 bud 54
## 5 light 49
## 6 budweiser 37
## 7 ad 30
## 8 the 27
## 9 pepsi 22
## 10 hyundai 21
## 11 doritos 19
## 12 cola 18
## 13 coca 17
## 14 superbowl 14
## 15 2014 13
## 16 kia 13
## 17 2013 11
## 18 hd 11
## 19 a 10
## 20 coke 10
## 21 toyota 10
## 22 2010 9
## 23 2012 9
## 24 2018 9
## 25 2019 9
## 26 game 8
## 27 official 8
## 28 dog 7
## 29 e 7
## 30 nfl 7
## 31 trade 7
## 32 tv 7
## 33 2009 6
## 34 2015 6
## 35 baby 6
## 36 big 6
## 37 cedric 6
## 38 new 6
## 39 xliii 6
## 40 2001 5
## 41 2005 5
## 42 2007 5
## 43 2020 5
## 44 44 5
## 45 crash 5
## 46 diet 5
## 47 extended 5
## 48 funny 5
## 49 usa 5
## 50 vs 5
## 51 winner 5
## 52 xliv 5
## 53 ads 4
## 54 etrade 4
## 55 fantasy 4
## 56 is 4
## 57 island 4
## 58 lighta 4
## 59 starring 4
## 60 with 4
## 61 2000 3
## 62 2002 3
## 63 2006 3
## 64 2016 3
## 65 and 3
## 66 clydesdale 3
## 67 commercials 3
## 68 cool 3
## 69 exclusive 3
## 70 genesis 3
## 71 girlfriend 3
## 72 happiness 3
## 73 legends 3
## 74 love 3
## 75 meter 3
## 76 monkey 3
## 77 of 3
## 78 on 3
## 79 one 3
## 80 puppy 3
## 81 spot 3
## 82 team 3
## 83 version 3
## 84 x 3
## 85 xli 3
## 86 you 3
## 87 2008 2
## 88 2011 2
## 89 2017 2
## 90 advertisement 2
## 91 all 2
## 92 babies 2
## 93 battle 2
## 94 bears 2
## 95 beer 2
## 96 best 2
## 97 bestbuds 2
## 98 black 2
## 99 body 2
## 100 britney 2
## 101 camry 2
## 102 car 2
## 103 cindy 2
## 104 crown 2
## 105 date 2
## 106 dilly 2
## 107 dogs 2
## 108 down 2
## 109 elantra 2
## 110 epic 2
## 111 factory 2
## 112 featuring 2
## 113 flavor 2
## 114 fly 2
## 115 for 2
## 116 ft 2
## 117 full 2
## 118 good 2
## 119 great 2
## 120 halftime 2
## 121 horse 2
## 122 in 2
## 123 inside 2
## 124 it 2
## 125 jackie 2
## 126 journey 2
## 127 life 2
## 128 max 2
## 129 next 2
## 130 open 2
## 131 optima 2
## 132 party 2
## 133 respect 2
## 134 ride 2
## 135 save 2
## 136 sonata 2
## 137 sorento 2
## 138 spears 2
## 139 to 2
## 140 tvc 2
## 141 up 2
## 142 video 2
## 143 way 2
## 144 what 2
## 145 when 2
## 146 xlix 2
## 147 xlvii 2
## 148 xxxv 2
## 149 xxxvi 2
## 150 zero 2
## 151 03 1
## 152 100 1
## 153 2003 1
## 154 2008genesis 1
## 155 209 1
## 156 4 1
## 157 42 1
## 158 43 1
## 159 720p 1
## 160 90 1
## 161 ability 1
## 162 advert 1
## 163 again 1
## 164 ali 1
## 165 allowed 1
## 166 american 1
## 167 ant 1
## 168 anthem 1
## 169 are 1
## 170 argh 1
## 171 assurance 1
## 172 attack 1
## 173 avatar 1
## 174 backstreet 1
## 175 bad 1
## 176 ball 1
## 177 barbers 1
## 178 beautiful 1
## 179 because 1
## 180 beige 1
## 181 better 1
## 182 blaze 1
## 183 bmw 1
## 184 bob 1
## 185 bold 1
## 186 border 1
## 187 born 1
## 188 bosses 1
## 189 bot 1
## 190 bots 1
## 191 boys 1
## 192 brain 1
## 193 breath 1
## 194 brett 1
## 195 bridge 1
## 196 brosnan 1
## 197 brotherhood 1
## 198 brown 1
## 199 bubbly 1
## 200 buble 1
## 201 bubly 1
## 202 budlight 1
## 203 bush 1
## 204 by 1
## 205 cab 1
## 206 campaign 1
## 207 can 1
## 208 cards 1
## 209 carlos 1
## 210 carson 1
## 211 casket 1
## 212 catch 1
## 213 celebration 1
## 214 celebrations 1
## 215 chan 1
## 216 chance 1
## 217 chase 1
## 218 checkout 1
## 219 cheese 1
## 220 chessmaster 1
## 221 chimp 1
## 222 choir 1
## 223 christopher 1
## 224 clown 1
## 225 clydesdales 1
## 226 cobie 1
## 227 cockato 1
## 228 collar 1
## 229 come 1
## 230 commercial.avi 1
## 231 coronation 1
## 232 coupe 1
## 233 court 1
## 234 cowboy 1
## 235 crab 1
## 236 crawford 1
## 237 crazy 1
## 238 crews 1
## 239 crunch 1
## 240 crystal 1
## 241 cult 1
## 242 dad 1
## 243 dad's 1
## 244 daddy 1
## 245 dale 1
## 246 dalmatian 1
## 247 deprogramming 1
## 248 detector 1
## 249 dew 1
## 250 doin 1
## 251 doing 1
## 252 dole 1
## 253 donkey 1
## 254 dorito 1
## 255 dorrito's 1
## 256 double 1
## 257 dream 1
## 258 earnhardt 1
## 259 elevator 1
## 260 elliott 1
## 261 energy 1
## 262 entertainer 1
## 263 eternal 1
## 264 excited 1
## 265 family 1
## 266 fans 1
## 267 fashionista 1
## 268 favre 1
## 269 fe 1
## 270 feat 1
## 271 feel 1
## 272 fergus 1
## 273 fetch 1
## 274 financial 1
## 275 finger 1
## 276 fire 1
## 277 first 1
## 278 football 1
## 279 forgotten 1
## 280 forte 1
## 281 frank 1
## 282 freeman 1
## 283 fridge 1
## 284 friend 1
## 285 from 1
## 286 garry 1
## 287 generations 1
## 288 getaway 1
## 289 getting 1
## 290 girl 1
## 291 goat 1
## 292 goes 1
## 293 going 1
## 294 greatest 1
## 295 groove 1
## 296 guy 1
## 297 happy 1
## 298 hard 1
## 299 harris 1
## 300 hart 1
## 301 have 1
## 302 hawk 1
## 303 hayley 1
## 304 hbo 1
## 305 heist 1
## 306 hendrix 1
## 307 heroes 1
## 308 hero’s 1
## 309 highlander 1
## 310 hiker 1
## 311 hill 1
## 312 hitch 1
## 313 hope 1
## 314 hot 1
## 315 how 1
## 316 hq 1
## 317 hulk 1
## 318 hurts 1
## 319 hybrid 1
## 320 hyped 1
## 321 i 1
## 322 i'm 1
## 323 if 1
## 324 instant 1
## 325 it's 1
## 326 its 1
## 327 jimi 1
## 328 jonah 1
## 329 jr 1
## 330 justin 1
## 331 kasparov 1
## 332 kevin 1
## 333 kid 1
## 334 king 1
## 335 king's 1
## 336 knight 1
## 337 lamb 1
## 338 landry 1
## 339 language 1
## 340 law 1
## 341 li 1
## 342 lii 1
## 343 liii 1
## 344 lil 1
## 345 lines 1
## 346 lipstick 1
## 347 listener 1
## 348 liv 1
## 349 live 1
## 350 lost 1
## 351 machine 1
## 352 magic 1
## 353 magnus 1
## 354 make 1
## 355 malone 1
## 356 man 1
## 357 man's 1
## 358 mango 1
## 359 martin 1
## 360 master 1
## 361 matrix 1
## 362 mccarthy 1
## 363 mean 1
## 364 medieval 1
## 365 meeting 1
## 366 melissa 1
## 367 mencia 1
## 368 mercedes 1
## 369 michael 1
## 370 middle 1
## 371 mine 1
## 372 mini 1
## 373 money 1
## 374 moon 1
## 375 more 1
## 376 morgan 1
## 377 morpheus 1
## 378 mother 1
## 379 motion 1
## 380 mountain 1
## 381 mouse 1
## 382 muppets 1
## 383 music 1
## 384 musical 1
## 385 my 1
## 386 nas 1
## 387 ne_bear 1
## 388 never 1
## 389 nice 1
## 390 ninja 1
## 391 niro 1
## 392 no 1
## 393 now 1
## 394 nsync 1
## 395 obese 1
## 396 odds 1
## 397 offical 1
## 398 office 1
## 399 ok 1
## 400 old 1
## 401 optimism 1
## 402 original 1
## 403 otto 1
## 404 out 1
## 405 pahk 1
## 406 pain 1
## 407 paintball 1
## 408 paper 1
## 409 parties 1
## 410 pass 1
## 411 pencil 1
## 412 pep 1
## 413 perfect 1
## 414 pick 1
## 415 pierce 1
## 416 pigs 1
## 417 pinball 1
## 418 pitch 1
## 419 planet 1
## 420 platinum 1
## 421 playdate 1
## 422 polar 1
## 423 political 1
## 424 possibilities 1
## 425 post 1
## 426 post's 1
## 427 postystore 1
## 428 power 1
## 429 pug 1
## 430 quits 1
## 431 quoris 1
## 432 ranch 1
## 433 rapper 1
## 434 rav4 1
## 435 ray 1
## 436 real 1
## 437 referee 1
## 438 reggie 1
## 439 reinvented 1
## 440 replay 1
## 441 rescue 1
## 442 rock 1
## 443 roof 1
## 444 roommate 1
## 445 run 1
## 446 ryanville 1
## 447 s 1
## 448 sale 1
## 449 sam 1
## 450 sanctuary 1
## 451 santa 1
## 452 satin 1
## 453 saw 1
## 454 scientist 1
## 455 scissors 1
## 456 scorsese 1
## 457 seal 1
## 458 seat 1
## 459 secret 1
## 460 see 1
## 461 sells 1
## 462 seltos 1
## 463 sense 1
## 464 she 1
## 465 sheets 1
## 466 shock 1
## 467 short 1
## 468 show 1
## 469 shows 1
## 470 siege 1
## 471 sitter 1
## 472 sixth 1
## 473 skier 1
## 474 skydiver 1
## 475 slap 1
## 476 sleigh 1
## 477 sling 1
## 478 slow 1
## 479 smaht 1
## 480 smulders 1
## 481 socks 1
## 482 something 1
## 483 spa 1
## 484 space 1
## 485 stand 1
## 486 star 1
## 487 steven 1
## 488 stevie 1
## 489 stranded 1
## 490 streaker 1
## 491 strongman 1
## 492 subway 1
## 493 suck 1
## 494 sugar 1
## 495 superstition 1
## 496 supra 1
## 497 t 1
## 498 tacoma 1
## 499 talk 1
## 500 tears 1
## 501 tech 1
## 502 television 1
## 503 tennis 1
## 504 terry 1
## 505 than 1
## 506 themed 1
## 507 then 1
## 508 these 1
## 509 this 1
## 510 thrones 1
## 511 timberlake 1
## 512 time 1
## 513 timeline 1
## 514 toni 1
## 515 touchdown 1
## 516 touches 1
## 517 tough 1
## 518 training 1
## 519 trap 1
## 520 trojan 1
## 521 troy 1
## 522 truck 1
## 523 true 1
## 524 truth 1
## 525 tundra 1
## 526 turbo 1
## 527 twist 1
## 528 twisted 1
## 529 tyler 1
## 530 typical 1
## 531 unknowns 1
## 532 upside 1
## 533 vision 1
## 534 walken 1
## 535 wassup 1
## 536 watcher 1
## 537 wave 1
## 538 wazoo 1
## 539 whassup 1
## 540 wheel 1
## 541 who’s 1
## 542 wine 1
## 543 wizard 1
## 544 wonder 1
## 545 wrong 1
## 546 xlviii 1
## 547 xxxiv 1
## 548 xxxviii 1
## 549 yoga 1
## 550 yourself 1
#vis
tidy_youtube %>%
group_by(word) %>%
summarise(
n = n(),
like_count = mean(like_count)
) %>%
ggplot(aes(n, like_count)) +
geom_hline(yintercept = mean(data$like_count), lty = 2, color = "gray50", size = 1.5) +
geom_jitter(color = "midnightblue", alpha = 0.7) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = "top", hjust = "left") +
scale_x_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#build models
set.seed(123)
youtube_split <- initial_split(data, strata = like_count)
youtube_train <- training(youtube_split)
youtube_test <- testing(youtube_split)
set.seed(234)
youtube_folds <- vfold_cv(youtube_train, strata = like_count)
youtube_rec <- recipe(
like_count ~ title + description + brand + channel_title + year +
funny + show_product_quickly + patriotic + celebrity +
danger + animals + use_sex + view_count,
data = youtube_train
) %>%
step_tokenize(title, description) %>%
step_tokenfilter(title, max_tokens = 25) %>%
step_tokenfilter(description, max_tokens = 25) %>%
step_tf(title, description) %>%
step_dummy(all_nominal_predictors())
prep(youtube_rec) %>% bake(new_data = NULL)
## # A tibble: 160 × 260
## view_count like_count tf_title_2010 tf_title_2012 tf_title_2013 tf_title_2014
## <int> <dbl> <int> <int> <int> <int>
## 1 782 1.95 0 0 0 0
## 2 3805 2.20 1 0 0 0
## 3 4302 3.09 0 0 1 0
## 4 301 0.693 0 0 0 0
## 5 3667 1.95 0 0 1 0
## 6 14927 2.94 1 0 0 0
## 7 5264 2.30 0 0 0 0
## 8 1171 1.61 0 0 1 0
## 9 350 0 0 0 0 0
## 10 3900 2.64 0 0 0 1
## # ℹ 150 more rows
## # ℹ 254 more variables: tf_title_ad <int>, tf_title_big <int>,
## # tf_title_bowl <int>, tf_title_bud <int>, tf_title_budweiser <int>,
## # tf_title_coca <int>, tf_title_coke <int>, tf_title_cola <int>,
## # tf_title_commercial <int>, tf_title_dog <int>, tf_title_doritos <int>,
## # tf_title_game <int>, tf_title_hd <int>, tf_title_hyundai <int>,
## # tf_title_kia <int>, tf_title_light <int>, tf_title_pepsi <int>, …
rf_spec <- rand_forest(trees = 500) %>%
set_engine("ranger") %>%
set_mode("regression")
svm_spec <- svm_linear() %>%
set_engine("LiblineaR") %>%
set_mode("regression")
#workflow
svm_wf <- workflow(youtube_rec, svm_spec)
rf_wf <- workflow(youtube_rec, rf_spec)
#eval
doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)
svm_rs <- fit_resamples(
svm_wf,
resamples = youtube_folds,
control = contrl_preds
)
ranger_rs <- fit_resamples(
rf_wf,
resamples = youtube_folds,
control = contrl_preds
)
#metrics
collect_metrics(svm_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 7.16 10 1.83 pre0_mod0_post0
## 2 rsq standard 0.455 10 0.0310 pre0_mod0_post0
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.74 10 0.0785 pre0_mod0_post0
## 2 rsq standard 0.641 10 0.0336 pre0_mod0_post0
#visuale
bind_rows(
collect_predictions(svm_rs) %>% mutate(mod = "SVM"),
collect_predictions(ranger_rs) %>% mutate(mod = "ranger")
) %>%
ggplot(aes(like_count, .pred, color = id)) +
geom_abline(lty = 2, color = "gray50", size = 1.2) +
geom_jitter(width = 0.5, alpha = 0.5) +
facet_wrap(vars(mod)) +
coord_fixed()
final_fitted <- last_fit(svm_wf, youtube_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 5.10 pre0_mod0_post0
## 2 rsq standard 0.316 pre0_mod0_post0
final_wf <- extract_workflow(final_fitted)
final_wf %>%
tidy() %>%
filter(term != "Bias") %>%
group_by(estimate > 0) %>%
slice_max(abs(estimate), n = 10) %>%
ungroup() %>%
mutate(term = str_remove(term, "tf_title_|tf_description_")) %>%
ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
geom_col(alpha = 0.8) +
labs(y = NULL, fill = "More from...")