Get the Data

Read in with tidytuesdayR package

Install from CRAN via: install.packages(“tidytuesdayR”)

This loads the readme and all the datasets for the week of interest

Either ISO-8601 date or year/week works!

tuesdata <- tidytuesdayR::tt_load(‘2021-03-02’) tuesdata <- tidytuesdayR::tt_load(2021, week = 10)

youtube <- tuesdata$youtube

Or read in the data manually

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- youtube %>%
  # Remove missing values
  select(-thumbnail, -ends_with("url"), -etag, -kind, -favorite_count, -published_at, -view_count, -comment_count, -description, -dislike_count) %>%
  na.omit() %>%
  
  # log transform the target variable
  mutate(like_count = log(like_count)) %>%
  
  # category_id is a factor
  mutate(category_id = as.factor(category_id)) %>%
  
  # Convert logical to factor
  mutate(across(where(is.logical), factor)) %>%
  
  # Convert character to factor
  mutate(across(c(id, channel_title, brand), factor))
# Step 1: Prepare Data

library(lubridate)
data_binarized_tbl <-data %>%
  select(-id, -title) %>%
  binarize()

data_binarized_tbl %>% glimpse()
## Rows: 225
## Columns: 54
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE                                  <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE                                   <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE                   <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE                    <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE                              <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE                               <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE                                 <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE                                  <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE                                <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE                                 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE                                <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.94443897916644`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.94443897916644_4.86753445045558 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.86753445045558_6.26720054854136 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26720054854136_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ channel_title__BudBowlXLII                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `channel_title__Coca-Cola`                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ channel_title__Funny_Commercials              <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ channel_title__John_Keehler                   <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ channel_title__NFL                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__omon007                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__reggiep08v2                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__The_Hall_of_Advertising        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__USA_TODAY                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__World_Hyundai_Matteson         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `channel_title__-OTHER`                       <dbl> 0, 1, 0, 1, 1, 0, 0, 1, …
## $ category_id__1                                <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22                               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24                               <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27                               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2 : Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate( like_count__6.26720054854136_Inf )

data_corr_tbl
## # A tibble: 54 × 3
##    feature       bin                               correlation
##    <fct>         <chr>                                   <dbl>
##  1 like_count    6.26720054854136_Inf                    1    
##  2 like_count    -Inf_2.94443897916644                  -0.339
##  3 like_count    4.86753445045558_6.26720054854136      -0.331
##  4 like_count    2.94443897916644_4.86753445045558      -0.327
##  5 brand         Doritos                                 0.281
##  6 channel_title NFL                                     0.262
##  7 brand         NFL                                     0.250
##  8 brand         Bud_Light                              -0.212
##  9 year          2015_Inf                                0.202
## 10 channel_title Coca-Cola                               0.202
## # ℹ 44 more rows
#Step 3: Plot
data_corr_tbl %>%
  plot_correlation_funnel()
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps