Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

skimr::skim(youtube)
Data summary
Name youtube
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
data <- youtube %>% 
    
    # Treat missing values
    select(-superbowl_ads_dot_com_url, -youtube_url, -id, -kind, -etag, -channel_title, -category_id, -comment_count, -thumbnail, -published_at) %>% 
    na.omit()

Explore Data

Identify good predictors.

view_count

data %>% 
    ggplot(aes(like_count, view_count)) + 
    scale_y_log10() +
    geom_point()

celebrity

data %>% 
    ggplot(aes(like_count, funny)) + 
    geom_point()

Brand

data %>% 
    
    # tokenize title
    unnest_tokens(output = word, input = brand) %>%
    
    # calculate avg rent per word
    group_by(word) %>%
    summarise(like_count = mean(like_count),
              n          = n()) %>%
    ungroup() %>%
    
    # Plot 
    ggplot(aes(like_count, fct_reorder(word, like_count))) + 
    geom_point() + 
    
    labs(y = "Brands")

EDA Shortcut

# Step 1: Prepare Data
data_binarized_tbl <- data %>% 
    select(-year) %>% 
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 194
## Columns: 40
## $ brand__Bud_Light                                     <dbl> 0, 1, 1, 0, 1, 0,…
## $ brand__Budweiser                                     <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__Coca-Cola`                                   <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Doritos                                       <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__E-Trade`                                     <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Hynudai                                       <dbl> 0, 0, 0, 1, 0, 0,…
## $ brand__Kia                                           <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__NFL                                           <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Pepsi                                         <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Toyota                                        <dbl> 1, 0, 0, 0, 0, 1,…
## $ funny__0                                             <dbl> 1, 0, 0, 1, 0, 0,…
## $ funny__1                                             <dbl> 0, 1, 1, 0, 1, 1,…
## $ show_product_quickly__0                              <dbl> 1, 0, 1, 0, 0, 0,…
## $ show_product_quickly__1                              <dbl> 0, 1, 0, 1, 1, 1,…
## $ patriotic__0                                         <dbl> 1, 1, 1, 1, 1, 1,…
## $ patriotic__1                                         <dbl> 0, 0, 0, 0, 0, 0,…
## $ celebrity__0                                         <dbl> 1, 0, 1, 1, 1, 0,…
## $ celebrity__1                                         <dbl> 0, 1, 0, 0, 0, 1,…
## $ danger__0                                            <dbl> 1, 0, 0, 1, 0, 0,…
## $ danger__1                                            <dbl> 0, 1, 1, 0, 1, 1,…
## $ animals__0                                           <dbl> 1, 1, 0, 1, 0, 0,…
## $ animals__1                                           <dbl> 0, 0, 1, 0, 1, 1,…
## $ use_sex__0                                           <dbl> 1, 1, 1, 1, 0, 1,…
## $ use_sex__1                                           <dbl> 0, 0, 0, 0, 1, 0,…
## $ `view_count__-Inf_10484.75`                          <dbl> 0, 0, 0, 1, 0, 0,…
## $ view_count__10484.75_58515.5                         <dbl> 0, 1, 0, 0, 1, 1,…
## $ view_count__58515.5_219180.25                        <dbl> 1, 0, 1, 0, 0, 0,…
## $ view_count__219180.25_Inf                            <dbl> 0, 0, 0, 0, 0, 0,…
## $ `like_count__-Inf_29.75`                             <dbl> 0, 0, 0, 1, 1, 0,…
## $ like_count__29.75_165                                <dbl> 0, 0, 1, 0, 0, 1,…
## $ like_count__165_592.75                               <dbl> 0, 1, 0, 0, 0, 0,…
## $ like_count__592.75_Inf                               <dbl> 1, 0, 0, 0, 0, 0,…
## $ `dislike_count__-Inf_2`                              <dbl> 0, 0, 0, 1, 0, 0,…
## $ dislike_count__2_8.5                                 <dbl> 0, 0, 0, 0, 1, 0,…
## $ dislike_count__8.5_37.75                             <dbl> 0, 1, 1, 0, 0, 1,…
## $ dislike_count__37.75_Inf                             <dbl> 1, 0, 0, 0, 0, 0,…
## $ title__Bud_Lighta_Cedric_a_Island_Fantasy_2005       <dbl> 0, 0, 0, 0, 0, 0,…
## $ `title__-OTHER`                                      <dbl> 1, 1, 1, 1, 1, 1,…
## $ description__Bud_Lighta_Cedric_a_Island_Fantasy_2005 <dbl> 0, 0, 0, 0, 0, 0,…
## $ `description__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1,…
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>% 
    correlate(like_count__592.75_Inf)

data_corr_tbl
## # A tibble: 40 × 3
##    feature       bin              correlation
##    <fct>         <chr>                  <dbl>
##  1 like_count    592.75_Inf             1    
##  2 view_count    219180.25_Inf          0.782
##  3 dislike_count 37.75_Inf              0.672
##  4 dislike_count -Inf_2                -0.366
##  5 view_count    -Inf_10484.75         -0.338
##  6 like_count    -Inf_29.75            -0.338
##  7 like_count    165_592.75            -0.333
##  8 view_count    10484.75_58515.5      -0.333
##  9 like_count    29.75_165             -0.333
## 10 dislike_count 2_8.5                 -0.306
## # ℹ 30 more rows
# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps