Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
Data summary
Name youtube
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
data <- youtube %>%
    
    # Treat Missing Values
    select(-thumbnail, -channel_title, -description, -etag, -kind, -youtube_url, -id) %>%
    na.omit() %>% 
    
    # Log Transform Variables with Skewed Distribution
    mutate(like_count = log(like_count))

Explore Data

Identify Good Predictors

View Count

data %>%
    ggplot(aes(like_count, view_count)) + 
    scale_y_log10() + 
    geom_point()

Dislike Count

data %>% 
    ggplot(aes(like_count, as.factor(dislike_count))) + 
    geom_point()

Comment Count

data %>% 
    ggplot(aes(like_count, as.factor(comment_count)))

EDA Shortcut

# Prepare Data
data_binarized_tbl <- data %>% 
    select(-title, -published_at, -animals, -funny, -danger, -use_sex, -patriotic, -celebrity, -show_product_quickly, -year, -brand, -superbowl_ads_dot_com_url) %>% 
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 219
## Columns: 20
## $ `view_count__-Inf_6577`                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ view_count__6577_41828                        <dbl> 0, 0, 0, 1, 1, 0, 1, 1, …
## $ view_count__41828_176014.5                    <dbl> 1, 1, 0, 0, 0, 0, 0, 0, …
## $ view_count__176014.5_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `like_count__-Inf_2.9174053685313`            <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ like_count__2.9174053685313_4.86753445045558  <dbl> 0, 1, 0, 1, 1, 0, 1, 0, …
## $ like_count__4.86753445045558_6.17478337258445 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ like_count__6.17478337258445_Inf              <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ `dislike_count__-Inf_1`                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ dislike_count__1_7                            <dbl> 0, 0, 0, 1, 0, 0, 1, 1, …
## $ dislike_count__7_24                           <dbl> 1, 1, 0, 0, 1, 0, 0, 0, …
## $ dislike_count__24_Inf                         <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `comment_count__-Inf_1`                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ comment_count__1_11                           <dbl> 0, 1, 0, 1, 0, 0, 1, 0, …
## $ comment_count__11_51.5                        <dbl> 1, 0, 0, 0, 1, 0, 0, 1, …
## $ comment_count__51.5_Inf                       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `category_id__-Inf_17`                        <dbl> 0, 1, 0, 0, 1, 0, 1, 0, …
## $ category_id__17_23                            <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__23_24                            <dbl> 0, 0, 0, 1, 0, 1, 0, 1, …
## $ category_id__24_Inf                           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, …
# Correlate
data_corr_tbl <- data_binarized_tbl %>% 
    correlate(like_count__6.17478337258445_Inf)

data_corr_tbl
## # A tibble: 20 × 3
##    feature       bin                               correlation
##    <fct>         <chr>                                   <dbl>
##  1 like_count    6.17478337258445_Inf                   1     
##  2 comment_count 51.5_Inf                               0.806 
##  3 view_count    176014.5_Inf                           0.733 
##  4 dislike_count 24_Inf                                 0.695 
##  5 comment_count -Inf_1                                -0.372 
##  6 dislike_count -Inf_1                                -0.352 
##  7 dislike_count 1_7                                   -0.348 
##  8 view_count    -Inf_6577                             -0.335 
##  9 like_count    -Inf_2.9174053685313                  -0.335 
## 10 like_count    2.9174053685313_4.86753445045558      -0.335 
## 11 like_count    4.86753445045558_6.17478337258445     -0.331 
## 12 view_count    6577_41828                            -0.311 
## 13 comment_count 1_11                                  -0.303 
## 14 comment_count 11_51.5                               -0.131 
## 15 category_id   -Inf_17                                0.106 
## 16 view_count    41828_176014.5                        -0.0870
## 17 category_id   23_24                                 -0.0736
## 18 category_id   17_23                                 -0.0298
## 19 dislike_count 7_24                                   0.0241
## 20 category_id   24_Inf                                 0.0114
# Plot
data_corr_tbl %>% 
    plot_correlation_funnel()