Goal: The goal is to predict the Youtube like count. Click here for the data.

Import Data

youtube <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')

## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(youtube)

Data summary
Name	youtube
Number of rows	247
Number of columns	25
_______________________
Column type frequency:
character	10
logical	7
numeric	7
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
brand	0	1.00	3	9	10
superbowl_ads_dot_com_url	0	1.00	34	120	244
youtube_url	11	0.96	43	43	233
id	11	0.96	11	11	233
kind	16	0.94	13	13	1
etag	16	0.94	27	27	228
title	16	0.94	6	99	228
description	50	0.80	3	3527	194
thumbnail	129	0.48	48	48	118
channel_title	16	0.94	3	37	185

Variable type: logical

skim_variable	complete_rate	mean	count
funny	1	0.69	TRU: 171, FAL: 76
show_product_quickly	1	0.68	TRU: 169, FAL: 78
patriotic	1	0.17	FAL: 206, TRU: 41
celebrity	1	0.29	FAL: 176, TRU: 71
danger	1	0.30	FAL: 172, TRU: 75
animals	1	0.37	FAL: 155, TRU: 92
use_sex	1	0.27	FAL: 181, TRU: 66

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2010.19	5.86	2000	2005	2010	2015.00	2020	▇▇▇▇▆
view_count	16	0.94	1407556.46	11971111.01	10	6431	41379	170015.50	176373378	▇▁▁▁▁
like_count	22	0.91	4146.03	23920.40	0	19	130	527.00	275362	▇▁▁▁▁
dislike_count	22	0.91	833.54	6948.52	0	1	7	24.00	92990	▇▁▁▁▁
favorite_count	16	0.94	0.00	0.00	0	0	0	0.00	0	▁▁▇▁▁
comment_count	25	0.90	188.64	986.46	0	1	10	50.75	9190	▇▁▁▁▁
category_id	16	0.94	19.32	8.00	1	17	23	24.00	29	▃▁▂▆▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
published_at	16	0.94	2006-02-06 10:02:36	2021-01-27 13:11:29	2013-01-31 09:13:55	227

data <- youtube %>%
    
    # Treat missing values
    select(-superbowl_ads_dot_com_url, -youtube_url, -id, -kind, -etag, -favorite_count, -thumbnail, -channel_title) %>%
    na.omit()

Explore Data

Identify good predictors

likes per year

data %>%
    ggplot(aes(year, like_count)) +
    geom_col() +
    labs(title = "Likes by year",
         x = "Year",
         y = "Likes")

views and likes

data %>%
    ggplot(aes(like_count, view_count)) +
    geom_point() +
    labs(title = "Views and Likes",
         x = "Likes",
         y = "Views")

comments on a video

data %>%
    ggplot(aes(like_count, comment_count)) + 
    geom_point() +
    labs(title = "Comments and Likes",
         x = "Likes",
         y = "Comments")

Likes for categories

# List categories
characteristics <- c("funny", "show_product_quickly", "patriotic", "celebrity", "danger", "animals", "use_sex") 

# Select TRUE values and count likes
likes_by_cat <- data %>%
    select(all_of(characteristics), like_count) %>%
    group_by(dplyr::across(all_of(characteristics))) %>%
    summarise(total_likes = sum(like_count, na.rm = TRUE)) %>%
    ungroup()

## `summarise()` has grouped output by 'funny', 'show_product_quickly',
## 'patriotic', 'celebrity', 'danger', 'animals'. You can override using the
## `.groups` argument.

# Reshape Data
likes_long <- likes_by_cat %>%
    pivot_longer(cols = all_of(characteristics), names_to = "characteristic", values_to = "is_true") %>%
    filter(is_true)

# Plot Data
likes_long %>%
    ggplot(aes(x = characteristic, y = total_likes)) +
    geom_col() +
    labs(title = "Likes by category",
         x = "Category",
         y = "Likes")

EDA shortcut

# Step 1: Prepare data

data_binarized_tbl <- data %>%
    select(-published_at) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 190
## Columns: 52
## $ `year__-Inf_2006`                                    <dbl> 0, 1, 0, 1, 0, 0,…
## $ year__2006_2010                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2010_2014.75                                   <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2014.75_Inf                                    <dbl> 1, 0, 1, 0, 1, 1,…
## $ brand__Bud_Light                                     <dbl> 1, 1, 0, 1, 0, 0,…
## $ brand__Budweiser                                     <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__Coca-Cola`                                   <dbl> 0, 0, 0, 0, 0, 1,…
## $ brand__Doritos                                       <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__E-Trade`                                     <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Hynudai                                       <dbl> 0, 0, 1, 0, 0, 0,…
## $ brand__Kia                                           <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__NFL                                           <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Pepsi                                         <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Toyota                                        <dbl> 0, 0, 0, 0, 1, 0,…
## $ funny__0                                             <dbl> 0, 0, 1, 0, 0, 0,…
## $ funny__1                                             <dbl> 1, 1, 0, 1, 1, 1,…
## $ show_product_quickly__0                              <dbl> 0, 1, 0, 0, 0, 1,…
## $ show_product_quickly__1                              <dbl> 1, 0, 1, 1, 1, 0,…
## $ patriotic__0                                         <dbl> 1, 1, 1, 1, 1, 1,…
## $ patriotic__1                                         <dbl> 0, 0, 0, 0, 0, 0,…
## $ celebrity__0                                         <dbl> 0, 1, 1, 1, 0, 0,…
## $ celebrity__1                                         <dbl> 1, 0, 0, 0, 1, 1,…
## $ danger__0                                            <dbl> 0, 0, 1, 0, 0, 1,…
## $ danger__1                                            <dbl> 1, 1, 0, 1, 1, 0,…
## $ animals__0                                           <dbl> 1, 0, 1, 0, 0, 0,…
## $ animals__1                                           <dbl> 0, 1, 0, 1, 1, 1,…
## $ use_sex__0                                           <dbl> 1, 1, 1, 0, 1, 1,…
## $ use_sex__1                                           <dbl> 0, 0, 0, 1, 0, 0,…
## $ `view_count__-Inf_10484.75`                          <dbl> 0, 0, 1, 0, 0, 0,…
## $ view_count__10484.75_58515.5                         <dbl> 1, 0, 0, 1, 1, 0,…
## $ view_count__58515.5_219180.25                        <dbl> 0, 1, 0, 0, 0, 0,…
## $ view_count__219180.25_Inf                            <dbl> 0, 0, 0, 0, 0, 1,…
## $ `like_count__-Inf_32`                                <dbl> 0, 0, 1, 1, 0, 0,…
## $ like_count__32_165                                   <dbl> 0, 1, 0, 0, 1, 0,…
## $ like_count__165_588.75                               <dbl> 1, 0, 0, 0, 0, 0,…
## $ like_count__588.75_Inf                               <dbl> 0, 0, 0, 0, 0, 1,…
## $ `dislike_count__-Inf_2`                              <dbl> 0, 0, 1, 0, 0, 0,…
## $ dislike_count__2_8.5                                 <dbl> 0, 0, 0, 1, 0, 0,…
## $ dislike_count__8.5_37                                <dbl> 1, 1, 0, 0, 1, 0,…
## $ dislike_count__37_Inf                                <dbl> 0, 0, 0, 0, 0, 1,…
## $ `comment_count__-Inf_2`                              <dbl> 0, 0, 1, 1, 0, 0,…
## $ comment_count__2_15                                  <dbl> 1, 1, 0, 0, 1, 0,…
## $ comment_count__15_65                                 <dbl> 0, 0, 0, 0, 0, 0,…
## $ comment_count__65_Inf                                <dbl> 0, 0, 0, 0, 0, 1,…
## $ title__Bud_Lighta_Cedric_a_Island_Fantasy_2005       <dbl> 0, 0, 0, 0, 0, 0,…
## $ `title__-OTHER`                                      <dbl> 1, 1, 1, 1, 1, 1,…
## $ description__Bud_Lighta_Cedric_a_Island_Fantasy_2005 <dbl> 0, 0, 0, 0, 0, 0,…
## $ `description__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1,…
## $ `category_id__-Inf_17`                               <dbl> 0, 1, 0, 0, 1, 0,…
## $ category_id__17_23                                   <dbl> 0, 0, 1, 0, 0, 0,…
## $ category_id__23_24                                   <dbl> 0, 0, 0, 1, 0, 1,…
## $ category_id__24_Inf                                  <dbl> 1, 0, 0, 0, 0, 0,…

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(like_count__588.75_Inf)

data_corr_tbl

## # A tibble: 52 × 3
##    feature       bin              correlation
##    <fct>         <chr>                  <dbl>
##  1 like_count    588.75_Inf             1    
##  2 view_count    219180.25_Inf          0.777
##  3 comment_count 65_Inf                 0.761
##  4 dislike_count 37_Inf                 0.649
##  5 comment_count -Inf_2                -0.362
##  6 dislike_count -Inf_2                -0.362
##  7 like_count    -Inf_32               -0.343
##  8 view_count    -Inf_10484.75         -0.338
##  9 view_count    10484.75_58515.5      -0.333
## 10 like_count    165_588.75            -0.333
## # ℹ 42 more rows

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Apply to your Data 1

Max Haussmann

2024-09-09

Import Data

Explore Data

Preprocess Data

Build Models

Evaluate Models

Make Predictions