Goal: The goal is to predict the Youtube like count. Click here for the data.
youtube <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
Name | youtube |
Number of rows | 247 |
Number of columns | 25 |
_______________________ | |
Column type frequency: | |
character | 10 |
logical | 7 |
numeric | 7 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
brand | 0 | 1.00 | 3 | 9 | 0 | 10 | 0 |
superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 244 | 0 |
youtube_url | 11 | 0.96 | 43 | 43 | 0 | 233 | 0 |
id | 11 | 0.96 | 11 | 11 | 0 | 233 | 0 |
kind | 16 | 0.94 | 13 | 13 | 0 | 1 | 0 |
etag | 16 | 0.94 | 27 | 27 | 0 | 228 | 0 |
title | 16 | 0.94 | 6 | 99 | 0 | 228 | 0 |
description | 50 | 0.80 | 3 | 3527 | 0 | 194 | 0 |
thumbnail | 129 | 0.48 | 48 | 48 | 0 | 118 | 0 |
channel_title | 16 | 0.94 | 3 | 37 | 0 | 185 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
funny | 0 | 1 | 0.69 | TRU: 171, FAL: 76 |
show_product_quickly | 0 | 1 | 0.68 | TRU: 169, FAL: 78 |
patriotic | 0 | 1 | 0.17 | FAL: 206, TRU: 41 |
celebrity | 0 | 1 | 0.29 | FAL: 176, TRU: 71 |
danger | 0 | 1 | 0.30 | FAL: 172, TRU: 75 |
animals | 0 | 1 | 0.37 | FAL: 155, TRU: 92 |
use_sex | 0 | 1 | 0.27 | FAL: 181, TRU: 66 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2010.19 | 5.86 | 2000 | 2005 | 2010 | 2015.00 | 2020 | ▇▇▇▇▆ |
view_count | 16 | 0.94 | 1407556.46 | 11971111.01 | 10 | 6431 | 41379 | 170015.50 | 176373378 | ▇▁▁▁▁ |
like_count | 22 | 0.91 | 4146.03 | 23920.40 | 0 | 19 | 130 | 527.00 | 275362 | ▇▁▁▁▁ |
dislike_count | 22 | 0.91 | 833.54 | 6948.52 | 0 | 1 | 7 | 24.00 | 92990 | ▇▁▁▁▁ |
favorite_count | 16 | 0.94 | 0.00 | 0.00 | 0 | 0 | 0 | 0.00 | 0 | ▁▁▇▁▁ |
comment_count | 25 | 0.90 | 188.64 | 986.46 | 0 | 1 | 10 | 50.75 | 9190 | ▇▁▁▁▁ |
category_id | 16 | 0.94 | 19.32 | 8.00 | 1 | 17 | 23 | 24.00 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
published_at | 16 | 0.94 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2013-01-31 09:13:55 | 227 |
data <- youtube %>%
# Treat missing values
select(-superbowl_ads_dot_com_url, -youtube_url, -id, -kind, -etag, -favorite_count, -thumbnail, -channel_title) %>%
na.omit()
Identify good predictors
likes per year
data %>%
ggplot(aes(year, like_count)) +
geom_col() +
labs(title = "Likes by year",
x = "Year",
y = "Likes")
views and likes
data %>%
ggplot(aes(like_count, view_count)) +
geom_point() +
labs(title = "Views and Likes",
x = "Likes",
y = "Views")
comments on a video
data %>%
ggplot(aes(like_count, comment_count)) +
geom_point() +
labs(title = "Comments and Likes",
x = "Likes",
y = "Comments")
Likes for categories
# List categories
characteristics <- c("funny", "show_product_quickly", "patriotic", "celebrity", "danger", "animals", "use_sex")
# Select TRUE values and count likes
likes_by_cat <- data %>%
select(all_of(characteristics), like_count) %>%
group_by(dplyr::across(all_of(characteristics))) %>%
summarise(total_likes = sum(like_count, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'funny', 'show_product_quickly',
## 'patriotic', 'celebrity', 'danger', 'animals'. You can override using the
## `.groups` argument.
# Reshape Data
likes_long <- likes_by_cat %>%
pivot_longer(cols = all_of(characteristics), names_to = "characteristic", values_to = "is_true") %>%
filter(is_true)
# Plot Data
likes_long %>%
ggplot(aes(x = characteristic, y = total_likes)) +
geom_col() +
labs(title = "Likes by category",
x = "Category",
y = "Likes")
EDA shortcut
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-published_at) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 190
## Columns: 52
## $ `year__-Inf_2006` <dbl> 0, 1, 0, 1, 0, 0,…
## $ year__2006_2010 <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2010_2014.75 <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2014.75_Inf <dbl> 1, 0, 1, 0, 1, 1,…
## $ brand__Bud_Light <dbl> 1, 1, 0, 1, 0, 0,…
## $ brand__Budweiser <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 1,…
## $ brand__Doritos <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__E-Trade` <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Hynudai <dbl> 0, 0, 1, 0, 0, 0,…
## $ brand__Kia <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__NFL <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Pepsi <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Toyota <dbl> 0, 0, 0, 0, 1, 0,…
## $ funny__0 <dbl> 0, 0, 1, 0, 0, 0,…
## $ funny__1 <dbl> 1, 1, 0, 1, 1, 1,…
## $ show_product_quickly__0 <dbl> 0, 1, 0, 0, 0, 1,…
## $ show_product_quickly__1 <dbl> 1, 0, 1, 1, 1, 0,…
## $ patriotic__0 <dbl> 1, 1, 1, 1, 1, 1,…
## $ patriotic__1 <dbl> 0, 0, 0, 0, 0, 0,…
## $ celebrity__0 <dbl> 0, 1, 1, 1, 0, 0,…
## $ celebrity__1 <dbl> 1, 0, 0, 0, 1, 1,…
## $ danger__0 <dbl> 0, 0, 1, 0, 0, 1,…
## $ danger__1 <dbl> 1, 1, 0, 1, 1, 0,…
## $ animals__0 <dbl> 1, 0, 1, 0, 0, 0,…
## $ animals__1 <dbl> 0, 1, 0, 1, 1, 1,…
## $ use_sex__0 <dbl> 1, 1, 1, 0, 1, 1,…
## $ use_sex__1 <dbl> 0, 0, 0, 1, 0, 0,…
## $ `view_count__-Inf_10484.75` <dbl> 0, 0, 1, 0, 0, 0,…
## $ view_count__10484.75_58515.5 <dbl> 1, 0, 0, 1, 1, 0,…
## $ view_count__58515.5_219180.25 <dbl> 0, 1, 0, 0, 0, 0,…
## $ view_count__219180.25_Inf <dbl> 0, 0, 0, 0, 0, 1,…
## $ `like_count__-Inf_32` <dbl> 0, 0, 1, 1, 0, 0,…
## $ like_count__32_165 <dbl> 0, 1, 0, 0, 1, 0,…
## $ like_count__165_588.75 <dbl> 1, 0, 0, 0, 0, 0,…
## $ like_count__588.75_Inf <dbl> 0, 0, 0, 0, 0, 1,…
## $ `dislike_count__-Inf_2` <dbl> 0, 0, 1, 0, 0, 0,…
## $ dislike_count__2_8.5 <dbl> 0, 0, 0, 1, 0, 0,…
## $ dislike_count__8.5_37 <dbl> 1, 1, 0, 0, 1, 0,…
## $ dislike_count__37_Inf <dbl> 0, 0, 0, 0, 0, 1,…
## $ `comment_count__-Inf_2` <dbl> 0, 0, 1, 1, 0, 0,…
## $ comment_count__2_15 <dbl> 1, 1, 0, 0, 1, 0,…
## $ comment_count__15_65 <dbl> 0, 0, 0, 0, 0, 0,…
## $ comment_count__65_Inf <dbl> 0, 0, 0, 0, 0, 1,…
## $ title__Bud_Lighta_Cedric_a_Island_Fantasy_2005 <dbl> 0, 0, 0, 0, 0, 0,…
## $ `title__-OTHER` <dbl> 1, 1, 1, 1, 1, 1,…
## $ description__Bud_Lighta_Cedric_a_Island_Fantasy_2005 <dbl> 0, 0, 0, 0, 0, 0,…
## $ `description__-OTHER` <dbl> 1, 1, 1, 1, 1, 1,…
## $ `category_id__-Inf_17` <dbl> 0, 1, 0, 0, 1, 0,…
## $ category_id__17_23 <dbl> 0, 0, 1, 0, 0, 0,…
## $ category_id__23_24 <dbl> 0, 0, 0, 1, 0, 1,…
## $ category_id__24_Inf <dbl> 1, 0, 0, 0, 0, 0,…
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(like_count__588.75_Inf)
data_corr_tbl
## # A tibble: 52 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 588.75_Inf 1
## 2 view_count 219180.25_Inf 0.777
## 3 comment_count 65_Inf 0.761
## 4 dislike_count 37_Inf 0.649
## 5 comment_count -Inf_2 -0.362
## 6 dislike_count -Inf_2 -0.362
## 7 like_count -Inf_32 -0.343
## 8 view_count -Inf_10484.75 -0.338
## 9 view_count 10484.75_58515.5 -0.333
## 10 like_count 165_588.75 -0.333
## # ℹ 42 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps