Goal: to predict the Youtube like count Click here for the data

Import Data

 youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(youtube)

Data summary
Name	youtube
Number of rows	247
Number of columns	25
_______________________
Column type frequency:
character	10
logical	7
numeric	7
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
brand	0	1.00	3	9	10
superbowl_ads_dot_com_url	0	1.00	34	120	244
youtube_url	11	0.96	43	43	233
id	11	0.96	11	11	233
kind	16	0.94	13	13	1
etag	16	0.94	27	27	228
title	16	0.94	6	99	228
description	50	0.80	3	3527	194
thumbnail	129	0.48	48	48	118
channel_title	16	0.94	3	37	185

Variable type: logical

skim_variable	complete_rate	mean	count
funny	1	0.69	TRU: 171, FAL: 76
show_product_quickly	1	0.68	TRU: 169, FAL: 78
patriotic	1	0.17	FAL: 206, TRU: 41
celebrity	1	0.29	FAL: 176, TRU: 71
danger	1	0.30	FAL: 172, TRU: 75
animals	1	0.37	FAL: 155, TRU: 92
use_sex	1	0.27	FAL: 181, TRU: 66

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2010.19	5.86	2000	2005	2010	2015.00	2020	▇▇▇▇▆
view_count	16	0.94	1407556.46	11971111.01	10	6431	41379	170015.50	176373378	▇▁▁▁▁
like_count	22	0.91	4146.03	23920.40	0	19	130	527.00	275362	▇▁▁▁▁
dislike_count	22	0.91	833.54	6948.52	0	1	7	24.00	92990	▇▁▁▁▁
favorite_count	16	0.94	0.00	0.00	0	0	0	0.00	0	▁▁▇▁▁
comment_count	25	0.90	188.64	986.46	0	1	10	50.75	9190	▇▁▁▁▁
category_id	16	0.94	19.32	8.00	1	17	23	24.00	29	▃▁▂▆▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
published_at	16	0.94	2006-02-06 10:02:36	2021-01-27 13:11:29	2013-01-31 09:13:55	227

data <- youtube %>%

  
  # Treat missing values
  select(-thumbnail, -description, -favorite_count, -comment_count, -published_at, -category_id, -superbowl_ads_dot_com_url, -youtube_url, -id ,-etag, -channel_title) %>% 
   na.omit() %>% 
  
  # log transform variables with pos-skewed distribution
  mutate(like_count = log(like_count))

Explore Data

Identify good predictors

like_count

data %>%
  ggplot(aes(like_count, view_count)) +
  scale_y_log10() +
  geom_point()

data %>% 
  ggplot(aes(like_count, as.factor(brand))) +
  geom_boxplot()

## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

title

data %>%  
  
  # tokenism title
  unnest_tokens(output = word, input = brand) %>%
  
  # calculate avg rent per word
  group_by(word) %>%
  summarise(like_count = mean(like_count), 
            n   = n()) %>%
  ungroup() %>%

filter(n > 10, !str_detect(word, "\\d")) %>%
  slice_max(order_by = like_count, n = 20) %>% 
  
  # plot

    ggplot(aes(like_count, fct_reorder(word, like_count))) +
  geom_point() +


  
  labs(y = "word in Title")

# step 1: prepare data
data_binarized_tbl <- data %>%

  select(-dislike_count, -title) %>%
binarize() 

data_binarized_tbl  %>% glimpse()

## Rows: 225
## Columns: 36
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__0                                      <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__1                                      <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__0                       <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__1                       <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__0                                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__1                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__0                                  <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__1                                  <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__0                                     <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__1                                     <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__0                                    <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__1                                    <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__0                                    <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__1                                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `view_count__-Inf_6641`                       <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ view_count__6641_43983                        <dbl> 0, 0, 0, 0, 1, 1, 0, 1, …
## $ view_count__43983_175482                      <dbl> 1, 1, 1, 0, 0, 0, 0, 0, …
## $ view_count__175482_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `like_count__-Inf_2.94443897916644`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.94443897916644_4.86753445045558 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.86753445045558_6.26720054854136 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26720054854136_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …

# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate(like_count__6.26720054854136_Inf )

data_corr_tbl

## # A tibble: 36 × 3
##    feature    bin                               correlation
##    <fct>      <chr>                                   <dbl>
##  1 like_count 6.26720054854136_Inf                    1    
##  2 view_count 175482_Inf                              0.715
##  3 like_count -Inf_2.94443897916644                  -0.339
##  4 view_count -Inf_6641                              -0.335
##  5 like_count 4.86753445045558_6.26720054854136      -0.331
##  6 like_count 2.94443897916644_4.86753445045558      -0.327
##  7 view_count 6641_43983                             -0.308
##  8 brand      Doritos                                 0.281
##  9 brand      NFL                                     0.250
## 10 brand      Bud_Light                              -0.212
## # ℹ 26 more rows

# step 3: 
data_corr_tbl %>%
  plot_correlation_funnel()

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Apply your Data 1

Javony Deleon

2026-02-07

Import Data

Explore Data

Preprocess Data

Build Models

Evaluate Models

Make Predictions