download.file(
  "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv",
  destfile = "youtube.csv",
  mode = "wb"
)

youtube <- read_csv("youtube.csv")

## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data <- youtube %>%
  mutate(
    title = replace_na(title, "missing"),
    description = replace_na(description, "missing"),
    brand = replace_na(brand, "missing"),
    channel_title = replace_na(channel_title, "missing"),
    funny = replace_na(funny, FALSE),
    show_product_quickly = replace_na(show_product_quickly, FALSE),
    patriotic = replace_na(patriotic, FALSE),
    celebrity = replace_na(celebrity, FALSE),
    danger = replace_na(danger, FALSE),
    animals = replace_na(animals, FALSE),
    use_sex = replace_na(use_sex, FALSE),
    view_count = replace_na(view_count, median(view_count, na.rm = TRUE)),
    like_count = replace_na(like_count, median(like_count, na.rm = TRUE))
  ) %>%
  filter(like_count > 0) %>%
  mutate(
    high_likes = if_else(
      like_count >= median(like_count, na.rm = TRUE),
      "Yes",
      "No"
    ),
    high_likes = factor(high_likes, levels = c("No", "Yes")),
    year = as.factor(year),
    brand = fct_lump_n(as.factor(brand), n = 10),
    channel_title = fct_lump_n(as.factor(channel_title), n = 10),
    funny = as.factor(funny),
    show_product_quickly = as.factor(show_product_quickly),
    patriotic = as.factor(patriotic),
    celebrity = as.factor(celebrity),
    danger = as.factor(danger),
    animals = as.factor(animals),
    use_sex = as.factor(use_sex)
  )

Explore Data

skimr::skim(data)

Data summary
Name	data
Number of rows	238
Number of columns	26
_______________________
Column type frequency:
character	8
factor	11
numeric	6
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
superbowl_ads_dot_com_url	0	1.00	34	120	235
youtube_url	11	0.95	43	43	224
id	11	0.95	11	11	224
kind	16	0.93	13	13	1
etag	16	0.93	27	27	219
title	0	1.00	6	99	220
description	0	1.00	3	3527	189
thumbnail	126	0.47	48	48	112

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
year	1	FALSE	21	200: 15, 201: 15, 201: 15, 201: 14
brand	1	FALSE	10	Bud: 62, Bud: 39, Dor: 25, Pep: 24
funny	1	FALSE	2	TRU: 167, FAL: 71
show_product_quickly	1	FALSE	2	TRU: 164, FAL: 74
patriotic	1	FALSE	2	FAL: 199, TRU: 39
celebrity	1	FALSE	2	FAL: 169, TRU: 69
danger	1	FALSE	2	FAL: 163, TRU: 75
animals	1	FALSE	2	FAL: 149, TRU: 89
use_sex	1	FALSE	2	FAL: 172, TRU: 66
channel_title	1	FALSE	13	Oth: 183, mis: 16, NFL: 5, omo: 5
high_likes	1	FALSE	2	Yes: 135, No: 103

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
view_count	0	1.00	1368934.98	11795086.74	42	10484.75	41379	154328.5	176373378	▇▁▁▁▁
like_count	0	1.00	3931.58	23272.26	1	32.25	130	451.5	275362	▇▁▁▁▁
dislike_count	22	0.91	868.27	7090.33	0	2.00	7	28.5	92990	▇▁▁▁▁
favorite_count	16	0.93	0.00	0.00	0	0.00	0	0.0	0	▁▁▇▁▁
comment_count	25	0.89	196.61	1006.40	0	1.00	13	52.0	9190	▇▁▁▁▁
category_id	16	0.93	19.21	8.13	1	17.00	23	24.0	29	▃▁▂▆▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
published_at	16	0.93	2006-02-06 10:02:36	2021-01-27 13:11:29	2012-12-24 11:50:26	218

data %>% count(high_likes)

## # A tibble: 2 × 2
##   high_likes     n
##   <fct>      <int>
## 1 No           103
## 2 Yes          135

data %>%
  ggplot(aes(high_likes)) +
  geom_bar()

High Likes vs View Count

data %>%
  ggplot(aes(high_likes, view_count)) +
  geom_boxplot() +
  scale_y_log10()

Correlation Plot

data_clean <- data %>%
  select(
    high_likes,
    year,
    brand,
    channel_title,
    funny,
    show_product_quickly,
    patriotic,
    celebrity,
    danger,
    animals,
    use_sex,
    view_count
  )

data_binarized <- data_clean %>%
  binarize()

data_binarized %>% glimpse()

## Rows: 238
## Columns: 64
## $ high_likes__No                         <dbl> 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1…
## $ high_likes__Yes                        <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0…
## $ year__2000                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2001                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2002                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2003                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ year__2004                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2005                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2006                             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2007                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2008                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2009                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2010                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ year__2011                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2012                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2013                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2014                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2015                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2016                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2017                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2018                             <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ year__2019                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2020                             <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0…
## $ brand__Bud_Light                       <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0…
## $ brand__Budweiser                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ `brand__Coca-Cola`                     <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ brand__Doritos                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `brand__E-Trade`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Hynudai                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ brand__Kia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ brand__NFL                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Pepsi                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Toyota                          <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ channel_title__BudBowlXLII             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `channel_title__Coca-Cola`             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ channel_title__Funny_Commercials       <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ channel_title__John_Keehler            <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__missing                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__Mister_Alcohol          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__NFL                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__omon007                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__reggiep08v2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__The_Hall_of_Advertising <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__USA_TODAY               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__World_Hyundai_Matteson  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ channel_title__Other                   <dbl> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0…
## $ funny__FALSE                           <dbl> 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1…
## $ funny__TRUE                            <dbl> 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0…
## $ show_product_quickly__FALSE            <dbl> 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0…
## $ show_product_quickly__TRUE             <dbl> 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1…
## $ patriotic__FALSE                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0…
## $ patriotic__TRUE                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ celebrity__FALSE                       <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1…
## $ celebrity__TRUE                        <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0…
## $ danger__FALSE                          <dbl> 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1…
## $ danger__TRUE                           <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0…
## $ animals__FALSE                         <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1…
## $ animals__TRUE                          <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0…
## $ use_sex__FALSE                         <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1…
## $ use_sex__TRUE                          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ `view_count__-Inf_10484.75`            <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0…
## $ view_count__10484.75_41379             <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0…
## $ view_count__41379_154328.5             <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ view_count__154328.5_Inf               <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…

data_correlation <- data_binarized %>%
  correlate(target = high_likes__Yes)

data_correlation

## # A tibble: 64 × 3
##    feature       bin           correlation
##    <fct>         <chr>               <dbl>
##  1 high_likes    No                 -1    
##  2 high_likes    Yes                 1    
##  3 view_count    -Inf_10484.75      -0.587
##  4 view_count    154328.5_Inf        0.507
##  5 channel_title missing             0.234
##  6 channel_title Other              -0.177
##  7 channel_title omon007            -0.168
##  8 brand         Doritos             0.161
##  9 brand         Bud_Light          -0.158
## 10 patriotic     FALSE              -0.158
## # ℹ 54 more rows

data_correlation %>%
  plot_correlation_funnel()

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Summary

For this assignment, I built on my Apply 4 work by using the same YouTube dataset, but changing the analysis to follow the approach from Code Along 5. Instead of predicting like_count as a numeric outcome, I created a new binary variable called high_likes that groups videos into higher-like and lower-like categories based on the median like count. I then used correlation funnel analysis to explore which variables were most strongly related to videos with higher likes.

Apply 5

Matthew Plutzner

2026-04-12

Explore Data

High Likes vs View Count

Correlation Plot

Summary