download.file(
  "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv",
  destfile = "youtube.csv",
  mode = "wb"
)

youtube <- read_csv("youtube.csv")
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- youtube %>%
  mutate(
    title = replace_na(title, "missing"),
    description = replace_na(description, "missing"),
    brand = replace_na(brand, "missing"),
    channel_title = replace_na(channel_title, "missing"),
    funny = replace_na(funny, FALSE),
    show_product_quickly = replace_na(show_product_quickly, FALSE),
    patriotic = replace_na(patriotic, FALSE),
    celebrity = replace_na(celebrity, FALSE),
    danger = replace_na(danger, FALSE),
    animals = replace_na(animals, FALSE),
    use_sex = replace_na(use_sex, FALSE),
    view_count = replace_na(view_count, median(view_count, na.rm = TRUE)),
    like_count = replace_na(like_count, median(like_count, na.rm = TRUE))
  ) %>%
  filter(like_count > 0) %>%
  mutate(
    high_likes = if_else(
      like_count >= median(like_count, na.rm = TRUE),
      "Yes",
      "No"
    ),
    high_likes = factor(high_likes, levels = c("No", "Yes")),
    year = as.factor(year),
    brand = fct_lump_n(as.factor(brand), n = 10),
    channel_title = fct_lump_n(as.factor(channel_title), n = 10),
    funny = as.factor(funny),
    show_product_quickly = as.factor(show_product_quickly),
    patriotic = as.factor(patriotic),
    celebrity = as.factor(celebrity),
    danger = as.factor(danger),
    animals = as.factor(animals),
    use_sex = as.factor(use_sex)
  )

Explore Data

skimr::skim(data)
Data summary
Name data
Number of rows 238
Number of columns 26
_______________________
Column type frequency:
character 8
factor 11
numeric 6
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
superbowl_ads_dot_com_url 0 1.00 34 120 0 235 0
youtube_url 11 0.95 43 43 0 224 0
id 11 0.95 11 11 0 224 0
kind 16 0.93 13 13 0 1 0
etag 16 0.93 27 27 0 219 0
title 0 1.00 6 99 0 220 0
description 0 1.00 3 3527 0 189 0
thumbnail 126 0.47 48 48 0 112 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
year 0 1 FALSE 21 200: 15, 201: 15, 201: 15, 201: 14
brand 0 1 FALSE 10 Bud: 62, Bud: 39, Dor: 25, Pep: 24
funny 0 1 FALSE 2 TRU: 167, FAL: 71
show_product_quickly 0 1 FALSE 2 TRU: 164, FAL: 74
patriotic 0 1 FALSE 2 FAL: 199, TRU: 39
celebrity 0 1 FALSE 2 FAL: 169, TRU: 69
danger 0 1 FALSE 2 FAL: 163, TRU: 75
animals 0 1 FALSE 2 FAL: 149, TRU: 89
use_sex 0 1 FALSE 2 FAL: 172, TRU: 66
channel_title 0 1 FALSE 13 Oth: 183, mis: 16, NFL: 5, omo: 5
high_likes 0 1 FALSE 2 Yes: 135, No: 103

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
view_count 0 1.00 1368934.98 11795086.74 42 10484.75 41379 154328.5 176373378 ▇▁▁▁▁
like_count 0 1.00 3931.58 23272.26 1 32.25 130 451.5 275362 ▇▁▁▁▁
dislike_count 22 0.91 868.27 7090.33 0 2.00 7 28.5 92990 ▇▁▁▁▁
favorite_count 16 0.93 0.00 0.00 0 0.00 0 0.0 0 ▁▁▇▁▁
comment_count 25 0.89 196.61 1006.40 0 1.00 13 52.0 9190 ▇▁▁▁▁
category_id 16 0.93 19.21 8.13 1 17.00 23 24.0 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.93 2006-02-06 10:02:36 2021-01-27 13:11:29 2012-12-24 11:50:26 218
data %>% count(high_likes)
## # A tibble: 2 × 2
##   high_likes     n
##   <fct>      <int>
## 1 No           103
## 2 Yes          135
data %>%
  ggplot(aes(high_likes)) +
  geom_bar()

High Likes vs View Count

data %>%
  ggplot(aes(high_likes, view_count)) +
  geom_boxplot() +
  scale_y_log10()

Correlation Plot

data_clean <- data %>%
  select(
    high_likes,
    year,
    brand,
    channel_title,
    funny,
    show_product_quickly,
    patriotic,
    celebrity,
    danger,
    animals,
    use_sex,
    view_count
  )
data_binarized <- data_clean %>%
  binarize()

data_binarized %>% glimpse()
## Rows: 238
## Columns: 64
## $ high_likes__No                         <dbl> 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1…
## $ high_likes__Yes                        <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0…
## $ year__2000                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2001                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2002                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2003                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ year__2004                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2005                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2006                             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2007                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2008                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2009                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2010                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ year__2011                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2012                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2013                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2014                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2015                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2016                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2017                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2018                             <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ year__2019                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2020                             <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0…
## $ brand__Bud_Light                       <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0…
## $ brand__Budweiser                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ `brand__Coca-Cola`                     <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ brand__Doritos                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `brand__E-Trade`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Hynudai                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ brand__Kia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ brand__NFL                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Pepsi                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Toyota                          <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ channel_title__BudBowlXLII             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `channel_title__Coca-Cola`             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ channel_title__Funny_Commercials       <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ channel_title__John_Keehler            <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__missing                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__Mister_Alcohol          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__NFL                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__omon007                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__reggiep08v2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__The_Hall_of_Advertising <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__USA_TODAY               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__World_Hyundai_Matteson  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ channel_title__Other                   <dbl> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0…
## $ funny__FALSE                           <dbl> 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1…
## $ funny__TRUE                            <dbl> 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0…
## $ show_product_quickly__FALSE            <dbl> 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0…
## $ show_product_quickly__TRUE             <dbl> 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1…
## $ patriotic__FALSE                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0…
## $ patriotic__TRUE                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ celebrity__FALSE                       <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1…
## $ celebrity__TRUE                        <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0…
## $ danger__FALSE                          <dbl> 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1…
## $ danger__TRUE                           <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0…
## $ animals__FALSE                         <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1…
## $ animals__TRUE                          <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0…
## $ use_sex__FALSE                         <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1…
## $ use_sex__TRUE                          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ `view_count__-Inf_10484.75`            <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0…
## $ view_count__10484.75_41379             <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0…
## $ view_count__41379_154328.5             <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ view_count__154328.5_Inf               <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
data_correlation <- data_binarized %>%
  correlate(target = high_likes__Yes)

data_correlation
## # A tibble: 64 × 3
##    feature       bin           correlation
##    <fct>         <chr>               <dbl>
##  1 high_likes    No                 -1    
##  2 high_likes    Yes                 1    
##  3 view_count    -Inf_10484.75      -0.587
##  4 view_count    154328.5_Inf        0.507
##  5 channel_title missing             0.234
##  6 channel_title Other              -0.177
##  7 channel_title omon007            -0.168
##  8 brand         Doritos             0.161
##  9 brand         Bud_Light          -0.158
## 10 patriotic     FALSE              -0.158
## # ℹ 54 more rows
data_correlation %>%
  plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Summary

For this assignment, I built on my Apply 4 work by using the same YouTube dataset, but changing the analysis to follow the approach from Code Along 5. Instead of predicting like_count as a numeric outcome, I created a new binary variable called high_likes that groups videos into higher-like and lower-like categories based on the median like count. I then used correlation funnel analysis to explore which variables were most strongly related to videos with higher likes.