download.file(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv",
destfile = "youtube.csv",
mode = "wb"
)
youtube <- read_csv("youtube.csv")
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- youtube %>%
mutate(
title = replace_na(title, "missing"),
description = replace_na(description, "missing"),
brand = replace_na(brand, "missing"),
channel_title = replace_na(channel_title, "missing"),
funny = replace_na(funny, FALSE),
show_product_quickly = replace_na(show_product_quickly, FALSE),
patriotic = replace_na(patriotic, FALSE),
celebrity = replace_na(celebrity, FALSE),
danger = replace_na(danger, FALSE),
animals = replace_na(animals, FALSE),
use_sex = replace_na(use_sex, FALSE),
view_count = replace_na(view_count, median(view_count, na.rm = TRUE)),
like_count = replace_na(like_count, median(like_count, na.rm = TRUE))
) %>%
filter(like_count > 0) %>%
mutate(
high_likes = if_else(
like_count >= median(like_count, na.rm = TRUE),
"Yes",
"No"
),
high_likes = factor(high_likes, levels = c("No", "Yes")),
year = as.factor(year),
brand = fct_lump_n(as.factor(brand), n = 10),
channel_title = fct_lump_n(as.factor(channel_title), n = 10),
funny = as.factor(funny),
show_product_quickly = as.factor(show_product_quickly),
patriotic = as.factor(patriotic),
celebrity = as.factor(celebrity),
danger = as.factor(danger),
animals = as.factor(animals),
use_sex = as.factor(use_sex)
)
skimr::skim(data)
| Name | data |
| Number of rows | 238 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| factor | 11 |
| numeric | 6 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 235 | 0 |
| youtube_url | 11 | 0.95 | 43 | 43 | 0 | 224 | 0 |
| id | 11 | 0.95 | 11 | 11 | 0 | 224 | 0 |
| kind | 16 | 0.93 | 13 | 13 | 0 | 1 | 0 |
| etag | 16 | 0.93 | 27 | 27 | 0 | 219 | 0 |
| title | 0 | 1.00 | 6 | 99 | 0 | 220 | 0 |
| description | 0 | 1.00 | 3 | 3527 | 0 | 189 | 0 |
| thumbnail | 126 | 0.47 | 48 | 48 | 0 | 112 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| year | 0 | 1 | FALSE | 21 | 200: 15, 201: 15, 201: 15, 201: 14 |
| brand | 0 | 1 | FALSE | 10 | Bud: 62, Bud: 39, Dor: 25, Pep: 24 |
| funny | 0 | 1 | FALSE | 2 | TRU: 167, FAL: 71 |
| show_product_quickly | 0 | 1 | FALSE | 2 | TRU: 164, FAL: 74 |
| patriotic | 0 | 1 | FALSE | 2 | FAL: 199, TRU: 39 |
| celebrity | 0 | 1 | FALSE | 2 | FAL: 169, TRU: 69 |
| danger | 0 | 1 | FALSE | 2 | FAL: 163, TRU: 75 |
| animals | 0 | 1 | FALSE | 2 | FAL: 149, TRU: 89 |
| use_sex | 0 | 1 | FALSE | 2 | FAL: 172, TRU: 66 |
| channel_title | 0 | 1 | FALSE | 13 | Oth: 183, mis: 16, NFL: 5, omo: 5 |
| high_likes | 0 | 1 | FALSE | 2 | Yes: 135, No: 103 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| view_count | 0 | 1.00 | 1368934.98 | 11795086.74 | 42 | 10484.75 | 41379 | 154328.5 | 176373378 | ▇▁▁▁▁ |
| like_count | 0 | 1.00 | 3931.58 | 23272.26 | 1 | 32.25 | 130 | 451.5 | 275362 | ▇▁▁▁▁ |
| dislike_count | 22 | 0.91 | 868.27 | 7090.33 | 0 | 2.00 | 7 | 28.5 | 92990 | ▇▁▁▁▁ |
| favorite_count | 16 | 0.93 | 0.00 | 0.00 | 0 | 0.00 | 0 | 0.0 | 0 | ▁▁▇▁▁ |
| comment_count | 25 | 0.89 | 196.61 | 1006.40 | 0 | 1.00 | 13 | 52.0 | 9190 | ▇▁▁▁▁ |
| category_id | 16 | 0.93 | 19.21 | 8.13 | 1 | 17.00 | 23 | 24.0 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| published_at | 16 | 0.93 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2012-12-24 11:50:26 | 218 |
data %>% count(high_likes)
## # A tibble: 2 × 2
## high_likes n
## <fct> <int>
## 1 No 103
## 2 Yes 135
data %>%
ggplot(aes(high_likes)) +
geom_bar()
data %>%
ggplot(aes(high_likes, view_count)) +
geom_boxplot() +
scale_y_log10()
data_clean <- data %>%
select(
high_likes,
year,
brand,
channel_title,
funny,
show_product_quickly,
patriotic,
celebrity,
danger,
animals,
use_sex,
view_count
)
data_binarized <- data_clean %>%
binarize()
data_binarized %>% glimpse()
## Rows: 238
## Columns: 64
## $ high_likes__No <dbl> 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1…
## $ high_likes__Yes <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0…
## $ year__2000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2001 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2002 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2003 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ year__2004 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2005 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2006 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2009 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ year__2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2017 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2018 <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ year__2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2020 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0…
## $ brand__Bud_Light <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0…
## $ brand__Budweiser <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ `brand__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ brand__Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `brand__E-Trade` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Hynudai <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ brand__Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ brand__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ brand__Toyota <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ channel_title__BudBowlXLII <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `channel_title__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ channel_title__Funny_Commercials <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ channel_title__John_Keehler <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__missing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__Mister_Alcohol <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__omon007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__reggiep08v2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__The_Hall_of_Advertising <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__USA_TODAY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ channel_title__World_Hyundai_Matteson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ channel_title__Other <dbl> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0…
## $ funny__FALSE <dbl> 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1…
## $ funny__TRUE <dbl> 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0…
## $ show_product_quickly__FALSE <dbl> 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0…
## $ show_product_quickly__TRUE <dbl> 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1…
## $ patriotic__FALSE <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0…
## $ patriotic__TRUE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ celebrity__FALSE <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1…
## $ celebrity__TRUE <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0…
## $ danger__FALSE <dbl> 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1…
## $ danger__TRUE <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0…
## $ animals__FALSE <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1…
## $ animals__TRUE <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0…
## $ use_sex__FALSE <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1…
## $ use_sex__TRUE <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ `view_count__-Inf_10484.75` <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0…
## $ view_count__10484.75_41379 <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0…
## $ view_count__41379_154328.5 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ view_count__154328.5_Inf <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
data_correlation <- data_binarized %>%
correlate(target = high_likes__Yes)
data_correlation
## # A tibble: 64 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 high_likes No -1
## 2 high_likes Yes 1
## 3 view_count -Inf_10484.75 -0.587
## 4 view_count 154328.5_Inf 0.507
## 5 channel_title missing 0.234
## 6 channel_title Other -0.177
## 7 channel_title omon007 -0.168
## 8 brand Doritos 0.161
## 9 brand Bud_Light -0.158
## 10 patriotic FALSE -0.158
## # ℹ 54 more rows
data_correlation %>%
plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
## Please report the issue at
## <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
## Please report the issue at
## <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
For this assignment, I built on my Apply 4 work by using the same YouTube dataset, but changing the analysis to follow the approach from Code Along 5. Instead of predicting like_count as a numeric outcome, I created a new binary variable called high_likes that groups videos into higher-like and lower-like categories based on the median like count. I then used correlation funnel analysis to explore which variables were most strongly related to videos with higher likes.