tuesdata <- tidytuesdayR::tt_load(‘2021-03-02’) tuesdata <- tidytuesdayR::tt_load(2021, week = 10)
youtube <- tuesdata$youtube
youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- youtube %>%
# Remove missing values
select(-thumbnail, -ends_with("url"), -etag, -kind, -favorite_count, -published_at, -view_count, -comment_count, -description, -dislike_count) %>%
na.omit() %>%
# log transform the target variable
mutate(like_count = log(like_count)) %>%
# category_id is a factor
mutate(category_id = as.factor(category_id)) %>%
# Convert logical to factor
mutate(across(where(is.logical), factor)) %>%
# Convert character to factor
mutate(across(c(id, channel_title, brand), factor))
# Step 1: Prepare Data
library(lubridate)
data_binarized_tbl <-data %>%
select(-id, -title) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 225
## Columns: 54
## $ `year__-Inf_2005` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.94443897916644` <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.94443897916644_4.86753445045558 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.86753445045558_6.26720054854136 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26720054854136_Inf <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ channel_title__BudBowlXLII <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `channel_title__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ channel_title__Funny_Commercials <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ channel_title__John_Keehler <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ channel_title__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__omon007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__reggiep08v2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__The_Hall_of_Advertising <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__USA_TODAY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__World_Hyundai_Matteson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `channel_title__-OTHER` <dbl> 0, 1, 0, 1, 1, 0, 0, 1, …
## $ category_id__1 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24 <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2 : Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate( like_count__6.26720054854136_Inf )
data_corr_tbl
## # A tibble: 54 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 6.26720054854136_Inf 1
## 2 like_count -Inf_2.94443897916644 -0.339
## 3 like_count 4.86753445045558_6.26720054854136 -0.331
## 4 like_count 2.94443897916644_4.86753445045558 -0.327
## 5 brand Doritos 0.281
## 6 channel_title NFL 0.262
## 7 brand NFL 0.250
## 8 brand Bud_Light -0.212
## 9 year 2015_Inf 0.202
## 10 channel_title Coca-Cola 0.202
## # ℹ 44 more rows
#Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps