youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
| Name | youtube |
| Number of rows | 247 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| logical | 7 |
| numeric | 7 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| brand | 0 | 1.00 | 3 | 9 | 0 | 10 | 0 |
| superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 244 | 0 |
| youtube_url | 11 | 0.96 | 43 | 43 | 0 | 233 | 0 |
| id | 11 | 0.96 | 11 | 11 | 0 | 233 | 0 |
| kind | 16 | 0.94 | 13 | 13 | 0 | 1 | 0 |
| etag | 16 | 0.94 | 27 | 27 | 0 | 228 | 0 |
| title | 16 | 0.94 | 6 | 99 | 0 | 228 | 0 |
| description | 50 | 0.80 | 3 | 3527 | 0 | 194 | 0 |
| thumbnail | 129 | 0.48 | 48 | 48 | 0 | 118 | 0 |
| channel_title | 16 | 0.94 | 3 | 37 | 0 | 185 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| funny | 0 | 1 | 0.69 | TRU: 171, FAL: 76 |
| show_product_quickly | 0 | 1 | 0.68 | TRU: 169, FAL: 78 |
| patriotic | 0 | 1 | 0.17 | FAL: 206, TRU: 41 |
| celebrity | 0 | 1 | 0.29 | FAL: 176, TRU: 71 |
| danger | 0 | 1 | 0.30 | FAL: 172, TRU: 75 |
| animals | 0 | 1 | 0.37 | FAL: 155, TRU: 92 |
| use_sex | 0 | 1 | 0.27 | FAL: 181, TRU: 66 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2010.19 | 5.86 | 2000 | 2005 | 2010 | 2015.00 | 2020 | ▇▇▇▇▆ |
| view_count | 16 | 0.94 | 1407556.46 | 11971111.01 | 10 | 6431 | 41379 | 170015.50 | 176373378 | ▇▁▁▁▁ |
| like_count | 22 | 0.91 | 4146.03 | 23920.40 | 0 | 19 | 130 | 527.00 | 275362 | ▇▁▁▁▁ |
| dislike_count | 22 | 0.91 | 833.54 | 6948.52 | 0 | 1 | 7 | 24.00 | 92990 | ▇▁▁▁▁ |
| favorite_count | 16 | 0.94 | 0.00 | 0.00 | 0 | 0 | 0 | 0.00 | 0 | ▁▁▇▁▁ |
| comment_count | 25 | 0.90 | 188.64 | 986.46 | 0 | 1 | 10 | 50.75 | 9190 | ▇▁▁▁▁ |
| category_id | 16 | 0.94 | 19.32 | 8.00 | 1 | 17 | 23 | 24.00 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| published_at | 16 | 0.94 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2013-01-31 09:13:55 | 227 |
data <- youtube %>%
# Treat Missing Values
select(-thumbnail, -channel_title, -description, -etag, -kind, -youtube_url, -id) %>%
na.omit() %>%
# Log Transform Variables with Skewed Distribution
mutate(like_count = log(like_count))
View Count
data %>%
ggplot(aes(like_count, view_count)) +
scale_y_log10() +
geom_point()
Dislike Count
data %>%
ggplot(aes(like_count, as.factor(dislike_count))) +
geom_point()
Comment Count
data %>%
ggplot(aes(like_count, as.factor(comment_count)))
# Prepare Data
data_binarized_tbl <- data %>%
select(-title, -published_at, -animals, -funny, -danger, -use_sex, -patriotic, -celebrity, -show_product_quickly, -year, -brand, -superbowl_ads_dot_com_url) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 219
## Columns: 20
## $ `view_count__-Inf_6577` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ view_count__6577_41828 <dbl> 0, 0, 0, 1, 1, 0, 1, 1, …
## $ view_count__41828_176014.5 <dbl> 1, 1, 0, 0, 0, 0, 0, 0, …
## $ view_count__176014.5_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `like_count__-Inf_2.9174053685313` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ like_count__2.9174053685313_4.86753445045558 <dbl> 0, 1, 0, 1, 1, 0, 1, 0, …
## $ like_count__4.86753445045558_6.17478337258445 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ like_count__6.17478337258445_Inf <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ `dislike_count__-Inf_1` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ dislike_count__1_7 <dbl> 0, 0, 0, 1, 0, 0, 1, 1, …
## $ dislike_count__7_24 <dbl> 1, 1, 0, 0, 1, 0, 0, 0, …
## $ dislike_count__24_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `comment_count__-Inf_1` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ comment_count__1_11 <dbl> 0, 1, 0, 1, 0, 0, 1, 0, …
## $ comment_count__11_51.5 <dbl> 1, 0, 0, 0, 1, 0, 0, 1, …
## $ comment_count__51.5_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `category_id__-Inf_17` <dbl> 0, 1, 0, 0, 1, 0, 1, 0, …
## $ category_id__17_23 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__23_24 <dbl> 0, 0, 0, 1, 0, 1, 0, 1, …
## $ category_id__24_Inf <dbl> 1, 0, 0, 0, 0, 0, 0, 0, …
# Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(like_count__6.17478337258445_Inf)
data_corr_tbl
## # A tibble: 20 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 6.17478337258445_Inf 1
## 2 comment_count 51.5_Inf 0.806
## 3 view_count 176014.5_Inf 0.733
## 4 dislike_count 24_Inf 0.695
## 5 comment_count -Inf_1 -0.372
## 6 dislike_count -Inf_1 -0.352
## 7 dislike_count 1_7 -0.348
## 8 view_count -Inf_6577 -0.335
## 9 like_count -Inf_2.9174053685313 -0.335
## 10 like_count 2.9174053685313_4.86753445045558 -0.335
## 11 like_count 4.86753445045558_6.17478337258445 -0.331
## 12 view_count 6577_41828 -0.311
## 13 comment_count 1_11 -0.303
## 14 comment_count 11_51.5 -0.131
## 15 category_id -Inf_17 0.106
## 16 view_count 41828_176014.5 -0.0870
## 17 category_id 23_24 -0.0736
## 18 category_id 17_23 -0.0298
## 19 dislike_count 7_24 0.0241
## 20 category_id 24_Inf 0.0114
# Plot
data_corr_tbl %>%
plot_correlation_funnel()