Library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(glue)
library(plotly) 
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggpubr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

##Load Data

vids <- read.csv("data_input/youtubetrends_2023.csv", stringsAsFactors = TRUE )
head(vids)
##   trending_date
## 1    2023-01-01
## 2    2023-01-01
## 3    2023-01-01
## 4    2023-01-01
## 5    2023-01-01
## 6    2023-01-01
##                                                                              title
## 1      Peach Bowl: Ohio State Buckeyes vs. Georgia Bulldogs | Full Game Highlights
## 2                                                If You Build It, I'll Pay For It!
## 3 Fiesta Bowl: TCU Horned Frogs vs. Michigan Wolverines | College Football Playoff
## 4                                            Dude Perfect vs. Luka Doncic (1-on-1)
## 5                         Times Square 2023 Ball Drop in New York City: full video
## 6          New Year's 2023: Dubai puts on thrilling fireworks show at Burj Khalifa
##           channel_title       category_id        publish_time   views  likes
## 1 ESPN College Football            Sports 2023-01-01 05:12:55  618494   7884
## 2        MrBeast Gaming            Gaming 2022-12-31 20:00:04 3621512 238494
## 3 ESPN College Football            Sports 2023-01-01 01:11:58  500700   4915
## 4          Dude Perfect            Sports 2022-12-31 14:59:56 2811640 148481
## 5          News 19 WLTX News and Politics 2023-01-01 05:21:44  358177   2963
## 6           Global News News and Politics 2022-12-31 20:51:09 2088464  18234
##   dislikes comment_count comments_disabled ratings_disabled
## 1        0          2413             FALSE            FALSE
## 2        0         12935             FALSE            FALSE
## 3        0          2355             FALSE            FALSE
## 4        0          4543             FALSE            FALSE
## 5        0           654             FALSE            FALSE
## 6        0          1508             FALSE            FALSE
##   video_error_or_removed publish_hour publish_when publish_wday timetotrend
## 1                  FALSE            5  12am to 7am       Sunday           1
## 2                  FALSE           20  4pm to 12am     Saturday           2
## 3                  FALSE            1  12am to 7am       Sunday           1
## 4                  FALSE           14   8am to 3pm     Saturday           2
## 5                  FALSE            5  12am to 7am       Sunday           1
## 6                  FALSE           20  4pm to 12am     Saturday           2

Deskripsi kolom: - trending_date : tanggal trending (format: YY.MM.DD) - title : judul video - channel_title : nama channel Youtube - category_id : kategori video - publish_time : tanggal upload video (format: YYYY-MM-DD-HH-MM-SS) - views : jumlah views dalam video tersebut - likes : jumlah likes dalam video tersebut - dislikes : jumlah dislikes dalam video tersebut - comment_count : jumlah komentar - comment_disabled: apakah kolom komentar tidak diaktifkan - rating_disabled : apakah rating video tidak diaktifkan - video_error_or_removed: apakah video dihapus - publish_hour : jam video tersebut dipublish - publish_when : range video tersebut dipublish - publish_wday : hari video tersebut dipublish - timetotrend : rentang waktu video tersebut dari dipublish sampai menjadi trending (harian)

str(vids)
## 'data.frame':    72389 obs. of  16 variables:
##  $ trending_date         : Factor w/ 362 levels "2023-01-01","2023-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ title                 : Factor w/ 12595 levels "'1위/4K' 지민 (Jimin) - Like Crazy #엠카운트다운 EP.790 | Mnet 230330 방송",..: 8255 5659 3251 2843 11085 7742 4050 2291 9601 6102 ...
##  $ channel_title         : Factor w/ 3548 levels " 3 Southern Cats and Momma™ ",..: 957 2116 957 871 2215 1199 255 1082 3291 3459 ...
##  $ category_id           : Factor w/ 15 levels "Autos and Vehicles",..: 14 6 14 14 9 9 4 9 11 14 ...
##  $ publish_time          : Factor w/ 12059 levels "2022-12-18 19:20:24",..: 218 204 213 197 219 206 212 194 183 191 ...
##  $ views                 : int  618494 3621512 500700 2811640 358177 2088464 1915097 663532 1454615 2706246 ...
##  $ likes                 : int  7884 238494 4915 148481 2963 18234 24181 5130 32079 69895 ...
##  $ dislikes              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ comment_count         : int  2413 12935 2355 4543 654 1508 2711 776 3410 4231 ...
##  $ comments_disabled     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ratings_disabled      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ video_error_or_removed: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ publish_hour          : int  5 20 1 14 5 20 0 13 20 4 ...
##  $ publish_when          : Factor w/ 3 levels "12am to 7am",..: 1 2 1 3 1 2 1 3 2 1 ...
##  $ publish_wday          : Factor w/ 7 levels "Friday","Monday",..: 4 3 4 3 4 3 4 3 1 3 ...
##  $ timetotrend           : int  1 2 1 2 1 2 1 2 3 2 ...
unique(vids$category_id)
##  [1] Sports                 Gaming                 News and Politics     
##  [4] Entertainment          People and Blogs       Comedy                
##  [7] Music                  Education              Film and Animation    
## [10] Science and Technology Howto and Style        Travel and Events     
## [13] Autos and Vehicles     Pets and Animals       Nonprofit and Activism
## 15 Levels: Autos and Vehicles Comedy Education ... Travel and Events

##Membuat analisa terkait karakteristik video dari kategori ‘Travel and Events’ yang memiliki video trending tidak lebih dari 1 minggu setelah rilis (df vids_Travel)

vids_Travel <-
vids %>% 
  filter(category_id == "Travel and Events", timetotrend <= 7)
head(vids_Travel)
##   trending_date                                                   title
## 1    2023-01-01 We Raced To Visit The Most US States In 100 Hrs - Day 3
## 2    2023-01-02 We Raced To Visit The Most US States In 100 Hrs - Day 3
## 3    2023-01-03 We Raced To Visit The Most US States In 100 Hrs - Day 3
## 4    2023-01-08             Day in the Life of a Japanese Game Designer
## 5    2023-01-09                    They Said It Would Be Worth the Risk
## 6    2023-01-09             Day in the Life of a Japanese Game Designer
##       channel_title       category_id        publish_time  views likes dislikes
## 1 Jet Lag: The Game Travel and Events 2022-12-28 15:31:33 561164 20361        0
## 2 Jet Lag: The Game Travel and Events 2022-12-28 15:31:33 577256 20816        0
## 3 Jet Lag: The Game Travel and Events 2022-12-28 15:31:33 590802 21162        0
## 4   Paolo fromTOKYO Travel and Events 2023-01-06 23:59:08 389259 23481        0
## 5       Eva zu Beck Travel and Events 2023-01-07 19:54:37 246397 14160        0
## 6   Paolo fromTOKYO Travel and Events 2023-01-06 23:59:08 506513 28309        0
##   comment_count comments_disabled ratings_disabled video_error_or_removed
## 1          1570             FALSE            FALSE                  FALSE
## 2          1603             FALSE            FALSE                  FALSE
## 3          1621             FALSE            FALSE                  FALSE
## 4          1402             FALSE            FALSE                  FALSE
## 5           993             FALSE            FALSE                  FALSE
## 6          1590             FALSE            FALSE                  FALSE
##   publish_hour publish_when publish_wday timetotrend
## 1           15   8am to 3pm    Wednesday           5
## 2           15   8am to 3pm    Wednesday           6
## 3           15   8am to 3pm    Wednesday           7
## 4           23  4pm to 12am       Friday           3
## 5           19  4pm to 12am     Saturday           3
## 6           23  4pm to 12am       Friday           4

###Mencari Hubungan Comment Ratio dan Like Ratio dari df vids_Travel Hasilnya hubungan Comment Ratio dan Like Ratio dari df vids_Travel berbanding lurus

vids_Travel_clean <- vids_Travel %>%  
  # deselect kolom yang tidak dibutuhkan
  select(-comments_disabled, -ratings_disabled, -video_error_or_removed) %>% 
  
  # buat kolom baru
  mutate(likes_ratio = likes / views,
         comment_ratio = comment_count / views)
  
head(vids_Travel_clean)
##   trending_date                                                   title
## 1    2023-01-01 We Raced To Visit The Most US States In 100 Hrs - Day 3
## 2    2023-01-02 We Raced To Visit The Most US States In 100 Hrs - Day 3
## 3    2023-01-03 We Raced To Visit The Most US States In 100 Hrs - Day 3
## 4    2023-01-08             Day in the Life of a Japanese Game Designer
## 5    2023-01-09                    They Said It Would Be Worth the Risk
## 6    2023-01-09             Day in the Life of a Japanese Game Designer
##       channel_title       category_id        publish_time  views likes dislikes
## 1 Jet Lag: The Game Travel and Events 2022-12-28 15:31:33 561164 20361        0
## 2 Jet Lag: The Game Travel and Events 2022-12-28 15:31:33 577256 20816        0
## 3 Jet Lag: The Game Travel and Events 2022-12-28 15:31:33 590802 21162        0
## 4   Paolo fromTOKYO Travel and Events 2023-01-06 23:59:08 389259 23481        0
## 5       Eva zu Beck Travel and Events 2023-01-07 19:54:37 246397 14160        0
## 6   Paolo fromTOKYO Travel and Events 2023-01-06 23:59:08 506513 28309        0
##   comment_count publish_hour publish_when publish_wday timetotrend likes_ratio
## 1          1570           15   8am to 3pm    Wednesday           5  0.03628351
## 2          1603           15   8am to 3pm    Wednesday           6  0.03606026
## 3          1621           15   8am to 3pm    Wednesday           7  0.03581911
## 4          1402           23  4pm to 12am       Friday           3  0.06032230
## 5           993           19  4pm to 12am     Saturday           3  0.05746823
## 6          1590           23  4pm to 12am       Friday           4  0.05588998
##   comment_ratio
## 1   0.002797756
## 2   0.002776931
## 3   0.002743728
## 4   0.003601715
## 5   0.004030082
## 6   0.003139110
ggplot(data = vids_Travel_clean, aes(x = likes_ratio, y = comment_ratio, color = factor(category_id))) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Relationship Between Like Ratio & Comment Ratio by Category",
       x = "Like Ratio",
       y = "Comment Ratio",
       color = "Category ID") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

###Mencari top 10 dari Video Travel dan Event (df vids_Travel) yang memiliki rata-rata likes terbanyak

vids_Travel_Like <- 
  vids_Travel_clean %>%
  group_by(title) %>%
  summarise(mean_likes = mean(likes, na.rm = TRUE)) %>%
  arrange(desc(mean_likes))
head(vids_Travel_Like)
## # A tibble: 6 × 2
##   title                                                 mean_likes
##   <fct>                                                      <dbl>
## 1 I Was Trafficked Through The World's Deadliest Jungle    103214.
## 2 I Crossed The World's Deadliest Jungle: Darien Gap        85467 
## 3 We're Having a Baby!                                      68800 
## 4 Our Unexpected Move to Toronto                            45774 
## 5 $10,000 Basement Renovation (+ Bec's health update)       42671.
## 6 Not The Video We Wanted to Make                           41895.
#Mencari top 10 video Travel & Event (berdasarkan rata-rata like)

Vids_Travel_Like_top10 <- 
  vids_Travel_Like %>%
  arrange(desc(mean_likes)) %>%
  head(10)
Vids_Travel_Like_top10
## # A tibble: 10 × 2
##    title                                                 mean_likes
##    <fct>                                                      <dbl>
##  1 I Was Trafficked Through The World's Deadliest Jungle    103214.
##  2 I Crossed The World's Deadliest Jungle: Darien Gap        85467 
##  3 We're Having a Baby!                                      68800 
##  4 Our Unexpected Move to Toronto                            45774 
##  5 $10,000 Basement Renovation (+ Bec's health update)       42671.
##  6 Not The Video We Wanted to Make                           41895.
##  7 Raw, unfiltered BOAT LIFE in Thailand! (Extended Cut)     39122.
##  8 Day in the Life of a Japanese Bread Baker                 38268.
##  9 Why Sudan is on the Verge of Civil War                    37192.
## 10 Gender Reveal Fail!                                       36946.
library(ggplot2)
library(dplyr)

ggplot(Vids_Travel_Like_top10, aes(x = reorder(title, mean_likes), y = mean_likes)) +
  geom_bar(stat = "identity", fill = "pink", color = "black") +
  labs(title = "Average Likes per Video",
       x = "Video Title",
       y = "Average Likes") +
  coord_flip()

  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

###Mencari Top 10 Channel yang memiliki jumlah video dengan trending

vids_Travel_count <- vids_Travel %>% 
  group_by(channel_title) %>% 
  summarise(jumlah_video = n() ) %>% 
  arrange(-jumlah_video) %>% 
  slice_head(n = 10) %>%
  ungroup()

vids_Travel_count
## # A tibble: 10 × 2
##    channel_title        jumlah_video
##    <fct>                       <int>
##  1 Eamon & Bec                   138
##  2 Sailing La Vagabonde           76
##  3 Trek Trendy                    55
##  4 Kara and Nate                  51
##  5 Jet Lag: The Game              45
##  6 The Ninja Fam!                 18
##  7 Gone with the Wynns            17
##  8 Paolo fromTOKYO                15
##  9 Dunkin'                        14
## 10 Eva zu Beck                    11
ggplot(vids_Travel_count, aes(x = jumlah_video ,
                                y = reorder(channel_title, jumlah_video) )) + 
  # bar plot
  geom_col(aes(fill = jumlah_video)) +
  
  # additional
  scale_fill_gradient2(low="gray", mid= "pink", high="magenta", midpoint = median(vids_Travel_count$jumlah_video)) +
  labs(title = "Top 10 Trending Travel and Event Channel of YouTube US 2023",
       x = "Video Count",
       y = NULL) +
  scale_x_continuous(labels = comma) +
  theme_minimal() +
  theme(legend.position = "none")

###Mencari hari published video dengan rata-rata viewers terbanyak dari Top Trending Channel Travel & Event

# data
  vids_Travel %>% 
  
  # Mengambil channel_title dengan Trending tertinggi (Eamon&Bec)
  filter(channel_title == "Eamon & Bec") %>% 
  
  # Menghitung rata-rata views tiap jam publish
  group_by(publish_wday) %>% 
  summarise(mean_viewers = mean(views)) %>% 
  ungroup() %>% 
  
  # Membuat tooltip
  mutate(label = glue("Publish Day: {publish_wday}
                      Average Views: {comma(mean_viewers)}"))
## # A tibble: 2 × 3
##   publish_wday mean_viewers label                                      
##   <fct>               <dbl> <glue>                                     
## 1 Saturday          396141. Publish Day: Saturday
## Average Views: 396,141
## 2 Sunday            537834. Publish Day: Sunday
## Average Views: 537,834
# data
  vids_Travel %>% 
  
  # Mengambil channel_title dengan Trending tertinggi (Eamon&Bec)
  filter(channel_title == "Eamon & Bec") %>% 
  
  # Menghitung rata-rata views tiap jam publish
  group_by(publish_wday) %>% 
  slice_max(order_by = views, n = 1) %>% 
  ungroup() %>% 
  
  # Membuat tooltip
  mutate(label = glue("Publish Day: {publish_wday}
                       Video: {title}
                       Views: {comma(views)}"))
## # A tibble: 2 × 17
##   trending_date title        channel_title category_id publish_time  views likes
##   <fct>         <fct>        <fct>         <fct>       <fct>         <int> <int>
## 1 2023-02-10    Tiny House … Eamon & Bec   Travel and… 2023-02-04 … 4.75e5 18495
## 2 2023-05-06    Not The Vid… Eamon & Bec   Travel and… 2023-04-30 … 1.11e6 47541
## # ℹ 10 more variables: dislikes <int>, comment_count <int>,
## #   comments_disabled <lgl>, ratings_disabled <lgl>,
## #   video_error_or_removed <lgl>, publish_hour <int>, publish_when <fct>,
## #   publish_wday <fct>, timetotrend <int>, label <glue>

###Mencari Title dengan viewers tertinggi yang dipublished Eamon & Bec di hari Minggu

channel_name_Travel <- "Eamon & Bec"

vids_top_per_wday <- vids_Travel %>% 
  filter(channel_title == channel_name_Travel, publish_wday == "Sunday") %>%  
  group_by(title) %>%
  slice_max(order_by = views, n = 1) %>%  
  ungroup() %>% 
  arrange(desc(views)) %>%  
  slice_head(n = 5) %>%  
  mutate(label = glue("{title}\n{comma(views)} views"))

ggplot(vids_top_per_wday, aes(x = views, y = reorder(title, views))) +  
  geom_segment(aes(x = 0, xend = views, y = title, yend = title), color = "gray", size = 1) +  
  geom_point(color = "pink", size = 6) +  
  geom_text(aes(label = comma(views)), hjust = -0.5, size = 4) +
  scale_x_continuous(limits = c(0,1500000), labels = comma) +
  labs(title = glue("Top 5 Most Viewed Videos Published on Sunday ({channel_name_Travel})"),  
       x = "Views",
       y = "Video Title") +  
  theme_minimal() +
  theme(plot.title = element_text(size = 10.5, face = "bold"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.