# 1. Mencari jam tertinggi penonton melihat suatu video di youtube
# 2. Melihat rata2 views dalam suatu kategori video
# 3. Melihat perkembangan video dari waktu ke waktu berdasarkan views
vids <- read.csv("data_input/USvideos.csv")
head(vids)
## trending_date title
## 1 17.14.11 WE WANT TO TALK ABOUT OUR MARRIAGE
## 2 17.14.11 The Trump Presidency: Last Week Tonight with John Oliver (HBO)
## 3 17.14.11 Racist Superman | Rudy Mancuso, King Bach & Lele Pons
## 4 17.14.11 Nickelback Lyrics: Real or Fake?
## 5 17.14.11 I Dare You: GOING BALD!?
## 6 17.14.11 2 Weeks with iPhone X
## channel_title category_id publish_time views likes
## 1 CaseyNeistat 22 2017-11-13T17:13:01.000Z 748374 57527
## 2 LastWeekTonight 24 2017-11-13T07:30:00.000Z 2418783 97185
## 3 Rudy Mancuso 23 2017-11-12T19:05:24.000Z 3191434 146033
## 4 Good Mythical Morning 24 2017-11-13T11:00:04.000Z 343168 10172
## 5 nigahiga 24 2017-11-12T18:01:41.000Z 2095731 132235
## 6 iJustine 28 2017-11-13T19:07:23.000Z 119180 9763
## dislikes comment_count comments_disabled ratings_disabled
## 1 2966 15954 FALSE FALSE
## 2 6146 12703 FALSE FALSE
## 3 5339 8181 FALSE FALSE
## 4 666 2146 FALSE FALSE
## 5 1989 17518 FALSE FALSE
## 6 511 1434 FALSE FALSE
## video_error_or_removed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(vids)
## Rows: 13,400
## Columns: 12
## $ trending_date <chr> "17.14.11", "17.14.11", "17.14.11", "17.14.11",…
## $ title <chr> "WE WANT TO TALK ABOUT OUR MARRIAGE", "The Trum…
## $ channel_title <chr> "CaseyNeistat", "LastWeekTonight", "Rudy Mancus…
## $ category_id <int> 22, 24, 23, 24, 24, 28, 24, 28, 1, 25, 17, 24, …
## $ publish_time <chr> "2017-11-13T17:13:01.000Z", "2017-11-13T07:30:0…
## $ views <int> 748374, 2418783, 3191434, 343168, 2095731, 1191…
## $ likes <int> 57527, 97185, 146033, 10172, 132235, 9763, 1599…
## $ dislikes <int> 2966, 6146, 5339, 666, 1989, 511, 2445, 778, 11…
## $ comment_count <int> 15954, 12703, 8181, 2146, 17518, 1434, 1970, 34…
## $ comments_disabled <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ ratings_disabled <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ video_error_or_removed <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
unique(vids$category_id)
## [1] 22 24 23 28 1 25 17 10 15 27 26 2 19 20 29 43
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.2.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
vids <- vids %>%
mutate(trending_date = as.Date(trending_date, format = "%y.%d.%m"),
publish_time = ymd_hms(publish_time, tz = "America/New_York"),
category_id = as.factor(category_id))
## Date in ISO8601 format; converting timezone from UTC to "America/New_York".
vids$publish_day <- wday(vids$publish_time, label = T, abbr = F)
vids$publish_hour <- hour(vids$publish_time)
vids$category_id <- sapply(X = as.character(vids$category_id),
FUN = switch,
"1" = "Film and Animation",
"2" = "Autos and Vehicles",
"10" = "Music",
"15" = "Pets and Animals",
"17" = "Sports",
"19" = "Travel and Events",
"20" = "Gaming",
"22" = "People and Blogs",
"23" = "Comedy",
"24" = "Entertainment",
"25" = "News and Politics",
"26" = "Howto and Style",
"27" = "Education",
"28" = "Science and Technology",
"29" = "Nonprofit and Activism",
"43" = "Shows")
vids$category_id <- as.factor(vids$category_id)
# 1. Mencari jam tertinggi penonton melihat suatu video di youtube
vids_hours <- vids %>%
group_by(category_id,publish_hour) %>%
summarise(mean_views = mean(views))
## `summarise()` has grouped output by 'category_id'. You can override using the
## `.groups` argument.
# 2. Melihat rata2 views dalam suatu kategori video
vids_views <- vids %>%
group_by(category_id) %>%
summarise(mean_views = mean(views))
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.2.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
bar_mean_views <- bar_mean_views <- ggplot(data = vids_views, mapping = aes(y = reorder(category_id, mean_views), x = mean_views)) +
geom_col() +
scale_x_continuous(labels = scales::comma)
ggplotly(bar_mean_views)
#1. Mencari jam tertinggi penonton melihat suatu video di youtube
# berdasarkan rata2 views pada visualisasi sebelumnya, saya akan membuat visualisasi jam tertinggi pada 3 kategori video teratas saja
vids_top3 <- vids[vids$category_id %in% c("Music", "Entertainment", "Film and Animation"), ]
# 3. Melihat perkembangan video dari waktu ke waktu berdasarkan views (Business case tambahan setelah melihat visualisasi diatas)
vids_trend <- aggregate(views ~ publish_day, data = vids_top3,
FUN = mean)
#membuat aggregasinya
vids_trending <- aggregate(x = views ~ category_id + publish_hour,
data = vids_top3,
FUN = mean)
#Membuat plotnya
plot_trending <- ggplot(data = vids_trending, mapping = aes(x = publish_hour, y = views)) +
geom_line(aes(color = category_id)) +
geom_point(aes(color = category_id)) +
scale_x_continuous(labels = scales::comma) +
scale_x_continuous(breaks = seq(0, 24, 3)) +
labs(title = "Trend dari rata-rata viewers",
subtitle = "based on Entertainment, Music, Howto and Style",
x = "publish time",
y = "rata-rata viewers",
color = "category") +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
ggplotly(plot_trending)
ggplot(vids_trend, aes(x = publish_day, y = views)) +
geom_line(group=1)
Orang-orang banyak sekali melihat video youtube sampai menciptakan trending video pada hari rabu di jam 15:00 dan jam 21:00 - 24:00 dengan kategori rata2 views terbanyak ada pada musik, entertainment dan film & animation.