Business Case

# 1. Mencari jam tertinggi penonton melihat suatu video di youtube

# 2. Melihat rata2 views dalam suatu kategori video

# 3. Melihat perkembangan video dari waktu ke waktu berdasarkan views

Import Data

vids <- read.csv("data_input/USvideos.csv")
head(vids)
##   trending_date                                                          title
## 1      17.14.11                             WE WANT TO TALK ABOUT OUR MARRIAGE
## 2      17.14.11 The Trump Presidency: Last Week Tonight with John Oliver (HBO)
## 3      17.14.11          Racist Superman | Rudy Mancuso, King Bach & Lele Pons
## 4      17.14.11                               Nickelback Lyrics: Real or Fake?
## 5      17.14.11                                       I Dare You: GOING BALD!?
## 6      17.14.11                                          2 Weeks with iPhone X
##           channel_title category_id             publish_time   views  likes
## 1          CaseyNeistat          22 2017-11-13T17:13:01.000Z  748374  57527
## 2       LastWeekTonight          24 2017-11-13T07:30:00.000Z 2418783  97185
## 3          Rudy Mancuso          23 2017-11-12T19:05:24.000Z 3191434 146033
## 4 Good Mythical Morning          24 2017-11-13T11:00:04.000Z  343168  10172
## 5              nigahiga          24 2017-11-12T18:01:41.000Z 2095731 132235
## 6              iJustine          28 2017-11-13T19:07:23.000Z  119180   9763
##   dislikes comment_count comments_disabled ratings_disabled
## 1     2966         15954             FALSE            FALSE
## 2     6146         12703             FALSE            FALSE
## 3     5339          8181             FALSE            FALSE
## 4      666          2146             FALSE            FALSE
## 5     1989         17518             FALSE            FALSE
## 6      511          1434             FALSE            FALSE
##   video_error_or_removed
## 1                  FALSE
## 2                  FALSE
## 3                  FALSE
## 4                  FALSE
## 5                  FALSE
## 6                  FALSE

Cek struktur data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(vids)
## Rows: 13,400
## Columns: 12
## $ trending_date          <chr> "17.14.11", "17.14.11", "17.14.11", "17.14.11",…
## $ title                  <chr> "WE WANT TO TALK ABOUT OUR MARRIAGE", "The Trum…
## $ channel_title          <chr> "CaseyNeistat", "LastWeekTonight", "Rudy Mancus…
## $ category_id            <int> 22, 24, 23, 24, 24, 28, 24, 28, 1, 25, 17, 24, …
## $ publish_time           <chr> "2017-11-13T17:13:01.000Z", "2017-11-13T07:30:0…
## $ views                  <int> 748374, 2418783, 3191434, 343168, 2095731, 1191…
## $ likes                  <int> 57527, 97185, 146033, 10172, 132235, 9763, 1599…
## $ dislikes               <int> 2966, 6146, 5339, 666, 1989, 511, 2445, 778, 11…
## $ comment_count          <int> 15954, 12703, 8181, 2146, 17518, 1434, 1970, 34…
## $ comments_disabled      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ ratings_disabled       <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ video_error_or_removed <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…

Cek jumlah unique dalam kolom tertentu

unique(vids$category_id)
##  [1] 22 24 23 28  1 25 17 10 15 27 26  2 19 20 29 43

Mengubah tipe data yang sesuai

library(lubridate)
## Warning: package 'lubridate' was built under R version 4.2.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
vids <- vids %>%
  mutate(trending_date = as.Date(trending_date, format = "%y.%d.%m"),
         publish_time = ymd_hms(publish_time, tz = "America/New_York"),
         category_id = as.factor(category_id))
## Date in ISO8601 format; converting timezone from UTC to "America/New_York".
vids$publish_day <- wday(vids$publish_time, label = T, abbr = F)

vids$publish_hour <- hour(vids$publish_time)

Melakukan perubahan pada suatu kategori di kolom category.id

vids$category_id <- sapply(X = as.character(vids$category_id),
                           FUN = switch, 
                           "1" = "Film and Animation",
                           "2" = "Autos and Vehicles", 
                           "10" = "Music", 
                           "15" = "Pets and Animals", 
                           "17" = "Sports",
                           "19" = "Travel and Events", 
                           "20" = "Gaming", 
                           "22" = "People and Blogs", 
                           "23" = "Comedy",
                           "24" = "Entertainment", 
                           "25" = "News and Politics",
                           "26" = "Howto and Style", 
                           "27" = "Education",
                           "28" = "Science and Technology", 
                           "29" = "Nonprofit and Activism",
                           "43" = "Shows")

Mengubah kembali data yang sudah di switch kategorinya

vids$category_id <- as.factor(vids$category_id)

Data Preparation untuk plotting business case

# 1. Mencari jam tertinggi penonton melihat suatu video di youtube
vids_hours <- vids %>%
  group_by(category_id,publish_hour) %>%
  summarise(mean_views = mean(views))
## `summarise()` has grouped output by 'category_id'. You can override using the
## `.groups` argument.
# 2. Melihat rata2 views dalam suatu kategori video
vids_views <- vids %>%
  group_by(category_id) %>%
  summarise(mean_views = mean(views))

Membuat visualisasi business case

2. Melihat rata2 views dalam suatu kategori video

library(GGally)
## Warning: package 'GGally' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.2.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
bar_mean_views <- bar_mean_views <- ggplot(data = vids_views, mapping = aes(y = reorder(category_id, mean_views), x = mean_views)) +
  geom_col() +
  scale_x_continuous(labels = scales::comma)

ggplotly(bar_mean_views)

#1. Mencari jam tertinggi penonton melihat suatu video di youtube

# berdasarkan rata2 views pada visualisasi sebelumnya, saya akan membuat visualisasi jam tertinggi pada 3 kategori video teratas saja

vids_top3 <- vids[vids$category_id %in% c("Music", "Entertainment", "Film and Animation"), ]

# 3. Melihat perkembangan video dari waktu ke waktu berdasarkan views (Business case tambahan setelah melihat visualisasi diatas)
vids_trend <- aggregate(views ~ publish_day, data = vids_top3, 
                        FUN = mean)


#membuat aggregasinya
vids_trending <- aggregate(x = views ~ category_id + publish_hour,
                        data = vids_top3,
                        FUN = mean)

#Membuat plotnya
plot_trending <- ggplot(data = vids_trending, mapping = aes(x = publish_hour, y = views)) +
  geom_line(aes(color = category_id)) +
  geom_point(aes(color = category_id)) +
  scale_x_continuous(labels = scales::comma) +
  
  scale_x_continuous(breaks = seq(0, 24, 3)) +
  
  labs(title = "Trend dari rata-rata viewers",
       subtitle = "based on Entertainment, Music, Howto and Style",
       x = "publish time",
       y = "rata-rata viewers",
       color = "category") +
  theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
ggplotly(plot_trending)

3. Melihat perkembangan video dari waktu ke waktu berdasarkan views

ggplot(vids_trend, aes(x = publish_day, y = views)) +
  geom_line(group=1)

Kesimpulan

Orang-orang banyak sekali melihat video youtube sampai menciptakan trending video pada hari rabu di jam 15:00 dan jam 21:00 - 24:00 dengan kategori rata2 views terbanyak ada pada musik, entertainment dan film & animation.