1 Intro

Setup Library

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(tidyr)

2 Data Explanatory

2.1 Data Input & Structure

vids <- read.csv("USvideos.csv")

Struktur Data

str(vids)
## 'data.frame':    13400 obs. of  12 variables:
##  $ trending_date         : chr  "17.14.11" "17.14.11" "17.14.11" "17.14.11" ...
##  $ title                 : chr  "WE WANT TO TALK ABOUT OUR MARRIAGE" "The Trump Presidency: Last Week Tonight with John Oliver (HBO)" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "Nickelback Lyrics: Real or Fake?" ...
##  $ channel_title         : chr  "CaseyNeistat" "LastWeekTonight" "Rudy Mancuso" "Good Mythical Morning" ...
##  $ category_id           : int  22 24 23 24 24 28 24 28 1 25 ...
##  $ publish_time          : chr  "2017-11-13T17:13:01.000Z" "2017-11-13T07:30:00.000Z" "2017-11-12T19:05:24.000Z" "2017-11-13T11:00:04.000Z" ...
##  $ views                 : int  748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
##  $ likes                 : int  57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
##  $ dislikes              : int  2966 6146 5339 666 1989 511 2445 778 119 1363 ...
##  $ comment_count         : int  15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
##  $ comments_disabled     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ratings_disabled      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ video_error_or_removed: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

2.3 Mengubah timezone (zona waktu)

vids$publish_time <-ymd_hms(vids$publish_time, tz = "Asia/Jakarta")
## Date in ISO8601 format; converting timezone from UTC to "Asia/Jakarta".
class(vids$publish_time)
## [1] "POSIXct" "POSIXt"

2.4 Cek Missing Value

anyNA(vids)
## [1] FALSE

2.5 Membuat kolom baru berisi hari publish video dan jam publish video

vids$publish_day <- wday(vids$publish_time, label = T, abbr = F)
vids$publish_hour <- hour(vids$publish_time)

2.7 Mengubah kolom category_id dari angka yang tidak bermakna menjadi nama kategori

vids$category_id <- sapply(X = as.character(vids$category_id),
                           FUN = switch, 
                           "1" = "Film and Animation",
                           "2" = "Autos and Vehicles", 
                           "10" = "Music", 
                           "15" = "Pets and Animals", 
                           "17" = "Sports",
                           "19" = "Travel and Events", 
                           "20" = "Gaming", 
                           "22" = "People and Blogs", 
                           "23" = "Comedy",
                           "24" = "Entertainment", 
                           "25" = "News and Politics",
                           "26" = "Howto and Style", 
                           "27" = "Education",
                           "28" = "Science and Technology", 
                           "29" = "Nonprofit and Activism",
                           "43" = "Shows")

vids$category_id <- as.factor(vids$category_id)

3 Visualisasi Data

Kategori Video yang sering Trending Youtube

ggplot(data = head(trending_category,10),
       mapping = aes(x = Freq, y = reorder(Var1, Freq))) +
  geom_col(mapping = aes(fill = Freq))+
  scale_fill_gradient(low = "#000000", high = "#ff0000") +
  geom_text(mapping = aes(label = Freq), color = "black", nudge_x = 25) +
  labs(x = 'Frequency',
       y = 'Category',
       title = 'Top 10 Category Trending In Youtube')

Grafik di atas menggambarkan jumlah trending video berdasarkan kategorinya. Semakin sering video muncul dalam suatu kategori semakin merah warna yang ditunjukkan, sedangkan semakin sedikit akan semakin hitam. Ternyata video berkategori “Entertaiment” yang sering trending di youtube dengan jumlah aplikasi mencapai 736

Rasio like dan dislike per Kategori

vids.agg <- aggregate.data.frame(x = list(like_r = vids.unik$like_ratio, 
                                          dislike_r = vids.unik$dislike_ratio),
                                 by = list(category = vids.unik$category_id),
                                 FUN = mean)

vids.agg.long <- pivot_longer(data = vids.agg,
             cols = c("like_r", "dislike_r"),
             names_to = "variable",
             values_to = "proportion")

ggplot(data = vids.agg.long,
       mapping = aes(x = proportion, y = reorder(category, proportion))) + 
  geom_col(mapping = aes(fill = variable), position = "dodge") +
  geom_text(mapping = aes(label = round(proportion, 3), group = variable), position = position_dodge(width = .9), hjust = 0.01)+ xlim(0, 0.08)+
  labs(x = 'Like & Dislike Ratio',
       y = 'Category',
       title = 'Like & Dislike Ratio for Each Category')

Dari grafik yang dihasilkan dapat diasumsikan bahwa music lebih disukai dari pada entertaiment walapun entertaiment lebih sering trending daripada music.Sedangkan News dan Politics merupakan kategori video paling tidak disukai.

Top Channel Youtube

ggplot(data = head(trending_ch_sort,10),
       mapping = aes(x = Freq, y = reorder(Var1, Freq))) +
  geom_col(mapping = aes(fill = Freq))+
  scale_fill_gradient(low = "#000000", high = "#0066ff") +
  geom_text(mapping = aes(label = Freq), color = "green", nudge_x = -4) +
  labs(x = 'Frequency',
       y = 'Channel Title',
       title = 'Top 10 Channel In Youtube')

Channel Refinery29 menjadi channel yang paling sering trending di youtube dengan jumlah trending sebanyak 31 kali.