Setup Library
library(lubridate)##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(tidyr)vids <- read.csv("USvideos.csv")Struktur Data
str(vids)## 'data.frame': 13400 obs. of 12 variables:
## $ trending_date : chr "17.14.11" "17.14.11" "17.14.11" "17.14.11" ...
## $ title : chr "WE WANT TO TALK ABOUT OUR MARRIAGE" "The Trump Presidency: Last Week Tonight with John Oliver (HBO)" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "Nickelback Lyrics: Real or Fake?" ...
## $ channel_title : chr "CaseyNeistat" "LastWeekTonight" "Rudy Mancuso" "Good Mythical Morning" ...
## $ category_id : int 22 24 23 24 24 28 24 28 1 25 ...
## $ publish_time : chr "2017-11-13T17:13:01.000Z" "2017-11-13T07:30:00.000Z" "2017-11-12T19:05:24.000Z" "2017-11-13T11:00:04.000Z" ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
vids$trending_date <- ydm(vids$trending_date)
str(vids$trending_date)## Date[1:13400], format: "2017-11-14" "2017-11-14" "2017-11-14" "2017-11-14" "2017-11-14" ...
vids$publish_time <-ymd_hms(vids$publish_time, tz = "Asia/Jakarta")## Date in ISO8601 format; converting timezone from UTC to "Asia/Jakarta".
class(vids$publish_time)## [1] "POSIXct" "POSIXt"
anyNA(vids)## [1] FALSE
vids$publish_day <- wday(vids$publish_time, label = T, abbr = F)
vids$publish_hour <- hour(vids$publish_time)vids$publish_date <- date(vids$publish_time)
vids$time_to_trend <- as.numeric(vids$trending_date - date(vids$publish_time))category_id dari angka yang tidak bermakna menjadi nama
kategorivids$category_id <- sapply(X = as.character(vids$category_id),
FUN = switch,
"1" = "Film and Animation",
"2" = "Autos and Vehicles",
"10" = "Music",
"15" = "Pets and Animals",
"17" = "Sports",
"19" = "Travel and Events",
"20" = "Gaming",
"22" = "People and Blogs",
"23" = "Comedy",
"24" = "Entertainment",
"25" = "News and Politics",
"26" = "Howto and Style",
"27" = "Education",
"28" = "Science and Technology",
"29" = "Nonprofit and Activism",
"43" = "Shows")
vids$category_id <- as.factor(vids$category_id)vids.unik <- vids[match(unique(vids$title), vids$title), ]
vids.unik$dislike_ratio <- vids.unik$dislikes/vids.unik$views
vids.unik$like_ratio <- vids.unik$likes/vids.unik$viewstrending_category <- data.frame(table(vids.unik$category_id))
head(trending_category,2)## Var1 Freq
## 1 Autos and Vehicles 41
## 2 Comedy 273
trending_channel <- data.frame(table(vids.unik$channel_title))
head(trending_channel,2)## Var1
## 1 \x8b\xc4\xf8\x8b\xc4_\x8b\xc4_\x8b\xc4_ \x8b\xc4Ћĩ\x8b⦋\xc4_\x8b\xe2_ \x8c\xc9\u008c_\x8f\x8bā\x8bģ\x8b\xc4_\x8bč\x8b\xc4\xc7
## 2 \x93\xf7\x81\x90\xb5_\x91⬓_\x90 Korean Englishman
## Freq
## 1 1
## 2 1
trending_channel <- trending_channel[trending_channel$Freq >= 10, ]
trending_ch_sort <- trending_channel[order(trending_channel$Freq, decreasing = T),]Kategori Video yang sering Trending Youtube
ggplot(data = head(trending_category,10),
mapping = aes(x = Freq, y = reorder(Var1, Freq))) +
geom_col(mapping = aes(fill = Freq))+
scale_fill_gradient(low = "#000000", high = "#ff0000") +
geom_text(mapping = aes(label = Freq), color = "black", nudge_x = 25) +
labs(x = 'Frequency',
y = 'Category',
title = 'Top 10 Category Trending In Youtube')Grafik di atas menggambarkan jumlah trending video berdasarkan kategorinya. Semakin sering video muncul dalam suatu kategori semakin merah warna yang ditunjukkan, sedangkan semakin sedikit akan semakin hitam. Ternyata video berkategori “Entertaiment” yang sering trending di youtube dengan jumlah aplikasi mencapai 736
Rasio like dan dislike per Kategori
vids.agg <- aggregate.data.frame(x = list(like_r = vids.unik$like_ratio,
dislike_r = vids.unik$dislike_ratio),
by = list(category = vids.unik$category_id),
FUN = mean)
vids.agg.long <- pivot_longer(data = vids.agg,
cols = c("like_r", "dislike_r"),
names_to = "variable",
values_to = "proportion")
ggplot(data = vids.agg.long,
mapping = aes(x = proportion, y = reorder(category, proportion))) +
geom_col(mapping = aes(fill = variable), position = "dodge") +
geom_text(mapping = aes(label = round(proportion, 3), group = variable), position = position_dodge(width = .9), hjust = 0.01)+ xlim(0, 0.08)+
labs(x = 'Like & Dislike Ratio',
y = 'Category',
title = 'Like & Dislike Ratio for Each Category')
Dari grafik yang dihasilkan dapat diasumsikan bahwa music lebih disukai
dari pada entertaiment walapun entertaiment lebih sering trending
daripada music.Sedangkan News dan Politics merupakan kategori video
paling tidak disukai.
Top Channel Youtube
ggplot(data = head(trending_ch_sort,10),
mapping = aes(x = Freq, y = reorder(Var1, Freq))) +
geom_col(mapping = aes(fill = Freq))+
scale_fill_gradient(low = "#000000", high = "#0066ff") +
geom_text(mapping = aes(label = Freq), color = "green", nudge_x = -4) +
labs(x = 'Frequency',
y = 'Channel Title',
title = 'Top 10 Channel In Youtube')Channel Refinery29 menjadi channel yang paling sering trending di youtube dengan jumlah trending sebanyak 31 kali.