Data USvideos adalah data yang berisi tentang video
trending di Youtube pada tahun 2017. Kolom-kolom yang ada di dalam data
ini adalah:
trending_date: tanggal trendingtitle: judul videochannel_title: nama channel
Youtubecategory_id: kategori videopublish_time: tanggal upload
videoviews: jumlah viewslikes: jumlah likesdislikes: jumlah dislikescomment_count: jumlah komentarcomment_disabled: apakah kolom
komentar tidak diaktifkanrating_disabled: apakah rating video
tidak diaktifkanvideo_error_or_removed: apakah video
dihapusInstall Package
options(scipen = 999)
library(lubridate)
library(ggplot2)
library(tidyr)vids <- read.csv("USvideos.csv")
head(vids)## trending_date title
## 1 17.14.11 WE WANT TO TALK ABOUT OUR MARRIAGE
## 2 17.14.11 The Trump Presidency: Last Week Tonight with John Oliver (HBO)
## 3 17.14.11 Racist Superman | Rudy Mancuso, King Bach & Lele Pons
## 4 17.14.11 Nickelback Lyrics: Real or Fake?
## 5 17.14.11 I Dare You: GOING BALD!?
## 6 17.14.11 2 Weeks with iPhone X
## channel_title category_id publish_time views likes
## 1 CaseyNeistat 22 2017-11-13T17:13:01.000Z 748374 57527
## 2 LastWeekTonight 24 2017-11-13T07:30:00.000Z 2418783 97185
## 3 Rudy Mancuso 23 2017-11-12T19:05:24.000Z 3191434 146033
## 4 Good Mythical Morning 24 2017-11-13T11:00:04.000Z 343168 10172
## 5 nigahiga 24 2017-11-12T18:01:41.000Z 2095731 132235
## 6 iJustine 28 2017-11-13T19:07:23.000Z 119180 9763
## dislikes comment_count comments_disabled ratings_disabled
## 1 2966 15954 FALSE FALSE
## 2 6146 12703 FALSE FALSE
## 3 5339 8181 FALSE FALSE
## 4 666 2146 FALSE FALSE
## 5 1989 17518 FALSE FALSE
## 6 511 1434 FALSE FALSE
## video_error_or_removed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
str(vids)## 'data.frame': 13400 obs. of 12 variables:
## $ trending_date : chr "17.14.11" "17.14.11" "17.14.11" "17.14.11" ...
## $ title : chr "WE WANT TO TALK ABOUT OUR MARRIAGE" "The Trump Presidency: Last Week Tonight with John Oliver (HBO)" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "Nickelback Lyrics: Real or Fake?" ...
## $ channel_title : chr "CaseyNeistat" "LastWeekTonight" "Rudy Mancuso" "Good Mythical Morning" ...
## $ category_id : int 22 24 23 24 24 28 24 28 1 25 ...
## $ publish_time : chr "2017-11-13T17:13:01.000Z" "2017-11-13T07:30:00.000Z" "2017-11-12T19:05:24.000Z" "2017-11-13T11:00:04.000Z" ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
Mengubah kolom trending_date dan
publish_time menjadi data datetime.
vids$trending_date <- ydm(vids$trending_date)
vids$publish_time <-ymd_hms(vids$publish_time, tz = "Asia/Jakarta")Mengubah kolom category_id untuk tiap row dengan
switch() dengan bantuan sapply() dan mengubah
hasilnya menjadi data factor
vids$category_id <- sapply(X = as.character(vids$category_id),
FUN = switch,
"1" = "Film and Animation",
"2" = "Autos and Vehicles",
"10" = "Music",
"15" = "Pets and Animals",
"17" = "Sports",
"19" = "Travel and Events",
"20" = "Gaming",
"22" = "People and Blogs",
"23" = "Comedy",
"24" = "Entertainment",
"25" = "News and Politics",
"26" = "Howto and Style",
"27" = "Education",
"28" = "Science and Technology",
"29" = "Nonprofit and Activism",
"43" = "Shows")
vids$category_id <- as.factor(vids$category_id)Membuat kolom publish_day, publish_date dan
publish_hour masing-masing berisi hari, tanggal, dan jam
publish video
vids$publish_day <- wday(vids$publish_time, label = T, abbr = F)
vids$publish_date <- date(vids$publish_time)
vids$publish_hour <- hour(vids$publish_time)Membuat kolom ‘time_to_trend’ berisi waktu yang dibutuhkan untuk trending
vids$time_to_trend <- as.numeric(vids$trending_date - date(vids$publish_time))Membuat data baru yang hanya berisi judul-judul video yang trending untuk mempermudah proses analisis data
vids_title <- vids[match(x = unique(vids$title), table = vids$title),]Ada 3 hal yang akan kita analisis dari data USvideos, yaitu:
vids_trend <- aggregate(title ~ channel_title, vids_title, length)
vids_trend10 <- head(vids_trend[order (-vids_trend$title),],10)
vids_trend10## channel_title title
## 1012 Refinery29 31
## 1218 The Tonight Show Starring Jimmy Fallon 30
## 1346 Vox 29
## 1233 TheEllenShow 28
## 876 Netflix 27
## 884 NFL 25
## 390 ESPN 24
## 606 Jimmy Kimmel Live 24
## 1192 The Late Show with Stephen Colbert 22
## 681 Late Night with Seth Meyers 21
vids_trend10 %>%
ggplot(aes(x=reorder(channel_title,title), y=title, fill = channel_title) ) +
geom_bar(stat="identity",show.legend = FALSE) +
coord_flip() +
labs( x="Nama Channel",
y="Jumlah Video Trending",
title = "US Trending Videos",
subtitle = "10 Channels dengan Video Trending Terbanyak"
) +
theme(
plot.title = element_text(color = "#696969", size = 16),
plot.subtitle = element_text(face = "italic")
)
Dari hasil di atas, channel Refinery29 memiliki jumlah video trending
terbanyak yaitu 31 video.
vids_popular <- aggregate(views ~ category_id, vids_title, sum)
category_popular <- head(vids_popular[order(-vids_popular$views),], 5)
category_popular## category_id views
## 4 Entertainment 445350694
## 8 Music 386197869
## 2 Comedy 179900099
## 7 Howto and Style 110259469
## 5 Film and Animation 77476128
category_popular %>%
ggplot(aes(x=reorder(category_id,-views), y=views, fill=category_id ) ) +
geom_col(stat="identity",show.legend = FALSE) +
geom_text(aes(label = format(views, big.mark = ",",
scientific = FALSE) ),
vjust = -0.5,
size = 3,
colour = "Red")+
scale_y_continuous(labels = scales::comma)+
labs( x="Kategori",
y="Frekuensi ditonton",
title = "US Trending Videos",
subtitle = "5 Kategori Video Paling Sering Ditonton"
) +
theme(
plot.title = element_text(color = "#696969", size = 16),
plot.subtitle = element_text(face = "italic")
)
Dari hasil di atas, lategori
Refinery29Entertainment`
menjadi kategori paling sering ditonton.
vids_subset <- vids_title[vids_title$category_id %in% c("Entertainment","Music","Comedy","Howto and Style","Film and Animation"),]
vids_subset$like_per_view <- round(vids_subset$likes/vids_subset$views,3)
vids_subset$comment_per_view <- round(vids_subset$comment_count/vids_subset$views,3)ggplot(vids_subset, aes(x=comment_per_view, y=like_per_view, color=category_id)) +
geom_point(size=2) +
theme_minimal()+
labs(x="comment per view",
y="like per view",
title = "US Trending Videos",
subtitle = "Comment per View and Like per View from Most Viewed Video Categories") +
theme(
plot.title = element_text(color = "#696969", size = 16),
plot.subtitle = element_text(face = "italic", size =10)
)Dari hasil di atas, kategori Music cenderung memiliki
rasio like/view paling tinggi, sementara How to and Style
cenderung memiliki rasio comment/view paling tinggi