library(tidyverse)
library(lubridate)
library(viridis)
library(scales)
library(ggrepel)
vids <- read.csv("USvideos.csv")
vids.ori <- vids
str(vids)
## 'data.frame': 13400 obs. of 12 variables:
## $ trending_date : Factor w/ 67 levels "17.01.12","17.02.12",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ title : Factor w/ 2986 levels "'I have dad moves': Barack Obama discusses dancing on David Letterman's new Netflix show",..: 2802 2574 2081 1903 1231 89 2164 143 2482 2920 ...
## $ channel_title : Factor w/ 1408 levels "_¢_Á_\235","“÷\201\220µ_‘⬓_\220 Korean Englishman",..: 195 686 1046 472 902 559 1063 283 6 1358 ...
## $ category_id : int 22 24 23 24 24 28 24 28 1 25 ...
## $ publish_time : Factor w/ 2903 levels "2008-04-05T18:22:40.000Z",..: 302 271 255 275 253 307 240 258 281 279 ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
Preprocessing
vids$trending_date <- ydm(vids$trending_date) # melakukan transformasi date
vids$title <- as.character(vids$title)
vids$category_id <- sapply(as.character(vids$category_id), switch,
"1" = "Film and Animation",
"2" = "Autos and Vehicles",
"10" = "Music",
"15" = "Pets and Animals",
"17" = "Sports",
"19" = "Travel and Events",
"20" = "Gaming",
"22" = "People and Blogs",
"23" = "Comedy",
"24" = "Entertainment",
"25" = "News and Politics",
"26" = "Howto and Style",
"27" = "Education",
"28" = "Science and Technology",
"29" = "Nonprofit and Activism",
"43" = "Shows")
vids$category_id <- as.factor(vids$category_id)
vids$publish_time <- ymd_hms(vids$publish_time,tz="America/New_York")
## Date in ISO8601 format; converting timezone from UTC to "America/New_York".
vids$publish_hour <- hour(vids$publish_time)
pw <- function(x){
if(x < 8){
x <- "12am to 8am"
}else if(x >= 8 & x < 16){
x <- "8am to 3pm"
}else{
x <- "3pm to 12am"
}
}
vids$publish_when <- as.factor(sapply(vids$publish_hour, pw))
vids$publish_wday <- as.factor(weekdays(vids$publish_time))
vids$publish_wday <- ordered(vids$publish_wday,
levels=c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"))
vids[,c("views",
"likes",
"dislikes",
"comment_count")] <- lapply(vids[,c("views",
"likes",
"dislikes",
"comment_count")],
as.numeric)
vids.u <- vids[match(unique(vids$title), vids$title),]
vids.u$timetotrend <- vids.u$trending_date - as.Date(vids.u$publish_time)
vids.u$timetotrend <- as.factor(ifelse(vids.u$timetotrend <= 7, vids.u$timetotrend, "8+"))
rownames(vids.u) <- 1:nrow(vids.u)
Preprocessing Tidy Format
library(dplyr)
detail <- read.csv("category.csv")
tidydat <- vids.ori %>%
mutate(trending_date = ydm(trending_date),
title = as.character(title)) %>%
left_join(detail, by = "category_id") %>%
mutate(category_id = as.factor(category),
publish_time = ymd_hms(publish_time,tz="America/New_York"),
publish_hour = hour(publish_time),
publish_when = as.factor(case_when(
publish_hour < 8 ~ "12am to 8am",
publish_hour >=8 & publish_hour < 16 ~ "8am to 3pm",
TRUE ~ "3pm to 12am"
)),
publish_wday = as.factor(wday(publish_time,
label = TRUE,
abbr = FALSE,
week_start = 1))) %>%
mutate_if(is.integer, as.numeric) %>%
mutate(publish_hour = as.integer(publish_hour)) %>%
distinct(title, .keep_all = TRUE) %>%
mutate(timetotrend = as.numeric(as.duration(interval(as.Date(publish_time),
trending_date)),
"days")) %>%
mutate(timetotrend = as.factor(case_when(
timetotrend <= 7 ~ as.character(.$timetotrend),
TRUE ~ "8+"
))) %>%
select(-category)
## Date in ISO8601 format; converting timezone from UTC to "America/New_York".
The Results are identical
checking the similarity two results.
identical(vids.u, tidydat)
## [1] TRUE
Plotting
temaku <- theme(panel.background = element_rect(fill = "#2b2d31"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_line(color = "grey2", linetype = 2),
panel.grid.major.y = element_line(color = "grey2", linetype = 2),
panel.grid = element_line(colour = "grey2", linetype = 2),
legend.key = element_rect(fill = "#2b2d31"),
legend.background = element_rect(fill = "#2b2d31"),
legend.title = element_text(color = "white"),
legend.text = element_text(color = "white"),
legend.position = "top",
plot.subtitle = element_text(color = "white"),
plot.title = element_text(colour = "white"),
plot.caption = element_text(colour = "white"),
plot.background = element_rect(fill = "#2b2d31"),
axis.title = element_text(color = "white"),
axis.ticks.x = element_line(color = "grey"),
axis.ticks.y = element_line(color = "grey"),
axis.text.x = element_text(color = "white"),
axis.text.y = element_text(color = "white"))
Static Plot
tidydat %>%
filter(category_id == "Science and Technology") %>%
mutate(sentiment = (likes/dislikes)*100) %>%
filter(views > 10000 & sentiment > 100) %>%
ggplot(mapping = aes(x = sentiment,
y = views,
size = comment_count,
col = timetotrend,
text = paste("Title: ", .$title))) +
geom_point(alpha = 0.5) +
scale_size(range = c(2,10), guide = "none") +
scale_color_brewer(palette = "Oranges") +
scale_x_continuous(limits = c(0,25000),
labels = comma,
breaks = seq(0,25000, 5000)) +
scale_y_continuous(limits = c(0,4e6),
labels = comma) +
labs(title = "Favorite Videos Science & Technology",
subtitle = "Data youtube trending",
caption = "Source: Kaggle Dataset") +
temaku +
theme(legend.position = "none") -> plot1
plot1
Interactive Plot
library(plotly)
ggplotly(plot1)
Bonus examples
from ggplotly’s packages examples:
highlight_key(iris) %>%
GGally::ggpairs(aes(colour = Species), columns = 1:4) %>%
ggplotly(tooltip = c("x", "y", "colour")) %>%
highlight("plotly_selected")