library(tidyverse)
library(lubridate)
library(viridis)
library(scales)
library(ggrepel)

vids <- read.csv("USvideos.csv")
vids.ori <- vids

str(vids)

## 'data.frame':    13400 obs. of  12 variables:
##  $ trending_date         : Factor w/ 67 levels "17.01.12","17.02.12",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ title                 : Factor w/ 2986 levels "'I have dad moves': Barack Obama discusses dancing on David Letterman's new Netflix show",..: 2802 2574 2081 1903 1231 89 2164 143 2482 2920 ...
##  $ channel_title         : Factor w/ 1408 levels "_¢_Á_\235","“÷\201\220µ_‘â¬“_\220 Korean Englishman",..: 195 686 1046 472 902 559 1063 283 6 1358 ...
##  $ category_id           : int  22 24 23 24 24 28 24 28 1 25 ...
##  $ publish_time          : Factor w/ 2903 levels "2008-04-05T18:22:40.000Z",..: 302 271 255 275 253 307 240 258 281 279 ...
##  $ views                 : int  748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
##  $ likes                 : int  57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
##  $ dislikes              : int  2966 6146 5339 666 1989 511 2445 778 119 1363 ...
##  $ comment_count         : int  15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
##  $ comments_disabled     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ratings_disabled      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ video_error_or_removed: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

Preprocessing

vids$trending_date <- ydm(vids$trending_date) # melakukan transformasi date

vids$title <- as.character(vids$title)

vids$category_id <- sapply(as.character(vids$category_id), switch, 
                           "1" = "Film and Animation",
                           "2" = "Autos and Vehicles", 
                           "10" = "Music", 
                           "15" = "Pets and Animals", 
                           "17" = "Sports",
                           "19" = "Travel and Events", 
                           "20" = "Gaming", 
                           "22" = "People and Blogs", 
                           "23" = "Comedy",
                           "24" = "Entertainment", 
                           "25" = "News and Politics",
                           "26" = "Howto and Style", 
                           "27" = "Education",
                           "28" = "Science and Technology", 
                           "29" = "Nonprofit and Activism",
                           "43" = "Shows")

vids$category_id <- as.factor(vids$category_id)
vids$publish_time <- ymd_hms(vids$publish_time,tz="America/New_York")

## Date in ISO8601 format; converting timezone from UTC to "America/New_York".

vids$publish_hour <- hour(vids$publish_time)

pw <- function(x){
    if(x < 8){
      x <- "12am to 8am"
    }else if(x >= 8 & x < 16){
      x <- "8am to 3pm"
    }else{
      x <- "3pm to 12am"
    }  
}

vids$publish_when <- as.factor(sapply(vids$publish_hour, pw))
vids$publish_wday <- as.factor(weekdays(vids$publish_time))


vids$publish_wday <- ordered(vids$publish_wday, 
                             levels=c("Monday", 
                                      "Tuesday", 
                                      "Wednesday", 
                                      "Thursday", 
                                      "Friday", 
                                      "Saturday", 
                                      "Sunday"))
vids[,c("views", 
        "likes", 
        "dislikes", 
        "comment_count")] <- lapply(vids[,c("views", 
                                            "likes", 
                                            "dislikes", 
                                            "comment_count")], 
                                    as.numeric)

vids.u <- vids[match(unique(vids$title), vids$title),]

vids.u$timetotrend <- vids.u$trending_date - as.Date(vids.u$publish_time)
vids.u$timetotrend <- as.factor(ifelse(vids.u$timetotrend <= 7, vids.u$timetotrend, "8+"))

rownames(vids.u) <- 1:nrow(vids.u)

Preprocessing Tidy Format

library(dplyr)

detail <- read.csv("category.csv")

tidydat <- vids.ori %>%
  mutate(trending_date = ydm(trending_date),
         title = as.character(title)) %>% 
  left_join(detail, by = "category_id") %>% 
  mutate(category_id = as.factor(category),
         publish_time = ymd_hms(publish_time,tz="America/New_York"),
         publish_hour = hour(publish_time),
         publish_when = as.factor(case_when(
           publish_hour < 8 ~ "12am to 8am",
           publish_hour >=8 & publish_hour < 16 ~ "8am to 3pm",
           TRUE ~ "3pm to 12am"
         )),
         publish_wday = as.factor(wday(publish_time, 
                                       label = TRUE, 
                                       abbr = FALSE, 
                                       week_start = 1))) %>% 
  mutate_if(is.integer, as.numeric) %>% 
  mutate(publish_hour = as.integer(publish_hour)) %>% 
  distinct(title, .keep_all = TRUE) %>% 
  mutate(timetotrend = as.numeric(as.duration(interval(as.Date(publish_time), 
                                                       trending_date)),
                                  "days")) %>%
  mutate(timetotrend = as.factor(case_when(
    timetotrend <= 7 ~ as.character(.$timetotrend),
    TRUE ~ "8+"
  ))) %>% 
  select(-category)

## Date in ISO8601 format; converting timezone from UTC to "America/New_York".

The Results are identical

checking the similarity two results.

identical(vids.u, tidydat)

## [1] TRUE

Plotting

temaku <- theme(panel.background = element_rect(fill = "#2b2d31"),
        panel.grid.minor = element_blank(),
        panel.grid.major.x = element_line(color = "grey2", linetype = 2),
        panel.grid.major.y = element_line(color = "grey2", linetype = 2),
        panel.grid = element_line(colour = "grey2", linetype = 2),
        legend.key = element_rect(fill = "#2b2d31"),
        legend.background = element_rect(fill = "#2b2d31"),
        legend.title = element_text(color = "white"),
        legend.text = element_text(color = "white"),
        legend.position = "top",
        plot.subtitle = element_text(color = "white"),
        plot.title = element_text(colour = "white"),
        plot.caption = element_text(colour = "white"),
        plot.background = element_rect(fill = "#2b2d31"),
        axis.title = element_text(color = "white"),
        axis.ticks.x = element_line(color = "grey"),
        axis.ticks.y = element_line(color = "grey"),
        axis.text.x = element_text(color = "white"),
        axis.text.y = element_text(color = "white"))

Static Plot

tidydat %>% 
  filter(category_id == "Science and Technology") %>% 
  mutate(sentiment = (likes/dislikes)*100) %>% 
  filter(views > 10000 & sentiment > 100) %>% 
  ggplot(mapping = aes(x = sentiment,
                       y = views,
                       size = comment_count,
                       col = timetotrend,
                       text = paste("Title: ", .$title))) +
  geom_point(alpha = 0.5) +
  scale_size(range = c(2,10), guide = "none") +
  scale_color_brewer(palette = "Oranges") +
  scale_x_continuous(limits = c(0,25000), 
                     labels = comma, 
                     breaks = seq(0,25000, 5000)) +
  scale_y_continuous(limits = c(0,4e6), 
                     labels = comma) +
  labs(title = "Favorite Videos Science & Technology", 
       subtitle = "Data youtube trending",
       caption = "Source: Kaggle Dataset") +
  temaku +
  theme(legend.position = "none") -> plot1

plot1

Interactive Plot

library(plotly)
ggplotly(plot1)

Bonus examples

from ggplotly’s packages examples:

highlight_key(iris) %>%
  GGally::ggpairs(aes(colour = Species), columns = 1:4) %>%
  ggplotly(tooltip = c("x", "y", "colour")) %>%
  highlight("plotly_selected")