1 Explanation

1.1 Brief

Hi !! Welcome to my LBB :)
In this LBB, I will use data USvideos.csv
I hope you enjoy it !

1.2 Data’s Point of View

This data is contains of video trending YouTube records sales record included the title, channel title, publish time, etc.
The first thing i need to do is load all package tht might be needed for this dataset.\

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(leaflet)
library(scales)
library(tidyr)
library(colorspace) 
library(ggthemes)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Note : Detail explanatory will be given at the end of content

2 Data

2.1 Input Data

Make sure our data placed within folder of our R project data.

data <- read.csv("data_input/USvideos.csv")
head(data)

Input data is DONE ! then let’s get started

2.2 Data Inspection

head(data)
dim(data)
## [1] 13400    12
names(data)
##  [1] "trending_date"          "title"                  "channel_title"         
##  [4] "category_id"            "publish_time"           "views"                 
##  [7] "likes"                  "dislikes"               "comment_count"         
## [10] "comments_disabled"      "ratings_disabled"       "video_error_or_removed"

From our inspection we can conclude :
* Data contains 13400 of rows and 12 of coloumns
* Column Description

  • trending_date : trending date
  • title : video title
  • channel_title : channel title
  • category_id : video category
  • publish_time: publish time
  • views : views count
  • likes : likes count
  • dislikes : dislikes count
  • comment_count : comments count
  • comment_disabled : is comments disabled
  • rating_disabled : is rating disabled
  • video_error_or_removed: is video error or removed

2.3 Data Cleansing & Coertions

From our observation, these are the columns that we want to removed

  1. comment_disabled
  2. rating_disabled
  3. video_error_or_removed

From our observation, these are the columns that we want to fix

  1. trending_date
  2. publish_time
  3. category_id

2.3.1 Remove unwanted columns

data <- data[ , 1:9]
head(data)

2.3.2 Fixing category_id as factor

switch.category <- function(x){
  x <- as.character(x)
  y <- switch(x,
              "1" = "Film and Animation",
              "2" = "Autos and Vehicles", 
             "10" = "Music", 
             "15" = "Pets and Animals", 
             "17" = "Sports",
             "19" = "Travel and Events", 
             "20" = "Gaming", 
             "22" = "People and Blogs", 
             "23" = "Comedy",
             "24" = "Entertainment", 
             "25" = "News and Politics",
             "26" = "Howto and Style", 
             "27" = "Education",
             "28" = "Science and Technology", 
             "29" = "Nonprofit and Activism",
             "43" = "Shows")
  return(y)
}
data$category_id <- as.factor(sapply(data$category_id, FUN = switch.category))
head(data$category_id)
## [1] People and Blogs       Entertainment          Comedy                
## [4] Entertainment          Entertainment          Science and Technology
## 16 Levels: Autos and Vehicles Comedy Education ... Travel and Events

2.3.3 Formatting data with format date using lubridate

data$trending_date <- ydm(data$trending_date)
data$publish_time <- ymd_hms(data$publish_time)

head(data)

2.3.4 Remove repeating data

match.row <- match(unique(data$title), data$title)
vids <- data[match.row, ]
head(vids,100)

3 Study Case

1. What is the top 3 YouTube Channel with most trending videos?

agg <- aggregate(title ~ category_id, data = vids, FUN = length)

table_channel <- sort(table(vids$channel_title), decreasing = T)
vids.tab <- as.data.frame(table_channel)

names(vids.tab) <- c("title", "total_videos")
vids.tab.10 <- vids.tab[1:10,]

ggplot(data = vids.tab.10, mapping = aes(y = reorder(title,
                                                     total_videos), 
                                         x = total_videos)) + 
  geom_col() +
  labs(title = "Top Trending YouTube Channel",
       x = "Total Videos",
       y = NULL)


Based on visualization above, the top 3 trending YouTube Channel is Refinery29, The Tonight Show Starring Jimmy Fallon and Vox

2. When is the video most often uploaded? ?

vids$publish_hour <- hour(vids$publish_time)
agg <- aggregate(title ~ publish_hour, data = vids, FUN = length)

table_hour <- sort(table(vids$publish_hour), decreasing = T)
vids.tab <- as.data.frame(table_hour)

names(vids.tab) <- c("publish_hour", "total_videos")

ggplot(data = vids.tab, mapping = aes(x = reorder(publish_hour,
                                                     -total_videos), 
                                         y = total_videos)) + 
  geom_col() +
  labs(title = "",
       x = "Total Videos",
       y = NULL)


Based on visualization above, the trending video most often uploaded at 4 pm


3. Create visualization about relationship between likes ratio for top 3 trending category !

agg <- aggregate(title ~ category_id, data = vids, FUN = length)
top.category <- agg[order(agg$title, decreasing = T), ]$category_id[1:3]

vids.top <- vids[vids$category_id %in% top.category, ]

vids.top$category_id <- droplevels(vids.top$category_id)
vids.top$likesp <- vids.top$likes / vids.top$views 

ggplot(data = vids.top, aes(x = category_id, y = likesp)) +
  geom_boxplot(aes(fill = category_id), 
               color = "gray", outlier.shape = NA) +
  geom_jitter(aes(color = category_id), alpha = 0.3, show.legend = F) +
  labs(title = "Likes Ratio", subtitle = "Top 3 Categories", 
       x = "Likes Ratio", y = "Category", caption = "Source : YouTube", fill = "Category")+
  
   scale_y_continuous(breaks = seq(0, 0.2, 0.05), limits = c(0, 0.2)) +
  scale_fill_manual(values = c("#1C315E", "#227C70", "#88A47C"))+
  theme_dark()
## Warning: Removed 8 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 10 rows containing missing values (`geom_point()`).


4. Create visualization about trending average viewers on “Entertainment”, “Music” and “Howto and Style” !

vids.cat.custom <- vids[vids$category_id %in% 
                          c("Entertainment","Music", "Howto and Style"),]

vids.emh <- aggregate(views ~ category_id + publish_hour , 
          data = vids.cat.custom, 
          FUN = "mean")
ggplot(data = vids.emh , mapping = aes(x = publish_hour , 
                                       y = views)) +
  geom_line(mapping = aes(color = category_id), size = 1.5) + 
  scale_y_continuous(label = comma) + 
  labs(title = "Average Views per Hours",
       x = "Publish Hour",
       y = "Mean Views",
       color = "Category",
       caption = "Source : Youtube") + 
  theme(legend.position = "top",
        plot.title = element_text(hjust = 0.5, face = "bold"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.


4 Business Recomendation

As a beginner YouTuber, these are the recommendations for you:
1. You are recommended to choose either Entertainment, MusicorHowto and Style as your video category
2. You are recommended to upload your video between 2pm until 6pm as hot time to publish