Hi !! Welcome to my LBB :)
In this LBB, I will use data USvideos.csv
I hope you enjoy it !
This data is contains of video trending YouTube records sales record
included the title, channel title, publish time, etc.
The first thing i need to do is load all package tht might be needed for
this dataset.\
library(lubridate)##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(leaflet)
library(scales)
library(tidyr)
library(colorspace)
library(ggthemes)
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Note : Detail explanatory will be given at the end of content
Make sure our data placed within folder of our R project data.
data <- read.csv("data_input/USvideos.csv")
head(data)Input data is DONE ! then let’s get started
head(data)dim(data)## [1] 13400 12
names(data)## [1] "trending_date" "title" "channel_title"
## [4] "category_id" "publish_time" "views"
## [7] "likes" "dislikes" "comment_count"
## [10] "comments_disabled" "ratings_disabled" "video_error_or_removed"
From our inspection we can conclude :
* Data contains 13400 of rows and 12 of coloumns
* Column Description
trending_date : trending datetitle : video titlechannel_title : channel titlecategory_id : video categorypublish_time: publish timeviews : views countlikes : likes countdislikes : dislikes countcomment_count : comments countcomment_disabled : is comments disabledrating_disabled : is rating disabledvideo_error_or_removed: is video error or removedFrom our observation, these are the columns that we want to removed
comment_disabledrating_disabledvideo_error_or_removedFrom our observation, these are the columns that we want to fix
trending_datepublish_timecategory_iddata <- data[ , 1:9]
head(data)switch.category <- function(x){
x <- as.character(x)
y <- switch(x,
"1" = "Film and Animation",
"2" = "Autos and Vehicles",
"10" = "Music",
"15" = "Pets and Animals",
"17" = "Sports",
"19" = "Travel and Events",
"20" = "Gaming",
"22" = "People and Blogs",
"23" = "Comedy",
"24" = "Entertainment",
"25" = "News and Politics",
"26" = "Howto and Style",
"27" = "Education",
"28" = "Science and Technology",
"29" = "Nonprofit and Activism",
"43" = "Shows")
return(y)
}data$category_id <- as.factor(sapply(data$category_id, FUN = switch.category))
head(data$category_id)## [1] People and Blogs Entertainment Comedy
## [4] Entertainment Entertainment Science and Technology
## 16 Levels: Autos and Vehicles Comedy Education ... Travel and Events
lubridatedata$trending_date <- ydm(data$trending_date)
data$publish_time <- ymd_hms(data$publish_time)
head(data)match.row <- match(unique(data$title), data$title)
vids <- data[match.row, ]
head(vids,100)1. What is the top 3 YouTube Channel with most trending
videos?
agg <- aggregate(title ~ category_id, data = vids, FUN = length)
table_channel <- sort(table(vids$channel_title), decreasing = T)
vids.tab <- as.data.frame(table_channel)
names(vids.tab) <- c("title", "total_videos")
vids.tab.10 <- vids.tab[1:10,]
ggplot(data = vids.tab.10, mapping = aes(y = reorder(title,
total_videos),
x = total_videos)) +
geom_col() +
labs(title = "Top Trending YouTube Channel",
x = "Total Videos",
y = NULL)
Based on visualization above, the top 3 trending YouTube Channel is
Refinery29, The Tonight Show Starring Jimmy Fallon and Vox
2. When is the video most often uploaded? ?
vids$publish_hour <- hour(vids$publish_time)
agg <- aggregate(title ~ publish_hour, data = vids, FUN = length)
table_hour <- sort(table(vids$publish_hour), decreasing = T)
vids.tab <- as.data.frame(table_hour)
names(vids.tab) <- c("publish_hour", "total_videos")
ggplot(data = vids.tab, mapping = aes(x = reorder(publish_hour,
-total_videos),
y = total_videos)) +
geom_col() +
labs(title = "",
x = "Total Videos",
y = NULL)
Based on visualization above, the trending video most often uploaded
at 4 pm
3. Create visualization about relationship between likes ratio
for top 3 trending category !
agg <- aggregate(title ~ category_id, data = vids, FUN = length)
top.category <- agg[order(agg$title, decreasing = T), ]$category_id[1:3]
vids.top <- vids[vids$category_id %in% top.category, ]
vids.top$category_id <- droplevels(vids.top$category_id)
vids.top$likesp <- vids.top$likes / vids.top$views
ggplot(data = vids.top, aes(x = category_id, y = likesp)) +
geom_boxplot(aes(fill = category_id),
color = "gray", outlier.shape = NA) +
geom_jitter(aes(color = category_id), alpha = 0.3, show.legend = F) +
labs(title = "Likes Ratio", subtitle = "Top 3 Categories",
x = "Likes Ratio", y = "Category", caption = "Source : YouTube", fill = "Category")+
scale_y_continuous(breaks = seq(0, 0.2, 0.05), limits = c(0, 0.2)) +
scale_fill_manual(values = c("#1C315E", "#227C70", "#88A47C"))+
theme_dark()## Warning: Removed 8 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 10 rows containing missing values (`geom_point()`).
4. Create visualization about trending average viewers on
“Entertainment”, “Music” and “Howto and Style” !
vids.cat.custom <- vids[vids$category_id %in%
c("Entertainment","Music", "Howto and Style"),]
vids.emh <- aggregate(views ~ category_id + publish_hour ,
data = vids.cat.custom,
FUN = "mean")
ggplot(data = vids.emh , mapping = aes(x = publish_hour ,
y = views)) +
geom_line(mapping = aes(color = category_id), size = 1.5) +
scale_y_continuous(label = comma) +
labs(title = "Average Views per Hours",
x = "Publish Hour",
y = "Mean Views",
color = "Category",
caption = "Source : Youtube") +
theme(legend.position = "top",
plot.title = element_text(hjust = 0.5, face = "bold"))## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
As a beginner YouTuber, these are the recommendations for you:
1. You are recommended to choose either Entertainment,
Music‘or’Howto and Style as your
video category
2. You are recommended to upload your video between 2pm until 6pm as hot
time to publish