This is a list of library that I used in this analysis.
library(lubridate)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(scales)First, lets input the data. YouTube (the world-famous video sharing website) maintains a list of the top trending videos on the platform. This dataset is a daily record of the top trending YouTube video in US.
vids <- read.csv("USvideos.csv")This is the top 6 data of the dataset.
head(vids)Lets check if there is any NA
colSums(is.na(vids))## trending_date title channel_title
## 0 0 0
## category_id publish_time views
## 0 0 0
## likes dislikes comment_count
## 0 0 0
## comments_disabled ratings_disabled video_error_or_removed
## 0 0 0
Okay, the data didn’t have any NA so we can continue without problem.
Before start anything, lets explore the data. We need to make sure the datatypes are correct.
str(vids)## 'data.frame': 13400 obs. of 12 variables:
## $ trending_date : chr "17.14.11" "17.14.11" "17.14.11" "17.14.11" ...
## $ title : chr "WE WANT TO TALK ABOUT OUR MARRIAGE" "The Trump Presidency: Last Week Tonight with John Oliver (HBO)" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "Nickelback Lyrics: Real or Fake?" ...
## $ channel_title : chr "CaseyNeistat" "LastWeekTonight" "Rudy Mancuso" "Good Mythical Morning" ...
## $ category_id : int 22 24 23 24 24 28 24 28 1 25 ...
## $ publish_time : chr "2017-11-13T17:13:01.000Z" "2017-11-13T07:30:00.000Z" "2017-11-12T19:05:24.000Z" "2017-11-13T11:00:04.000Z" ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
Based on the data, I need to change some of the datatypes into factor and date. Beside that, we know that the data is a dataframe and have 13400 rows and 12 columns.
Because the category_id still a number that represent a category, I need to change it into the name of the category. After that, I need to change the datatypes into a suitable one.
vids$category_id <- sapply(as.character(vids$category_id), switch,
"1" = "Film and Animation",
"2" = "Autos and Vehicles",
"10" = "Music",
"15" = "Pets and Animals",
"17" = "Sports",
"19" = "Travel and Events",
"20" = "Gaming",
"22" = "People and Blogs",
"23" = "Comedy",
"24" = "Entertainment",
"25" = "News and Politics",
"26" = "Howto and Style",
"27" = "Education",
"28" = "Science and Technology",
"29" = "Nonprofit and Activism",
"43" = "Shows")
vids$category_id <- as.factor(vids$category_id)
vids$trending_date <- ydm(vids$trending_date)
vids$publish_time <- ymd_hms(vids$publish_time, tz = "America/New_York")Lets check the data again.
str(vids)## 'data.frame': 13400 obs. of 12 variables:
## $ trending_date : Date, format: "2017-11-14" "2017-11-14" ...
## $ title : chr "WE WANT TO TALK ABOUT OUR MARRIAGE" "The Trump Presidency: Last Week Tonight with John Oliver (HBO)" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "Nickelback Lyrics: Real or Fake?" ...
## $ channel_title : chr "CaseyNeistat" "LastWeekTonight" "Rudy Mancuso" "Good Mythical Morning" ...
## $ category_id : Factor w/ 16 levels "Autos and Vehicles",..: 11 4 2 4 4 13 4 13 5 9 ...
## $ publish_time : POSIXct, format: "2017-11-13 12:13:01" "2017-11-13 02:30:00" ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
With this we succesfully change the datatypes and change the category_id.
Lets make new colomn about publish_hour. I will extract the information from publish_time column
vids$publish_hour <- hour(vids$publish_time)I want to make publish_hour into 3 categories and add it into a new column called publish_when. First, make the function.
pw <- function(x){
if(x < 8){
x <- "12am to 8am"
}else if(x >= 8 & x < 16){
x <- "8am to 4pm"
}else{
x <- "4pm to 12am"
}
}Then apply it to the data
vids$publish_when <- as.factor(sapply(vids$publish_hour, pw))I also want to add new colomn about name of the day publish.
vids$publish_wday <- wday(vids$publish_time,
label = T,
abbr = F,
week_start = 1
) Then I want to know how long it takes to trend. So I make new column called publish_date and extract the date information from publish_time
vids$publish_date <- date(vids$publish_time)After that, I can make new colomn called timetotrend and change the datatypes into factor.
vids$timetotrend <- vids$trending_date - vids$publish_date
vids$timetotrend <- as.factor(ifelse(vids$timetotrend <= 7, vids$timetotrend, "8+"))Last, I want to make column about like,dislike,comment ratio
vids$likesratio <- vids$likes/vids$views
vids$dislikesratio <- vids$dislikes/vids$views
vids$commentratio <- vids$comment_count/vids$viewsThis is the data structure.
str(vids)## 'data.frame': 13400 obs. of 20 variables:
## $ trending_date : Date, format: "2017-11-14" "2017-11-14" ...
## $ title : chr "WE WANT TO TALK ABOUT OUR MARRIAGE" "The Trump Presidency: Last Week Tonight with John Oliver (HBO)" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "Nickelback Lyrics: Real or Fake?" ...
## $ channel_title : chr "CaseyNeistat" "LastWeekTonight" "Rudy Mancuso" "Good Mythical Morning" ...
## $ category_id : Factor w/ 16 levels "Autos and Vehicles",..: 11 4 2 4 4 13 4 13 5 9 ...
## $ publish_time : POSIXct, format: "2017-11-13 12:13:01" "2017-11-13 02:30:00" ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ publish_hour : int 12 2 14 6 13 14 0 16 9 8 ...
## $ publish_when : Factor w/ 3 levels "12am to 8am",..: 3 1 3 1 3 3 1 2 3 3 ...
## $ publish_wday : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 1 1 7 1 7 1 7 7 1 1 ...
## $ publish_date : Date, format: "2017-11-13" "2017-11-13" ...
## $ timetotrend : Factor w/ 9 levels "0","1","2","3",..: 2 2 3 2 3 2 3 3 2 2 ...
## $ likesratio : num 0.0769 0.0402 0.0458 0.0296 0.0631 ...
## $ dislikesratio : num 0.003963 0.002541 0.001673 0.001941 0.000949 ...
## $ commentratio : num 0.02132 0.00525 0.00256 0.00625 0.00836 ...
But, there is one problem. There is a redundant data because there are channels that have videos that trends more than 1 day so the data doubled. Lets just take the unique data and add it to vids.unique.
vids.unique <- vids[match(unique(vids$title),vids$title),]
str(vids.unique)## 'data.frame': 2986 obs. of 20 variables:
## $ trending_date : Date, format: "2017-11-14" "2017-11-14" ...
## $ title : chr "WE WANT TO TALK ABOUT OUR MARRIAGE" "The Trump Presidency: Last Week Tonight with John Oliver (HBO)" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "Nickelback Lyrics: Real or Fake?" ...
## $ channel_title : chr "CaseyNeistat" "LastWeekTonight" "Rudy Mancuso" "Good Mythical Morning" ...
## $ category_id : Factor w/ 16 levels "Autos and Vehicles",..: 11 4 2 4 4 13 4 13 5 9 ...
## $ publish_time : POSIXct, format: "2017-11-13 12:13:01" "2017-11-13 02:30:00" ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ publish_hour : int 12 2 14 6 13 14 0 16 9 8 ...
## $ publish_when : Factor w/ 3 levels "12am to 8am",..: 3 1 3 1 3 3 1 2 3 3 ...
## $ publish_wday : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 1 1 7 1 7 1 7 7 1 1 ...
## $ publish_date : Date, format: "2017-11-13" "2017-11-13" ...
## $ timetotrend : Factor w/ 9 levels "0","1","2","3",..: 2 2 3 2 3 2 3 3 2 2 ...
## $ likesratio : num 0.0769 0.0402 0.0458 0.0296 0.0631 ...
## $ dislikesratio : num 0.003963 0.002541 0.001673 0.001941 0.000949 ...
## $ commentratio : num 0.02132 0.00525 0.00256 0.00625 0.00836 ...
At the end, we got dataframe that unique and contain every information that we need to answer many business case I want to solve.
I want to filter category into comedy, gaming, and science and technology
vids.category <- vids.unique[vids.unique$category_id %in% c("Comedy","Gaming","Science and Technology"),]
vids.category$category_id <- droplevels(vids.category$category_id)
levels(vids.category$category_id)## [1] "Comedy" "Gaming" "Science and Technology"
Lets make new table with comment,like,dislike ratio for each category
vids.ratios <- aggregate(formula = cbind(likesratio, dislikesratio, commentratio) ~ category_id,
data = vids.category,
FUN = mean)
vids.ratios <- pivot_longer(vids.ratios,
cols = c("likesratio", "dislikesratio", "commentratio"))
vids.ratiosLets make facet visualization
ggplot(data = vids.ratios, mapping = aes(x = category_id, y = value)) +
geom_col(width = 0.8, mapping = aes(fill = category_id)) +
facet_wrap(~name,
scales = "free_y",
labeller = as_labeller(c("commentratio"="Comment", "dislikesratio"="Dislikes", "likesratio"="Likes"))) +
scale_fill_brewer(palette = "Set2") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.background = element_rect(fill = "light blue"))Based on the visualization, comedy has the highest likes ratio meanwhile gaming has the highest viewer engangement
After that, I want to know at what time people in the category publish
vids.publishwhen <- as.data.frame(table(category = vids.category$category_id,
publish_time = vids.category$publish_when))
vids.publishwhen After get the table, lets make visualization
ggplot(data = vids.publishwhen , mapping = aes(x = reorder(category, Freq), y = Freq)) +
geom_col(mapping = aes(fill = publish_time), position = "fill") +
coord_flip() +
labs(x = "",
y = "Video Count",
fill = "",
title = "Proportion of YouTube Trending Videos",
subtitle = "Categories vs. Publish Hour") +
theme_minimal() +
scale_fill_brewer(palette = "Set2") +
theme(legend.position = "top")Based on the visualization, people tend to upload videos at 8 am to 4 pm
After that, I want to know what category has the highest trending videos
ggplot(data = vids.publishwhen, mapping = aes(x = reorder(category, Freq), y = Freq)) +
geom_col(mapping = aes(fill = publish_time), position = "stack") +
coord_flip() +
labs(x = "",
y = "Video Count",
fill = "",
title = "Categories with Highest Trending Videos",
subtitle = "Colored per Publish Hour") +
theme_minimal() +
scale_fill_brewer(palette = "Set2") +
theme(legend.position = "top")Based on the visualization, comedy has the highest trending videos and most of them upload at 8 am to 4 pm
vids.publishhour <- aggregate(views ~ publish_hour + category_id, data = vids.category, FUN = mean)
vids.publishhourAfter that, I want to know when is the peak time to upload videos
ggplot(data = vids.publishhour,
mapping = aes(x = publish_hour,
y = views)) + # grouping line berdasarkan
geom_line(aes(col = category_id), lwd = 0.9) +
facet_wrap(facets = ~category_id, nrow = 3) +
scale_x_continuous(breaks = seq(from = 0, to = 23, by = 2)) +
scale_y_continuous(labels = comma) +
theme_minimal()Based on the visualization, for Comedy the best time to upload is at 3, for gaming the best time to upload is at 15, and for Science and technology the best time to upload is at 16