Welcome to my rmd. In this Learning By Building task, I would like to do data visualization of USvideos.csv
options(scipen = 0)
options(warn = -1)
vids <- read.csv("USvideos.csv", stringsAsFactors = FALSE)
vids <- vids[match(unique(vids$title),vids$title),]
vidsUSvideos.csv contains of column:
No Missing Value in data USvideos.csv
## trending_date title channel_title
## 0 0 0
## category_id publish_time views
## 0 0 0
## likes dislikes comment_count
## 0 0 0
## comments_disabled ratings_disabled video_error_or_removed
## 0 0 0
vids[,c("title", "channel_title")] <- lapply(vids[,c("title", "channel_title")], as.factor)
str(vids)## 'data.frame': 2986 obs. of 12 variables:
## $ trending_date : chr "17.14.11" "17.14.11" "17.14.11" "17.14.11" ...
## $ title : Factor w/ 2986 levels "'I have dad moves': Barack Obama discusses dancing on David Letterman's new Netflix show",..: 2802 2574 2081 1903 1231 89 2164 143 2482 2920 ...
## $ channel_title : Factor w/ 1404 levels "_¢_Á_\235","“÷\201\220µ_‘⬓_\220 Korean Englishman",..: 193 682 1042 469 898 556 1059 280 6 1354 ...
## $ category_id : int 22 24 23 24 24 28 24 28 1 25 ...
## $ publish_time : chr "2017-11-13T17:13:01.000Z" "2017-11-13T07:30:00.000Z" "2017-11-12T19:05:24.000Z" "2017-11-13T11:00:04.000Z" ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
library(lubridate)
vids$trending_date <- ydm(vids$trending_date)
vids$publish_time <- ymd_hms(vids$publish_time, tz = "America/New_York")
head(vids)vids$category_id <- sapply(as.character(vids$category_id), switch,
"1" = "Film and Animation",
"2" = "Autos and Vehicles",
"10" = "Music",
"15" = "Pets and Animals",
"17" = "Sports",
"19" = "Travel and Events",
"20" = "Gaming",
"22" = "People and Blogs",
"23" = "Comedy",
"24" = "Entertainment",
"25" = "News and Politics",
"26" = "Howto and Style",
"27" = "Education",
"28" = "Science and Technology",
"29" = "Nonprofit and Activism",
"43" = "Shows")
vids$category_id <- as.factor(vids$category_id)
head(vids,5)most_view_title <- vids %>%
group_by(title) %>%
summarise(Views = sum(views))
most_view_title <- head(most_view_title[order(most_view_title$Views, decreasing = T),],5)
most_view_title <- most_view_title %>%
mutate(remark=paste(title,":",Views))
most_view_title_plot<- most_view_title %>%
ggplot(aes(x=Views,y=reorder(title,Views),text=remark)) +
geom_col(aes(fill = title),show.legend = F) +
labs(title = "Most View Title",x="Views",y="Title") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_view_title_plot,tooltip = "text")most_view_channel_title <- vids %>%
group_by(channel_title) %>%
summarise(Views = sum(views))
most_view_channel_title <- head(most_view_channel_title[order(most_view_channel_title$Views, decreasing = T),],5)
most_view_channel_title <- most_view_channel_title %>%
mutate(remark=paste(channel_title,":",Views))
most_view_channel_title_plot<- most_view_channel_title %>%
ggplot(aes(x=Views,y=reorder(channel_title,Views),text=remark)) +
geom_col(aes(fill = channel_title),show.legend = F) +
labs(title = "Most View Channel Title",x="Views",y="Channel Title") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_view_channel_title_plot,tooltip = "text")most_view_categoryid <- vids %>%
group_by(category_id) %>%
summarise(Views = sum(views))
most_view_categoryid <- head(most_view_categoryid[order(most_view_categoryid$Views, decreasing = T),],5)
most_view_categoryid <- most_view_categoryid %>%
mutate(remark=paste(category_id,":",Views))
most_view_categoryid_plot<- most_view_categoryid %>%
ggplot(aes(x=Views,y=reorder(category_id,Views),text=remark)) +
geom_col(aes(fill = category_id),show.legend = F) +
labs(title = "Most View Category",x="Views",y="Category") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_view_categoryid_plot,tooltip = "text")most_likes_title <- vids %>%
group_by(title) %>%
summarise(Likes = sum(likes))
most_likes_title <- head(most_likes_title[order(most_likes_title$Likes, decreasing = T),],5)
most_likes_title <- most_likes_title %>%
mutate(remark=paste(title,":",Likes))
most_likes_title_plot<- most_likes_title %>%
ggplot(aes(x=Likes,y=reorder(title,Likes),text=remark)) +
geom_col(aes(fill = title),show.legend = F) +
labs(title = "Most Likes Title",x="Likes",y="Title") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_likes_title_plot,tooltip = "text")most_likes_channel_title <- vids %>%
group_by(channel_title) %>%
summarise(Likes = sum(likes))
most_likes_channel_title <- head(most_likes_channel_title[order(most_likes_channel_title$Likes, decreasing = T),],5)
most_likes_channel_title <- most_likes_channel_title %>%
mutate(remark=paste(channel_title,":",Likes))
most_likes_channel_title_plot<- most_likes_channel_title %>%
ggplot(aes(x=Likes,y=reorder(channel_title,Likes),text=remark)) +
geom_col(aes(fill = channel_title),show.legend = F) +
labs(title = "Most Likes Channel Title",x="Likes",y="Channel Title") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_likes_channel_title_plot,tooltip = "text")most_likes_categoryid <- vids %>%
group_by(category_id) %>%
summarise(Likes = sum(likes))
most_likes_categoryid <- head(most_likes_categoryid[order(most_likes_categoryid$Likes, decreasing = T),],5)
most_likes_categoryid <- most_likes_categoryid %>%
mutate(remark=paste(category_id,":",Likes))
most_likes_categoryid_plot<- most_likes_categoryid %>%
ggplot(aes(x=Likes,y=reorder(category_id,Likes),text=remark)) +
geom_col(aes(fill = category_id),show.legend = F) +
labs(title = "Most Likes Category",x="LIkes",y="Category") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_likes_categoryid_plot,tooltip = "text")most_dislikes_title <- vids %>%
group_by(title) %>%
summarise(Dislikes = sum(dislikes))
most_dislikes_title <- head(most_dislikes_title[order(most_dislikes_title$Dislikes, decreasing = T),],5)
most_dislikes_title <- most_dislikes_title %>%
mutate(remark=paste(title,":",Dislikes))
most_dislikes_title_plot<- most_dislikes_title %>%
ggplot(aes(x=Dislikes,y=reorder(title,Dislikes),text=remark)) +
geom_col(aes(fill = title),show.legend = F) +
labs(title = "Most Dislikes Title",x="Dislikes",y="Title") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_dislikes_title_plot,tooltip = "text")most_dislikes_channel_title <- vids %>%
group_by(channel_title) %>%
summarise(Dislikes = sum(dislikes))
most_dislikes_channel_title <- head(most_dislikes_channel_title[order(most_dislikes_channel_title$Dislikes, decreasing = T),],5)
most_dislikes_channel_title <- most_dislikes_channel_title %>%
mutate(remark=paste(channel_title,":",Dislikes))
most_dislikes_channel_title_plot<- most_dislikes_channel_title %>%
ggplot(aes(x=Dislikes,y=reorder(channel_title,Dislikes),text=remark)) +
geom_col(aes(fill = channel_title),show.legend = F) +
labs(title = "Most Dislikes Channel Title",x="Dislikes",y="Channel Title") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_dislikes_channel_title_plot,tooltip = "text")most_dislikes_categoryid <- vids %>%
group_by(category_id) %>%
summarise(Dislikes = sum(dislikes))
most_dislikes_categoryid <- head(most_dislikes_categoryid[order(most_dislikes_categoryid$Dislikes, decreasing = T),],5)
most_dislikes_categoryid <- most_dislikes_categoryid %>%
mutate(remark=paste(category_id,":",Dislikes))
most_dislikes_categoryid_plot<- most_dislikes_categoryid %>%
ggplot(aes(x=Dislikes,y=reorder(category_id,Dislikes),text=remark)) +
geom_col(aes(fill = category_id),show.legend = F) +
labs(title = "Most Dislikes Category",x="Dislikes",y="Category") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_dislikes_categoryid_plot,tooltip = "text")most_commented_title <- vids %>%
group_by(title) %>%
summarise(Commented = sum(comment_count))
most_commented_title <- head(most_commented_title[order(most_commented_title$Commented, decreasing = T),],5)
most_commented_title <- most_commented_title %>%
mutate(remark=paste(title,":",Commented))
most_commented_title_plot<- most_commented_title %>%
ggplot(aes(x=Commented,y=reorder(title,Commented),text=remark)) +
geom_col(aes(fill = title),show.legend = F) +
labs(title = "Most Commented Title",x="Total Comment",y="Title") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_commented_title_plot,tooltip = "text")most_commented_channel_title <- vids %>%
group_by(channel_title) %>%
summarise(Commented = sum(comment_count))
most_commented_channel_title <- head(most_commented_channel_title[order(most_commented_channel_title$Commented, decreasing = T),],5)
most_commented_channel_title <- most_commented_channel_title %>%
mutate(remark=paste(channel_title,":",Commented))
most_commented_channel_title_plot<- most_commented_channel_title %>%
ggplot(aes(x=Commented,y=reorder(channel_title,Commented),text=remark)) +
geom_col(aes(fill = channel_title),show.legend = F) +
labs(title = "Most Commented Channel Title",x="Total Comment",y="Channel Title") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_commented_channel_title_plot,tooltip = "text")most_commented_categoryid <- vids %>%
group_by(category_id) %>%
summarise(Commented = sum(comment_count))
most_commented_categoryid <- head(most_commented_categoryid[order(most_commented_categoryid$Commented, decreasing = T),],5)
most_commented_categoryid <- most_commented_categoryid %>%
mutate(remark=paste(category_id,":",Commented))
most_commented_categoryid_plot<- most_commented_categoryid %>%
ggplot(aes(x=Commented,y=reorder(category_id,Commented),text=remark)) +
geom_col(aes(fill = Commented),show.legend = F) +
labs(title = "Most Commented Category",x="Total Comment",y="Category") +
theme_algoritma+theme(legend.position = "none")
ggplotly(most_commented_categoryid_plot,tooltip = "text")vids$publish_date <- date(vids$publish_time)
vids$timetotrend <- vids$trending_date-vids$publish_date
vids$timetotrend2 <- as.factor(ifelse(vids$timetotrend <= 7, vids$timetotrend, "8+"))
avg_duration <- vids %>%
group_by(category_id) %>%
summarise(Duration = mean(timetotrend))
avg_duration <- avg_duration %>%
mutate(remark=paste(category_id,":",Duration))avg_duration_plot<- avg_duration %>%
ggplot(aes(x=Duration,y=reorder(category_id,-Duration),text=remark)) +
geom_col(aes(fill = Duration),show.legend = F) +
labs(title = "Average Duration Publish to Trending Per Category",x="Category",y="Duration") +
theme_algoritma+theme(legend.position = "none")
ggplotly(avg_duration_plot,tooltip = "text")# buat data frame (aggregasi)
vids.chan <- data.frame(table(vids$channel_title))
vids.chan <- vids.chan[vids.chan$Freq >= 10,]
vids.chan <- vids.chan[order(vids.chan$Freq, decreasing = T),]
vids.chan$Var1 <- droplevels(vids.chan$Var1)
vids.chan$remark <- paste(vids.chan$Var1,":",vids.chan$Freq)# menambah keterangan (labs) + percantik plot
p3 <- ggplot(data = vids.chan[1:10,], mapping = aes(x = reorder(Var1, Freq), y = Freq,text=remark)) +
geom_col(mapping = aes(fill = Freq)) +
coord_flip() +
geom_label(mapping = aes(label = Freq),
data = vids.chan[1:3,],
nudge_y = 1,
size = 4) +
scale_fill_gradient(low = "skyblue",
high = "navy") +
labs(title = "Top 10 Trending Channel in US",
subtitle = "Based on Video Count",
x = "",
y = "Video Count") +
theme_minimal() + # menambah theme bawaan
theme(legend.position = "none", # menghilaggkan legend
panel.grid = element_blank(), # menghilangkan grid
plot.title = element_text(face = "bold.italic")) + # mengatur font plot.title
geom_hline(yintercept = 20) # menambah horizontal line (axis y)
ggplotly(p3,tooltip = "text")13 Relation between Likes vs View in Music Category
library(ggthemes)
vids$commentratio <- vids$comment_count/vids$views
vids$likeratio <- vids$likes/vids$views
vids.music <- vids[vids$category_id == "Music",]
ggplot(vids.music, aes(x = likeratio, y = commentratio)) +
geom_point(aes(col = timetotrend2, size = dislikes)) +
labs(title = "",
subtitle = "",
x = "likes per view",
y = "comment per view") +
guides(size = "none") +
scale_color_viridis_d() +
theme_classic() +
theme(legend.position = "top") 14. We want to visualize proportion total video that was published in certain time (publish_when) in each category.
# data wrangling
vids$publish_hour <- hour(vids$publish_time)
pw <- function(x){
if(x < 8){
x <- "12am to 8am"
}else if(x >= 8 & x < 16){
x <- "8am to 4pm"
}else{
x <- "4pm to 12am"
}
}
vids$publish_when <- as.factor(sapply(vids$publish_hour, pw))
vids.mul2 <- as.data.frame(table(category = vids$category_id, # menamakan kolom pada table
publish_time = vids$publish_when))ggplot(data = vids.mul2, mapping = aes(x = reorder(category, Freq), y = Freq)) +
geom_col(mapping = aes(fill = publish_time), position = "fill") +
coord_flip() +
labs(x = "",
y = "Video Count",
fill = "",
title = "Proportion of YouTube Trending Videos",
subtitle = "Categories vs. Publish Hour") +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(legend.position = "top")Insight:
vids.trend <- vids[vids$category_id %in% c("Entertainment", "Music", "Comedy"),]
vids.tagg <- aggregate(views ~ publish_hour + category_id, data = vids.trend, FUN = mean)
# visualisasi geom_line
ggplot(data = vids.tagg,
mapping = aes(x = publish_hour,
y = views)) + # grouping line berdasarkan
geom_line(aes(col = category_id), lwd = 0.9) +
facet_wrap(facets = ~category_id, nrow = 3) +
scale_x_continuous(breaks = seq(from = 0, to = 23, by = 3)) +
scale_y_continuous(labels = scales::comma) +
theme_minimal()