# clear-up the environment
rm(list = ls())
# chunk options
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
fig.align = "center",
comment = "#>"
)
options(scipen = 123)this is my LBB for data visualization with team algoritma. i hope you can gain insight from my LBB.
in here. we will input all library that we will use
library(lubridate)
library(ggplot2)
library(scales)
library(ggthemes)vids <- read.csv("USvideos.csv", stringsAsFactors = T)and now we can inspect the data
dim(vids)#> [1] 13400 12
we found inside data have 12 columns and 13400 rows
head(vids)we need to see data structure
str(vids)#> 'data.frame': 13400 obs. of 12 variables:
#> $ trending_date : Factor w/ 67 levels "17.01.12","17.02.12",..: 14 14 14 14 14 14 14 14 14 14 ...
#> $ title : Factor w/ 2986 levels "'I have dad moves': Barack Obama discusses dancing on David Letterman's new Netflix show",..: 2802 2574 2081 1903 1231 89 2164 143 2482 2920 ...
#> $ channel_title : Factor w/ 1408 levels "_¢_Á_\235","“÷\201\220µ_‘⬓_\220 Korean Englishman",..: 195 686 1046 472 902 559 1063 283 6 1358 ...
#> $ category_id : int 22 24 23 24 24 28 24 28 1 25 ...
#> $ publish_time : Factor w/ 2903 levels "2008-04-05T18:22:40.000Z",..: 302 271 255 275 253 307 240 258 281 279 ...
#> $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
#> $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
#> $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
#> $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
#> $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
#> $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
#> $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
we can see few data types are incorrect. we need to change it
vids$trending_date <- ydm(vids$trending_date)vids$publish_time <- ymd_hms(vids$publish_time)str(vids)#> 'data.frame': 13400 obs. of 12 variables:
#> $ trending_date : Date, format: "2017-11-14" "2017-11-14" ...
#> $ title : Factor w/ 2986 levels "'I have dad moves': Barack Obama discusses dancing on David Letterman's new Netflix show",..: 2802 2574 2081 1903 1231 89 2164 143 2482 2920 ...
#> $ channel_title : Factor w/ 1408 levels "_¢_Á_\235","“÷\201\220µ_‘⬓_\220 Korean Englishman",..: 195 686 1046 472 902 559 1063 283 6 1358 ...
#> $ category_id : int 22 24 23 24 24 28 24 28 1 25 ...
#> $ publish_time : POSIXct, format: "2017-11-13 17:13:01" "2017-11-13 07:30:00" ...
#> $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
#> $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
#> $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
#> $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
#> $ comments_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
#> $ ratings_disabled : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
#> $ video_error_or_removed: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
we need to check if there are any NA in data
anyNA(vids)#> [1] FALSE
with this we have confirmation that no missing value in data
summary(vids)#> trending_date
#> Min. :2017-11-14
#> 1st Qu.:2017-11-30
#> Median :2017-12-17
#> Mean :2017-12-17
#> 3rd Qu.:2018-01-03
#> Max. :2018-01-21
#>
#> title
#> Selena Gomez, Marshmello - Wolves : 18
#> Cardi B - Bartier Cardi (feat. 21 Savage) [Official Audio] : 14
#> 2016 vs 2017 : 13
#> Cut for Time: Hallmark Channel Christmas Promo (James Franco) - SNL : 13
#> G-Eazy - No Limit REMIX ft. A$AP Rocky, Cardi B, French Montana, Juicy J, Belly: 13
#> GoPro: Gorilla Tickling at the GRACE Center : 13
#> (Other) :13316
#> channel_title category_id publish_time
#> NFL : 67 Min. : 1.00 Min. :2008-04-05 18:22:40
#> Refinery29 : 67 1st Qu.:17.00 1st Qu.:2017-11-25 20:52:30
#> Vox : 67 Median :24.00 Median :2017-12-13 02:23:04
#> First We Feast: 66 Mean :20.01 Mean :2017-11-18 18:46:37
#> ESPN : 64 3rd Qu.:25.00 3rd Qu.:2017-12-27 14:00:03
#> NBA : 63 Max. :43.00 Max. :2018-01-21 05:44:30
#> (Other) :13006
#> views likes dislikes comment_count
#> Min. : 687 Min. : 0 Min. : 0 Min. : 0.0
#> 1st Qu.: 91096 1st Qu.: 1987 1st Qu.: 85 1st Qu.: 283.8
#> Median : 309796 Median : 8907 Median : 322 Median : 1017.5
#> Mean : 1234583 Mean : 45662 Mean : 3352 Mean : 5982.4
#> 3rd Qu.: 996668 3rd Qu.: 29376 3rd Qu.: 1113 3rd Qu.: 3362.2
#> Max. :149376127 Max. :3093544 Max. :1674420 Max. :1361580.0
#>
#> comments_disabled ratings_disabled video_error_or_removed
#> Mode :logical Mode :logical Mode :logical
#> FALSE:13172 FALSE:13341 FALSE:13399
#> TRUE :228 TRUE :59 TRUE :1
#>
#>
#>
#>
From summary above, we may conclude some of the things 1. earliest publish time was 2008-04-05 and latest was 2018-01-21 2. there was video with 0 comment and max comment count was 1361580 3. there was video with views count 687 and max count 149376127
this is theme that we will use in our DV LBB.
theme_algoritma <- theme(legend.key = element_rect(fill="black"),
legend.background = element_rect(color="white", fill="#263238"),
plot.subtitle = element_text(size=6, color="white"),
panel.background = element_rect(fill="#dddddd"),
panel.border = element_rect(fill=NA),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color="darkgrey", linetype=2),
panel.grid.minor.y = element_blank(),
plot.background = element_rect(fill="#263238"),
text = element_text(color="white"),
axis.text = element_text(color="white"))vids$publish_day <- wday(vids$publish_time, label = T, abbr = F)we subset this first to gain insight at what time is the best time to upload video
vids$publish_hour <- hour(vids$publish_time)vids$likes_per_views <- vids$likes/vids$viewshead(vids)ggplot(vids, aes(publish_day, likes_per_views))+
geom_boxplot(aes(fill = publish_day)) +
labs(title = "Best Day To Publish Video",
x= NULL,
y= "Likes per Views",
fill = "Publish Day")+
theme_algoritma+
theme(plot.title = element_text(hjust = 0.5)) Interpretations :
we conclude that the best day to upload video are Friday and saturday.
the reason because we can see median on Friday and Saturday are higher than other day.
we already know the best day and now we need to know the best time to upload video.
convert_hour <- function(x){
if (x >= 0 & x <= 8) {
x <- "12am to 8am"
}
else if (x > 8 & x <= 15) {
x <- "8am to 3pm"
}
else {
x <- "3pm to 12am"
}
return(x)
}vids$publish_when <- sapply(X = vids$publish_hour, FUN = convert_hour)in here, we subset data to make a category for each time
ggplot(vids, aes(publish_when, likes_per_views))+
geom_boxplot(aes(fill = publish_when)) +
labs(title = "Best Hour To Publish Video",
x= NULL,
y= "Likes per Views",
fill = "Publish Hour")+
theme_algoritma+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_discrete(limits=c("8am to 3pm", "3pm to 12am", "12am to 8am")) we gain conclusion that the best hour to upload video between 3Pm to 12am.
why? we looking at median (middle line inside boxplot), we can see median line in 3pm to 12 am is higher than other. so we gain conclusion this is the best time to upload video.
we already know the best day and best time to upload video. now we need to know best category
vids$category_id <- sapply(X = as.character(vids$category_id), # Data
FUN = switch, # Function
# Glossary atau kamus
"1" = "Film and Animation",
"2" = "Autos and Vehicles",
"10" = "Music",
"15" = "Pets and Animals",
"17" = "Sports",
"19" = "Travel and Events",
"20" = "Gaming",
"22" = "People and Blogs",
"23" = "Comedy",
"24" = "Entertainment",
"25" = "News and Politics",
"26" = "Howto and Style",
"27" = "Education",
"28" = "Science and Technology",
"29" = "Nonprofit and Activism",
"43" = "Shows")here. we switch all category number into names. so we can easy to know which category it is. and we nees to make new columns for total view counts
ggplot(vids, aes(likes_per_views, category_id))+
geom_col(aes(fill = category_id)) +
labs(title = "Best Category",
x= "Likes per Views",
y= NULL,
fill = "Category")+
theme_algoritma+
theme(plot.title = element_text(hjust = 0.5)) in here, we gain insight that music category have highest likes per views counts. follow by Entertainment category.
this is example of what i can do with basic data visualization. i hope you can gain usefull information from my LBB. thank you.