# clear-up the environment
rm(list = ls())

# chunk options
knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  fig.align = "center",
  comment = "#>"
)

options(scipen = 123)

1 Introduction

this is my LBB for data visualization with team algoritma. i hope you can gain insight from my LBB.

2 Library setup

in here. we will input all library that we will use

library(lubridate)
library(ggplot2)
library(scales)
library(ggthemes)

3 Data input

3.1 Data Input & Structure

vids <- read.csv("USvideos.csv", stringsAsFactors = T)

and now we can inspect the data

dim(vids)
#> [1] 13400    12

we found inside data have 12 columns and 13400 rows

head(vids)

we need to see data structure

str(vids)
#> 'data.frame':    13400 obs. of  12 variables:
#>  $ trending_date         : Factor w/ 67 levels "17.01.12","17.02.12",..: 14 14 14 14 14 14 14 14 14 14 ...
#>  $ title                 : Factor w/ 2986 levels "'I have dad moves': Barack Obama discusses dancing on David Letterman's new Netflix show",..: 2802 2574 2081 1903 1231 89 2164 143 2482 2920 ...
#>  $ channel_title         : Factor w/ 1408 levels "_¢_Á_\235","“÷\201\220µ_‘⬓_\220 Korean Englishman",..: 195 686 1046 472 902 559 1063 283 6 1358 ...
#>  $ category_id           : int  22 24 23 24 24 28 24 28 1 25 ...
#>  $ publish_time          : Factor w/ 2903 levels "2008-04-05T18:22:40.000Z",..: 302 271 255 275 253 307 240 258 281 279 ...
#>  $ views                 : int  748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
#>  $ likes                 : int  57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
#>  $ dislikes              : int  2966 6146 5339 666 1989 511 2445 778 119 1363 ...
#>  $ comment_count         : int  15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
#>  $ comments_disabled     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
#>  $ ratings_disabled      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
#>  $ video_error_or_removed: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

we can see few data types are incorrect. we need to change it

vids$trending_date <- ydm(vids$trending_date)
vids$publish_time <- ymd_hms(vids$publish_time)
str(vids)
#> 'data.frame':    13400 obs. of  12 variables:
#>  $ trending_date         : Date, format: "2017-11-14" "2017-11-14" ...
#>  $ title                 : Factor w/ 2986 levels "'I have dad moves': Barack Obama discusses dancing on David Letterman's new Netflix show",..: 2802 2574 2081 1903 1231 89 2164 143 2482 2920 ...
#>  $ channel_title         : Factor w/ 1408 levels "_¢_Á_\235","“÷\201\220µ_‘⬓_\220 Korean Englishman",..: 195 686 1046 472 902 559 1063 283 6 1358 ...
#>  $ category_id           : int  22 24 23 24 24 28 24 28 1 25 ...
#>  $ publish_time          : POSIXct, format: "2017-11-13 17:13:01" "2017-11-13 07:30:00" ...
#>  $ views                 : int  748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
#>  $ likes                 : int  57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
#>  $ dislikes              : int  2966 6146 5339 666 1989 511 2445 778 119 1363 ...
#>  $ comment_count         : int  15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
#>  $ comments_disabled     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
#>  $ ratings_disabled      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
#>  $ video_error_or_removed: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

3.1.1 Missing Value

we need to check if there are any NA in data

anyNA(vids)
#> [1] FALSE

with this we have confirmation that no missing value in data

summary(vids)
#>  trending_date       
#>  Min.   :2017-11-14  
#>  1st Qu.:2017-11-30  
#>  Median :2017-12-17  
#>  Mean   :2017-12-17  
#>  3rd Qu.:2018-01-03  
#>  Max.   :2018-01-21  
#>                      
#>                                                                              title      
#>  Selena Gomez, Marshmello - Wolves                                              :   18  
#>  Cardi B - Bartier Cardi (feat. 21 Savage) [Official Audio]                     :   14  
#>  2016 vs 2017                                                                   :   13  
#>  Cut for Time: Hallmark Channel Christmas Promo (James Franco) - SNL            :   13  
#>  G-Eazy - No Limit REMIX ft. A$AP Rocky, Cardi B, French Montana, Juicy J, Belly:   13  
#>  GoPro: Gorilla Tickling at the GRACE Center                                    :   13  
#>  (Other)                                                                        :13316  
#>         channel_title    category_id     publish_time                
#>  NFL           :   67   Min.   : 1.00   Min.   :2008-04-05 18:22:40  
#>  Refinery29    :   67   1st Qu.:17.00   1st Qu.:2017-11-25 20:52:30  
#>  Vox           :   67   Median :24.00   Median :2017-12-13 02:23:04  
#>  First We Feast:   66   Mean   :20.01   Mean   :2017-11-18 18:46:37  
#>  ESPN          :   64   3rd Qu.:25.00   3rd Qu.:2017-12-27 14:00:03  
#>  NBA           :   63   Max.   :43.00   Max.   :2018-01-21 05:44:30  
#>  (Other)       :13006                                                
#>      views               likes            dislikes       comment_count      
#>  Min.   :      687   Min.   :      0   Min.   :      0   Min.   :      0.0  
#>  1st Qu.:    91096   1st Qu.:   1987   1st Qu.:     85   1st Qu.:    283.8  
#>  Median :   309796   Median :   8907   Median :    322   Median :   1017.5  
#>  Mean   :  1234583   Mean   :  45662   Mean   :   3352   Mean   :   5982.4  
#>  3rd Qu.:   996668   3rd Qu.:  29376   3rd Qu.:   1113   3rd Qu.:   3362.2  
#>  Max.   :149376127   Max.   :3093544   Max.   :1674420   Max.   :1361580.0  
#>                                                                             
#>  comments_disabled ratings_disabled video_error_or_removed
#>  Mode :logical     Mode :logical    Mode :logical         
#>  FALSE:13172       FALSE:13341      FALSE:13399           
#>  TRUE :228         TRUE :59         TRUE :1               
#>                                                           
#>                                                           
#>                                                           
#> 

From summary above, we may conclude some of the things 1. earliest publish time was 2008-04-05 and latest was 2018-01-21 2. there was video with 0 comment and max comment count was 1361580 3. there was video with views count 687 and max count 149376127

4 Study Case

this is theme that we will use in our DV LBB.

theme_algoritma <- theme(legend.key = element_rect(fill="black"),
           legend.background = element_rect(color="white", fill="#263238"),
           plot.subtitle = element_text(size=6, color="white"),
           panel.background = element_rect(fill="#dddddd"),
           panel.border = element_rect(fill=NA),
           panel.grid.minor.x = element_blank(),
           panel.grid.major.x = element_blank(),
           panel.grid.major.y = element_line(color="darkgrey", linetype=2),
           panel.grid.minor.y = element_blank(),
           plot.background = element_rect(fill="#263238"),
           text = element_text(color="white"),
           axis.text = element_text(color="white"))
vids$publish_day <- wday(vids$publish_time, label = T, abbr = F)

we subset this first to gain insight at what time is the best time to upload video

vids$publish_hour <- hour(vids$publish_time)
vids$likes_per_views <- vids$likes/vids$views
head(vids)
  1. we want to know the best day to upload video to gain popularity
ggplot(vids, aes(publish_day, likes_per_views))+
  geom_boxplot(aes(fill = publish_day)) +
  labs(title = "Best Day To Publish Video", 
       x= NULL,
       y= "Likes per Views",
       fill = "Publish Day")+
  theme_algoritma+
  theme(plot.title = element_text(hjust = 0.5))

Interpretations :

we conclude that the best day to upload video are Friday and saturday.

the reason because we can see median on Friday and Saturday are higher than other day.

  1. best time to upload video

we already know the best day and now we need to know the best time to upload video.

convert_hour <- function(x){
  
  if (x >= 0 & x <= 8) { 
    x <- "12am to 8am"
  }
  else if (x > 8 & x <= 15) {
    x <- "8am to 3pm"
  }
  else {
    x <- "3pm to 12am"
  }

  return(x)
}
vids$publish_when <- sapply(X = vids$publish_hour, FUN = convert_hour)

in here, we subset data to make a category for each time

ggplot(vids, aes(publish_when, likes_per_views))+
  geom_boxplot(aes(fill = publish_when)) +
  labs(title = "Best Hour To Publish Video", 
       x= NULL,
       y= "Likes per Views",
       fill = "Publish Hour")+
  theme_algoritma+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_x_discrete(limits=c("8am to 3pm", "3pm to 12am", "12am to 8am"))

we gain conclusion that the best hour to upload video between 3Pm to 12am.

why? we looking at median (middle line inside boxplot), we can see median line in 3pm to 12 am is higher than other. so we gain conclusion this is the best time to upload video.

  1. best category video to upload

we already know the best day and best time to upload video. now we need to know best category

vids$category_id <- sapply(X = as.character(vids$category_id), # Data
                           FUN = switch, # Function
                           
                           # Glossary atau kamus
                           "1" = "Film and Animation",
                           "2" = "Autos and Vehicles", 
                           "10" = "Music", 
                           "15" = "Pets and Animals", 
                           "17" = "Sports",
                           "19" = "Travel and Events", 
                           "20" = "Gaming", 
                           "22" = "People and Blogs", 
                           "23" = "Comedy",
                           "24" = "Entertainment", 
                           "25" = "News and Politics",
                           "26" = "Howto and Style", 
                           "27" = "Education",
                           "28" = "Science and Technology", 
                           "29" = "Nonprofit and Activism",
                           "43" = "Shows")

here. we switch all category number into names. so we can easy to know which category it is. and we nees to make new columns for total view counts

ggplot(vids, aes(likes_per_views, category_id))+
  geom_col(aes(fill = category_id)) +
  labs(title = "Best Category", 
       x= "Likes per Views",
       y= NULL,
       fill = "Category")+
  theme_algoritma+
  theme(plot.title = element_text(hjust = 0.5))

in here, we gain insight that music category have highest likes per views counts. follow by Entertainment category.

5 Closing

this is example of what i can do with basic data visualization. i hope you can gain usefull information from my LBB. thank you.