1 Explanation

1.1 Brief

Hi !! Welcome to my LBB :)
In this LBB, I will use data USvideos.csv
I hope you enjoy it !

1.2 Data’s Point of View

This data is contains of video trending YouTube records sales record included the title, channel title, publish time, etc.
The first thing i need to do is load all package tht might be needed for this dataset.\

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggplot2)
library(leaflet)
library(scales)
library(tidyr)
library(colorspace) 
library(ggthemes)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Note : Detail explanatory will be given at the end of content

2 Data

2.1 Input Data

Make sure our data placed within folder of our R project data.

data <- read.csv("data_input/USvideos.csv")
head(data)

Input data is DONE ! then let’s get started

2.2 Data Inspection

head(data)

dim(data)

## [1] 13400    12

names(data)

##  [1] "trending_date"          "title"                  "channel_title"         
##  [4] "category_id"            "publish_time"           "views"                 
##  [7] "likes"                  "dislikes"               "comment_count"         
## [10] "comments_disabled"      "ratings_disabled"       "video_error_or_removed"

From our inspection we can conclude :
* Data contains 13400 of rows and 12 of coloumns
* Column Description

trending_date : trending date
title : video title
channel_title : channel title
category_id : video category
publish_time: publish time
views : views count
likes : likes count
dislikes : dislikes count
comment_count : comments count
comment_disabled : is comments disabled
rating_disabled : is rating disabled
video_error_or_removed: is video error or removed

2.3 Data Cleansing & Coertions

From our observation, these are the columns that we want to removed

comment_disabled
rating_disabled
video_error_or_removed

From our observation, these are the columns that we want to fix

trending_date
publish_time
category_id

2.3.1 Remove unwanted columns

data <- data[ , 1:9]
head(data)

2.3.2 Fixing category_id as factor

switch.category <- function(x){
  x <- as.character(x)
  y <- switch(x,
              "1" = "Film and Animation",
              "2" = "Autos and Vehicles", 
             "10" = "Music", 
             "15" = "Pets and Animals", 
             "17" = "Sports",
             "19" = "Travel and Events", 
             "20" = "Gaming", 
             "22" = "People and Blogs", 
             "23" = "Comedy",
             "24" = "Entertainment", 
             "25" = "News and Politics",
             "26" = "Howto and Style", 
             "27" = "Education",
             "28" = "Science and Technology", 
             "29" = "Nonprofit and Activism",
             "43" = "Shows")
  return(y)
}

data$category_id <- as.factor(sapply(data$category_id, FUN = switch.category))
head(data$category_id)

## [1] People and Blogs       Entertainment          Comedy                
## [4] Entertainment          Entertainment          Science and Technology
## 16 Levels: Autos and Vehicles Comedy Education ... Travel and Events

2.3.3 Formatting data with format date using `lubridate`

data$trending_date <- ydm(data$trending_date)
data$publish_time <- ymd_hms(data$publish_time)

head(data)

2.3.4 Remove repeating data

match.row <- match(unique(data$title), data$title)
vids <- data[match.row, ]
head(vids,100)

3 Study Case

1. What is the top 3 YouTube Channel with most trending videos?

agg <- aggregate(title ~ category_id, data = vids, FUN = length)

table_channel <- sort(table(vids$channel_title), decreasing = T)
vids.tab <- as.data.frame(table_channel)

names(vids.tab) <- c("title", "total_videos")
vids.tab.10 <- vids.tab[1:10,]

ggplot(data = vids.tab.10, mapping = aes(y = reorder(title,
                                                     total_videos), 
                                         x = total_videos)) + 
  geom_col() +
  labs(title = "Top Trending YouTube Channel",
       x = "Total Videos",
       y = NULL)

Based on visualization above, the top 3 trending YouTube Channel is Refinery29, The Tonight Show Starring Jimmy Fallon and Vox

2. When is the video most often uploaded? ?

vids$publish_hour <- hour(vids$publish_time)
agg <- aggregate(title ~ publish_hour, data = vids, FUN = length)

table_hour <- sort(table(vids$publish_hour), decreasing = T)
vids.tab <- as.data.frame(table_hour)

names(vids.tab) <- c("publish_hour", "total_videos")

ggplot(data = vids.tab, mapping = aes(x = reorder(publish_hour,
                                                     -total_videos), 
                                         y = total_videos)) + 
  geom_col() +
  labs(title = "",
       x = "Total Videos",
       y = NULL)

Based on visualization above, the trending video most often uploaded at 4 pm

3. Create visualization about relationship between likes ratio for top 3 trending category !

agg <- aggregate(title ~ category_id, data = vids, FUN = length)
top.category <- agg[order(agg$title, decreasing = T), ]$category_id[1:3]

vids.top <- vids[vids$category_id %in% top.category, ]

vids.top$category_id <- droplevels(vids.top$category_id)
vids.top$likesp <- vids.top$likes / vids.top$views 

ggplot(data = vids.top, aes(x = category_id, y = likesp)) +
  geom_boxplot(aes(fill = category_id), 
               color = "gray", outlier.shape = NA) +
  geom_jitter(aes(color = category_id), alpha = 0.3, show.legend = F) +
  labs(title = "Likes Ratio", subtitle = "Top 3 Categories", 
       x = "Likes Ratio", y = "Category", caption = "Source : YouTube", fill = "Category")+
  
   scale_y_continuous(breaks = seq(0, 0.2, 0.05), limits = c(0, 0.2)) +
  scale_fill_manual(values = c("#1C315E", "#227C70", "#88A47C"))+
  theme_dark()

## Warning: Removed 8 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 10 rows containing missing values (`geom_point()`).

4. Create visualization about trending average viewers on “Entertainment”, “Music” and “Howto and Style” !

vids.cat.custom <- vids[vids$category_id %in% 
                          c("Entertainment","Music", "Howto and Style"),]

vids.emh <- aggregate(views ~ category_id + publish_hour , 
          data = vids.cat.custom, 
          FUN = "mean")
ggplot(data = vids.emh , mapping = aes(x = publish_hour , 
                                       y = views)) +
  geom_line(mapping = aes(color = category_id), size = 1.5) + 
  scale_y_continuous(label = comma) + 
  labs(title = "Average Views per Hours",
       x = "Publish Hour",
       y = "Mean Views",
       color = "Category",
       caption = "Source : Youtube") + 
  theme(legend.position = "top",
        plot.title = element_text(hjust = 0.5, face = "bold"))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

4 Business Recomendation

As a beginner YouTuber, these are the recommendations for you:
1. You are recommended to choose either Entertainment, Music‘or’Howto and Style as your video category
2. You are recommended to upload your video between 2pm until 6pm as hot time to publish

LBB Data Visualitation

Heinz Metrosan Donradt S

2023-02-26

1 Explanation

1.1 Brief

1.2 Data’s Point of View

2 Data

2.1 Input Data

2.2 Data Inspection

2.3 Data Cleansing & Coertions

2.3.1 Remove unwanted columns

2.3.2 Fixing category_id as factor

2.3.3 Formatting data with format date using `lubridate`

2.3.4 Remove repeating data

3 Study Case

4 Business Recomendation

LBB Data Visualitation

Heinz Metrosan Donradt S

2023-02-26

1 Explanation

1.1 Brief

1.2 Data’s Point of View

2 Data

2.1 Input Data

2.2 Data Inspection

2.3 Data Cleansing & Coertions

2.3.1 Remove unwanted columns

2.3.2 Fixing category_id as factor

2.3.3 Formatting data with format date using lubridate

2.3.4 Remove repeating data

3 Study Case

4 Business Recomendation

2.3.3 Formatting data with format date using `lubridate`