#Welcome selamat datang pada LBB yang akan diproses oleh partisipan Algortima bernama Michael

#Import Data dan Inspection

sebelum memulai proses input, kita akan menginstall library packages yang akan dipakai

library(tidyverse)
library(plotly)
library(glue)
library(scales)
library(lubridate)
library(plotly)

selanjutnya kita akan mengimport data yang akan kita gunakan

youtube <- read_csv("youtubetrends.csv")

import data selesai, selanjutnya kita akan memeriksa data. apakah ada bagian yang tidak cocok

head(youtube)
## # A tibble: 6 x 16
##   trending_date title      channel_title  category_id publish_time         views
##   <date>        <chr>      <chr>          <chr>       <dttm>               <dbl>
## 1 2017-11-14    WE WANT T~ CaseyNeistat   People and~ 2017-11-13 12:13:01 7.48e5
## 2 2017-11-14    The Trump~ LastWeekTonig~ Entertainm~ 2017-11-13 02:30:00 2.42e6
## 3 2017-11-14    Racist Su~ Rudy Mancuso   Comedy      2017-11-12 14:05:24 3.19e6
## 4 2017-11-14    Nickelbac~ Good Mythical~ Entertainm~ 2017-11-13 06:00:04 3.43e5
## 5 2017-11-14    I Dare Yo~ nigahiga       Entertainm~ 2017-11-12 13:01:41 2.10e6
## 6 2017-11-14    2 Weeks w~ iJustine       Science an~ 2017-11-13 14:07:23 1.19e5
## # ... with 10 more variables: likes <dbl>, dislikes <dbl>, comment_count <dbl>,
## #   comments_disabled <lgl>, ratings_disabled <lgl>,
## #   video_error_or_removed <lgl>, publish_hour <dbl>, publish_when <chr>,
## #   publish_wday <chr>, timetotrend <chr>
tail(youtube)
## # A tibble: 6 x 16
##   trending_date title  channel_title category_id publish_time        views likes
##   <date>        <chr>  <chr>         <chr>       <dttm>              <dbl> <dbl>
## 1 2018-01-21    Short~ Washington P~ News and P~ 2018-01-19 23:43:42 13044    68
## 2 2018-01-21    Jessi~ Saturday Nig~ Entertainm~ 2018-01-21 00:44:30 23484   858
## 3 2018-01-21    Legen~ CBS Los Ange~ News and P~ 2018-01-19 21:10:07  3976    30
## 4 2018-01-21    Mulan~ Aymeric Fava~ Sports      2014-03-15 14:21:22 16231   103
## 5 2018-01-21    Urban~ Marcos Horac~ Sports      2018-01-19 12:26:11 13531   136
## 6 2018-01-21    GET R~ LaToya Forev~ Entertainm~ 2018-01-18 18:03:24 44754  3246
## # ... with 9 more variables: dislikes <dbl>, comment_count <dbl>,
## #   comments_disabled <lgl>, ratings_disabled <lgl>,
## #   video_error_or_removed <lgl>, publish_hour <dbl>, publish_when <chr>,
## #   publish_wday <chr>, timetotrend <chr>
dim(youtube)
## [1] 2986   16

disini kita bisa melihat kalau jumlah data memiliki baris = 2986 dan kolom = 16

names(youtube)
##  [1] "trending_date"          "title"                  "channel_title"         
##  [4] "category_id"            "publish_time"           "views"                 
##  [7] "likes"                  "dislikes"               "comment_count"         
## [10] "comments_disabled"      "ratings_disabled"       "video_error_or_removed"
## [13] "publish_hour"           "publish_when"           "publish_wday"          
## [16] "timetotrend"
glimpse(youtube)
## Rows: 2,986
## Columns: 16
## $ trending_date          <date> 2017-11-14, 2017-11-14, 2017-11-14, 2017-11-14~
## $ title                  <chr> "WE WANT TO TALK ABOUT OUR MARRIAGE", "The Trum~
## $ channel_title          <chr> "CaseyNeistat", "LastWeekTonight", "Rudy Mancus~
## $ category_id            <chr> "People and Blogs", "Entertainment", "Comedy", ~
## $ publish_time           <dttm> 2017-11-13 12:13:01, 2017-11-13 02:30:00, 2017~
## $ views                  <dbl> 748374, 2418783, 3191434, 343168, 2095731, 1191~
## $ likes                  <dbl> 57527, 97185, 146033, 10172, 132235, 9763, 1599~
## $ dislikes               <dbl> 2966, 6146, 5339, 666, 1989, 511, 2445, 778, 11~
## $ comment_count          <dbl> 15954, 12703, 8181, 2146, 17518, 1434, 1970, 34~
## $ comments_disabled      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE~
## $ ratings_disabled       <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE~
## $ video_error_or_removed <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE~
## $ publish_hour           <dbl> 12, 2, 14, 6, 13, 14, 0, 16, 9, 8, 21, 22, 12, ~
## $ publish_when           <chr> "8am to 3pm", "12am to 8am", "8am to 3pm", "12a~
## $ publish_wday           <chr> "Monday", "Monday", "Sunday", "Monday", "Sunday~
## $ timetotrend            <chr> "1", "1", "2", "1", "2", "1", "2", "2", "1", "1~

akan merubah semua kategori data yang bersifat string menjadi kategori

youtube[,c("category_id","publish_when","publish_wday")] <- lapply(youtube[,c("category_id","publish_when","publish_wday")], FUN = as.factor)

youtube$timetotrend <- as.numeric(youtube$timetotrend)
summary(youtube)
##  trending_date           title           channel_title     
##  Min.   :2017-11-14   Length:2986        Length:2986       
##  1st Qu.:2017-11-27   Class :character   Class :character  
##  Median :2017-12-14   Mode  :character   Mode  :character  
##  Mean   :2017-12-15                                        
##  3rd Qu.:2018-01-04                                        
##  Max.   :2018-01-21                                        
##                                                            
##             category_id   publish_time                     views         
##  Entertainment    :736   Min.   :2008-04-05 14:22:40   Min.   :     687  
##  Music            :391   1st Qu.:2017-11-22 10:51:03   1st Qu.:   43014  
##  Howto and Style  :285   Median :2017-12-11 12:41:02   Median :  149196  
##  Comedy           :273   Mean   :2017-11-07 14:45:42   Mean   :  512602  
##  News and Politics:271   3rd Qu.:2018-01-01 10:22:33   3rd Qu.:  483848  
##  People and Blogs :228   Max.   :2018-01-21 00:44:30   Max.   :37736281  
##  (Other)          :802                                                   
##      likes              dislikes        comment_count    comments_disabled
##  Min.   :      0.0   Min.   :     0.0   Min.   :     0   Mode :logical    
##  1st Qu.:    931.2   1st Qu.:    39.0   1st Qu.:   152   FALSE:2942       
##  Median :   4785.0   Median :   148.5   Median :   589   TRUE :44         
##  Mean   :  25412.1   Mean   :  1257.7   Mean   :  3545                    
##  3rd Qu.:  17517.0   3rd Qu.:   541.0   3rd Qu.:  1964                    
##  Max.   :2055137.0   Max.   :629120.0   Max.   :733373                    
##                                                                           
##  ratings_disabled video_error_or_removed  publish_hour        publish_when 
##  Mode :logical    Mode :logical          Min.   : 0.00   12am to 8am: 552  
##  FALSE:2976       FALSE:2985             1st Qu.: 9.00   3pm to 12am: 795  
##  TRUE :10         TRUE :1                Median :12.00   8am to 3pm :1639  
##                                          Mean   :11.95                     
##                                          3rd Qu.:16.00                     
##                                          Max.   :23.00                     
##                                                                            
##     publish_wday  timetotrend  
##  Friday   :478   Min.   :0.00  
##  Monday   :415   1st Qu.:1.00  
##  Saturday :269   Median :1.00  
##  Sunday   :247   Mean   :1.99  
##  Thursday :490   3rd Qu.:2.00  
##  Tuesday  :561   Max.   :7.00  
##  Wednesday:526   NA's   :129

dari data diatas, kita bisa mengambil beberapa kesimpulan: 1. hari publikasi paling banyak adalah hari kamis 2. jam publikasi yang paling sering adalah jam 8 pagi hingga jam 3 siang(15.00 wib) 3. kategori video yang banyak diminati adalah kategori entertainment

#Missing data

setelah melihat dan memeriksa tipe data, kita akan memeriksa apakah ada missing value dalam data

youtube %>% is.na() %>% colSums()
##          trending_date                  title          channel_title 
##                      0                      0                      0 
##            category_id           publish_time                  views 
##                      0                      0                      0 
##                  likes               dislikes          comment_count 
##                      0                      0                      0 
##      comments_disabled       ratings_disabled video_error_or_removed 
##                      0                      0                      0 
##           publish_hour           publish_when           publish_wday 
##                      0                      0                      0 
##            timetotrend 
##                    129
anyNA(youtube)
## [1] TRUE

disini kita mendapatkan kalau time to trend memiliki missing value sebanyak 129

karena jumlah data yang missing value tidaklah banyak. kita akan memanipulasi 129 data menjadi nilai rata-rata

youtube$timetotrend <- youtube %>% 
  mutate(timetotrend = replace_na(timetotrend, 
                                          replace = mean(timetotrend, 
                                                         na.rm = T)))
youtube %>% is.na() %>% colSums()
##          trending_date                  title          channel_title 
##                      0                      0                      0 
##            category_id           publish_time                  views 
##                      0                      0                      0 
##                  likes               dislikes          comment_count 
##                      0                      0                      0 
##      comments_disabled       ratings_disabled video_error_or_removed 
##                      0                      0                      0 
##           publish_hour           publish_when           publish_wday 
##                      0                      0                      0 
##          trending_date                  title          channel_title 
##                      0                      0                      0 
##            category_id           publish_time                  views 
##                      0                      0                      0 
##                  likes               dislikes          comment_count 
##                      0                      0                      0 
##      comments_disabled       ratings_disabled video_error_or_removed 
##                      0                      0                      0 
##           publish_hour           publish_when           publish_wday 
##                      0                      0                      0 
##            timetotrend 
##                      0
anyNA(youtube)
## [1] FALSE

dengan disini kita sudah selesai dalam memanipulasi data yang ada.

#Study Case dan plotting

sebelum memulai, kita akan memakai theme custom algoritma

theme_algoritma <- theme(legend.key = element_rect(fill="black"),
           legend.background = element_rect(color="white", fill="#263238"),
           plot.subtitle = element_text(size=6, color="white"),
           panel.background = element_rect(fill="#dddddd"),
           panel.border = element_rect(fill=NA),
           panel.grid.minor.x = element_blank(),
           panel.grid.major.x = element_blank(),
           panel.grid.major.y = element_line(color="darkgrey", linetype=2),
           panel.grid.minor.y = element_blank(),
           plot.background = element_rect(fill="#263238"),
           text = element_text(color="white"),
           axis.text = element_text(color="white")
           
           )

kita ingin melihat video title dengan views terbanyak dan membuat plotnya.

view_ranking <- youtube %>% group_by(category_id) %>%
  summarise(most_view = sum(views)) %>%
  arrange(desc(most_view))

view_ranking
## # A tibble: 16 x 2
##    category_id            most_view
##    <fct>                      <dbl>
##  1 Entertainment          445350694
##  2 Music                  386197869
##  3 Comedy                 179900099
##  4 Howto and Style        110259469
##  5 Film and Animation      77476128
##  6 Sports                  77284746
##  7 People and Blogs        73512421
##  8 Science and Technology  59723097
##  9 News and Politics       44459409
## 10 Education               28597023
## 11 Gaming                  17830867
## 12 Pets and Animals        15830348
## 13 Autos and Vehicles       8528563
## 14 Travel and Events        5445076
## 15 Nonprofit and Activism    122659
## 16 Shows                     110112
plot_ranking <- view_ranking %>%
  ggplot(aes(x= most_view,
             y= reorder(category_id, most_view),
             text = glue("{category_id}
                         Total views :{most_view}")))+
  geom_col(fill= "steelblue")+
  geom_col(data = view_ranking %>% filter(category_id == "Entertainment"), fill = "yellow") +
  labs(title = "Kategori Video Dengan views Terbanyak",
       x= NULL,
       y= NULL) +
  scale_x_continuous(labels = comma)+
  theme_algoritma
ggplotly(plot_ranking, tooltip = "text")

selanjutnya kita ingin melihat hubungan jumlah rata-rata likes yang diperoleh dengan hari dipublish

view_enter <- youtube %>%
  group_by(publish_hour, publish_wday) %>% 
  summarise(Published.Video = n(),
            mean_likes = mean(likes)) %>% 
  arrange(desc(mean_likes))

view_enter
## # A tibble: 168 x 4
## # Groups:   publish_hour [24]
##    publish_hour publish_wday Published.Video mean_likes
##           <dbl> <fct>                  <int>      <dbl>
##  1            4 Friday                    10    291460.
##  2            6 Thursday                  14    126691.
##  3            8 Wednesday                 24    113533.
##  4            3 Monday                     8     97748.
##  5            4 Tuesday                   10     84358.
##  6           22 Thursday                   8     83808.
##  7           16 Sunday                    11     82311.
##  8            4 Thursday                   7     78444 
##  9           21 Thursday                  18     62912.
## 10           22 Friday                     6     62877.
## # ... with 158 more rows
plot_day <- view_enter %>% ggplot(aes(x= publish_wday, y= mean_likes,text = glue("mean likes :{mean_likes}")))+
  geom_boxplot(fill="pink")+
  labs(title = "Hari Publish Dengan Likes Tertinggi",
      x= NULL,
      y= NULL)+
  scale_y_continuous(labels = comma)+
  scale_x_discrete(limits=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))+
  theme_algoritma
ggplotly(plot_day, tooltip = "text")

disini kita bisa mendapatkan kesimpulan kalau hari kamis dan Jumat karena median yang didapat pada hari tersebut lebih tinggi daripada hari yang lain

selanjutnya kita akan melihat hubungan likes per view dengan dislikes per view

correlation <- youtube %>% 
  mutate(ratio_dislikes = dislikes / views)%>%
  mutate(ratio_likes = likes / views)%>%
  mutate(ratio_comments = comment_count/views)
plot_cor <- correlation %>%
  ggplot(aes(x = ratio_likes,y = ratio_comments))+ 
  geom_jitter(aes(col= ratio_dislikes,
                  text = glue("dislikes ratio{ratio_dislikes}"))) + 
  geom_smooth() +
  labs (x = "Likes per view", 
        y = "Comment per view", 
        title = "Hubungan likes per view dengan Comment per view", 
        size = "dislike per view")+
  theme_algoritma
ggplotly(plot_cor, tooltip = "text")

kita ingin melihat pada jam berapa publish yang bisa mendapatkan likes terbanyak

view_when <-youtube %>%
  group_by(publish_hour, publish_when) %>% 
  summarise(Published.Video = n(),
            mean_likes = mean(likes)) %>% 
  arrange(desc(mean_likes))

view_when
## # A tibble: 24 x 4
## # Groups:   publish_hour [24]
##    publish_hour publish_when Published.Video mean_likes
##           <dbl> <fct>                  <int>      <dbl>
##  1            4 12am to 8am               38    118354.
##  2            0 12am to 8am              128     42123.
##  3            8 8am to 3pm               129     38718.
##  4           16 3pm to 12am              115     38401.
##  5            6 12am to 8am               71     31990.
##  6           14 8am to 3pm               159     29343.
##  7           22 3pm to 12am               74     28421.
##  8            1 12am to 8am               49     27383.
##  9           19 3pm to 12am              102     27006.
## 10            3 12am to 8am               87     25463.
## # ... with 14 more rows
plot_when <- view_when %>% ggplot(aes(x= publish_when, y= mean_likes,text = glue("mean likes :{mean_likes}")))+
  geom_boxplot(fill="pink")+
  labs(title = "Waktu Publish Dengan Likes Tertinggi",
      x= NULL,
      y= NULL)+
  scale_y_continuous(labels = comma)+
  scale_x_discrete(limits=c("8am to 3pm", "3pm to 12am", "12am to 8am"))+
  theme_algoritma

plot_when

ggplotly(plot_when, tooltip = "text")