#Welcome selamat datang pada LBB yang akan diproses oleh partisipan Algortima bernama Michael
#Import Data dan Inspection
sebelum memulai proses input, kita akan menginstall library packages yang akan dipakai
library(tidyverse)
library(plotly)
library(glue)
library(scales)
library(lubridate)
library(plotly)
selanjutnya kita akan mengimport data yang akan kita gunakan
youtube <- read_csv("youtubetrends.csv")
import data selesai, selanjutnya kita akan memeriksa data. apakah ada bagian yang tidak cocok
head(youtube)
## # A tibble: 6 x 16
## trending_date title channel_title category_id publish_time views
## <date> <chr> <chr> <chr> <dttm> <dbl>
## 1 2017-11-14 WE WANT T~ CaseyNeistat People and~ 2017-11-13 12:13:01 7.48e5
## 2 2017-11-14 The Trump~ LastWeekTonig~ Entertainm~ 2017-11-13 02:30:00 2.42e6
## 3 2017-11-14 Racist Su~ Rudy Mancuso Comedy 2017-11-12 14:05:24 3.19e6
## 4 2017-11-14 Nickelbac~ Good Mythical~ Entertainm~ 2017-11-13 06:00:04 3.43e5
## 5 2017-11-14 I Dare Yo~ nigahiga Entertainm~ 2017-11-12 13:01:41 2.10e6
## 6 2017-11-14 2 Weeks w~ iJustine Science an~ 2017-11-13 14:07:23 1.19e5
## # ... with 10 more variables: likes <dbl>, dislikes <dbl>, comment_count <dbl>,
## # comments_disabled <lgl>, ratings_disabled <lgl>,
## # video_error_or_removed <lgl>, publish_hour <dbl>, publish_when <chr>,
## # publish_wday <chr>, timetotrend <chr>
tail(youtube)
## # A tibble: 6 x 16
## trending_date title channel_title category_id publish_time views likes
## <date> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 2018-01-21 Short~ Washington P~ News and P~ 2018-01-19 23:43:42 13044 68
## 2 2018-01-21 Jessi~ Saturday Nig~ Entertainm~ 2018-01-21 00:44:30 23484 858
## 3 2018-01-21 Legen~ CBS Los Ange~ News and P~ 2018-01-19 21:10:07 3976 30
## 4 2018-01-21 Mulan~ Aymeric Fava~ Sports 2014-03-15 14:21:22 16231 103
## 5 2018-01-21 Urban~ Marcos Horac~ Sports 2018-01-19 12:26:11 13531 136
## 6 2018-01-21 GET R~ LaToya Forev~ Entertainm~ 2018-01-18 18:03:24 44754 3246
## # ... with 9 more variables: dislikes <dbl>, comment_count <dbl>,
## # comments_disabled <lgl>, ratings_disabled <lgl>,
## # video_error_or_removed <lgl>, publish_hour <dbl>, publish_when <chr>,
## # publish_wday <chr>, timetotrend <chr>
dim(youtube)
## [1] 2986 16
disini kita bisa melihat kalau jumlah data memiliki baris = 2986 dan kolom = 16
names(youtube)
## [1] "trending_date" "title" "channel_title"
## [4] "category_id" "publish_time" "views"
## [7] "likes" "dislikes" "comment_count"
## [10] "comments_disabled" "ratings_disabled" "video_error_or_removed"
## [13] "publish_hour" "publish_when" "publish_wday"
## [16] "timetotrend"
glimpse(youtube)
## Rows: 2,986
## Columns: 16
## $ trending_date <date> 2017-11-14, 2017-11-14, 2017-11-14, 2017-11-14~
## $ title <chr> "WE WANT TO TALK ABOUT OUR MARRIAGE", "The Trum~
## $ channel_title <chr> "CaseyNeistat", "LastWeekTonight", "Rudy Mancus~
## $ category_id <chr> "People and Blogs", "Entertainment", "Comedy", ~
## $ publish_time <dttm> 2017-11-13 12:13:01, 2017-11-13 02:30:00, 2017~
## $ views <dbl> 748374, 2418783, 3191434, 343168, 2095731, 1191~
## $ likes <dbl> 57527, 97185, 146033, 10172, 132235, 9763, 1599~
## $ dislikes <dbl> 2966, 6146, 5339, 666, 1989, 511, 2445, 778, 11~
## $ comment_count <dbl> 15954, 12703, 8181, 2146, 17518, 1434, 1970, 34~
## $ comments_disabled <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE~
## $ ratings_disabled <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE~
## $ video_error_or_removed <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE~
## $ publish_hour <dbl> 12, 2, 14, 6, 13, 14, 0, 16, 9, 8, 21, 22, 12, ~
## $ publish_when <chr> "8am to 3pm", "12am to 8am", "8am to 3pm", "12a~
## $ publish_wday <chr> "Monday", "Monday", "Sunday", "Monday", "Sunday~
## $ timetotrend <chr> "1", "1", "2", "1", "2", "1", "2", "2", "1", "1~
akan merubah semua kategori data yang bersifat string menjadi kategori
youtube[,c("category_id","publish_when","publish_wday")] <- lapply(youtube[,c("category_id","publish_when","publish_wday")], FUN = as.factor)
youtube$timetotrend <- as.numeric(youtube$timetotrend)
summary(youtube)
## trending_date title channel_title
## Min. :2017-11-14 Length:2986 Length:2986
## 1st Qu.:2017-11-27 Class :character Class :character
## Median :2017-12-14 Mode :character Mode :character
## Mean :2017-12-15
## 3rd Qu.:2018-01-04
## Max. :2018-01-21
##
## category_id publish_time views
## Entertainment :736 Min. :2008-04-05 14:22:40 Min. : 687
## Music :391 1st Qu.:2017-11-22 10:51:03 1st Qu.: 43014
## Howto and Style :285 Median :2017-12-11 12:41:02 Median : 149196
## Comedy :273 Mean :2017-11-07 14:45:42 Mean : 512602
## News and Politics:271 3rd Qu.:2018-01-01 10:22:33 3rd Qu.: 483848
## People and Blogs :228 Max. :2018-01-21 00:44:30 Max. :37736281
## (Other) :802
## likes dislikes comment_count comments_disabled
## Min. : 0.0 Min. : 0.0 Min. : 0 Mode :logical
## 1st Qu.: 931.2 1st Qu.: 39.0 1st Qu.: 152 FALSE:2942
## Median : 4785.0 Median : 148.5 Median : 589 TRUE :44
## Mean : 25412.1 Mean : 1257.7 Mean : 3545
## 3rd Qu.: 17517.0 3rd Qu.: 541.0 3rd Qu.: 1964
## Max. :2055137.0 Max. :629120.0 Max. :733373
##
## ratings_disabled video_error_or_removed publish_hour publish_when
## Mode :logical Mode :logical Min. : 0.00 12am to 8am: 552
## FALSE:2976 FALSE:2985 1st Qu.: 9.00 3pm to 12am: 795
## TRUE :10 TRUE :1 Median :12.00 8am to 3pm :1639
## Mean :11.95
## 3rd Qu.:16.00
## Max. :23.00
##
## publish_wday timetotrend
## Friday :478 Min. :0.00
## Monday :415 1st Qu.:1.00
## Saturday :269 Median :1.00
## Sunday :247 Mean :1.99
## Thursday :490 3rd Qu.:2.00
## Tuesday :561 Max. :7.00
## Wednesday:526 NA's :129
dari data diatas, kita bisa mengambil beberapa kesimpulan: 1. hari publikasi paling banyak adalah hari kamis 2. jam publikasi yang paling sering adalah jam 8 pagi hingga jam 3 siang(15.00 wib) 3. kategori video yang banyak diminati adalah kategori entertainment
#Missing data
setelah melihat dan memeriksa tipe data, kita akan memeriksa apakah ada missing value dalam data
youtube %>% is.na() %>% colSums()
## trending_date title channel_title
## 0 0 0
## category_id publish_time views
## 0 0 0
## likes dislikes comment_count
## 0 0 0
## comments_disabled ratings_disabled video_error_or_removed
## 0 0 0
## publish_hour publish_when publish_wday
## 0 0 0
## timetotrend
## 129
anyNA(youtube)
## [1] TRUE
disini kita mendapatkan kalau time to trend memiliki missing value sebanyak 129
karena jumlah data yang missing value tidaklah banyak. kita akan memanipulasi 129 data menjadi nilai rata-rata
youtube$timetotrend <- youtube %>%
mutate(timetotrend = replace_na(timetotrend,
replace = mean(timetotrend,
na.rm = T)))
youtube %>% is.na() %>% colSums()
## trending_date title channel_title
## 0 0 0
## category_id publish_time views
## 0 0 0
## likes dislikes comment_count
## 0 0 0
## comments_disabled ratings_disabled video_error_or_removed
## 0 0 0
## publish_hour publish_when publish_wday
## 0 0 0
## trending_date title channel_title
## 0 0 0
## category_id publish_time views
## 0 0 0
## likes dislikes comment_count
## 0 0 0
## comments_disabled ratings_disabled video_error_or_removed
## 0 0 0
## publish_hour publish_when publish_wday
## 0 0 0
## timetotrend
## 0
anyNA(youtube)
## [1] FALSE
dengan disini kita sudah selesai dalam memanipulasi data yang ada.
#Study Case dan plotting
sebelum memulai, kita akan memakai theme custom algoritma
theme_algoritma <- theme(legend.key = element_rect(fill="black"),
legend.background = element_rect(color="white", fill="#263238"),
plot.subtitle = element_text(size=6, color="white"),
panel.background = element_rect(fill="#dddddd"),
panel.border = element_rect(fill=NA),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color="darkgrey", linetype=2),
panel.grid.minor.y = element_blank(),
plot.background = element_rect(fill="#263238"),
text = element_text(color="white"),
axis.text = element_text(color="white")
)
kita ingin melihat video title dengan views terbanyak dan membuat plotnya.
view_ranking <- youtube %>% group_by(category_id) %>%
summarise(most_view = sum(views)) %>%
arrange(desc(most_view))
view_ranking
## # A tibble: 16 x 2
## category_id most_view
## <fct> <dbl>
## 1 Entertainment 445350694
## 2 Music 386197869
## 3 Comedy 179900099
## 4 Howto and Style 110259469
## 5 Film and Animation 77476128
## 6 Sports 77284746
## 7 People and Blogs 73512421
## 8 Science and Technology 59723097
## 9 News and Politics 44459409
## 10 Education 28597023
## 11 Gaming 17830867
## 12 Pets and Animals 15830348
## 13 Autos and Vehicles 8528563
## 14 Travel and Events 5445076
## 15 Nonprofit and Activism 122659
## 16 Shows 110112
plot_ranking <- view_ranking %>%
ggplot(aes(x= most_view,
y= reorder(category_id, most_view),
text = glue("{category_id}
Total views :{most_view}")))+
geom_col(fill= "steelblue")+
geom_col(data = view_ranking %>% filter(category_id == "Entertainment"), fill = "yellow") +
labs(title = "Kategori Video Dengan views Terbanyak",
x= NULL,
y= NULL) +
scale_x_continuous(labels = comma)+
theme_algoritma
ggplotly(plot_ranking, tooltip = "text")
selanjutnya kita ingin melihat hubungan jumlah rata-rata likes yang diperoleh dengan hari dipublish
view_enter <- youtube %>%
group_by(publish_hour, publish_wday) %>%
summarise(Published.Video = n(),
mean_likes = mean(likes)) %>%
arrange(desc(mean_likes))
view_enter
## # A tibble: 168 x 4
## # Groups: publish_hour [24]
## publish_hour publish_wday Published.Video mean_likes
## <dbl> <fct> <int> <dbl>
## 1 4 Friday 10 291460.
## 2 6 Thursday 14 126691.
## 3 8 Wednesday 24 113533.
## 4 3 Monday 8 97748.
## 5 4 Tuesday 10 84358.
## 6 22 Thursday 8 83808.
## 7 16 Sunday 11 82311.
## 8 4 Thursday 7 78444
## 9 21 Thursday 18 62912.
## 10 22 Friday 6 62877.
## # ... with 158 more rows
plot_day <- view_enter %>% ggplot(aes(x= publish_wday, y= mean_likes,text = glue("mean likes :{mean_likes}")))+
geom_boxplot(fill="pink")+
labs(title = "Hari Publish Dengan Likes Tertinggi",
x= NULL,
y= NULL)+
scale_y_continuous(labels = comma)+
scale_x_discrete(limits=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))+
theme_algoritma
ggplotly(plot_day, tooltip = "text")
disini kita bisa mendapatkan kesimpulan kalau hari kamis dan Jumat karena median yang didapat pada hari tersebut lebih tinggi daripada hari yang lain
selanjutnya kita akan melihat hubungan likes per view dengan dislikes per view
correlation <- youtube %>%
mutate(ratio_dislikes = dislikes / views)%>%
mutate(ratio_likes = likes / views)%>%
mutate(ratio_comments = comment_count/views)
plot_cor <- correlation %>%
ggplot(aes(x = ratio_likes,y = ratio_comments))+
geom_jitter(aes(col= ratio_dislikes,
text = glue("dislikes ratio{ratio_dislikes}"))) +
geom_smooth() +
labs (x = "Likes per view",
y = "Comment per view",
title = "Hubungan likes per view dengan Comment per view",
size = "dislike per view")+
theme_algoritma
ggplotly(plot_cor, tooltip = "text")
kita ingin melihat pada jam berapa publish yang bisa mendapatkan likes terbanyak
view_when <-youtube %>%
group_by(publish_hour, publish_when) %>%
summarise(Published.Video = n(),
mean_likes = mean(likes)) %>%
arrange(desc(mean_likes))
view_when
## # A tibble: 24 x 4
## # Groups: publish_hour [24]
## publish_hour publish_when Published.Video mean_likes
## <dbl> <fct> <int> <dbl>
## 1 4 12am to 8am 38 118354.
## 2 0 12am to 8am 128 42123.
## 3 8 8am to 3pm 129 38718.
## 4 16 3pm to 12am 115 38401.
## 5 6 12am to 8am 71 31990.
## 6 14 8am to 3pm 159 29343.
## 7 22 3pm to 12am 74 28421.
## 8 1 12am to 8am 49 27383.
## 9 19 3pm to 12am 102 27006.
## 10 3 12am to 8am 87 25463.
## # ... with 14 more rows
plot_when <- view_when %>% ggplot(aes(x= publish_when, y= mean_likes,text = glue("mean likes :{mean_likes}")))+
geom_boxplot(fill="pink")+
labs(title = "Waktu Publish Dengan Likes Tertinggi",
x= NULL,
y= NULL)+
scale_y_continuous(labels = comma)+
scale_x_discrete(limits=c("8am to 3pm", "3pm to 12am", "12am to 8am"))+
theme_algoritma
plot_when
ggplotly(plot_when, tooltip = "text")