Hello, welcome to my Rmd!
I’ll be using Netflixglobal.csv data in this LBB
Netflixglobal.csv data is a description of the data of Netflix’s Trending Videos
Let’s dig deeper into the data and name the object Netflix!
First, we can input the data
Netflix <- read.csv("data_input/Netflixglobal.csv")library(tidyverse)
library(lubridate)Then, using the functions head() and tail(), we will see the top data and the last data
head(Netflix)tail(Netflix)Check dimension data:
dim(Netflix)#> [1] 1160 7
Check names each columns:
names(Netflix)#> [1] "week" "category"
#> [3] "weekly_rank" "show_title"
#> [5] "season_title" "weekly_hours_viewed"
#> [7] "cumulative_weeks_in_top_10"
From the inspection of the data, we can conclude:
- Netflix data contain 1160 of rows and 7 of coloumns
- Column name of the data : “week”,“category”,“weekly_rank”,“show_title”,“season_title”, “weekly_hours_viewed”, “cumulative_weeks_in_top_10”
Check data types:
str(Netflix)#> 'data.frame': 1160 obs. of 7 variables:
#> $ week : chr "2022-01-16" "2022-01-16" "2022-01-16" "2022-01-16" ...
#> $ category : chr "Films (English)" "Films (English)" "Films (English)" "Films (English)" ...
#> $ weekly_rank : int 1 2 3 4 5 6 7 8 9 10 ...
#> $ show_title : chr "Brazen" "Don't Look Up" "Mother/Android" "The Secret Life of Pets 2" ...
#> $ season_title : chr "" "" "" "" ...
#> $ weekly_hours_viewed : int 45340000 28390000 23170000 9390000 8790000 8710000 8710000 8470000 7860000 7000000 ...
#> $ cumulative_weeks_in_top_10: int 1 4 2 1 3 10 6 1 1 1 ...
We can see that there are some data types that do not match.
Then we can change the data type first!
#change to Factor and Date using dplyr
Netflix<- Netflix %>%
mutate(category = as.factor(category),
week= as.Date(week))
#change to Date
glimpse(Netflix)#> Rows: 1,160
#> Columns: 7
#> $ week <date> 2022-01-16, 2022-01-16, 2022-01-16, 2022-0…
#> $ category <fct> Films (English), Films (English), Films (En…
#> $ weekly_rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, …
#> $ show_title <chr> "Brazen", "Don't Look Up", "Mother/Android"…
#> $ season_title <chr> "", "", "", "", "", "", "", "", "", "", "",…
#> $ weekly_hours_viewed <int> 45340000, 28390000, 23170000, 9390000, 8790…
#> $ cumulative_weeks_in_top_10 <int> 1, 4, 2, 1, 3, 10, 6, 1, 1, 1, 2, 2, 2, 2, …
The data types change was successful! We can continue to look for missing values.
Check missing value on all data:
anyNA(Netflix)#> [1] FALSE
Check missing value each of columns:
colSums(is.na(Netflix))#> week category
#> 0 0
#> weekly_rank show_title
#> 0 0
#> season_title weekly_hours_viewed
#> 0 0
#> cumulative_weeks_in_top_10
#> 0
Good! No missing value.
Now, Netflix dataset is ready to be processed and analyzed.
topten <- Netflix %>%
select (category, cumulative_weeks_in_top_10)
toptenNetflix$year <- (year(Netflix$week))
head(Netflix$year)#> [1] 2022 2022 2022 2022 2022 2022
case1 <- Netflix %>%
filter(year == 2021) %>%
group_by(category) %>%
summarise(mean_viewed = mean(weekly_hours_viewed)) %>%
ungroup() %>%
arrange(desc(mean_viewed))
case1case2 <- Netflix %>%
filter(category == "Films (English)" | category == "Films (Non-English)" , weekly_rank <= 3 , weekly_hours_viewed >= 50000000) %>%
group_by(show_title) %>%
summarise(max_viewed = max(weekly_hours_viewed)) %>%
ungroup() %>%
arrange(desc(max_viewed))
case2case3 <- Netflix %>%
filter(category == "TV (English)" | category == "TV (Non-English)" , weekly_rank <= 3 , weekly_hours_viewed >= 100000000) %>%
group_by(show_title) %>%
summarise(max_viewed = max(weekly_hours_viewed)) %>%
ungroup() %>%
arrange(desc(max_viewed))
case3trending_channel <- Netflix %>%
group_by(show_title) %>%
summarise(n_position = n()) %>%
ungroup() %>%
filter(n_position >= 10) %>%
arrange(desc(n_position))
trending_channellibrary(ggplot2)
library(plotly)
library(glue) ggplot(data = topten, mapping = aes(x = category, y = cumulative_weeks_in_top_10, color = category)) +
geom_boxplot() Data Visualization Interactive :
plot_topten <- ggplot(data = topten, mapping = aes(x = category, y = cumulative_weeks_in_top_10,
text= glue("Frekuensi: {round(cumulative_weeks_in_top_10)}"),
color = category)) +
geom_boxplot()+
labs(title = "Frequency of Cumulative Weeks in Top 10",
x = "Category",
y = NULL,
fill = NULL)+
scale_fill_brewer(palette = "Set2")
ggplotly(plot_topten, tooltip = "text")Answer : Based on data, it appears that TV (Non - English) has the highest frequency on Cumulative Weeks in Top 10.
ggplot(case1, aes(x=category, y=mean_viewed)) +
geom_col(mapping = aes(fill = category),
position = "dodge")+
labs(title = "Proportion of Mean Viewed on 2021",
subtitle = "Mean Viewed per Category",
x = "Category",
y = NULL,
fill = NULL)+
scale_fill_brewer(palette = "Set3")Data Visualization Interactive :
options(scipen = 99)
plot_case1 <- ggplot(case1, aes(x=category, y=mean_viewed, text = glue("Mean Viewed: {round(mean_viewed,2)}"))) +
geom_col(mapping = aes(fill = category),
position = "dodge")+
labs(title = "Proportion of Mean Viewed on 2021",
subtitle = "Mean Viewed per Category",
x = "Category",
y = NULL,
fill = NULL)+
scale_fill_brewer(palette = "Set3")
ggplotly(plot_case1, tooltip = "text")Answer: It can be seen which category has the highest average viewers in 2021, namely TV (English).
ggplot(data = case2, aes(x = max_viewed, y = reorder(show_title, max_viewed)))+
geom_col(aes(fill = max_viewed), show.legend = F)+
labs(y = "show title", x = NULL, title = "Maximal Viewed of Film on Netflix")+
scale_fill_gradient(low = "rosybrown1", high = "brown4")+
theme_minimal() Data Visualization Interactive :
plot_case2 <- ggplot(data = case2, aes(x = max_viewed,
y = reorder(show_title, max_viewed),
text = glue("Maximal Viewed: {round(max_viewed,2)}")))+
geom_col(aes(fill = max_viewed), show.legend = F)+
labs(y = "show title", x = NULL, title = "Maximal Viewed of Film on Netflix")+
scale_fill_gradient(low = "rosybrown1", high = "brown4")+
theme_minimal()
ggplotly(plot_case2, tooltip = "text")Answer : The film with the most viewers in the film category is Don’t Look Up.
ggplot(data = case3, aes(x = max_viewed,y = reorder(show_title, max_viewed)))+
geom_col(aes(fill = max_viewed), show.legend = F)+
labs(y = "show title", x = NULL, title = "Maximal Viewed of TV series on Netflix")+
scale_fill_gradient(low = "mistyrose", high = "indianred4")+
theme_minimal() Data Visualization Interactive :
plot_case3 <- ggplot(data = case3, aes(x = max_viewed,
y = reorder(show_title, max_viewed),
text = glue("Maximal Viewed: {round(max_viewed,2)}")))+
geom_col(aes(fill = max_viewed), show.legend = F)+
labs(y = "show title", x = NULL, title = "Maximal Viewed of TV series on Netflix")+
scale_fill_gradient(low = "mistyrose", high = "indianred4")+
theme_minimal()
ggplotly(plot_case3, tooltip = "text")Answer: Squid Game has the most viewers of any TV series.
ggplot(data = trending_channel, mapping = aes(x = n_position, y = reorder(show_title, n_position))) +
geom_segment(aes( x = 0, xend = n_position, yend = reorder(show_title, n_position)), color="grey") +
geom_point(color= "indianred3", size=3,fill=alpha("orange", 0.3), alpha=0.7, shape=21, stroke=2)+
labs(x = "Frequency", y = NULL, title = "Trending Channel Netflix")+
theme_minimal()Data Visualization Interactive :
plot_trending<- ggplot(data = trending_channel, mapping = aes(x = n_position, y = reorder(show_title, n_position),
text = glue("Frequency: {n_position}"))) +
geom_segment(aes( x = 0, xend = n_position, yend = reorder(show_title, n_position)), color="grey") +
geom_point(color= "indianred3", size=3,fill=alpha("orange", 0.3), alpha=0.7, shape=21, stroke=2)+
labs(x = "Frequency", y = NULL, title = "Trending Channel Netflix")+
theme_minimal()
ggplotly(plot_trending, tooltip = "text")Answer: Money Heist is the trending channel based on the frequency of show titles that appear frequently.