Background

Hello, welcome to my Rmd!
I’ll be using Netflixglobal.csv data in this LBB
Netflixglobal.csv data is a description of the data of Netflix’s Trending Videos
Let’s dig deeper into the data and name the object Netflix!

Import Data

First, we can input the data

Netflix <- read.csv("data_input/Netflixglobal.csv")

Data Inspection

library(tidyverse)
library(lubridate)

Then, using the functions head() and tail(), we will see the top data and the last data

head(Netflix)
tail(Netflix)

Check dimension data:

dim(Netflix)
#> [1] 1160    7

Check names each columns:

names(Netflix)
#> [1] "week"                       "category"                  
#> [3] "weekly_rank"                "show_title"                
#> [5] "season_title"               "weekly_hours_viewed"       
#> [7] "cumulative_weeks_in_top_10"

From the inspection of the data, we can conclude:
- Netflix data contain 1160 of rows and 7 of coloumns
- Column name of the data : “week”,“category”,“weekly_rank”,“show_title”,“season_title”, “weekly_hours_viewed”, “cumulative_weeks_in_top_10”

Data Cleansing

Check data types:

str(Netflix)
#> 'data.frame':    1160 obs. of  7 variables:
#>  $ week                      : chr  "2022-01-16" "2022-01-16" "2022-01-16" "2022-01-16" ...
#>  $ category                  : chr  "Films (English)" "Films (English)" "Films (English)" "Films (English)" ...
#>  $ weekly_rank               : int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ show_title                : chr  "Brazen" "Don't Look Up" "Mother/Android" "The Secret Life of Pets 2" ...
#>  $ season_title              : chr  "" "" "" "" ...
#>  $ weekly_hours_viewed       : int  45340000 28390000 23170000 9390000 8790000 8710000 8710000 8470000 7860000 7000000 ...
#>  $ cumulative_weeks_in_top_10: int  1 4 2 1 3 10 6 1 1 1 ...

We can see that there are some data types that do not match.
Then we can change the data type first!

#change to Factor and Date using dplyr

Netflix<- Netflix %>% 
  mutate(category = as.factor(category),
        week= as.Date(week))

#change to Date
glimpse(Netflix)
#> Rows: 1,160
#> Columns: 7
#> $ week                       <date> 2022-01-16, 2022-01-16, 2022-01-16, 2022-0…
#> $ category                   <fct> Films (English), Films (English), Films (En…
#> $ weekly_rank                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, …
#> $ show_title                 <chr> "Brazen", "Don't Look Up", "Mother/Android"…
#> $ season_title               <chr> "", "", "", "", "", "", "", "", "", "", "",…
#> $ weekly_hours_viewed        <int> 45340000, 28390000, 23170000, 9390000, 8790…
#> $ cumulative_weeks_in_top_10 <int> 1, 4, 2, 1, 3, 10, 6, 1, 1, 1, 2, 2, 2, 2, …

The data types change was successful! We can continue to look for missing values.

Check missing value on all data:

anyNA(Netflix)
#> [1] FALSE

Check missing value each of columns:

colSums(is.na(Netflix))
#>                       week                   category 
#>                          0                          0 
#>                weekly_rank                 show_title 
#>                          0                          0 
#>               season_title        weekly_hours_viewed 
#>                          0                          0 
#> cumulative_weeks_in_top_10 
#>                          0

Good! No missing value.
Now, Netflix dataset is ready to be processed and analyzed.

Data Pre-Processing

  1. Which category has the highest proportion of cumulative weeks in top 10?
topten <- Netflix %>% 
  select (category, cumulative_weeks_in_top_10)
  
topten
  1. What is the average weekly viewers (mean viewed) for each category in 2021?
Netflix$year <- (year(Netflix$week))
head(Netflix$year)
#> [1] 2022 2022 2022 2022 2022 2022
case1 <- Netflix %>% 
  filter(year == 2021) %>% 
  group_by(category) %>% 
  summarise(mean_viewed = mean(weekly_hours_viewed)) %>% 
  ungroup() %>% 
  arrange(desc(mean_viewed))
case1
  1. Which of the films in the top three weekly rankings has the most weekly views (more than 50,000,000 times watched)?
case2 <- Netflix %>% 
  filter(category == "Films (English)" | category == "Films (Non-English)" , weekly_rank <= 3 , weekly_hours_viewed >= 50000000) %>% 
  group_by(show_title) %>% 
  summarise(max_viewed = max(weekly_hours_viewed)) %>% 
  ungroup() %>% 
  arrange(desc(max_viewed))
case2
  1. Which film has the most views of the TV series that are in the top three weekly rankings and have weekly views of more than 100,000,000 times watched?
case3 <- Netflix %>% 
  filter(category == "TV (English)" | category == "TV (Non-English)" , weekly_rank <= 3 , weekly_hours_viewed >= 100000000) %>% 
  group_by(show_title) %>% 
  summarise(max_viewed = max(weekly_hours_viewed)) %>% 
  ungroup() %>% 
  arrange(desc(max_viewed))
case3
  1. Show trending on netflix!
trending_channel <- Netflix %>% 
  group_by(show_title) %>% 
  summarise(n_position = n()) %>%
  ungroup() %>% 
  filter(n_position >= 10) %>% 
  arrange(desc(n_position)) 
trending_channel

Visualization

library(ggplot2)
library(plotly) 
library(glue) 

1. Frequency of Cumulative Weeks in Top 10

ggplot(data = topten, mapping = aes(x = category, y = cumulative_weeks_in_top_10, color = category)) +
  geom_boxplot()

Data Visualization Interactive :

plot_topten <- ggplot(data = topten, mapping = aes(x = category, y = cumulative_weeks_in_top_10, 
                                                   text= glue("Frekuensi: {round(cumulative_weeks_in_top_10)}"),
                                                   color = category)) +
  geom_boxplot()+
    labs(title = "Frequency of Cumulative Weeks in Top 10",
       x = "Category",
       y = NULL,
       fill = NULL)+
  scale_fill_brewer(palette = "Set2")
ggplotly(plot_topten, tooltip = "text")

Answer : Based on data, it appears that TV (Non - English) has the highest frequency on Cumulative Weeks in Top 10.

2. Mean Viewed per Category

ggplot(case1, aes(x=category, y=mean_viewed)) + 
    geom_col(mapping = aes(fill = category), 
           position = "dodge")+
  labs(title = "Proportion of Mean Viewed on 2021",
       subtitle = "Mean Viewed per Category",
       x = "Category",
       y = NULL,
       fill = NULL)+
  scale_fill_brewer(palette = "Set3")

Data Visualization Interactive :

options(scipen = 99)
plot_case1 <- ggplot(case1, aes(x=category, y=mean_viewed, text = glue("Mean Viewed: {round(mean_viewed,2)}"))) + 
    geom_col(mapping = aes(fill = category), 
           position = "dodge")+
  labs(title = "Proportion of Mean Viewed on 2021",
       subtitle = "Mean Viewed per Category",
       x = "Category",
       y = NULL,
       fill = NULL)+
  scale_fill_brewer(palette = "Set3")
ggplotly(plot_case1, tooltip = "text")

Answer: It can be seen which category has the highest average viewers in 2021, namely TV (English).

3. Maximal Viewed of Film on Netflix

ggplot(data = case2, aes(x = max_viewed, y = reorder(show_title, max_viewed)))+ 
  geom_col(aes(fill = max_viewed), show.legend = F)+
  labs(y = "show title", x = NULL, title = "Maximal Viewed of Film on Netflix")+
  scale_fill_gradient(low = "rosybrown1", high = "brown4")+
  theme_minimal()

Data Visualization Interactive :

plot_case2 <- ggplot(data = case2, aes(x = max_viewed, 
                                       y = reorder(show_title, max_viewed),
                                       text = glue("Maximal Viewed: {round(max_viewed,2)}")))+ 
  geom_col(aes(fill = max_viewed), show.legend = F)+
  labs(y = "show title", x = NULL, title = "Maximal Viewed of Film on Netflix")+
  scale_fill_gradient(low = "rosybrown1", high = "brown4")+
  theme_minimal()
ggplotly(plot_case2, tooltip = "text")

Answer : The film with the most viewers in the film category is Don’t Look Up.

4. Maximal Viewed of TV series on Netflix

ggplot(data = case3, aes(x = max_viewed,y = reorder(show_title, max_viewed)))+ 
  geom_col(aes(fill = max_viewed), show.legend = F)+
  labs(y = "show title", x = NULL, title = "Maximal Viewed of TV series on Netflix")+
  scale_fill_gradient(low = "mistyrose", high = "indianred4")+
  theme_minimal()

Data Visualization Interactive :

plot_case3 <- ggplot(data = case3, aes(x = max_viewed, 
                                       y = reorder(show_title, max_viewed),
                                       text = glue("Maximal Viewed: {round(max_viewed,2)}")))+ 
  geom_col(aes(fill = max_viewed), show.legend = F)+
  labs(y = "show title", x = NULL, title = "Maximal Viewed of TV series on Netflix")+
  scale_fill_gradient(low = "mistyrose", high = "indianred4")+
  theme_minimal()
ggplotly(plot_case3, tooltip = "text")

Answer: Squid Game has the most viewers of any TV series.