Import Library
library(tidyverse)
library(ggpubr)
library(ggplot2)
library(ggrepel)
library(lubridate)
Read Data
netflix <- read.csv("netflix_titles.csv")
head(netflix)
Data Wrangling
netflix$date_added <- mdy(netflix$date_added)
#check NA
colSums(is.na(netflix))
## show_id type title director cast country
## 0 0 0 0 0 0
## date_added release_year rating duration listed_in description
## 10 0 0 0 0 0
#eliminate NA
netflix <- netflix %>%
drop_na()
colSums(is.na(netflix))
## show_id type title director cast country
## 0 0 0 0 0 0
## date_added release_year rating duration listed_in description
## 0 0 0 0 0 0
Comparison Between Film And TV Show
e <- netflix %>%
group_by(type) %>%
summarise(Total = n())
ggplot(e, aes(x = "", y = Total, fill = type)) +
geom_bar(stat = "identity", width = 1, color = "black") +
coord_polar("y", start = 0) +
scale_fill_brewer(palette = "dark2") +
geom_text(aes(y = Total, label = type), color = "black", size = 5) +
theme_update()
## Warning in pal_name(palette, type): Unknown palette dark2

Berdasarkan piechart diatas, jenis tayangan yang ada di netflix masih didominasi oleh film dengan perbandingan yang lumayan jauh
Netflix Growth
a <- netflix %>%
filter(release_year >= 1962,
release_year <= 2020) %>%
group_by(release_year) %>%
summarise(Total = n())
ggplot(a, aes(x = release_year,
y = Total)) +
geom_line(color = "blue", size = 1, alpha = 0.9, linetype = 1) +
geom_point(shape = 21, ) +
theme_linedraw() +
labs(x = "Year",
Y = "Total Film",
title = "Netflix Growth 1962-2020")

Berdasarkan diatas jumlah tayangan tahun 2018 adalah terbanyak dari tahun tahun sebelumnya, namun anehnya jumlah film tahun 2019 dan 2020 berkurang cukup drastis dari 2018
library(varhandle)
netflix_growth <- as.data.frame(table(netflix$type, netflix$release_year))
netflix_growth$Var2 <- unfactor(netflix_growth$Var2)
netflix_growth <- netflix_growth %>%
filter(Var2 >= 1962,
Var2 <= 2020)
ggplot(netflix_growth, aes(x = Var2, y = Freq, group = Var1, color = Var1))+
geom_line(size = 2) +
geom_point() +
labs(x = NULL,
title = "Comparison Between Movie and TV Show") +
theme_linedraw()

Grafik ini membuka mata kita bahwa yang membuat jumlah tayangan yang menurun adalah menurunnya film tahun 2019 dan 2020, karena ternyata tv show selalu naik dari tahun ketahun
Top 10 Country
b <- netflix %>%
filter(country != "") %>%
group_by(country) %>%
summarise(Total = n()) %>%
arrange(desc(Total)) %>%
head(10)
b %>%
ggplot(aes(x = Total, y = reorder(country, Total))) +
geom_col(color = "black", fill = "green", alpha = 0.8, width = 0.5) +
theme_cleveland() +
labs(x = "Total Film", title = "Rank of Country on Netflix")

Berdasarkan chart diatas, Amerika , india dan inggris masih mendominasi jumlah tayangan yang ada di netflix
Distribution of Duration
to_remove <- c("1 Seasons", "2 Seasons", "3 Seasons", "4 Seasons", "5 Seasons",
"6 Seasons", "7 Seasons", "8 Seasons", "9 Seasons", "10 Seasons",
"11 Seasons", "12 Seasons", "13 Seasons", "14 Seasons", "15 Seasons")
netflix_clean <- netflix[!(netflix$duration %in% to_remove), ]
netflix_clean$duration <- str_remove_all(netflix_clean$duration, "min")
netflix_clean <- transform(netflix_clean, duration = as.numeric(duration))
## Warning in eval(substitute(list(...)), `_data`, parent.frame()): NAs introduced
## by coercion
netflix_clean <- netflix_clean %>%
na.omit()
ggplot(netflix_clean, aes(x = duration)) +
geom_density(fill = "gold", bw = 2.0, alpha = 0.8) +
scale_x_continuous(limits = c(0, 200), expand = c(0,0)) +
geom_rug(alpha = 0.05)
## Warning: Removed 15 rows containing non-finite values (stat_density).
Berdasarkan data, durasi yang paling banyak muncul didalam tayangan netflix yaitu berkisar antara 80 menit sampai 120 menit. jarang sekali ada tayangan netflix yang berdurasi lebih dari 200 menit atau kurang dari 30 menit
Top 15 Genre
most_genre <- netflix %>%
mutate(listed_in = strsplit(as.character(listed_in), ", ")) %>%
unnest(listed_in) %>%
group_by(listed_in) %>%
summarise(Total = n()) %>%
arrange(desc(Total)) %>%
head(15)
most_genre$listed_in <- factor(most_genre$listed_in, levels = most_genre$listed_in)
ggplot(most_genre, aes(x = listed_in, y = Total)) +
geom_segment(aes(x = listed_in, xend = listed_in, y = 0, yend = Total)) +
geom_point(size = 4, color = "blue", fill = alpha("red", 0.2), alpha = 0.7, shape = 21, stroke = 4) +
labs(x = NULL,
title = "Top 15 Genre on Netflix") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.8))

Berdasarkan data, Genre international movies, Drama dan komedi masih menjadi genre yang paling disukai oleh penikmat netflix, dibuktikan dengan tayangan yang bergenre ini lebih banyak dari genre lainnya dengan selisih yang cukup jauh
15 Actor with Most Show
c <- netflix %>%
select(c(cast, country)) %>%
mutate(cast = strsplit(as.character(cast), ", ")) %>%
unnest(cast) %>%
group_by(cast) %>%
summarise(Total = n()) %>%
arrange(desc(Total)) %>%
head(15)
c %>%
ggplot(aes(reorder(cast, Total), Total)) +
geom_col(fill = "blue", color = "red") +
theme(panel.grid = element_blank()) +
theme(axis.text.x = element_text(angle = 65, vjust = 0.8)) +
labs(x = "Actor",
title = "Actor with Most Movie on Netflix")

Aktor aktor india seperti Anupam kher, shahrukhan dan akshyay kumar merupakan aktor yang paling sering muncul dalam tayangan netflix

Rating on Netflix
netflix %>%
group_by(rating) %>%
mutate(count = n()) %>%
ggplot(aes(x = reorder(rating, count), fill = type)) +
geom_bar(position = position_dodge(width = 0.6), alpha = 0.6) +
labs(x = "Ratings",
title = "Comparison Movie and TV Show by Rating")

Berdasarkan data diatas ternyata banyak rating yang ada di Film namun tidak ada di tv shows
Netflix Publishing
netflix$month_added <- month(netflix$date_added)
f <- netflix %>%
select(type, month_added) %>%
group_by(type, month_added) %>%
summarise(Total = n())
f$month_added <- as.factor(f$month_added)
f %>%
ggplot(aes(x = Total, y = month_added)) +
geom_col(aes(fill = type), position = "fill") +
geom_vline(xintercept = 0.5, col = "black", lty = 2, lwd = 1.5) +
scale_fill_manual(values = c("red", "blue")) +
labs(x = NULL,
y = "Month Added",
title = "Comparison Month Added Between Movie and TV Show") +
theme_cleveland()

grafik ini menjelaskan kepada kita bahwa setiap bulannya film masih mengungguli tv shows dalam hal penerbitannya