Impporting libraries
Load data
netflix <- read.csv("data/netflix_titles.csv")
Check data
head(netflix)
## show_id type title director
## 1 s1 Movie Dick Johnson Is Dead Kirsten Johnson
## 2 s2 TV Show Blood & Water
## 3 s3 TV Show Ganglands Julien Leclercq
## 4 s4 TV Show Jailbirds New Orleans
## 5 s5 TV Show Kota Factory
## 6 s6 TV Show Midnight Mass Mike Flanagan
## cast
## 1
## 2 Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng
## 3 Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera
## 4
## 5 Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar
## 6 Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver
## country date_added release_year rating duration
## 1 United States September 25, 2021 2020 PG-13 90 min
## 2 South Africa September 24, 2021 2021 TV-MA 2 Seasons
## 3 September 24, 2021 2021 TV-MA 1 Season
## 4 September 24, 2021 2021 TV-MA 1 Season
## 5 India September 24, 2021 2021 TV-MA 2 Seasons
## 6 September 24, 2021 2021 TV-MA 1 Season
## listed_in
## 1 Documentaries
## 2 International TV Shows, TV Dramas, TV Mysteries
## 3 Crime TV Shows, International TV Shows, TV Action & Adventure
## 4 Docuseries, Reality TV
## 5 International TV Shows, Romantic TV Shows, TV Comedies
## 6 TV Dramas, TV Horror, TV Mysteries
## description
## 1 As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.
## 2 After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.
## 3 To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.
## 4 Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series.
## 5 In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life.
## 6 The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe.
str(netflix)
## 'data.frame': 8807 obs. of 12 variables:
## $ show_id : chr "s1" "s2" "s3" "s4" ...
## $ type : chr "Movie" "TV Show" "TV Show" "TV Show" ...
## $ title : chr "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
## $ director : chr "Kirsten Johnson" "" "Julien Leclercq" "" ...
## $ cast : chr "" "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ "" ...
## $ country : chr "United States" "South Africa" "" "" ...
## $ date_added : chr "September 25, 2021" "September 24, 2021" "September 24, 2021" "September 24, 2021" ...
## $ release_year: int 2020 2021 2021 2021 2021 2021 2021 1993 2021 2021 ...
## $ rating : chr "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
## $ duration : chr "90 min" "2 Seasons" "1 Season" "1 Season" ...
## $ listed_in : chr "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
## $ description : chr "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...
summary(netflix)
## show_id type title director
## Length :8807 Length :8807 Length :8807 Length :8807
## N.unique :8807 N.unique : 2 N.unique :8807 N.unique :4529
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank :2634
## Min.nchar: 2 Min.nchar: 5 Min.nchar: 1 Min.nchar: 0
## Max.nchar: 5 Max.nchar: 7 Max.nchar: 104 Max.nchar: 208
##
## cast country date_added release_year
## Length :8807 Length :8807 Length :8807 Min. :1925
## N.unique :7693 N.unique : 749 N.unique :1768 1st Qu.:2013
## N.blank : 825 N.blank : 831 N.blank : 10 Median :2017
## Min.nchar: 0 Min.nchar: 0 Min.nchar: 0 Mean :2014
## Max.nchar: 771 Max.nchar: 123 Max.nchar: 19 3rd Qu.:2019
## Max. :2021
## rating duration listed_in description
## Length :8807 Length :8807 Length :8807 Length :8807
## N.unique : 18 N.unique : 221 N.unique : 514 N.unique :8775
## N.blank : 4 N.blank : 3 N.blank : 0 N.blank : 0
## Min.nchar: 0 Min.nchar: 0 Min.nchar: 6 Min.nchar: 61
## Max.nchar: 8 Max.nchar: 10 Max.nchar: 79 Max.nchar: 248
##
Data Cleaning
netflix <- netflix %>%
drop_na()
netflix$date_added <- as.Date(
netflix$date_added,
format="%B %d, %Y"
)
netflix$year_added <- year(netflix$date_added)
Visualizations
theme(
plot.title = element_text(
size = 16,
face = "bold"
),
axis.text = element_text(size = 11),
axis.title = element_text(size = 13)
)
## <theme> List of 3
## $ axis.title: <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : num 13
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi FALSE
## $ axis.text : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : num 11
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi FALSE
## $ plot.title: <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : chr "bold"
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : num 16
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi FALSE
## @ complete: logi FALSE
## @ validate: logi TRUE
type_data <- netflix %>%
count(type)
ggplot(type_data,
aes(x = "",
y = n,
fill = type)) +
geom_bar(stat = "identity",
width = 1) +
coord_polar("y", start = 0) +
labs(
title = "Distribution of Movies vs TV Shows",
fill = "Content Type"
) +
theme_void() +
scale_fill_viridis_d()
Netflix contains significantly more Movies than TV Shows, indicating a
stronger focus on movie-based content distribution.
top_countries <- netflix %>%
count(country, sort = TRUE) %>%
head(10)
ggplot(top_countries,
aes(x = reorder(country, n),
y = n,
fill = n)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(
title = "Top 10 Countries Producing Netflix Content",
x = "Country",
y = "Number of Titles"
) +
theme_minimal() +
scale_fill_viridis_c()
The United States dominates Netflix content production, followed by
India and the United Kingdom.
yearly_release <- netflix %>%
count(release_year)
ggplot(yearly_release,
aes(x = release_year,
y = n)) +
geom_line(color = "blue",
linewidth = 1.2) +
geom_point(color = "red") +
labs(
title = "Netflix Content Released by Year",
x = "Release Year",
y = "Number of Releases"
) +
theme_minimal()
Netflix content releases increased rapidly after 2015, reflecting the
platform’s global expansion strategy.
clean movie duration:
movies <- netflix %>%
filter(type == "Movie")
movies$duration_num <- as.numeric(
gsub(" min", "", movies$duration)
)
ggplot(movies,
aes(x = duration_num)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
labs(
title = "Distribution of Movie Durations",
x = "Duration (Minutes)",
y = "Frequency"
) +
theme_minimal()
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
Most Netflix movies fall between 80 and 120 minutes in duration.
rating_data <- netflix %>%
count(rating, sort = TRUE)
ggplot(rating_data,
aes(x = reorder(rating, n),
y = n,
fill = rating)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(
title = "Distribution of Netflix Ratings",
x = "Rating",
y = "Number of Titles"
) +
theme_minimal() +
theme(legend.position = "none")
TV-MA and TV-14 are the most common ratings, indicating strong adult and
teen audience targeting.
content_added <- netflix %>%
count(year_added)
ggplot(content_added,
aes(x = year_added,
y = n)) +
geom_area(fill = "skyblue",
alpha = 0.7) +
labs(
title = "Netflix Content Added Over Time",
x = "Year Added",
y = "Number of Titles Added"
) +
theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_align()`).
Netflix experienced substantial growth in content additions between 2016
and 2020.
Create numeric variables:
movies$release_year_num <- movies$release_year
numeric_data <- movies %>%
select(duration_num, release_year_num)
cor_matrix <- cor(numeric_data)
corrplot(
cor_matrix,
method = "color",
type = "upper",
tl.col = "black",
addCoef.col = "black"
)
The heatmap shows weak correlation between release year and movie
duration.
interactive_plot <- ggplot(
yearly_release,
aes(
x = release_year,
y = n
)
) +
geom_line(color = "red",
linewidth = 1.2) +
geom_point() +
labs(
title = "Interactive Netflix Release Trend",
x = "Release Year",
y = "Number of Releases"
) +
theme_minimal()
ggplotly(interactive_plot)