##Load library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
##Data Import
netflix_data <- read.csv("netflix_titles.csv")
head(netflix_data)
## show_id type title director
## 1 s1 Movie Dick Johnson Is Dead Kirsten Johnson
## 2 s2 TV Show Blood & Water
## 3 s3 TV Show Ganglands Julien Leclercq
## 4 s4 TV Show Jailbirds New Orleans
## 5 s5 TV Show Kota Factory
## 6 s6 TV Show Midnight Mass Mike Flanagan
## cast
## 1
## 2 Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng
## 3 Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera
## 4
## 5 Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar
## 6 Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver
## country date_added release_year rating duration
## 1 United States September 25, 2021 2020 PG-13 90 min
## 2 South Africa September 24, 2021 2021 TV-MA 2 Seasons
## 3 September 24, 2021 2021 TV-MA 1 Season
## 4 September 24, 2021 2021 TV-MA 1 Season
## 5 India September 24, 2021 2021 TV-MA 2 Seasons
## 6 September 24, 2021 2021 TV-MA 1 Season
## listed_in
## 1 Documentaries
## 2 International TV Shows, TV Dramas, TV Mysteries
## 3 Crime TV Shows, International TV Shows, TV Action & Adventure
## 4 Docuseries, Reality TV
## 5 International TV Shows, Romantic TV Shows, TV Comedies
## 6 TV Dramas, TV Horror, TV Mysteries
## description
## 1 As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.
## 2 After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.
## 3 To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.
## 4 Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series.
## 5 In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life.
## 6 The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe.
## X X.1 X.2 X.3 X.4 X.5 X.6 X.7 X.8 X.9 X.10 X.11 X.12 X.13
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
netflix_data <- netflix_data[, -c(13:26)]
str(netflix_data)
## 'data.frame': 8809 obs. of 12 variables:
## $ show_id : chr "s1" "s2" "s3" "s4" ...
## $ type : chr "Movie" "TV Show" "TV Show" "TV Show" ...
## $ title : chr "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
## $ director : chr "Kirsten Johnson" "" "Julien Leclercq" "" ...
## $ cast : chr "" "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ "" ...
## $ country : chr "United States" "South Africa" "" "" ...
## $ date_added : chr "September 25, 2021" "September 24, 2021" "September 24, 2021" "September 24, 2021" ...
## $ release_year: int 2020 2021 2021 2021 2021 2021 2021 1993 2021 2021 ...
## $ rating : chr "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
## $ duration : chr "90 min" "2 Seasons" "1 Season" "1 Season" ...
## $ listed_in : chr "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
## $ description : chr "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...
##Data Cleaning
colSums(is.na(netflix_data))
## show_id type title director cast country
## 0 0 0 0 0 0
## date_added release_year rating duration listed_in description
## 0 0 0 0 0 0
netflix_data <- na.omit(netflix_data)
colSums(is.na(netflix_data)
)
## show_id type title director cast country
## 0 0 0 0 0 0
## date_added release_year rating duration listed_in description
## 0 0 0 0 0 0
dim(netflix_data)
## [1] 8809 12
##Netflix Content Strategy Analysis
Q1: Market Dominance by Country (Bar Plot) Scenario: The strategy team wants to identify which 10 countries produce the most content to prioritize regional marketing budgets.
Code:
top_10_countries <- netflix_data %>%
filter(country != "") %>%
count(country, sort = TRUE) %>%
head(10)
ggplot(top_10_countries, aes(x = reorder(country, n), y = n)) +
geom_bar(stat = "identity", fill = "midnightblue") +
coord_flip()
Interpretation: This Bar Plot reveals that the United States is the primary content provider, followed significantly by India, suggesting these are the core markets for Netflix.
Q2: Content Evolution Over Time (Histogram) Scenario: Analysts need to see if the library is dominated by “Classic” films or “Modern” releases to adjust their acquisition strategy.
Code:
ggplot(netflix_data, aes(x = release_year)) +
geom_histogram(binwidth = 2, fill = "darkred", color = "white")
Interpretation: The Histogram shows a massive spike in content released after 2015, indicating that Netflix prioritizes newer, contemporary media over historical archives.
Q3: Movie vs. TV Show Ratio Scenario: A stakeholder asks for the current split between Movies and TV Shows to decide if they should invest more in episodic content.
Code:
netflix_data %>%
group_by(type) %>%
summarise(Total = n())
## # A tibble: 2 × 2
## type Total
## <chr> <int>
## 1 Movie 6132
## 2 TV Show 2677
Interpretation: The output shows that Movies still outnumber TV Shows on the platform, suggesting Netflix started as a movie-centric service.
Q4: Film Length Trends (Histogram) Scenario: To optimize streaming bandwidth, the tech team wants to know the most common duration for movies.
Code:
movie_duration <- netflix_data %>%
filter(type == "Movie") %>%
mutate(duration_min = as.numeric(str_remove(duration, " min")))
ggplot(movie_duration, aes(x = duration_min)) +
geom_histogram(binwidth = 10, fill = "seagreen")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
Interpretation: This Histogram shows a “Normal Distribution” centered around 90-110 minutes, which is the industry standard for feature films.
Q5: Audience Suitability (Bar Plot) Scenario: The legal team needs to know the distribution of maturity ratings to ensure compliance with global safety standards.
Code:
rating_counts <- netflix_data %>%
filter(rating != "") %>%
count(rating, sort = TRUE) %>%
head(8)
ggplot(rating_counts, aes(x = rating, y = n)) +
geom_bar(stat = "identity", fill = "orange")
Interpretation: The Bar Plot indicates that ‘TV-MA’ (Mature Audiences) is the most frequent rating, showing that a large portion of Netflix content is intended for adults.
Q6: Content Growth in a Specific Year Scenario: How many titles were added to the platform in the year 2021? Code:
netflix_data %>%
filter(str_detect(date_added, "2021")) %>%
nrow()
## [1] 1498
Interpretation: This count represents the library expansion rate during the final year of the recorded data.
Q7: Indian Content Deep Dive Scenario: List the first 5 titles produced exclusively in India. Code:
netflix_data %>%
filter(country == "India") %>%
select(title, release_year) %>%
head(5)
## title release_year
## 1 Kota Factory 2021
## 2 Jeans 1998
## 3 Chhota Bheem 2021
## 4 Dharmakshetra 2014
## 5 Raja Rasoi Aur Anya Kahaniyan 2014
Interpretation: These results show a snapshot of the diversity in Indian cinema available on the platform.
Q8: The “Oldest” Asset Scenario: Find the oldest movie available in the dataset for a “Vintage Cinema” campaign. Code:
netflix_data %>%
filter(type == "Movie") %>%
arrange(release_year) %>%
select(title, release_year) %>%
head(1)
## title release_year
## 1 Prelude to War 1942
Interpretation: This identifies the earliest piece of cinematic history currently hosted on the service.
Q9: Longest Running Series Scenario: Identify the TV Shows with the highest number of seasons. Code:
tv_data <- netflix_data %>%
filter(type == "TV Show") %>%
mutate(seasons = as.numeric(str_extract(duration, "\\d+")))
tv_data %>%
arrange(desc(seasons)) %>%
select(title, seasons) %>%
head(5)
## title seasons
## 1 Grey's Anatomy 17
## 2 Supernatural 15
## 3 NCIS 15
## 4 Heartland 13
## 5 COMEDIANS of the world 13
Interpretation: Shows with the most seasons represent successful franchises with high user retention.
Q10: Most Prolific Director Scenario: Which director has the most titles listed on Netflix? Code:
netflix_data %>%
filter(director != "") %>%
count(director, sort = TRUE) %>%
head(1)
## director n
## 1 Rajiv Chilaka 19
Interpretation: This highlights the director with the strongest professional relationship with the platform.
Q11: Identifying Missing Metadata Scenario: How many entries are missing “Director” information? Code:
sum(is.na(netflix_data$director) | netflix_data$director == "")
## [1] 2634
Interpretation: This informs the data cleaning team about the volume of incomplete records.
Q12: Horror Genre Popularity in the US Scenario: Count how many “Horror” titles are available for the United States market. Code:
netflix_data %>%
filter(country == "United States",
str_detect(listed_in, "Horror")) %>%
nrow()
## [1] 172
Interpretation: This metric helps in understanding the availability of niche genres in specific regions.
Q13: Average Year of TV Show Production Scenario: What is the average release year for TV Shows? Code:
netflix_data %>%
filter(type == "TV Show") %>%
summarise(avg_year = mean(release_year))
## avg_year
## 1 2016.609
Interpretation: A high average year suggests that TV shows on Netflix are generally very recent productions.
Q14: Search for Holiday Content Scenario: Find titles that include the word “Christmas” for a seasonal recommendation list. Code:
netflix_data %>%
filter(str_detect(title, "Christmas")) %>%
select(title) %>%
head(5)
## title
## 1 Home for Christmas
## 2 An Unremarkable Christmas
## 3 How To Ruin Christmas
## 4 A California Christmas
## 5 A Trash Truck Christmas
Interpretation: This provides a quick way to curate themed content for users.
Q15: Genre Comparison: Action vs. Comedy Scenario: Compare the total count of “Action & Adventure” vs “Comedies”. Code:
action <- sum(str_detect(netflix_data$listed_in, "Action"))
comedy <- sum(str_detect(netflix_data$listed_in, "Comedies"))
data.frame(Action = action, Comedy = comedy)
## Action Comedy
## 1 1028 2255
Interpretation: This comparison helps determine which genre is more heavily represented in the library.
Q16: Stand-Up Comedy Availability Scenario: How many “Stand-Up Comedy” specials are in the dataset? Code:
netflix_data %>%
filter(str_detect(listed_in, "Stand-Up Comedy")) %>%
nrow()
## [1] 399
Interpretation: Shows the scale of Netflix’s investment in original comedy specials.
Q17: Data Freshness Check Scenario: Display the last 5 rows of the dataset to see the most recent entries. Code:
tail(netflix_data[, c("title","type","date_added")], 5)
## title type date_added
## 8805 Zombieland Movie November 1, 2019
## 8806 Zoom Movie January 11, 2020
## 8807 Zubaan Movie March 2, 2019
## 8808 Parasyte: The Grey TV Show April 5, 2024
## 8809 Serena Movie April 5, 2024
Interpretation: This confirms the latest additions and ensures the data has been loaded correctly.