##Load library

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

##Data Import

netflix_data <- read.csv("netflix_titles.csv")
head(netflix_data)
##   show_id    type                 title        director
## 1      s1   Movie  Dick Johnson Is Dead Kirsten Johnson
## 2      s2 TV Show         Blood & Water                
## 3      s3 TV Show             Ganglands Julien Leclercq
## 4      s4 TV Show Jailbirds New Orleans                
## 5      s5 TV Show          Kota Factory                
## 6      s6 TV Show         Midnight Mass   Mike Flanagan
##                                                                                                                                                                                                                                                                                                              cast
## 1                                                                                                                                                                                                                                                                                                                
## 2 Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng
## 3                                                                                                                                                             Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera
## 4                                                                                                                                                                                                                                                                                                                
## 5                                                                                                                                                                                                        Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar
## 6                                                                        Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver
##         country         date_added release_year rating  duration
## 1 United States September 25, 2021         2020  PG-13    90 min
## 2  South Africa September 24, 2021         2021  TV-MA 2 Seasons
## 3               September 24, 2021         2021  TV-MA  1 Season
## 4               September 24, 2021         2021  TV-MA  1 Season
## 5         India September 24, 2021         2021  TV-MA 2 Seasons
## 6               September 24, 2021         2021  TV-MA  1 Season
##                                                       listed_in
## 1                                                 Documentaries
## 2               International TV Shows, TV Dramas, TV Mysteries
## 3 Crime TV Shows, International TV Shows, TV Action & Adventure
## 4                                        Docuseries, Reality TV
## 5        International TV Shows, Romantic TV Shows, TV Comedies
## 6                            TV Dramas, TV Horror, TV Mysteries
##                                                                                                                                                description
## 1 As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.
## 2      After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.
## 3       To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.
## 4      Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series.
## 5 In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life.
## 6 The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe.
##    X X.1 X.2 X.3 X.4 X.5 X.6 X.7 X.8 X.9 X.10 X.11 X.12 X.13
## 1 NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA
## 2 NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA
## 3 NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA
## 4 NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA
## 5 NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA
## 6 NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA
netflix_data <- netflix_data[, -c(13:26)]
str(netflix_data)
## 'data.frame':    8809 obs. of  12 variables:
##  $ show_id     : chr  "s1" "s2" "s3" "s4" ...
##  $ type        : chr  "Movie" "TV Show" "TV Show" "TV Show" ...
##  $ title       : chr  "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
##  $ director    : chr  "Kirsten Johnson" "" "Julien Leclercq" "" ...
##  $ cast        : chr  "" "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ "" ...
##  $ country     : chr  "United States" "South Africa" "" "" ...
##  $ date_added  : chr  "September 25, 2021" "September 24, 2021" "September 24, 2021" "September 24, 2021" ...
##  $ release_year: int  2020 2021 2021 2021 2021 2021 2021 1993 2021 2021 ...
##  $ rating      : chr  "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
##  $ duration    : chr  "90 min" "2 Seasons" "1 Season" "1 Season" ...
##  $ listed_in   : chr  "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
##  $ description : chr  "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...

##Data Cleaning

colSums(is.na(netflix_data))
##      show_id         type        title     director         cast      country 
##            0            0            0            0            0            0 
##   date_added release_year       rating     duration    listed_in  description 
##            0            0            0            0            0            0
netflix_data <- na.omit(netflix_data)
colSums(is.na(netflix_data)
)
##      show_id         type        title     director         cast      country 
##            0            0            0            0            0            0 
##   date_added release_year       rating     duration    listed_in  description 
##            0            0            0            0            0            0
dim(netflix_data)
## [1] 8809   12

##Netflix Content Strategy Analysis

Q1: Market Dominance by Country (Bar Plot) Scenario: The strategy team wants to identify which 10 countries produce the most content to prioritize regional marketing budgets.

Code:

top_10_countries <- netflix_data %>%
  filter(country != "") %>%
  count(country, sort = TRUE) %>%
  head(10)

ggplot(top_10_countries, aes(x = reorder(country, n), y = n)) +
  geom_bar(stat = "identity", fill = "midnightblue") +
  coord_flip()

Interpretation: This Bar Plot reveals that the United States is the primary content provider, followed significantly by India, suggesting these are the core markets for Netflix.

Q2: Content Evolution Over Time (Histogram) Scenario: Analysts need to see if the library is dominated by “Classic” films or “Modern” releases to adjust their acquisition strategy.

Code:

ggplot(netflix_data, aes(x = release_year)) +
  geom_histogram(binwidth = 2, fill = "darkred", color = "white")

Interpretation: The Histogram shows a massive spike in content released after 2015, indicating that Netflix prioritizes newer, contemporary media over historical archives.

Q3: Movie vs. TV Show Ratio Scenario: A stakeholder asks for the current split between Movies and TV Shows to decide if they should invest more in episodic content.

Code:

netflix_data %>%
  group_by(type) %>%
  summarise(Total = n())
## # A tibble: 2 × 2
##   type    Total
##   <chr>   <int>
## 1 Movie    6132
## 2 TV Show  2677

Interpretation: The output shows that Movies still outnumber TV Shows on the platform, suggesting Netflix started as a movie-centric service.

Q4: Film Length Trends (Histogram) Scenario: To optimize streaming bandwidth, the tech team wants to know the most common duration for movies.

Code:

movie_duration <- netflix_data %>%
  filter(type == "Movie") %>%
  mutate(duration_min = as.numeric(str_remove(duration, " min")))

ggplot(movie_duration, aes(x = duration_min)) +
  geom_histogram(binwidth = 10, fill = "seagreen")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

Interpretation: This Histogram shows a “Normal Distribution” centered around 90-110 minutes, which is the industry standard for feature films.

Q5: Audience Suitability (Bar Plot) Scenario: The legal team needs to know the distribution of maturity ratings to ensure compliance with global safety standards.

Code:

rating_counts <- netflix_data %>%
  filter(rating != "") %>%
  count(rating, sort = TRUE) %>%
  head(8)

ggplot(rating_counts, aes(x = rating, y = n)) +
  geom_bar(stat = "identity", fill = "orange")

Interpretation: The Bar Plot indicates that ‘TV-MA’ (Mature Audiences) is the most frequent rating, showing that a large portion of Netflix content is intended for adults.

Q6: Content Growth in a Specific Year Scenario: How many titles were added to the platform in the year 2021? Code:

netflix_data %>%
  filter(str_detect(date_added, "2021")) %>%
  nrow()
## [1] 1498

Interpretation: This count represents the library expansion rate during the final year of the recorded data.

Q7: Indian Content Deep Dive Scenario: List the first 5 titles produced exclusively in India. Code:

netflix_data %>%
  filter(country == "India") %>%
  select(title, release_year) %>%
  head(5)
##                           title release_year
## 1                  Kota Factory         2021
## 2                         Jeans         1998
## 3                  Chhota Bheem         2021
## 4                 Dharmakshetra         2014
## 5 Raja Rasoi Aur Anya Kahaniyan         2014

Interpretation: These results show a snapshot of the diversity in Indian cinema available on the platform.

Q8: The “Oldest” Asset Scenario: Find the oldest movie available in the dataset for a “Vintage Cinema” campaign. Code:

netflix_data %>%
  filter(type == "Movie") %>%
  arrange(release_year) %>%
  select(title, release_year) %>%
  head(1)
##            title release_year
## 1 Prelude to War         1942

Interpretation: This identifies the earliest piece of cinematic history currently hosted on the service.

Q9: Longest Running Series Scenario: Identify the TV Shows with the highest number of seasons. Code:

tv_data <- netflix_data %>%
  filter(type == "TV Show") %>%
  mutate(seasons = as.numeric(str_extract(duration, "\\d+")))

tv_data %>%
  arrange(desc(seasons)) %>%
  select(title, seasons) %>%
  head(5)
##                    title seasons
## 1         Grey's Anatomy      17
## 2           Supernatural      15
## 3                   NCIS      15
## 4              Heartland      13
## 5 COMEDIANS of the world      13

Interpretation: Shows with the most seasons represent successful franchises with high user retention.

Q10: Most Prolific Director Scenario: Which director has the most titles listed on Netflix? Code:

netflix_data %>%
  filter(director != "") %>%
  count(director, sort = TRUE) %>%
  head(1)
##        director  n
## 1 Rajiv Chilaka 19

Interpretation: This highlights the director with the strongest professional relationship with the platform.

Q11: Identifying Missing Metadata Scenario: How many entries are missing “Director” information? Code:

sum(is.na(netflix_data$director) | netflix_data$director == "")
## [1] 2634

Interpretation: This informs the data cleaning team about the volume of incomplete records.

Q12: Horror Genre Popularity in the US Scenario: Count how many “Horror” titles are available for the United States market. Code:

netflix_data %>%
  filter(country == "United States",
         str_detect(listed_in, "Horror")) %>%
  nrow()
## [1] 172

Interpretation: This metric helps in understanding the availability of niche genres in specific regions.

Q13: Average Year of TV Show Production Scenario: What is the average release year for TV Shows? Code:

netflix_data %>%
  filter(type == "TV Show") %>%
  summarise(avg_year = mean(release_year))
##   avg_year
## 1 2016.609

Interpretation: A high average year suggests that TV shows on Netflix are generally very recent productions.

Q14: Search for Holiday Content Scenario: Find titles that include the word “Christmas” for a seasonal recommendation list. Code:

netflix_data %>%
  filter(str_detect(title, "Christmas")) %>%
  select(title) %>%
  head(5)
##                       title
## 1        Home for Christmas
## 2 An Unremarkable Christmas
## 3     How To Ruin Christmas
## 4    A California Christmas
## 5   A Trash Truck Christmas

Interpretation: This provides a quick way to curate themed content for users.

Q15: Genre Comparison: Action vs. Comedy Scenario: Compare the total count of “Action & Adventure” vs “Comedies”. Code:

action <- sum(str_detect(netflix_data$listed_in, "Action"))
comedy <- sum(str_detect(netflix_data$listed_in, "Comedies"))

data.frame(Action = action, Comedy = comedy)
##   Action Comedy
## 1   1028   2255

Interpretation: This comparison helps determine which genre is more heavily represented in the library.

Q16: Stand-Up Comedy Availability Scenario: How many “Stand-Up Comedy” specials are in the dataset? Code:

netflix_data %>%
  filter(str_detect(listed_in, "Stand-Up Comedy")) %>%
  nrow()
## [1] 399

Interpretation: Shows the scale of Netflix’s investment in original comedy specials.

Q17: Data Freshness Check Scenario: Display the last 5 rows of the dataset to see the most recent entries. Code:

tail(netflix_data[, c("title","type","date_added")], 5)
##                   title    type       date_added
## 8805         Zombieland   Movie November 1, 2019
## 8806               Zoom   Movie January 11, 2020
## 8807             Zubaan   Movie    March 2, 2019
## 8808 Parasyte: The Grey TV Show    April 5, 2024
## 8809             Serena   Movie    April 5, 2024

Interpretation: This confirms the latest additions and ensures the data has been loaded correctly.