##Load library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
##Data Import
netflix_data <- read.csv("netflix_titles.csv")
head(netflix_data)
## show_id type title director
## 1 s1 Movie Dick Johnson Is Dead Kirsten Johnson
## 2 s2 TV Show Blood & Water
## 3 s3 TV Show Ganglands Julien Leclercq
## 4 s4 TV Show Jailbirds New Orleans
## 5 s5 TV Show Kota Factory
## 6 s6 TV Show Midnight Mass Mike Flanagan
## cast
## 1
## 2 Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng
## 3 Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera
## 4
## 5 Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar
## 6 Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver
## country date_added release_year rating duration
## 1 United States September 25, 2021 2020 PG-13 90 min
## 2 South Africa September 24, 2021 2021 TV-MA 2 Seasons
## 3 September 24, 2021 2021 TV-MA 1 Season
## 4 September 24, 2021 2021 TV-MA 1 Season
## 5 India September 24, 2021 2021 TV-MA 2 Seasons
## 6 September 24, 2021 2021 TV-MA 1 Season
## listed_in
## 1 Documentaries
## 2 International TV Shows, TV Dramas, TV Mysteries
## 3 Crime TV Shows, International TV Shows, TV Action & Adventure
## 4 Docuseries, Reality TV
## 5 International TV Shows, Romantic TV Shows, TV Comedies
## 6 TV Dramas, TV Horror, TV Mysteries
## description
## 1 As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.
## 2 After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.
## 3 To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.
## 4 Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series.
## 5 In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life.
## 6 The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe.
## X X.1 X.2 X.3 X.4 X.5 X.6 X.7 X.8 X.9 X.10 X.11 X.12 X.13
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
netflix_data <- netflix_data[, -c(13:26)]
str(netflix_data)
## 'data.frame': 8809 obs. of 12 variables:
## $ show_id : chr "s1" "s2" "s3" "s4" ...
## $ type : chr "Movie" "TV Show" "TV Show" "TV Show" ...
## $ title : chr "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
## $ director : chr "Kirsten Johnson" "" "Julien Leclercq" "" ...
## $ cast : chr "" "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ "" ...
## $ country : chr "United States" "South Africa" "" "" ...
## $ date_added : chr "September 25, 2021" "September 24, 2021" "September 24, 2021" "September 24, 2021" ...
## $ release_year: int 2020 2021 2021 2021 2021 2021 2021 1993 2021 2021 ...
## $ rating : chr "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
## $ duration : chr "90 min" "2 Seasons" "1 Season" "1 Season" ...
## $ listed_in : chr "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
## $ description : chr "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...
##Data Cleaning
colSums(is.na(netflix_data))
## show_id type title director cast country
## 0 0 0 0 0 0
## date_added release_year rating duration listed_in description
## 0 0 0 0 0 0
netflix_data <- na.omit(netflix_data)
colSums(is.na(netflix_data)
)
## show_id type title director cast country
## 0 0 0 0 0 0
## date_added release_year rating duration listed_in description
## 0 0 0 0 0 0
dim(netflix_data)
## [1] 8809 12
##Netflix Content Strategy Analysis
Q1: Market Dominance by Country (Bar Plot) Scenario: The strategy team wants to identify which 10 countries produce the most content to prioritize regional marketing budgets.
Code:
top_10_countries <- netflix_data %>%
filter(country != "") %>%
count(country, sort = TRUE) %>%
head(10)
ggplot(top_10_countries, aes(x = reorder(country, n), y = n)) +
geom_bar(stat = "identity", fill = "midnightblue") +
coord_flip()
Interpretation: This Bar Plot reveals that the United States is the primary content provider, followed significantly by India, suggesting these are the core markets for Netflix.
Q2: Content Evolution Over Time (Histogram) Scenario: Analysts need to see if the library is dominated by “Classic” films or “Modern” releases to adjust their acquisition strategy.
Code:
ggplot(netflix_data, aes(x = release_year)) +
geom_histogram(binwidth = 2, fill = "darkred", color = "white")
Interpretation: The Histogram shows a massive spike in content released after 2015, indicating that Netflix prioritizes newer, contemporary media over historical archives.
Q3: Movie vs. TV Show Ratio Scenario: A stakeholder asks for the current split between Movies and TV Shows to decide if they should invest more in episodic content.
Code:
netflix_data %>%
group_by(type) %>%
summarise(Total = n())
## # A tibble: 2 × 2
## type Total
## <chr> <int>
## 1 Movie 6132
## 2 TV Show 2677
Interpretation: The output shows that Movies still outnumber TV Shows on the platform, suggesting Netflix started as a movie-centric service.
Q4: Film Length Trends (Histogram) Scenario: To optimize streaming bandwidth, the tech team wants to know the most common duration for movies.
Code:
movie_duration <- netflix_data %>%
filter(type == "Movie") %>%
mutate(duration_min = as.numeric(str_remove(duration, " min")))
ggplot(movie_duration, aes(x = duration_min)) +
geom_histogram(binwidth = 10, fill = "seagreen")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
Interpretation: This Histogram shows a “Normal Distribution” centered around 90-110 minutes, which is the industry standard for feature films.
Q5: Audience Suitability (Bar Plot) Scenario: The legal team needs to know the distribution of maturity ratings to ensure compliance with global safety standards.
Code:
rating_counts <- netflix_data %>%
filter(rating != "") %>%
count(rating, sort = TRUE) %>%
head(8)
ggplot(rating_counts, aes(x = rating, y = n)) +
geom_bar(stat = "identity", fill = "orange")
Interpretation: The Bar Plot indicates that ‘TV-MA’ (Mature Audiences) is the most frequent rating, showing that a large portion of Netflix content is intended for adults.
Q6: Content Growth in a Specific Year Scenario: How many titles were added to the platform in the year 2021? Code:
netflix_data %>%
filter(str_detect(date_added, "2021")) %>%
nrow()
## [1] 1498
Interpretation: This count represents the library expansion rate during the final year of the recorded data.
Q7: Indian Content Deep Dive Scenario: List the first 5 titles produced exclusively in India. Code:
netflix_data %>%
filter(country == "India") %>%
select(title, release_year) %>%
head(5)
## title release_year
## 1 Kota Factory 2021
## 2 Jeans 1998
## 3 Chhota Bheem 2021
## 4 Dharmakshetra 2014
## 5 Raja Rasoi Aur Anya Kahaniyan 2014
Interpretation: These results show a snapshot of the diversity in Indian cinema available on the platform.
Q8: The “Oldest” Asset Scenario: Find the oldest movie available in the dataset for a “Vintage Cinema” campaign. Code:
netflix_data %>%
filter(type == "Movie") %>%
arrange(release_year) %>%
select(title, release_year) %>%
head(1)
## title release_year
## 1 Prelude to War 1942
Interpretation: This identifies the earliest piece of cinematic history currently hosted on the service.
Q9: Longest Running Series Scenario: Identify the TV Shows with the highest number of seasons. Code:
tv_data <- netflix_data %>%
filter(type == "TV Show") %>%
mutate(seasons = as.numeric(str_extract(duration, "\\d+")))
tv_data %>%
arrange(desc(seasons)) %>%
select(title, seasons) %>%
head(5)
## title seasons
## 1 Grey's Anatomy 17
## 2 Supernatural 15
## 3 NCIS 15
## 4 Heartland 13
## 5 COMEDIANS of the world 13
Interpretation: Shows with the most seasons represent successful franchises with high user retention.
Q10: Most Prolific Director Scenario: Which director has the most titles listed on Netflix? Code:
netflix_data %>%
filter(director != "") %>%
count(director, sort = TRUE) %>%
head(1)
## director n
## 1 Rajiv Chilaka 19
Interpretation: This highlights the director with the strongest professional relationship with the platform.
Q11: Identifying Missing Metadata Scenario: How many entries are missing “Director” information? Code:
sum(is.na(netflix_data$director) | netflix_data$director == "")
## [1] 2634
Interpretation: This informs the data cleaning team about the volume of incomplete records.
Q12: Horror Genre Popularity in the US Scenario: Count how many “Horror” titles are available for the United States market. Code:
netflix_data %>%
filter(country == "United States",
str_detect(listed_in, "Horror")) %>%
nrow()
## [1] 172
Interpretation: This metric helps in understanding the availability of niche genres in specific regions.
Q13: Average Year of TV Show Production Scenario: What is the average release year for TV Shows? Code:
netflix_data %>%
filter(type == "TV Show") %>%
summarise(avg_year = mean(release_year))
## avg_year
## 1 2016.609
Interpretation: A high average year suggests that TV shows on Netflix are generally very recent productions.
Q14: Search for Holiday Content Scenario: Find titles that include the word “Christmas” for a seasonal recommendation list. Code:
netflix_data %>%
filter(str_detect(title, "Christmas")) %>%
select(title) %>%
head(5)
## title
## 1 Home for Christmas
## 2 An Unremarkable Christmas
## 3 How To Ruin Christmas
## 4 A California Christmas
## 5 A Trash Truck Christmas
Interpretation: This provides a quick way to curate themed content for users.
Q15: Genre Comparison: Action vs. Comedy Scenario: Compare the total count of “Action & Adventure” vs “Comedies”. Code:
action <- sum(str_detect(netflix_data$listed_in, "Action"))
comedy <- sum(str_detect(netflix_data$listed_in, "Comedies"))
data.frame(Action = action, Comedy = comedy)
## Action Comedy
## 1 1028 2255
Interpretation: This comparison helps determine which genre is more heavily represented in the library.
Q16: Stand-Up Comedy Availability Scenario: How many “Stand-Up Comedy” specials are in the dataset? Code:
netflix_data %>%
filter(str_detect(listed_in, "Stand-Up Comedy")) %>%
nrow()
## [1] 399
Interpretation: Shows the scale of Netflix’s investment in original comedy specials.
Q17: Data Freshness Check Scenario: Display the last 5 rows of the dataset to see the most recent entries. Code:
tail(netflix_data[, c("title","type","date_added")], 5)
## title type date_added
## 8805 Zombieland Movie November 1, 2019
## 8806 Zoom Movie January 11, 2020
## 8807 Zubaan Movie March 2, 2019
## 8808 Parasyte: The Grey TV Show April 5, 2024
## 8809 Serena Movie April 5, 2024
Interpretation: This confirms the latest additions and ensures the data has been loaded correctly.
CA3
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
Scenario- The company wants to check whether newer content tends to be longer.
Question 18 - Is there a relationship between release year and movie duration?
Code-
movie_data <- netflix_data %>%
filter(type == "Movie") %>%
mutate(duration_min = as.numeric(str_remove(duration, " min")))
cor(movie_data$release_year, movie_data$duration_min)
## [1] NA
ggplot(movie_data, aes(x = release_year, y = duration_min)) +
geom_point() +
ggtitle("Release Year vs Duration")
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
Interpretation
Shows whether modern movies are longer or shorter.
Scenario: The content strategy team wants to understand whether newer TV shows tend to have more seasons.
Question 19: Is there a relationship between release year and number of seasons?
Code:
tv_data <- netflix_data %>%
filter(type == "TV Show") %>%
mutate(seasons = as.numeric(str_extract(duration, "\\d+")))
cor(tv_data$release_year, tv_data$seasons)
## [1] -0.09039468
ggplot(tv_data, aes(x = release_year, y = seasons)) +
geom_point() +
ggtitle("Release Year vs Seasons")
Output:
Correlation value and scatter plot.
Interpretation:
Shows whether modern TV shows tend to have more or fewer seasons.
Scenario: The company wants to compare whether Movies and TV Shows differ in their release years.
Question 20: Is there a significant difference in release year between Movies and TV Shows?
Code:
summary(aov(release_year ~ type, data = netflix_data))
## Df Sum Sq Mean Sq F value Pr(>F)
## type 1 22657 22657 301.2 <2e-16 ***
## Residuals 8807 662373 75
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(netflix_data, aes(x = type, y = release_year)) +
geom_boxplot()
Output:
ANOVA table and boxplot.
Interpretation:
If p-value < 0.05, Movies and TV Shows differ significantly in release trends.
Scenario: The company wants to check whether content rating affects release trends.
Question 21: Is there a difference in release year across ratings?
Code:
summary(aov(release_year ~ rating, data = netflix_data))
## Df Sum Sq Mean Sq F value Pr(>F)
## rating 18 69231 3846 54.9 <2e-16 ***
## Residuals 8790 615798 70
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(netflix_data, aes(x = rating, y = release_year)) +
geom_boxplot()
Output:
ANOVA summary and visualization.
Interpretation:
Shows whether certain ratings are more common in recent years.
Scenario: Analysts want to study how content has evolved over time.
Question22: Is there a trend in release year and movie duration?
Code:
movie_data <- netflix_data %>%
filter(type == "Movie") %>%
mutate(duration_min = as.numeric(str_remove(duration, " min")))
model1 <- lm(duration_min ~ release_year, data = movie_data)
summary(model1)
##
## Call:
## lm(formula = duration_min ~ release_year, data = movie_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -124.454 -13.063 -0.638 14.702 215.362
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1313.18631 73.54731 17.86 <2e-16 ***
## release_year -0.60285 0.03653 -16.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.68 on 6127 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.04255, Adjusted R-squared: 0.04239
## F-statistic: 272.3 on 1 and 6127 DF, p-value: < 2.2e-16
ggplot(movie_data, aes(x = release_year, y = duration_min)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle("Release Year vs Duration")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
Output:
Regression summary and trend line.
Interpretation:
Shows whether movie duration is increasing or decreasing over time.
Scenario: The company wants to analyze how movie length changes over time.
Question 23: Does movie duration depend on release year?
Code:
movie_data <- netflix_data %>%
filter(type == "Movie") %>%
mutate(duration_min = as.numeric(str_remove(duration, " min")))
model2 <- lm(duration_min ~ release_year, data = movie_data)
summary(model2)
##
## Call:
## lm(formula = duration_min ~ release_year, data = movie_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -124.454 -13.063 -0.638 14.702 215.362
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1313.18631 73.54731 17.86 <2e-16 ***
## release_year -0.60285 0.03653 -16.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.68 on 6127 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.04255, Adjusted R-squared: 0.04239
## F-statistic: 272.3 on 1 and 6127 DF, p-value: < 2.2e-16
ggplot(movie_data, aes(x = release_year, y = duration_min)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
Output:
Regression model and plot.
Interpretation:
Shows whether movies are becoming longer or shorter over time.
Scenario: The company wants to understand the proportion of Movies and TV Shows available on the platform to plan content strategy.
Question 24: What is the percentage distribution of Movies and TV Shows on Netflix?
code:
type_counts <- netflix_data %>%
count(type)
pie(type_counts$n, labels = type_counts$type,
main = "Distribution of Movies vs TV Shows")
Output:
Pie chart showing proportion of Movies and TV Shows.
Interpretation:
The pie chart shows the relative share of Movies and TV Shows. If Movies occupy a larger portion, it indicates that Netflix focuses more on movie content compared to TV shows.
Scenario: The analyst wants to check whether content type (Movie/TV Show) affects the rating category.
Question 25: Is there a significant difference in ratings between Movies and TV Shows?
Code:
anova_model <- aov(as.numeric(factor(rating)) ~ type, data = netflix_data)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## type 1 2832 2831.7 789 <2e-16 ***
## Residuals 8807 31610 3.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Scenario: The company suspects a non-linear relationship between year and duration.
Question 26: Is the relationship non-linear?
Code:
model_poly <- lm(duration_min ~ poly(release_year, 2), data = movie_data)
summary(model_poly)
##
## Call:
## lm(formula = duration_min ~ poly(release_year, 2), data = movie_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -97.452 -13.855 -0.239 14.842 216.910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.5792 0.3494 285.01 <2e-16 ***
## poly(release_year, 2)1 -456.8234 27.3533 -16.70 <2e-16 ***
## poly(release_year, 2)2 -334.6307 27.3544 -12.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.35 on 6126 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.06538, Adjusted R-squared: 0.06508
## F-statistic: 214.3 on 2 and 6126 DF, p-value: < 2.2e-16
ggplot(movie_data, aes(x = release_year, y = duration_min)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x,2))
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
Output:
Polynomial curve.
Interpretation:
Shows curved relationship if present.
Scenario: The company wants to understand whether longer TV shows (more seasons) are associated with newer releases.
Question27: Is there a relationship between number of seasons and release year?
Code:
tv_data <- netflix_data %>%
filter(type == "TV Show") %>%
mutate(seasons = as.numeric(str_extract(duration, "\\d+")))
cor(tv_data$seasons, tv_data$release_year)
## [1] -0.09039468
Output:
Correlation coefficient value.
Interpretation:
Shows whether modern TV shows tend to have more or fewer seasons.
Scenario: The company wants to determine whether a linear or polynomial model better explains movie duration.
Scenario:
The company wants to estimate the number of seasons for a TV show released in a given year.
Question:
What is the predicted number of seasons for a TV show released in 2020?
Code:
model_tv <- lm(seasons ~ release_year, data = tv_data)
predict(model_tv, newdata = data.frame(release_year = 2020))
## 1
## 1.680152
Output:
Predicted number of seasons.
Interpretation:
Provides an estimate of how long a TV show might run based on its release year
Scenario:
The company wants to predict movie duration based on its release year and rating.
Question 29:
What is the predicted duration of a movie released in 2018 with rating “PG-13”?
Code:
model_multi <- lm(duration_min ~ release_year + rating, data = movie_data)
predict(model_multi, newdata = data.frame(release_year = 2018, rating = "PG-13"))
## 1
## 104.2375
Output:
Predicted movie duration.
Interpretation:
Provides an estimate of movie length based on year and rating, helping in planning and recommendations.
Scenario: The analyst wants to understand the relationships between numerical variables in the dataset to identify patterns and dependencies.
Question 30: What is the correlation between release year and movie duration?
code:
cor(movie_data[, c("release_year", "duration_min")])
## release_year duration_min
## release_year 1 NA
## duration_min NA 1
Output:
Correlation matrix showing relationship between variables.
Interpretation:
The correlation value indicates the strength and direction of the relationship between release year and movie duration.
A value close to 1 indicates a strong positive relationship A value close to -1 indicates a strong negative relationship A value near 0 indicates little or no relationship