Netflix Movie Duration Analysis

Author

Yohannes Gebretsadik

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read.csv("netflix_titles.csv")
movies <- df %>%
  filter(type == "Movie")%>%
  mutate(duration_num = as.numeric(gsub("[^0-9]", "", duration)))%>%
  filter(!is.na(duration_num), duration_num > 30)
head(movies$duration_num)
[1]  90  91 125 104 127  91
mean(movies$duration_num)
[1] 101.2454
min(movies$duration_num)
[1] 31
max(movies$duration_num)
[1] 312
model <- lm(duration_num ~ release_year, data = movies)

summary(model) 

Call:
lm(formula = duration_num ~ release_year, data = movies)

Residuals:
     Min       1Q   Median       3Q      Max 
-107.557  -13.465   -1.783   13.595  213.535 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1234.41935   68.73329   17.96   <2e-16 ***
release_year   -0.56291    0.03414  -16.49   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 25.62 on 5996 degrees of freedom
Multiple R-squared:  0.04337,   Adjusted R-squared:  0.04321 
F-statistic: 271.8 on 1 and 5996 DF,  p-value: < 2.2e-16

Histogram

ggplot(movies, aes(x = duration_num)) +
  geom_histogram(binwidth = 10, fill = "steelblue", color = "black", alpha = 0.8 ) +
  labs(
    title = "Distribution of Movies Durations",
    x = "Duration (minutes)", 
    y = "Count"
  )

Boxplot

ggplot(movies, aes(x = duration_num)) +
  geom_boxplot(fill = "purple", color = "black", alpha = 0.7) +
  labs(
    title = "Boxplot of Movies Durations",
    y = "Duration (minutes)"
  ) +
  theme_minimal()

Density Plot

ggplot(movies, aes(x = duration_num)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(
    title = "Density of Movies Durations",
    x = "Duration (minutes)", 
    y = "Density"
  )

Linear Regression Plot

model_plot <- movies %>%
  mutate(rating_group = case_when(
    rating %in% c("G", "PG") ~ "Kids",
    rating %in% c("PG-13")~"Teens",
    rating %in% c("R", "NC-17") ~ "Adults",
    TRUE~"Other"
  ))
ggplot(model_plot, aes(x = release_year, y = duration_num, color = rating_group)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", color = "black", se = TRUE) +
  labs(
    title = "Linear Regression: Movies Duration vs Release Year by Audience",
    x = "Release Year",
    y = "Duration (minutes)",
    color = "Audience"
  ) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Overall, most movies are around 100 minutes long, with the majority falling between 90-120 minutes. While there is a slight decrease in duration over time, movie lenght has stayed fairly consistent overall.