library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(tidyr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(tsibble)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

Introduction

This project analyzes the trends, patterns, and relationships in TV show ratings over time. The primary objective is to identify the factors influencing viewer ratings and assess how these insights can improve content quality and audience engagement.

Dataset Overview

The dataset includes information on TV shows, such as ratings, number of episodes, vote counts, and release years. Below is a brief summary of the dataset:

# Read dataset
file_path <- "/Users/saransh/Downloads/TMDB_tv_dataset_v3.csv"
tv_data <- read.csv(file_path, stringsAsFactors = FALSE)

# Summary statistics of the dataset
glimpse(tv_data)

## Rows: 168,639
## Columns: 29
## $ id                   <int> 1399, 71446, 66732, 1402, 63174, 69050, 93405, 13…
## $ name                 <chr> "Game of Thrones", "Money Heist", "Stranger Thing…
## $ number_of_seasons    <int> 8, 3, 4, 11, 6, 7, 2, 5, 6, 1, 9, 12, 2, 19, 3, 3…
## $ number_of_episodes   <int> 73, 41, 34, 177, 93, 137, 9, 62, 116, 9, 184, 279…
## $ original_language    <chr> "en", "es", "en", "en", "en", "en", "ko", "en", "…
## $ vote_count           <int> 21857, 17836, 16161, 15432, 13870, 13180, 13053, …
## $ vote_average         <dbl> 8.442, 8.257, 8.624, 8.121, 8.486, 8.479, 7.831, …
## $ overview             <chr> "Seven noble families fight for control of the my…
## $ adult                <chr> "False", "False", "False", "False", "False", "Fal…
## $ backdrop_path        <chr> "/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg", "/gFZriCkpJYsA…
## $ first_air_date       <chr> "2011-04-17", "2017-05-02", "2016-07-15", "2010-1…
## $ last_air_date        <chr> "2019-05-19", "2021-12-03", "2022-07-01", "2022-1…
## $ homepage             <chr> "http://www.hbo.com/game-of-thrones", "https://ww…
## $ in_production        <chr> "False", "False", "True", "False", "False", "Fals…
## $ original_name        <chr> "Game of Thrones", "La Casa de Papel", "Stranger …
## $ popularity           <dbl> 1083.917, 96.354, 185.711, 489.746, 416.668, 143.…
## $ poster_path          <chr> "/1XS1oqL89opfnbLl8WnZY1O1uJx.jpg", "/reEMJA1uzsc…
## $ type                 <chr> "Scripted", "Scripted", "Scripted", "Scripted", "…
## $ status               <chr> "Ended", "Ended", "Returning Series", "Ended", "E…
## $ tagline              <chr> "Winter Is Coming", "The perfect robbery.", "Ever…
## $ genres               <chr> "Sci-Fi & Fantasy, Drama, Action & Adventure", "C…
## $ created_by           <chr> "David Benioff, D.B. Weiss", "Álex Pina", "Matt D…
## $ languages            <chr> "en", "es", "en", "en", "en", "en", "en, ko, ur",…
## $ networks             <chr> "HBO", "Netflix, Antena 3", "Netflix", "AMC", "FO…
## $ origin_country       <chr> "US", "ES", "US", "US", "US", "US", "KR", "US", "…
## $ spoken_languages     <chr> "English", "Español", "English", "English", "Engl…
## $ production_companies <chr> "Revolution Sun Studios, Television 360, Generato…
## $ production_countries <chr> "United Kingdom, United States of America", "Spai…
## $ episode_run_time     <int> 0, 70, 0, 42, 45, 45, 0, 0, 43, 0, 44, 22, 52, 43…

summary(tv_data)

##        id             name           number_of_seasons number_of_episodes
##  Min.   :     1   Length:168639      Min.   :  0.000   Min.   :    0.00  
##  1st Qu.: 45936   Class :character   1st Qu.:  1.000   1st Qu.:    1.00  
##  Median : 97734   Mode  :character   Median :  1.000   Median :    6.00  
##  Mean   :111307                      Mean   :  1.548   Mean   :   24.46  
##  3rd Qu.:196924                      3rd Qu.:  1.000   3rd Qu.:   20.00  
##  Max.   :251213                      Max.   :240.000   Max.   :20839.00  
##  original_language    vote_count       vote_average      overview        
##  Length:168639      Min.   :    0.0   Min.   : 0.000   Length:168639     
##  Class :character   1st Qu.:    0.0   1st Qu.: 0.000   Class :character  
##  Mode  :character   Median :    0.0   Median : 0.000   Mode  :character  
##                     Mean   :   13.3   Mean   : 2.334                     
##                     3rd Qu.:    1.0   3rd Qu.: 6.000                     
##                     Max.   :21857.0   Max.   :10.000                     
##     adult           backdrop_path      first_air_date     last_air_date     
##  Length:168639      Length:168639      Length:168639      Length:168639     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    homepage         in_production      original_name        popularity      
##  Length:168639      Length:168639      Length:168639      Min.   :   0.000  
##  Class :character   Class :character   Class :character   1st Qu.:   0.600  
##  Mode  :character   Mode  :character   Mode  :character   Median :   0.857  
##                                                           Mean   :   5.883  
##                                                           3rd Qu.:   2.432  
##                                                           Max.   :3707.008  
##  poster_path            type              status            tagline         
##  Length:168639      Length:168639      Length:168639      Length:168639     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     genres           created_by         languages           networks        
##  Length:168639      Length:168639      Length:168639      Length:168639     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  origin_country     spoken_languages   production_companies
##  Length:168639      Length:168639      Length:168639       
##  Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character    
##                                                            
##                                                            
##                                                            
##  production_countries episode_run_time
##  Length:168639        Min.   :   0.0  
##  Class :character     1st Qu.:   0.0  
##  Mode  :character     Median :   0.0  
##                       Mean   :  22.6  
##                       3rd Qu.:  42.0  
##                       Max.   :6032.0

Assumptions Made

The following assumptions were made during the analysis:

Missing language data is considered non-critical and does not affect the analysis.
Data between 1950 and 2024 is used to ensure relevance.
Viewer engagement (votes) correlates positively with show quality.

Exploratory Data Analysis (EDA)

Average Ratings Over Time

# Filter data between 1950 and 2024
tv_data <- tv_data |> 
  mutate(year = as.numeric(substr(first_air_date, 1, 4))) |> 
  filter(year >= 1950 & year <= 2024)

# Plot average ratings over time
ggplot(tv_data, aes(x = year, y = vote_average)) +
  geom_line(color = "blue", alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "Trend in Average Ratings Over Time",
    x = "Year",
    y = "Average Rating"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

The trend line illustrates a gradual decline in average TV show ratings over the analyzed period, from 1950 to 2024. This decline, while not steep, indicates a consistent downward movement in viewer ratings over time.
With changing cultural norms, technological advancements, and varying storytelling styles, audiences may have become more selective or critical of content.
The proliferation of alternative media platforms like streaming services, social media, and gaming has diversified entertainment choices, making it harder for traditional TV shows to maintain consistently high ratings.
Changes in production quality, story complexity, or innovation in TV content might not align with audience expectations, contributing to lower average ratings.
Studios can use this trend to investigate specific factors contributing to declining ratings.

Smoothed Ratings Trend

tv_data <- tv_data |>
  mutate(first_air_date = as.Date(first_air_date, format = "%Y-%m-%d"))

# Extract the year and aggregate data
yearly_data <- tv_data |>
  filter(!is.na(first_air_date)) |>
  mutate(year = year(first_air_date)) |>
  group_by(year) |>
  summarise(
    avg_vote = mean(vote_average, na.rm = TRUE),
    total_votes = sum(vote_count, na.rm = TRUE),
    show_count = n()
  )

# Create a tsibble object
yearly_tsibble <- yearly_data |> as_tsibble(index = year)

# Filter data for the desired range
filtered_tsibble <- yearly_tsibble |> 
  filter(year >= 1950 & year <= 2024)

# Apply smoothing
filtered_tsibble |>
  ggplot(aes(x = year, y = avg_vote)) +
  geom_line(color = "blue", alpha = 0.5) +
  geom_smooth(method = "loess", se = FALSE, color = "green") +
  labs(
    title = "Smoothed Average Ratings Over Time",
    x = "Year",
    y = "Average Rating"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Observations

The smoothed line (green) indicates an overall decline in average ratings over time. After a period of stability during the mid-20th century, ratings appear to decrease significantly in more recent years.
Early years (pre-1950) show greater fluctuations in average ratings, likely due to limited data points or variability in content quality during the formative years of television production.

Significance

The declining trend in ratings in recent years may reflect evolving audience preferences, changing production standards, or increased competition from alternative media platforms.
Identifying when the decline began can help producers and studios adapt strategies to maintain viewer satisfaction, such as experimenting with new genres or improving production quality.

Distribution of Ratings by Genre

# Group by genres and summarize show_count and avg_rating
genre_group <- tv_data |>
  group_by(genres) |>
  summarize(show_count = n(), avg_rating = mean(vote_average, na.rm = TRUE)) |>
  arrange(desc(show_count))

# Limit visualization to top 40 genres
top_40_genres <- genre_group |> slice_max(order_by = show_count, n = 40)

# Create bar plot for top 40 genres
ggplot(top_40_genres, aes(x = reorder(genres, -avg_rating), y = avg_rating)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Average Vote Ratings by Top 40 Genres",
    x = "Genres",
    y = "Average Rating"
  ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))

The analysis indicates that genres like Drama and Documentary consistently achieve higher average ratings compared to other genres. This suggests that audiences find content in these categories particularly compelling and engaging.
These genres often feature deep storytelling, emotional resonance, and thought-provoking narratives, which likely contribute to their sustained appeal. Documentaries, in particular, cater to a growing audience seeking informative, real-world content.
Leverage popular sub-genres, such as crime dramas or nature documentaries, to tap into established audience bases.
Explore cross-genre opportunities, such as blending Drama with Sci-Fi or merging Documentary formats with Reality TV elements, to introduce novelty while maintaining audience interest.

Hypothesis Testing

To determine if there is a significant relationship between ratings and the number of episodes:

# Group the number_of_episodes into two categories: "low" and "high"
tv_data <- tv_data |> 
  mutate(episode_group = ifelse(number_of_episodes <= median(number_of_episodes, na.rm = TRUE), 
                                "low", 
                                "high"))

# Perform t-test
t_test_result <- t.test(vote_average ~ episode_group, data = tv_data)
t_test_result

## 
##  Welch Two Sample t-test
## 
## data:  vote_average by episode_group
## t = 80.214, df = 126590, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group high and group low is not equal to 0
## 95 percent confidence interval:
##  1.507149 1.582646
## sample estimates:
## mean in group high  mean in group low 
##           3.609808           2.064910

The t-test was conducted to determine whether there is a statistically significant difference in the mean ratings between shows with fewer episodes and those with more episodes. By dividing the dataset into two groups, shows with a low episode count and shows with a high episode count (based on the median number of episodes), we can analyze whether the difference in ratings between these groups is meaningful or merely due to random variation.
A significant result (p-value < 0.05) would suggest that the mean ratings between the two groups are indeed different, indicating that episode count could play a role in influencing viewer satisfaction. The t-test indicates a statistically significant relationship between ratings and episode count.

Modeling

Linear Regression: Number of Episodes vs. Vote Average

# Create new column for average episodes per season
tv_data <- tv_data |> 
  mutate(avg_episodes_per_season = number_of_episodes / number_of_seasons)

# Visualization: Number of Episodes vs Vote Average
ggplot(tv_data, aes(x = avg_episodes_per_season, y = vote_average)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Number of Episodes per Season vs. Vote Average", 
       x = "Average Episodes per Season", y = "Vote Average") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 48 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 45 rows containing missing values or values outside the scale range
## (`geom_point()`).

This scatter plot shows the relationship between the average number of episodes per season and the vote average. The red regression line suggests a very weak positive correlation, meaning that shows with more episodes per season might have slightly higher ratings. However, the relationship is not strong, implying that the number of episodes per season is not a major factor in determining the audience ratings.

The presence of potential outliers, such as shows with an unusually high number of episodes but low ratings, warrants further exploration to understand if these deviations are due to specific genres or networks.

# Fit a linear regression model
linear_model <- lm(vote_average ~ number_of_episodes, data = tv_data)
summary(linear_model)

## 
## Call:
## lm(formula = vote_average ~ number_of_episodes, data = tv_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.119  -2.728  -2.713   3.787   7.289 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.711e+00  9.886e-03  274.22   <2e-16 ***
## number_of_episodes 1.867e-03  6.538e-05   28.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.586 on 136751 degrees of freedom
## Multiple R-squared:  0.005928,   Adjusted R-squared:  0.00592 
## F-statistic: 815.5 on 1 and 136751 DF,  p-value: < 2.2e-16

The linear regression model highlights a weak positive correlation between the number of episodes and average ratings. This means that as the number of episodes in a show increases, there is a slight tendency for the average ratings to improve. However, the effect is not substantial or dominant, indicating that the number of episodes alone is not a strong predictor of a show’s success.
The model’s statistical significance suggests that the relationship between the two variables is unlikely to have occurred by chance. However, the low adjusted R-squared value indicates that the number of episodes accounts for only a small fraction of the variability in average ratings.

Residual Diagnostics

# Diagnostic plots for the linear model
par(mfrow = c(2, 2))
plot(linear_model)

The residual diagnostics indicate deviations from linearity and normality in the model, as seen in the residual plots and Q-Q plot.
Residuals vs. Fitted Plot: This plot shows how the residuals (differences between observed and predicted values) are distributed relative to the predicted values. Deviations from a random scatter or patterns (e.g., curvature or clustering) suggest that a linear model may not fully capture the underlying relationship between the variables.
Q-Q Plot: This plot compares the residuals’ distribution to a normal distribution. Deviations from the straight diagonal line indicate that the residuals may not follow a normal distribution, violating an assumption of linear regression

Model 2: Ratings Per Season vs. Vote Count

Ratings Per Season vs. Vote Count

# Create a new column for the total number of ratings per season
tv_data <- tv_data |> 
  mutate(ratings_per_season = vote_count / number_of_seasons)

# Scatter plot for ratings_per_season vs vote_count
ggplot(tv_data, aes(x = ratings_per_season, y = vote_count)) +
  geom_point(color = "green") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Ratings per Season vs. Vote Count", 
       x = "Ratings per Season", 
       y = "Vote Count") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 48 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).

This plot highlights a positive correlation between ratings per season and vote count, emphasizing the impact of high-quality content on audience engagement.

# Fit a linear regression model for ratings per season vs. vote count
model_2 <- lm(vote_count ~ vote_average, data = tv_data)
summary(model_2)

## 
## Call:
## lm(formula = vote_count ~ vote_average, data = tv_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##   -62.4   -30.6     1.2     1.2 21804.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.2445     0.7182  -1.733   0.0831 .  
## vote_average   6.3684     0.1583  40.235   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 210.5 on 136751 degrees of freedom
## Multiple R-squared:  0.0117, Adjusted R-squared:  0.01169 
## F-statistic:  1619 on 1 and 136751 DF,  p-value: < 2.2e-16

The second model reveals a strong positive relationship between ratings per season and vote count, indicating that higher ratings correspond to higher audience engagement.

Residual Diagnostics for Model 2

# Diagnostic plots for the second linear model
par(mfrow = c(2, 2))
plot(model_2)

These diagnostic plots confirm the linearity and highlight potential areas for improvement in model fit.

Key Findings

Average ratings have declined over time, reflecting shifts in audience preferences or media landscape.
Genres like Drama and Documentary consistently perform better in ratings.
A marginal positive correlation exists between the number of episodes and average ratings, but this is not a major factor.

Conclusion

This analysis highlights the importance of delivering high-quality content tailored to audience preferences. Recommendations include:

Focusing on high-performing genres to maximize engagement.
Investigating periods of decline to understand shifts in viewer behavior.
Exploring additional variables like production budgets and demographic data to enhance insights.

Final Project Analysis