library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(tsibble)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
This project analyzes the trends, patterns, and relationships in TV show ratings over time. The primary objective is to identify the factors influencing viewer ratings and assess how these insights can improve content quality and audience engagement.
The dataset includes information on TV shows, such as ratings, number of episodes, vote counts, and release years. Below is a brief summary of the dataset:
# Read dataset
file_path <- "/Users/saransh/Downloads/TMDB_tv_dataset_v3.csv"
tv_data <- read.csv(file_path, stringsAsFactors = FALSE)
# Summary statistics of the dataset
glimpse(tv_data)
## Rows: 168,639
## Columns: 29
## $ id <int> 1399, 71446, 66732, 1402, 63174, 69050, 93405, 13…
## $ name <chr> "Game of Thrones", "Money Heist", "Stranger Thing…
## $ number_of_seasons <int> 8, 3, 4, 11, 6, 7, 2, 5, 6, 1, 9, 12, 2, 19, 3, 3…
## $ number_of_episodes <int> 73, 41, 34, 177, 93, 137, 9, 62, 116, 9, 184, 279…
## $ original_language <chr> "en", "es", "en", "en", "en", "en", "ko", "en", "…
## $ vote_count <int> 21857, 17836, 16161, 15432, 13870, 13180, 13053, …
## $ vote_average <dbl> 8.442, 8.257, 8.624, 8.121, 8.486, 8.479, 7.831, …
## $ overview <chr> "Seven noble families fight for control of the my…
## $ adult <chr> "False", "False", "False", "False", "False", "Fal…
## $ backdrop_path <chr> "/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg", "/gFZriCkpJYsA…
## $ first_air_date <chr> "2011-04-17", "2017-05-02", "2016-07-15", "2010-1…
## $ last_air_date <chr> "2019-05-19", "2021-12-03", "2022-07-01", "2022-1…
## $ homepage <chr> "http://www.hbo.com/game-of-thrones", "https://ww…
## $ in_production <chr> "False", "False", "True", "False", "False", "Fals…
## $ original_name <chr> "Game of Thrones", "La Casa de Papel", "Stranger …
## $ popularity <dbl> 1083.917, 96.354, 185.711, 489.746, 416.668, 143.…
## $ poster_path <chr> "/1XS1oqL89opfnbLl8WnZY1O1uJx.jpg", "/reEMJA1uzsc…
## $ type <chr> "Scripted", "Scripted", "Scripted", "Scripted", "…
## $ status <chr> "Ended", "Ended", "Returning Series", "Ended", "E…
## $ tagline <chr> "Winter Is Coming", "The perfect robbery.", "Ever…
## $ genres <chr> "Sci-Fi & Fantasy, Drama, Action & Adventure", "C…
## $ created_by <chr> "David Benioff, D.B. Weiss", "Álex Pina", "Matt D…
## $ languages <chr> "en", "es", "en", "en", "en", "en", "en, ko, ur",…
## $ networks <chr> "HBO", "Netflix, Antena 3", "Netflix", "AMC", "FO…
## $ origin_country <chr> "US", "ES", "US", "US", "US", "US", "KR", "US", "…
## $ spoken_languages <chr> "English", "Español", "English", "English", "Engl…
## $ production_companies <chr> "Revolution Sun Studios, Television 360, Generato…
## $ production_countries <chr> "United Kingdom, United States of America", "Spai…
## $ episode_run_time <int> 0, 70, 0, 42, 45, 45, 0, 0, 43, 0, 44, 22, 52, 43…
summary(tv_data)
## id name number_of_seasons number_of_episodes
## Min. : 1 Length:168639 Min. : 0.000 Min. : 0.00
## 1st Qu.: 45936 Class :character 1st Qu.: 1.000 1st Qu.: 1.00
## Median : 97734 Mode :character Median : 1.000 Median : 6.00
## Mean :111307 Mean : 1.548 Mean : 24.46
## 3rd Qu.:196924 3rd Qu.: 1.000 3rd Qu.: 20.00
## Max. :251213 Max. :240.000 Max. :20839.00
## original_language vote_count vote_average overview
## Length:168639 Min. : 0.0 Min. : 0.000 Length:168639
## Class :character 1st Qu.: 0.0 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.0 Median : 0.000 Mode :character
## Mean : 13.3 Mean : 2.334
## 3rd Qu.: 1.0 3rd Qu.: 6.000
## Max. :21857.0 Max. :10.000
## adult backdrop_path first_air_date last_air_date
## Length:168639 Length:168639 Length:168639 Length:168639
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## homepage in_production original_name popularity
## Length:168639 Length:168639 Length:168639 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 0.600
## Mode :character Mode :character Mode :character Median : 0.857
## Mean : 5.883
## 3rd Qu.: 2.432
## Max. :3707.008
## poster_path type status tagline
## Length:168639 Length:168639 Length:168639 Length:168639
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## genres created_by languages networks
## Length:168639 Length:168639 Length:168639 Length:168639
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## origin_country spoken_languages production_companies
## Length:168639 Length:168639 Length:168639
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## production_countries episode_run_time
## Length:168639 Min. : 0.0
## Class :character 1st Qu.: 0.0
## Mode :character Median : 0.0
## Mean : 22.6
## 3rd Qu.: 42.0
## Max. :6032.0
The following assumptions were made during the analysis:
# Filter data between 1950 and 2024
tv_data <- tv_data |>
mutate(year = as.numeric(substr(first_air_date, 1, 4))) |>
filter(year >= 1950 & year <= 2024)
# Plot average ratings over time
ggplot(tv_data, aes(x = year, y = vote_average)) +
geom_line(color = "blue", alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Trend in Average Ratings Over Time",
x = "Year",
y = "Average Rating"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
tv_data <- tv_data |>
mutate(first_air_date = as.Date(first_air_date, format = "%Y-%m-%d"))
# Extract the year and aggregate data
yearly_data <- tv_data |>
filter(!is.na(first_air_date)) |>
mutate(year = year(first_air_date)) |>
group_by(year) |>
summarise(
avg_vote = mean(vote_average, na.rm = TRUE),
total_votes = sum(vote_count, na.rm = TRUE),
show_count = n()
)
# Create a tsibble object
yearly_tsibble <- yearly_data |> as_tsibble(index = year)
# Filter data for the desired range
filtered_tsibble <- yearly_tsibble |>
filter(year >= 1950 & year <= 2024)
# Apply smoothing
filtered_tsibble |>
ggplot(aes(x = year, y = avg_vote)) +
geom_line(color = "blue", alpha = 0.5) +
geom_smooth(method = "loess", se = FALSE, color = "green") +
labs(
title = "Smoothed Average Ratings Over Time",
x = "Year",
y = "Average Rating"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Group by genres and summarize show_count and avg_rating
genre_group <- tv_data |>
group_by(genres) |>
summarize(show_count = n(), avg_rating = mean(vote_average, na.rm = TRUE)) |>
arrange(desc(show_count))
# Limit visualization to top 40 genres
top_40_genres <- genre_group |> slice_max(order_by = show_count, n = 40)
# Create bar plot for top 40 genres
ggplot(top_40_genres, aes(x = reorder(genres, -avg_rating), y = avg_rating)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(
title = "Average Vote Ratings by Top 40 Genres",
x = "Genres",
y = "Average Rating"
) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))
To determine if there is a significant relationship between ratings and the number of episodes:
# Group the number_of_episodes into two categories: "low" and "high"
tv_data <- tv_data |>
mutate(episode_group = ifelse(number_of_episodes <= median(number_of_episodes, na.rm = TRUE),
"low",
"high"))
# Perform t-test
t_test_result <- t.test(vote_average ~ episode_group, data = tv_data)
t_test_result
##
## Welch Two Sample t-test
##
## data: vote_average by episode_group
## t = 80.214, df = 126590, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group high and group low is not equal to 0
## 95 percent confidence interval:
## 1.507149 1.582646
## sample estimates:
## mean in group high mean in group low
## 3.609808 2.064910
# Create new column for average episodes per season
tv_data <- tv_data |>
mutate(avg_episodes_per_season = number_of_episodes / number_of_seasons)
# Visualization: Number of Episodes vs Vote Average
ggplot(tv_data, aes(x = avg_episodes_per_season, y = vote_average)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Number of Episodes per Season vs. Vote Average",
x = "Average Episodes per Season", y = "Vote Average") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 48 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 45 rows containing missing values or values outside the scale range
## (`geom_point()`).
This scatter plot shows the relationship between the average number of episodes per season and the vote average. The red regression line suggests a very weak positive correlation, meaning that shows with more episodes per season might have slightly higher ratings. However, the relationship is not strong, implying that the number of episodes per season is not a major factor in determining the audience ratings.
The presence of potential outliers, such as shows with an unusually high number of episodes but low ratings, warrants further exploration to understand if these deviations are due to specific genres or networks.
# Fit a linear regression model
linear_model <- lm(vote_average ~ number_of_episodes, data = tv_data)
summary(linear_model)
##
## Call:
## lm(formula = vote_average ~ number_of_episodes, data = tv_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.119 -2.728 -2.713 3.787 7.289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.711e+00 9.886e-03 274.22 <2e-16 ***
## number_of_episodes 1.867e-03 6.538e-05 28.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.586 on 136751 degrees of freedom
## Multiple R-squared: 0.005928, Adjusted R-squared: 0.00592
## F-statistic: 815.5 on 1 and 136751 DF, p-value: < 2.2e-16
# Diagnostic plots for the linear model
par(mfrow = c(2, 2))
plot(linear_model)
# Create a new column for the total number of ratings per season
tv_data <- tv_data |>
mutate(ratings_per_season = vote_count / number_of_seasons)
# Scatter plot for ratings_per_season vs vote_count
ggplot(tv_data, aes(x = ratings_per_season, y = vote_count)) +
geom_point(color = "green") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Ratings per Season vs. Vote Count",
x = "Ratings per Season",
y = "Vote Count") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 48 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).
This plot highlights a positive correlation between ratings per season and vote count, emphasizing the impact of high-quality content on audience engagement.
# Fit a linear regression model for ratings per season vs. vote count
model_2 <- lm(vote_count ~ vote_average, data = tv_data)
summary(model_2)
##
## Call:
## lm(formula = vote_count ~ vote_average, data = tv_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.4 -30.6 1.2 1.2 21804.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.2445 0.7182 -1.733 0.0831 .
## vote_average 6.3684 0.1583 40.235 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 210.5 on 136751 degrees of freedom
## Multiple R-squared: 0.0117, Adjusted R-squared: 0.01169
## F-statistic: 1619 on 1 and 136751 DF, p-value: < 2.2e-16
The second model reveals a strong positive relationship between ratings per season and vote count, indicating that higher ratings correspond to higher audience engagement.
# Diagnostic plots for the second linear model
par(mfrow = c(2, 2))
plot(model_2)
These diagnostic plots confirm the linearity and highlight potential areas for improvement in model fit.
This analysis highlights the importance of delivering high-quality content tailored to audience preferences. Recommendations include: