Init
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1.9000 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
lme4,
ggeffects,
DBI, duckdb
)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
library(future)
plan(multisession(workers = 2))
#duckdb connection (in-memory, loaded from parquet)
con = dbConnect(duckdb())
dbExecute(con, "CREATE TABLE ratings AS SELECT * FROM read_parquet('data/ratings.parquet')")
## [1] 1.34e+08
dbExecute(con, "CREATE TABLE titles AS SELECT * FROM read_parquet('data/titles.parquet')")
## [1] 12340957
dbExecute(con, "CREATE TABLE movie_home_best AS SELECT * FROM read_parquet('data/movie_home_best.parquet')")
## [1] 3501896
#claude
if (F) {
library(btw)
btw::btw_mcp_session()
}
Functions
as_decimal_year = function(x) {
tibble(
date = x
) %>%
mutate(
year = year(date),
day_of_year = yday(date),
year_length = if_else(leap_year(date), 366, 365),
decimal_year = year + (day_of_year - 1) / year_length
) %>% pull(decimal_year)
}
decimal_year_to_date <- function(decimal_year) {
tibble(decimal_year = decimal_year) %>%
mutate(
year = floor(decimal_year),
fractional_year = decimal_year - year,
year_length = if_else(leap_year(as.Date(paste0(year, "-01-01"))), 366, 365),
day_of_year = round(fractional_year * year_length) + 1,
date = as.Date(paste0(year, "-01-01")) + (day_of_year - 1)
) %>% pull(date)
}
#today test
today() %>%
as_decimal_year() %>%
decimal_year_to_date()
## [1] "2026-03-08"
#dates back and forth
tibble(
date_start = seq(from = as.Date("2020-01-01"), to = today(), by = "1 day"),
decimal_year = date_start %>% as_decimal_year(),
date_end = decimal_year %>% decimal_year_to_date(),
identical = date_start == date_end
) %>%
pull(identical) %>%
all()
## [1] TRUE
Data
if (F) {
#sync new rating files from server (2024-03 onwards)
system("rsync -av --ignore-existing root@164.92.95.101:/mnt/scraping/imdb/*.tsv.gz data/daily_ratings/")
#historical snapshots (2018-2024) downloaded from Wayback Machine
#see: https://web.archive.org/cdx/search/cdx?url=datasets.imdbws.com/title.ratings.tsv.gz&output=json&collapse=digest
#download latest title.basics
download.file(
"https://datasets.imdbws.com/title.basics.tsv.gz",
"data/title.basics.tsv.gz",
mode = "wb"
)
#create ratings table if not exists
dbExecute(con, "
CREATE TABLE IF NOT EXISTS ratings (
tconst TEXT,
averageRating DOUBLE,
numVotes INTEGER,
date DATE
)
")
#no indexes needed — DuckDB columnar scans are fast enough
#bulk ingest all new dates in one SQL statement
#uses DuckDB's glob reader + filename column to extract date from filename
dbExecute(con, "
INSERT INTO ratings
SELECT
tconst, averageRating, numVotes,
strptime(regexp_extract(filename, '(\\d{8})', 1), '%Y%m%d')::DATE AS date
FROM read_csv('data/daily_ratings/*-title.ratings.tsv.gz',
delim='\\t', header=true, nullstr='\\N', filename=true)
WHERE strptime(regexp_extract(filename, '(\\d{8})', 1), '%Y%m%d')::DATE
NOT IN (SELECT DISTINCT date FROM ratings)
")
#refresh titles table
dbExecute(con, "DROP TABLE IF EXISTS titles")
dbExecute(con, "
CREATE TABLE titles AS
SELECT * FROM read_csv('data/title.basics.tsv.gz',
delim='\\t', header=true, nullstr='\\N',
types={'tconst': 'TEXT', 'titleType': 'TEXT', 'primaryTitle': 'TEXT',
'originalTitle': 'TEXT', 'isAdult': 'INTEGER', 'startYear': 'INTEGER',
'endYear': 'INTEGER', 'runtimeMinutes': 'INTEGER', 'genres': 'TEXT'})
")
message(sprintf("Done. ratings: %s rows, titles: %s rows",
format(dbGetQuery(con, "SELECT count(*) FROM ratings")[[1]], big.mark = ","),
format(dbGetQuery(con, "SELECT count(*) FROM titles")[[1]], big.mark = ",")))
}
titles = tbl(con, "titles") %>% collect()
Cross-sectional analysis
#latest snapshot joined with titles, filtered to feature films
cross = dbGetQuery(con, "
SELECT r.tconst, r.averageRating, r.numVotes, r.date,
t.primaryTitle, t.startYear, t.genres, t.runtimeMinutes
FROM ratings r
JOIN titles t ON r.tconst = t.tconst
WHERE r.date = (SELECT max(date) FROM ratings)
AND t.titleType = 'movie'
AND t.runtimeMinutes > 60
AND t.isAdult = 0
AND t.genres IS NOT NULL
AND t.startYear IS NOT NULL
AND t.startYear >= 1950
AND r.numVotes >= 1000
") %>% as_tibble()
latest_date = cross$date[1]
latest_year = year(latest_date)
#recency dummies: 0, 1, 2, 3+ years ago
cross %<>% mutate(
years_ago = latest_year - startYear,
recency = case_when(
years_ago <= 10 ~ as.character(years_ago),
TRUE ~ "10+"
) %>% fct_relevel("10+")
)
#nonlinear smooth to show the decline + recent uptick
cs_slope = coef(lm(averageRating ~ startYear, data = cross))[2]
cross %>%
ggplot(aes(startYear, averageRating)) +
geom_point(alpha = 0.02, size = 0.5) +
geom_smooth() +
geom_smooth(method = "lm", color = "red", linetype = "dashed") +
annotate("text", x = 1955, y = 9.5, hjust = 0, vjust = 1, size = 3.5,
label = sprintf("Linear slope: %.4f\nn = %s movies", cs_slope, format(nrow(cross), big.mark = ","))) +
labs(
title = "Cross-sectional: IMDB ratings by release year",
subtitle = sprintf("Snapshot from %s, feature films with ≥1000 votes", latest_date),
x = "Release year",
y = "Average rating"
)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/cross_sectional_nonlinear.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
#OLS with per-year recency dummies (0-10, reference = 10+)
fit_recency = lm(averageRating ~ startYear + recency, data = cross)
summary(fit_recency)
##
## Call:
## lm(formula = averageRating ~ startYear + recency, data = cross)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.278 -0.648 0.156 0.821 3.568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.072092 0.766683 34.01 < 2e-16 ***
## startYear -0.009927 0.000384 -25.84 < 2e-16 ***
## recency0 0.372604 0.116033 3.21 0.0013 **
## recency1 0.308891 0.038029 8.12 4.7e-16 ***
## recency10 -0.000956 0.031495 -0.03 0.9758
## recency2 0.254883 0.032745 7.78 7.2e-15 ***
## recency3 0.139599 0.031515 4.43 9.5e-06 ***
## recency4 0.058173 0.030732 1.89 0.0584 .
## recency5 0.008115 0.033020 0.25 0.8059
## recency6 -0.140333 0.034586 -4.06 5.0e-05 ***
## recency7 -0.005600 0.030521 -0.18 0.8544
## recency8 0.016702 0.030558 0.55 0.5847
## recency9 -0.006341 0.030799 -0.21 0.8369
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.17 on 45267 degrees of freedom
## Multiple R-squared: 0.0225, Adjusted R-squared: 0.0223
## F-statistic: 87 on 12 and 45267 DF, p-value: <2e-16
#genre indicators
genre_cols = cross %>%
select(tconst, genres) %>%
separate_longer_delim(genres, ",") %>%
mutate(val = 1L) %>%
pivot_wider(names_from = genres, values_from = val, values_fill = 0L)
cross_genres = inner_join(cross, genre_cols, by = "tconst")
genre_names = setdiff(names(genre_cols), "tconst")
#OLS with recency dummies + genre controls
formula_full = as.formula(paste(
"averageRating ~ startYear + recency +",
paste(paste0("`", genre_names, "`"), collapse = " + ")
))
fit_full = lm(formula_full, data = cross_genres)
summary(fit_full)
##
## Call:
## lm(formula = formula_full, data = cross_genres)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.000 -0.538 0.098 0.654 3.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.629684 0.709422 43.18 < 2e-16 ***
## startYear -0.012285 0.000356 -34.55 < 2e-16 ***
## recency0 0.532089 0.101431 5.25 1.6e-07 ***
## recency1 0.399068 0.033379 11.96 < 2e-16 ***
## recency10 0.014344 0.027554 0.52 0.60267
## recency2 0.330094 0.028747 11.48 < 2e-16 ***
## recency3 0.186271 0.027642 6.74 1.6e-11 ***
## recency4 0.108064 0.026931 4.01 6.0e-05 ***
## recency5 0.026081 0.028935 0.90 0.36740
## recency6 -0.113810 0.030297 -3.76 0.00017 ***
## recency7 0.002624 0.026738 0.10 0.92181
## recency8 0.029337 0.026768 1.10 0.27310
## recency9 0.004089 0.026969 0.15 0.87950
## Action -0.232987 0.014104 -16.52 < 2e-16 ***
## Crime 0.046398 0.014262 3.25 0.00114 **
## Thriller -0.098977 0.015130 -6.54 6.1e-11 ***
## Horror -0.855755 0.016708 -51.22 < 2e-16 ***
## Mystery 0.050591 0.018421 2.75 0.00603 **
## Comedy -0.065190 0.012833 -5.08 3.8e-07 ***
## Drama 0.503427 0.012553 40.10 < 2e-16 ***
## Romance -0.013715 0.014459 -0.95 0.34284
## Musical 0.160688 0.040102 4.01 6.2e-05 ***
## Biography 0.256906 0.022748 11.29 < 2e-16 ***
## Documentary 1.128755 0.026049 43.33 < 2e-16 ***
## Fantasy -0.087169 0.021861 -3.99 6.7e-05 ***
## Western -0.298268 0.045251 -6.59 4.4e-11 ***
## Adventure -0.108979 0.017906 -6.09 1.2e-09 ***
## Animation 0.747592 0.029047 25.74 < 2e-16 ***
## Family -0.116363 0.025298 -4.60 4.2e-06 ***
## History 0.188281 0.026028 7.23 4.8e-13 ***
## Sport 0.008059 0.035636 0.23 0.82108
## `Sci-Fi` -0.382174 0.022976 -16.63 < 2e-16 ***
## War 0.128844 0.032644 3.95 7.9e-05 ***
## News 0.392503 0.201650 1.95 0.05161 .
## Music 0.067581 0.028816 2.35 0.01902 *
## `Talk-Show` -0.414209 1.022305 -0.41 0.68535
## `Film-Noir` -0.331414 0.075198 -4.41 1.0e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.02 on 45243 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.254
## F-statistic: 429 on 36 and 45243 DF, p-value: <2e-16
#compare recency bias estimates across models
bind_rows(
broom::tidy(fit_recency) %>% mutate(model = "Year + recency"),
broom::tidy(fit_full) %>% mutate(model = "Year + recency + genres")
) %>%
filter(str_detect(term, "recency")) %>%
select(model, term, estimate, std.error) %>%
arrange(term, model)
Within-cohort IQ scores
#within-cohort standardized ratings (IQ scale: mean=100, SD=15 per release year)
cross_iq = cross %>%
group_by(startYear) %>%
mutate(
cohort_mean = mean(averageRating),
cohort_sd = sd(averageRating),
rating_iq = 100 + 15 * (averageRating - cohort_mean) / cohort_sd
) %>%
ungroup()
cross_iq %>%
ggplot(aes(startYear, rating_iq)) +
geom_point(alpha = 0.02, size = 0.5) +
geom_smooth() +
geom_smooth(method = "lm", color = "red", linetype = "dashed") +
geom_hline(yintercept = 100, linetype = "dotted", alpha = 0.5) +
labs(
title = "Within-cohort standardized IMDB ratings (IQ scale: mean=100, SD=15)",
subtitle = sprintf("Feature films with >= 1000 votes, n = %s", format(nrow(cross_iq), big.mark = ",")),
x = "Release year",
y = "Rating IQ"
)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/rating_iq_by_year.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
#example well-known movies
cross_iq %>%
filter(numVotes >= 50000) %>%
arrange(desc(rating_iq)) %>%
select(primaryTitle, startYear, averageRating, numVotes, rating_iq) %>%
head(20)
#conversion chart: what raw rating = which IQ level, by release year
norms = cross_iq %>%
group_by(startYear) %>%
summarise(cohort_mean = mean(averageRating), cohort_sd = sd(averageRating), .groups = "drop")
conversion = norms %>%
crossing(iq = c(85, 100, 115, 130)) %>%
mutate(raw_rating = cohort_mean + (iq - 100) / 15 * cohort_sd)
conversion %>%
ggplot(aes(startYear, raw_rating, color = factor(iq))) +
geom_line(linewidth = 1) +
scale_color_manual(values = c("85" = "#E69F00", "100" = "black", "115" = "#0072B2", "130" = "#D55E00")) +
labs(
title = "IMDB rating to IQ conversion by release year",
subtitle = "Within-cohort norms: what raw rating corresponds to each IQ level?",
x = "Release year", y = "Raw IMDB rating", color = NULL
) +
scale_y_continuous(breaks = seq(4, 9, 0.5))

GG_save("figs/rating_iq_conversion.png")
#heatmap conversion table
rating_grid = seq(4, 9, by = 0.5)
year_grid = seq(1950, 2025, by = 5)
heatmap_data = norms %>%
filter(startYear %in% year_grid) %>%
crossing(raw_rating = rating_grid) %>%
mutate(iq = round(100 + 15 * (raw_rating - cohort_mean) / cohort_sd))
heatmap_data %>%
ggplot(aes(factor(startYear), factor(raw_rating), fill = iq)) +
geom_tile() +
geom_text(aes(label = iq), size = 3) +
scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#0072B2", midpoint = 100) +
labs(
title = "IMDB rating to IQ conversion table",
subtitle = "Within-cohort standardization (mean=100, SD=15 per release year)",
x = "Release year", y = "Raw IMDB rating", fill = "IQ"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

GG_save("figs/rating_iq_heatmap.png")
Region analysis
#movie_home_best loaded from parquet in init
#join with cross-sectional data and assign origin groups
cross_region = dbGetQuery(con, "
SELECT r.tconst, r.averageRating, r.numVotes, t.startYear,
CASE
WHEN h.home_region = 'US' THEN 'US'
WHEN h.home_region = 'GB' THEN 'UK'
WHEN h.home_region IN ('CA', 'AU', 'NZ', 'ZA', 'IE') THEN 'Other Anglo'
WHEN h.home_region IN ('DE', 'AT', 'CH') THEN 'Germany'
WHEN h.home_region = 'FR' THEN 'France'
WHEN h.home_region = 'IT' THEN 'Italy'
WHEN h.home_region IN ('SE', 'NO', 'DK', 'FI', 'IS') THEN 'Nordic'
WHEN h.home_region = 'IN' THEN 'India'
WHEN h.home_region IN ('CN', 'HK', 'TW') THEN 'China'
WHEN h.home_region = 'JP' THEN 'Japan'
WHEN h.home_region = 'KR' THEN 'South Korea'
ELSE 'Other'
END AS origin
FROM ratings r
JOIN titles t ON r.tconst = t.tconst
LEFT JOIN movie_home_best h ON r.tconst = h.tconst
WHERE r.date = (SELECT max(date) FROM ratings)
AND t.titleType = 'movie'
AND t.runtimeMinutes > 60
AND t.isAdult = 0
AND t.genres IS NOT NULL
AND t.startYear IS NOT NULL
AND t.startYear >= 1950
AND r.numVotes >= 1000
") %>% as_tibble()
#year slopes by region
region_slopes = cross_region %>%
group_by(origin) %>%
summarise(n = n(), year_coef = coef(lm(averageRating ~ startYear))[2], .groups = "drop")
region_labels = region_slopes %>%
mutate(
label = sprintf("b = %.4f\nn = %s", year_coef, format(n, big.mark = ",")),
startYear = 1955,
averageRating = 9.5
)
cross_region %>%
ggplot(aes(startYear, averageRating)) +
geom_point(alpha = 0.03, size = 0.5) +
geom_smooth(method = "lm") +
geom_smooth(color = "red", linetype = "dashed") +
geom_text(data = region_labels, aes(label = label), hjust = 0, vjust = 1, size = 3) +
facet_wrap(~origin, ncol = 4) +
labs(
title = "Rating decline by country of origin",
subtitle = "Blue = linear fit, red dashed = nonlinear smooth",
x = "Release year",
y = "Average rating"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

GG_save("figs/decline_by_region.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#compare regional slopes to overall
overall_coef = coef(lm(averageRating ~ startYear, data = cross_region))[2]
region_slopes_plot = region_slopes %>%
mutate(
origin = fct_reorder(origin, year_coef),
label = sprintf("%.4f (n=%s)", year_coef, format(n, big.mark = ","))
)
p = ggplot(region_slopes_plot, aes(year_coef, origin)) +
geom_point(size = 3) +
geom_vline(xintercept = overall_coef, linetype = "dashed", color = "red", linewidth = 0.8) +
geom_text(aes(label = label), hjust = -0.1, size = 3) +
annotate("text", x = overall_coef, y = 12.5, label = sprintf("Overall: %.4f", overall_coef),
color = "red", hjust = 1.1, size = 3.5) +
labs(
title = "Rating decline slope by country of origin vs. overall",
subtitle = "Red dashed = overall slope across all movies",
x = "Linear slope (rating per year)",
y = NULL
) +
xlim(-0.02, 0.012)
GG_save("figs/slope_by_region_vs_overall.png")
#composition effect: US-only vs overall
us_slope = region_slopes %>% filter(origin == "US") %>% pull(year_coef)
cat(sprintf("Overall slope: %.5f\nUS-only slope: %.5f\nComposition effect: %.5f (%.1f%% of US slope)\n",
overall_coef, us_slope, overall_coef - us_slope, (overall_coef - us_slope) / us_slope * 100))
## Overall slope: -0.00871
## US-only slope: -0.00708
## Composition effect: -0.00162 (22.9% of US slope)
Longitudinal analysis
#example with Return of the King
rotk = dbGetQuery(con, "
SELECT * FROM ratings WHERE tconst = 'tt0167260'
") %>% as_tibble()
rotk %>%
ggplot(aes(date, averageRating)) +
geom_line()

#longitudinal data: recent movies, full sample
titles_long = titles %>%
filter(
startYear %in% 2015:2025,
is.na(endYear),
titleType == "movie",
runtimeMinutes > 60,
isAdult == 0,
!is.na(genres),
)
#pull ratings via DuckDB
dbWriteTable(con, "titles_long_tmp", titles_long %>% select(tconst, startYear, genres), overwrite = TRUE)
ratings_long = dbGetQuery(con, "
SELECT r.tconst, r.averageRating, r.numVotes, r.date,
t.startYear, t.genres
FROM ratings r
JOIN titles_long_tmp t ON r.tconst = t.tconst
") %>% as_tibble()
dbExecute(con, "DROP TABLE IF EXISTS titles_long_tmp")
## [1] 0
#compute age with midpoint correction for release date
#for completed years, midpoint is +0.5 (uniform across year)
#for current year, midpoint is half the elapsed fraction
latest_decimal = as_decimal_year(max(ratings_long$date))
latest_year_int = floor(latest_decimal)
current_year_offset = (latest_decimal - latest_year_int) / 2
ratings_long %<>% mutate(
decimal_year = as_decimal_year(date),
release_midpoint = if_else(startYear < latest_year_int, startYear + 0.5, startYear + current_year_offset),
age = decimal_year - release_midpoint,
age_year = floor(age),
year_f = as.factor(startYear)
)
#for model fitting, take 10% sample of movies (keep all timepoints per movie)
set.seed(42)
tconst_sample = ratings_long %>% distinct(tconst) %>% sample_frac(0.10)
ratings_sub = semi_join(ratings_long, tconst_sample, by = "tconst")
ratings_sub %<>% mutate(age_year_f = factor(age_year))
#filter to valid age range
ratings_sub %<>% filter(age_year >= 0, age_year <= 10)
#model 1: age as factor only (no year interaction)
fit1 = lmer(averageRating ~ age_year_f + (1 | tconst), data = ratings_sub)
#model 2: age as factor × release year interaction
fit2 = lmer(averageRating ~ year_f * age_year_f + (1 | tconst), data = ratings_sub)
## fixed-effect model matrix is rank deficient so dropping 62 columns / coefficients
#model 3: continuous age (linear) × release year
fit3 = lmer(averageRating ~ year_f * age + (1 | tconst), data = ratings_sub)
#compare age effects across models
bind_rows(
broom.mixed::tidy(fit1, effects = "fixed") %>% mutate(model = "1: age factor only"),
broom.mixed::tidy(fit3, effects = "fixed") %>% mutate(model = "3: linear age × year")
) %>%
filter(str_detect(term, "age")) %>%
select(model, term, estimate, std.error) %>%
print(n = 30)
## # A tibble: 21 × 4
## model term estimate std.error
## <chr> <chr> <dbl> <dbl>
## 1 1: age factor only age_year_f1 -0.299 0.00196
## 2 1: age factor only age_year_f2 -0.412 0.00215
## 3 1: age factor only age_year_f3 -0.467 0.00235
## 4 1: age factor only age_year_f4 -0.497 0.00253
## 5 1: age factor only age_year_f5 -0.519 0.00265
## 6 1: age factor only age_year_f6 -0.535 0.00277
## 7 1: age factor only age_year_f7 -0.544 0.00292
## 8 1: age factor only age_year_f8 -0.563 0.00311
## 9 1: age factor only age_year_f9 -0.578 0.00345
## 10 1: age factor only age_year_f10 -0.603 0.00443
## 11 3: linear age × year age -0.0232 0.000857
## 12 3: linear age × year year_f2016:age 0.00948 0.00120
## 13 3: linear age × year year_f2017:age -0.00824 0.00119
## 14 3: linear age × year year_f2018:age 0.00102 0.00125
## 15 3: linear age × year year_f2019:age -0.00894 0.00124
## 16 3: linear age × year year_f2020:age -0.0311 0.00143
## 17 3: linear age × year year_f2021:age -0.0643 0.00145
## 18 3: linear age × year year_f2022:age -0.159 0.00156
## 19 3: linear age × year year_f2023:age -0.258 0.00233
## 20 3: linear age × year year_f2024:age -0.393 0.00487
## 21 3: linear age × year year_f2025:age -0.780 0.0217
#plot model 2 predictions (factor age × release year — no extrapolation)
pred2_tbl = ggpredict(fit2, terms = c("age_year_f", "year_f")) %>% as.data.frame() %>% as_tibble()
pred2_tbl %>%
mutate(age = as.numeric(as.character(x))) %>%
ggplot(aes(age, predicted, group = group)) +
geom_line(aes(color = group)) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.1) +
labs(
title = "IMDB rating trajectories by movie age",
subtitle = "Mixed model: age (factor) x release year + (1 | movie), 10% sample, 96 dates (2018-2026)",
x = "Age (years since release)",
y = "Predicted average rating",
color = "Release year",
fill = "Release year"
)

GG_save("figs/longitudinal_age.png")
#compare cross-sectional vs longitudinal recency bias
#longitudinal: predicted rating at age 0 minus stable rating (age 5+)
long_bias = pred2_tbl %>%
mutate(age = as.numeric(as.character(x))) %>%
group_by(group) %>%
summarise(
rating_age0 = predicted[age == 0],
rating_stable = mean(predicted[age >= 5]),
bias = rating_age0 - rating_stable,
.groups = "drop"
) %>%
rename(release_year = group)
#cross-sectional recency bias from OLS dummies
cross_bias = broom::tidy(fit_recency) %>%
filter(str_detect(term, "recency")) %>%
select(term, estimate, std.error)
long_bias
cross_bias
#recency bias by age: both models combined
#longitudinal: compute bias per release year, then average with CI across years
long_by_age = pred2_tbl %>%
mutate(age = as.numeric(as.character(x))) %>%
group_by(group) %>%
mutate(stable = mean(predicted[age >= 5])) %>%
ungroup() %>%
mutate(bias = predicted - stable) %>%
group_by(age) %>%
summarise(
mean_bias = mean(bias),
se_bias = sd(bias) / sqrt(n()),
ci_low = mean_bias - 1.96 * se_bias,
ci_high = mean_bias + 1.96 * se_bias,
.groups = "drop"
) %>%
transmute(age, bias = mean_bias, ci_low, ci_high, model = "Longitudinal")
cross_by_age = broom::tidy(fit_recency, conf.int = TRUE) %>%
filter(str_detect(term, "recency")) %>%
mutate(age = as.numeric(str_extract(term, "\\d+"))) %>%
select(age, bias = estimate, ci_low = conf.low, ci_high = conf.high) %>%
arrange(age) %>%
mutate(model = "Cross-sectional")
bind_rows(long_by_age, cross_by_age) %>%
ggplot(aes(age, bias, color = model, fill = model)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
geom_ribbon(aes(ymin = ci_low, ymax = ci_high), alpha = 0.15, color = NA) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
scale_x_continuous(breaks = 0:10) +
labs(
title = "Expected recency bias in IMDB ratings by movie age",
subtitle = "Longitudinal: 96 snapshots (2018-2026), Cross-sectional: latest snapshot",
x = "Movie age (years since release)",
y = "Rating bias",
color = "Model",
fill = "Model"
)

GG_save("figs/recency_bias_by_age.png")
Votes and ratings
#votes vs rating by decade
vote_rating = dbGetQuery(con, "
SELECT r.tconst, r.averageRating, r.numVotes,
t.startYear, t.genres
FROM ratings r
JOIN titles t ON r.tconst = t.tconst
WHERE r.date = (SELECT max(date) FROM ratings)
AND t.titleType = 'movie'
AND t.runtimeMinutes > 60
AND t.isAdult = 0
AND t.genres IS NOT NULL
AND t.startYear IS NOT NULL
AND t.startYear >= 1950
AND r.numVotes >= 1000
") %>% as_tibble()
vote_rating %<>% mutate(
decade = case_when(
startYear < 1980 ~ "1950-1979",
startYear < 1990 ~ "1980-1989",
startYear < 2000 ~ "1990-1999",
startYear < 2010 ~ "2000-2009",
startYear < 2020 ~ "2010-2019",
TRUE ~ "2020+"
) %>% fct_relevel("1950-1979"),
log_votes = log10(numVotes)
)
vote_rating %>%
ggplot(aes(log_votes, averageRating)) +
geom_point(alpha = 0.05, size = 0.5) +
geom_smooth() +
facet_wrap(~decade) +
labs(
title = "IMDB rating vs. number of votes by decade",
subtitle = sprintf("Feature films with >= 1000 votes, n = %s", format(nrow(vote_rating), big.mark = ",")),
x = "Number of votes (log10)",
y = "Average rating"
)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

GG_save("figs/votes_vs_rating_decades.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#vote growth vs rating change: audience broadening effect
trajectories = ratings_long %>%
group_by(tconst, startYear) %>%
summarise(
first_rating = averageRating[which.min(date)],
last_rating = averageRating[which.max(date)],
first_votes = numVotes[which.min(date)],
last_votes = numVotes[which.max(date)],
rating_change = last_rating - first_rating,
vote_growth = last_votes - first_votes,
.groups = "drop"
) %>%
filter(first_votes >= 100, vote_growth > 0) %>%
mutate(
year_group = case_when(
startYear <= 2018 ~ "2015-2018",
startYear <= 2021 ~ "2019-2021",
startYear <= 2023 ~ "2022-2023",
TRUE ~ "2024-2025"
) %>% fct_relevel("2015-2018")
)
trajectories %>%
ggplot(aes(log10(vote_growth), rating_change)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_smooth() +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
facet_wrap(~year_group) +
labs(
title = "Vote growth vs. rating change over observation period",
x = "Vote growth (log10 absolute increase)",
y = "Rating change (last - first observation)"
)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

GG_save("figs/vote_growth_vs_rating_change.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'