Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  lme4,
  ggeffects,
  DBI, duckdb
)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
library(future)
plan(multisession(workers = 2))

#duckdb connection (in-memory, loaded from parquet)
con = dbConnect(duckdb())
dbExecute(con, "CREATE TABLE ratings AS SELECT * FROM read_parquet('data/ratings.parquet')")
## [1] 1.34e+08
dbExecute(con, "CREATE TABLE titles AS SELECT * FROM read_parquet('data/titles.parquet')")
## [1] 12340957
dbExecute(con, "CREATE TABLE movie_home_best AS SELECT * FROM read_parquet('data/movie_home_best.parquet')")
## [1] 3501896
#claude
if (F) {
  library(btw)
  btw::btw_mcp_session()
}

Functions

as_decimal_year = function(x) {
  tibble(
    date = x
  ) %>% 
  mutate(
    year = year(date),
    day_of_year = yday(date),
    year_length = if_else(leap_year(date), 366, 365),
    decimal_year = year + (day_of_year - 1) / year_length
  ) %>% pull(decimal_year)
}

decimal_year_to_date <- function(decimal_year) {
  tibble(decimal_year = decimal_year) %>% 
    mutate(
      year = floor(decimal_year),
      fractional_year = decimal_year - year,
      year_length = if_else(leap_year(as.Date(paste0(year, "-01-01"))), 366, 365),
      day_of_year = round(fractional_year * year_length) + 1,
      date = as.Date(paste0(year, "-01-01")) + (day_of_year - 1)
    ) %>% pull(date)
}

#today test
today() %>% 
  as_decimal_year() %>% 
  decimal_year_to_date() 
## [1] "2026-03-08"
#dates back and forth
tibble(
  date_start = seq(from = as.Date("2020-01-01"), to = today(), by = "1 day"),
  decimal_year = date_start %>% as_decimal_year(),
  date_end = decimal_year %>% decimal_year_to_date(),
  identical = date_start == date_end
) %>% 
  pull(identical) %>%
  all()
## [1] TRUE

Data

if (F) {
  #sync new rating files from server (2024-03 onwards)
  system("rsync -av --ignore-existing root@164.92.95.101:/mnt/scraping/imdb/*.tsv.gz data/daily_ratings/")

  #historical snapshots (2018-2024) downloaded from Wayback Machine
  #see: https://web.archive.org/cdx/search/cdx?url=datasets.imdbws.com/title.ratings.tsv.gz&output=json&collapse=digest

  #download latest title.basics
  download.file(
    "https://datasets.imdbws.com/title.basics.tsv.gz",
    "data/title.basics.tsv.gz",
    mode = "wb"
  )

  #create ratings table if not exists
  dbExecute(con, "
    CREATE TABLE IF NOT EXISTS ratings (
      tconst TEXT,
      averageRating DOUBLE,
      numVotes INTEGER,
      date DATE
    )
  ")
  #no indexes needed — DuckDB columnar scans are fast enough

  #bulk ingest all new dates in one SQL statement
  #uses DuckDB's glob reader + filename column to extract date from filename
  dbExecute(con, "
    INSERT INTO ratings
    SELECT
      tconst, averageRating, numVotes,
      strptime(regexp_extract(filename, '(\\d{8})', 1), '%Y%m%d')::DATE AS date
    FROM read_csv('data/daily_ratings/*-title.ratings.tsv.gz',
      delim='\\t', header=true, nullstr='\\N', filename=true)
    WHERE strptime(regexp_extract(filename, '(\\d{8})', 1), '%Y%m%d')::DATE
      NOT IN (SELECT DISTINCT date FROM ratings)
  ")

  #refresh titles table
  dbExecute(con, "DROP TABLE IF EXISTS titles")
  dbExecute(con, "
    CREATE TABLE titles AS
    SELECT * FROM read_csv('data/title.basics.tsv.gz',
      delim='\\t', header=true, nullstr='\\N',
      types={'tconst': 'TEXT', 'titleType': 'TEXT', 'primaryTitle': 'TEXT',
             'originalTitle': 'TEXT', 'isAdult': 'INTEGER', 'startYear': 'INTEGER',
             'endYear': 'INTEGER', 'runtimeMinutes': 'INTEGER', 'genres': 'TEXT'})
  ")

  message(sprintf("Done. ratings: %s rows, titles: %s rows",
    format(dbGetQuery(con, "SELECT count(*) FROM ratings")[[1]], big.mark = ","),
    format(dbGetQuery(con, "SELECT count(*) FROM titles")[[1]], big.mark = ",")))
}
titles = tbl(con, "titles") %>% collect()

Cross-sectional analysis

#latest snapshot joined with titles, filtered to feature films
cross = dbGetQuery(con, "
  SELECT r.tconst, r.averageRating, r.numVotes, r.date,
         t.primaryTitle, t.startYear, t.genres, t.runtimeMinutes
  FROM ratings r
  JOIN titles t ON r.tconst = t.tconst
  WHERE r.date = (SELECT max(date) FROM ratings)
    AND t.titleType = 'movie'
    AND t.runtimeMinutes > 60
    AND t.isAdult = 0
    AND t.genres IS NOT NULL
    AND t.startYear IS NOT NULL
    AND t.startYear >= 1950
    AND r.numVotes >= 1000
") %>% as_tibble()

latest_date = cross$date[1]
latest_year = year(latest_date)

#recency dummies: 0, 1, 2, 3+ years ago
cross %<>% mutate(
  years_ago = latest_year - startYear,
  recency = case_when(
    years_ago <= 10 ~ as.character(years_ago),
    TRUE ~ "10+"
  ) %>% fct_relevel("10+")
)
#nonlinear smooth to show the decline + recent uptick
cs_slope = coef(lm(averageRating ~ startYear, data = cross))[2]

cross %>%
  ggplot(aes(startYear, averageRating)) +
  geom_point(alpha = 0.02, size = 0.5) +
  geom_smooth() +
  geom_smooth(method = "lm", color = "red", linetype = "dashed") +
  annotate("text", x = 1955, y = 9.5, hjust = 0, vjust = 1, size = 3.5,
    label = sprintf("Linear slope: %.4f\nn = %s movies", cs_slope, format(nrow(cross), big.mark = ","))) +
  labs(
    title = "Cross-sectional: IMDB ratings by release year",
    subtitle = sprintf("Snapshot from %s, feature films with ≥1000 votes", latest_date),
    x = "Release year",
    y = "Average rating"
  )
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/cross_sectional_nonlinear.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
#OLS with per-year recency dummies (0-10, reference = 10+)
fit_recency = lm(averageRating ~ startYear + recency, data = cross)
summary(fit_recency)
## 
## Call:
## lm(formula = averageRating ~ startYear + recency, data = cross)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.278 -0.648  0.156  0.821  3.568 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 26.072092   0.766683   34.01  < 2e-16 ***
## startYear   -0.009927   0.000384  -25.84  < 2e-16 ***
## recency0     0.372604   0.116033    3.21   0.0013 ** 
## recency1     0.308891   0.038029    8.12  4.7e-16 ***
## recency10   -0.000956   0.031495   -0.03   0.9758    
## recency2     0.254883   0.032745    7.78  7.2e-15 ***
## recency3     0.139599   0.031515    4.43  9.5e-06 ***
## recency4     0.058173   0.030732    1.89   0.0584 .  
## recency5     0.008115   0.033020    0.25   0.8059    
## recency6    -0.140333   0.034586   -4.06  5.0e-05 ***
## recency7    -0.005600   0.030521   -0.18   0.8544    
## recency8     0.016702   0.030558    0.55   0.5847    
## recency9    -0.006341   0.030799   -0.21   0.8369    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.17 on 45267 degrees of freedom
## Multiple R-squared:  0.0225, Adjusted R-squared:  0.0223 
## F-statistic:   87 on 12 and 45267 DF,  p-value: <2e-16
#genre indicators
genre_cols = cross %>%
  select(tconst, genres) %>%
  separate_longer_delim(genres, ",") %>%
  mutate(val = 1L) %>%
  pivot_wider(names_from = genres, values_from = val, values_fill = 0L)

cross_genres = inner_join(cross, genre_cols, by = "tconst")
genre_names = setdiff(names(genre_cols), "tconst")

#OLS with recency dummies + genre controls
formula_full = as.formula(paste(
  "averageRating ~ startYear + recency +",
  paste(paste0("`", genre_names, "`"), collapse = " + ")
))
fit_full = lm(formula_full, data = cross_genres)
summary(fit_full)
## 
## Call:
## lm(formula = formula_full, data = cross_genres)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.000 -0.538  0.098  0.654  3.862 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.629684   0.709422   43.18  < 2e-16 ***
## startYear   -0.012285   0.000356  -34.55  < 2e-16 ***
## recency0     0.532089   0.101431    5.25  1.6e-07 ***
## recency1     0.399068   0.033379   11.96  < 2e-16 ***
## recency10    0.014344   0.027554    0.52  0.60267    
## recency2     0.330094   0.028747   11.48  < 2e-16 ***
## recency3     0.186271   0.027642    6.74  1.6e-11 ***
## recency4     0.108064   0.026931    4.01  6.0e-05 ***
## recency5     0.026081   0.028935    0.90  0.36740    
## recency6    -0.113810   0.030297   -3.76  0.00017 ***
## recency7     0.002624   0.026738    0.10  0.92181    
## recency8     0.029337   0.026768    1.10  0.27310    
## recency9     0.004089   0.026969    0.15  0.87950    
## Action      -0.232987   0.014104  -16.52  < 2e-16 ***
## Crime        0.046398   0.014262    3.25  0.00114 ** 
## Thriller    -0.098977   0.015130   -6.54  6.1e-11 ***
## Horror      -0.855755   0.016708  -51.22  < 2e-16 ***
## Mystery      0.050591   0.018421    2.75  0.00603 ** 
## Comedy      -0.065190   0.012833   -5.08  3.8e-07 ***
## Drama        0.503427   0.012553   40.10  < 2e-16 ***
## Romance     -0.013715   0.014459   -0.95  0.34284    
## Musical      0.160688   0.040102    4.01  6.2e-05 ***
## Biography    0.256906   0.022748   11.29  < 2e-16 ***
## Documentary  1.128755   0.026049   43.33  < 2e-16 ***
## Fantasy     -0.087169   0.021861   -3.99  6.7e-05 ***
## Western     -0.298268   0.045251   -6.59  4.4e-11 ***
## Adventure   -0.108979   0.017906   -6.09  1.2e-09 ***
## Animation    0.747592   0.029047   25.74  < 2e-16 ***
## Family      -0.116363   0.025298   -4.60  4.2e-06 ***
## History      0.188281   0.026028    7.23  4.8e-13 ***
## Sport        0.008059   0.035636    0.23  0.82108    
## `Sci-Fi`    -0.382174   0.022976  -16.63  < 2e-16 ***
## War          0.128844   0.032644    3.95  7.9e-05 ***
## News         0.392503   0.201650    1.95  0.05161 .  
## Music        0.067581   0.028816    2.35  0.01902 *  
## `Talk-Show` -0.414209   1.022305   -0.41  0.68535    
## `Film-Noir` -0.331414   0.075198   -4.41  1.0e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.02 on 45243 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.254 
## F-statistic:  429 on 36 and 45243 DF,  p-value: <2e-16
#compare recency bias estimates across models
bind_rows(
  broom::tidy(fit_recency) %>% mutate(model = "Year + recency"),
  broom::tidy(fit_full) %>% mutate(model = "Year + recency + genres")
) %>%
  filter(str_detect(term, "recency")) %>%
  select(model, term, estimate, std.error) %>%
  arrange(term, model)

Within-cohort IQ scores

#within-cohort standardized ratings (IQ scale: mean=100, SD=15 per release year)
cross_iq = cross %>%
  group_by(startYear) %>%
  mutate(
    cohort_mean = mean(averageRating),
    cohort_sd = sd(averageRating),
    rating_iq = 100 + 15 * (averageRating - cohort_mean) / cohort_sd
  ) %>%
  ungroup()

cross_iq %>%
  ggplot(aes(startYear, rating_iq)) +
  geom_point(alpha = 0.02, size = 0.5) +
  geom_smooth() +
  geom_smooth(method = "lm", color = "red", linetype = "dashed") +
  geom_hline(yintercept = 100, linetype = "dotted", alpha = 0.5) +
  labs(
    title = "Within-cohort standardized IMDB ratings (IQ scale: mean=100, SD=15)",
    subtitle = sprintf("Feature films with >= 1000 votes, n = %s", format(nrow(cross_iq), big.mark = ",")),
    x = "Release year",
    y = "Rating IQ"
  )
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/rating_iq_by_year.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
#example well-known movies
cross_iq %>%
  filter(numVotes >= 50000) %>%
  arrange(desc(rating_iq)) %>%
  select(primaryTitle, startYear, averageRating, numVotes, rating_iq) %>%
  head(20)
#conversion chart: what raw rating = which IQ level, by release year
norms = cross_iq %>%
  group_by(startYear) %>%
  summarise(cohort_mean = mean(averageRating), cohort_sd = sd(averageRating), .groups = "drop")

conversion = norms %>%
  crossing(iq = c(85, 100, 115, 130)) %>%
  mutate(raw_rating = cohort_mean + (iq - 100) / 15 * cohort_sd)

conversion %>%
  ggplot(aes(startYear, raw_rating, color = factor(iq))) +
  geom_line(linewidth = 1) +
  scale_color_manual(values = c("85" = "#E69F00", "100" = "black", "115" = "#0072B2", "130" = "#D55E00")) +
  labs(
    title = "IMDB rating to IQ conversion by release year",
    subtitle = "Within-cohort norms: what raw rating corresponds to each IQ level?",
    x = "Release year", y = "Raw IMDB rating", color = NULL
  ) +
  scale_y_continuous(breaks = seq(4, 9, 0.5))

GG_save("figs/rating_iq_conversion.png")
#heatmap conversion table
rating_grid = seq(4, 9, by = 0.5)
year_grid = seq(1950, 2025, by = 5)

heatmap_data = norms %>%
  filter(startYear %in% year_grid) %>%
  crossing(raw_rating = rating_grid) %>%
  mutate(iq = round(100 + 15 * (raw_rating - cohort_mean) / cohort_sd))

heatmap_data %>%
  ggplot(aes(factor(startYear), factor(raw_rating), fill = iq)) +
  geom_tile() +
  geom_text(aes(label = iq), size = 3) +
  scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#0072B2", midpoint = 100) +
  labs(
    title = "IMDB rating to IQ conversion table",
    subtitle = "Within-cohort standardization (mean=100, SD=15 per release year)",
    x = "Release year", y = "Raw IMDB rating", fill = "IQ"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

GG_save("figs/rating_iq_heatmap.png")

Region analysis

#movie_home_best loaded from parquet in init
#join with cross-sectional data and assign origin groups
cross_region = dbGetQuery(con, "
  SELECT r.tconst, r.averageRating, r.numVotes, t.startYear,
    CASE
      WHEN h.home_region = 'US' THEN 'US'
      WHEN h.home_region = 'GB' THEN 'UK'
      WHEN h.home_region IN ('CA', 'AU', 'NZ', 'ZA', 'IE') THEN 'Other Anglo'
      WHEN h.home_region IN ('DE', 'AT', 'CH') THEN 'Germany'
      WHEN h.home_region = 'FR' THEN 'France'
      WHEN h.home_region = 'IT' THEN 'Italy'
      WHEN h.home_region IN ('SE', 'NO', 'DK', 'FI', 'IS') THEN 'Nordic'
      WHEN h.home_region = 'IN' THEN 'India'
      WHEN h.home_region IN ('CN', 'HK', 'TW') THEN 'China'
      WHEN h.home_region = 'JP' THEN 'Japan'
      WHEN h.home_region = 'KR' THEN 'South Korea'
      ELSE 'Other'
    END AS origin
  FROM ratings r
  JOIN titles t ON r.tconst = t.tconst
  LEFT JOIN movie_home_best h ON r.tconst = h.tconst
  WHERE r.date = (SELECT max(date) FROM ratings)
    AND t.titleType = 'movie'
    AND t.runtimeMinutes > 60
    AND t.isAdult = 0
    AND t.genres IS NOT NULL
    AND t.startYear IS NOT NULL
    AND t.startYear >= 1950
    AND r.numVotes >= 1000
") %>% as_tibble()

#year slopes by region
region_slopes = cross_region %>%
  group_by(origin) %>%
  summarise(n = n(), year_coef = coef(lm(averageRating ~ startYear))[2], .groups = "drop")

region_labels = region_slopes %>%
  mutate(
    label = sprintf("b = %.4f\nn = %s", year_coef, format(n, big.mark = ",")),
    startYear = 1955,
    averageRating = 9.5
  )

cross_region %>%
  ggplot(aes(startYear, averageRating)) +
  geom_point(alpha = 0.03, size = 0.5) +
  geom_smooth(method = "lm") +
  geom_smooth(color = "red", linetype = "dashed") +
  geom_text(data = region_labels, aes(label = label), hjust = 0, vjust = 1, size = 3) +
  facet_wrap(~origin, ncol = 4) +
  labs(
    title = "Rating decline by country of origin",
    subtitle = "Blue = linear fit, red dashed = nonlinear smooth",
    x = "Release year",
    y = "Average rating"
  )
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

GG_save("figs/decline_by_region.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#compare regional slopes to overall
overall_coef = coef(lm(averageRating ~ startYear, data = cross_region))[2]

region_slopes_plot = region_slopes %>%
  mutate(
    origin = fct_reorder(origin, year_coef),
    label = sprintf("%.4f (n=%s)", year_coef, format(n, big.mark = ","))
  )

p = ggplot(region_slopes_plot, aes(year_coef, origin)) +
  geom_point(size = 3) +
  geom_vline(xintercept = overall_coef, linetype = "dashed", color = "red", linewidth = 0.8) +
  geom_text(aes(label = label), hjust = -0.1, size = 3) +
  annotate("text", x = overall_coef, y = 12.5, label = sprintf("Overall: %.4f", overall_coef),
           color = "red", hjust = 1.1, size = 3.5) +
  labs(
    title = "Rating decline slope by country of origin vs. overall",
    subtitle = "Red dashed = overall slope across all movies",
    x = "Linear slope (rating per year)",
    y = NULL
  ) +
  xlim(-0.02, 0.012)

GG_save("figs/slope_by_region_vs_overall.png")

#composition effect: US-only vs overall
us_slope = region_slopes %>% filter(origin == "US") %>% pull(year_coef)
cat(sprintf("Overall slope: %.5f\nUS-only slope: %.5f\nComposition effect: %.5f (%.1f%% of US slope)\n",
  overall_coef, us_slope, overall_coef - us_slope, (overall_coef - us_slope) / us_slope * 100))
## Overall slope: -0.00871
## US-only slope: -0.00708
## Composition effect: -0.00162 (22.9% of US slope)

Longitudinal analysis

#example with Return of the King
rotk = dbGetQuery(con, "
  SELECT * FROM ratings WHERE tconst = 'tt0167260'
") %>% as_tibble()

rotk %>%
  ggplot(aes(date, averageRating)) +
  geom_line()

#longitudinal data: recent movies, full sample
titles_long = titles %>%
  filter(
    startYear %in% 2015:2025,
    is.na(endYear),
    titleType == "movie",
    runtimeMinutes > 60,
    isAdult == 0,
    !is.na(genres),
  )

#pull ratings via DuckDB
dbWriteTable(con, "titles_long_tmp", titles_long %>% select(tconst, startYear, genres), overwrite = TRUE)

ratings_long = dbGetQuery(con, "
  SELECT r.tconst, r.averageRating, r.numVotes, r.date,
         t.startYear, t.genres
  FROM ratings r
  JOIN titles_long_tmp t ON r.tconst = t.tconst
") %>% as_tibble()

dbExecute(con, "DROP TABLE IF EXISTS titles_long_tmp")
## [1] 0
#compute age with midpoint correction for release date
#for completed years, midpoint is +0.5 (uniform across year)
#for current year, midpoint is half the elapsed fraction
latest_decimal = as_decimal_year(max(ratings_long$date))
latest_year_int = floor(latest_decimal)
current_year_offset = (latest_decimal - latest_year_int) / 2

ratings_long %<>% mutate(
  decimal_year = as_decimal_year(date),
  release_midpoint = if_else(startYear < latest_year_int, startYear + 0.5, startYear + current_year_offset),
  age = decimal_year - release_midpoint,
  age_year = floor(age),
  year_f = as.factor(startYear)
)
#for model fitting, take 10% sample of movies (keep all timepoints per movie)
set.seed(42)
tconst_sample = ratings_long %>% distinct(tconst) %>% sample_frac(0.10)
ratings_sub = semi_join(ratings_long, tconst_sample, by = "tconst")
ratings_sub %<>% mutate(age_year_f = factor(age_year))
#filter to valid age range
ratings_sub %<>% filter(age_year >= 0, age_year <= 10)

#model 1: age as factor only (no year interaction)
fit1 = lmer(averageRating ~ age_year_f + (1 | tconst), data = ratings_sub)

#model 2: age as factor × release year interaction
fit2 = lmer(averageRating ~ year_f * age_year_f + (1 | tconst), data = ratings_sub)
## fixed-effect model matrix is rank deficient so dropping 62 columns / coefficients
#model 3: continuous age (linear) × release year
fit3 = lmer(averageRating ~ year_f * age + (1 | tconst), data = ratings_sub)

#compare age effects across models
bind_rows(
  broom.mixed::tidy(fit1, effects = "fixed") %>% mutate(model = "1: age factor only"),
  broom.mixed::tidy(fit3, effects = "fixed") %>% mutate(model = "3: linear age × year")
) %>%
  filter(str_detect(term, "age")) %>%
  select(model, term, estimate, std.error) %>%
  print(n = 30)
## # A tibble: 21 × 4
##    model                term           estimate std.error
##    <chr>                <chr>             <dbl>     <dbl>
##  1 1: age factor only   age_year_f1    -0.299    0.00196 
##  2 1: age factor only   age_year_f2    -0.412    0.00215 
##  3 1: age factor only   age_year_f3    -0.467    0.00235 
##  4 1: age factor only   age_year_f4    -0.497    0.00253 
##  5 1: age factor only   age_year_f5    -0.519    0.00265 
##  6 1: age factor only   age_year_f6    -0.535    0.00277 
##  7 1: age factor only   age_year_f7    -0.544    0.00292 
##  8 1: age factor only   age_year_f8    -0.563    0.00311 
##  9 1: age factor only   age_year_f9    -0.578    0.00345 
## 10 1: age factor only   age_year_f10   -0.603    0.00443 
## 11 3: linear age × year age            -0.0232   0.000857
## 12 3: linear age × year year_f2016:age  0.00948  0.00120 
## 13 3: linear age × year year_f2017:age -0.00824  0.00119 
## 14 3: linear age × year year_f2018:age  0.00102  0.00125 
## 15 3: linear age × year year_f2019:age -0.00894  0.00124 
## 16 3: linear age × year year_f2020:age -0.0311   0.00143 
## 17 3: linear age × year year_f2021:age -0.0643   0.00145 
## 18 3: linear age × year year_f2022:age -0.159    0.00156 
## 19 3: linear age × year year_f2023:age -0.258    0.00233 
## 20 3: linear age × year year_f2024:age -0.393    0.00487 
## 21 3: linear age × year year_f2025:age -0.780    0.0217
#plot model 2 predictions (factor age × release year — no extrapolation)
pred2_tbl = ggpredict(fit2, terms = c("age_year_f", "year_f")) %>% as.data.frame() %>% as_tibble()

pred2_tbl %>%
  mutate(age = as.numeric(as.character(x))) %>%
  ggplot(aes(age, predicted, group = group)) +
  geom_line(aes(color = group)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.1) +
  labs(
    title = "IMDB rating trajectories by movie age",
    subtitle = "Mixed model: age (factor) x release year + (1 | movie), 10% sample, 96 dates (2018-2026)",
    x = "Age (years since release)",
    y = "Predicted average rating",
    color = "Release year",
    fill = "Release year"
  )

GG_save("figs/longitudinal_age.png")
#compare cross-sectional vs longitudinal recency bias
#longitudinal: predicted rating at age 0 minus stable rating (age 5+)
long_bias = pred2_tbl %>%
  mutate(age = as.numeric(as.character(x))) %>%
  group_by(group) %>%
  summarise(
    rating_age0 = predicted[age == 0],
    rating_stable = mean(predicted[age >= 5]),
    bias = rating_age0 - rating_stable,
    .groups = "drop"
  ) %>%
  rename(release_year = group)

#cross-sectional recency bias from OLS dummies
cross_bias = broom::tidy(fit_recency) %>%
  filter(str_detect(term, "recency")) %>%
  select(term, estimate, std.error)

long_bias
cross_bias
#recency bias by age: both models combined
#longitudinal: compute bias per release year, then average with CI across years
long_by_age = pred2_tbl %>%
  mutate(age = as.numeric(as.character(x))) %>%
  group_by(group) %>%
  mutate(stable = mean(predicted[age >= 5])) %>%
  ungroup() %>%
  mutate(bias = predicted - stable) %>%
  group_by(age) %>%
  summarise(
    mean_bias = mean(bias),
    se_bias = sd(bias) / sqrt(n()),
    ci_low = mean_bias - 1.96 * se_bias,
    ci_high = mean_bias + 1.96 * se_bias,
    .groups = "drop"
  ) %>%
  transmute(age, bias = mean_bias, ci_low, ci_high, model = "Longitudinal")

cross_by_age = broom::tidy(fit_recency, conf.int = TRUE) %>%
  filter(str_detect(term, "recency")) %>%
  mutate(age = as.numeric(str_extract(term, "\\d+"))) %>%
  select(age, bias = estimate, ci_low = conf.low, ci_high = conf.high) %>%
  arrange(age) %>%
  mutate(model = "Cross-sectional")

bind_rows(long_by_age, cross_by_age) %>%
  ggplot(aes(age, bias, color = model, fill = model)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  geom_ribbon(aes(ymin = ci_low, ymax = ci_high), alpha = 0.15, color = NA) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  scale_x_continuous(breaks = 0:10) +
  labs(
    title = "Expected recency bias in IMDB ratings by movie age",
    subtitle = "Longitudinal: 96 snapshots (2018-2026), Cross-sectional: latest snapshot",
    x = "Movie age (years since release)",
    y = "Rating bias",
    color = "Model",
    fill = "Model"
  )

GG_save("figs/recency_bias_by_age.png")

Votes and ratings

#votes vs rating by decade
vote_rating = dbGetQuery(con, "
  SELECT r.tconst, r.averageRating, r.numVotes,
         t.startYear, t.genres
  FROM ratings r
  JOIN titles t ON r.tconst = t.tconst
  WHERE r.date = (SELECT max(date) FROM ratings)
    AND t.titleType = 'movie'
    AND t.runtimeMinutes > 60
    AND t.isAdult = 0
    AND t.genres IS NOT NULL
    AND t.startYear IS NOT NULL
    AND t.startYear >= 1950
    AND r.numVotes >= 1000
") %>% as_tibble()

vote_rating %<>% mutate(
  decade = case_when(
    startYear < 1980 ~ "1950-1979",
    startYear < 1990 ~ "1980-1989",
    startYear < 2000 ~ "1990-1999",
    startYear < 2010 ~ "2000-2009",
    startYear < 2020 ~ "2010-2019",
    TRUE ~ "2020+"
  ) %>% fct_relevel("1950-1979"),
  log_votes = log10(numVotes)
)

vote_rating %>%
  ggplot(aes(log_votes, averageRating)) +
  geom_point(alpha = 0.05, size = 0.5) +
  geom_smooth() +
  facet_wrap(~decade) +
  labs(
    title = "IMDB rating vs. number of votes by decade",
    subtitle = sprintf("Feature films with >= 1000 votes, n = %s", format(nrow(vote_rating), big.mark = ",")),
    x = "Number of votes (log10)",
    y = "Average rating"
  )
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

GG_save("figs/votes_vs_rating_decades.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#vote growth vs rating change: audience broadening effect
trajectories = ratings_long %>%
  group_by(tconst, startYear) %>%
  summarise(
    first_rating = averageRating[which.min(date)],
    last_rating = averageRating[which.max(date)],
    first_votes = numVotes[which.min(date)],
    last_votes = numVotes[which.max(date)],
    rating_change = last_rating - first_rating,
    vote_growth = last_votes - first_votes,
    .groups = "drop"
  ) %>%
  filter(first_votes >= 100, vote_growth > 0) %>%
  mutate(
    year_group = case_when(
      startYear <= 2018 ~ "2015-2018",
      startYear <= 2021 ~ "2019-2021",
      startYear <= 2023 ~ "2022-2023",
      TRUE ~ "2024-2025"
    ) %>% fct_relevel("2015-2018")
  )

trajectories %>%
  ggplot(aes(log10(vote_growth), rating_change)) +
  geom_point(alpha = 0.1, size = 0.5) +
  geom_smooth() +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  facet_wrap(~year_group) +
  labs(
    title = "Vote growth vs. rating change over observation period",
    x = "Vote growth (log10 absolute increase)",
    y = "Rating change (last - first observation)"
  )
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

GG_save("figs/vote_growth_vs_rating_change.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Meta

#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] future_1.67.0         duckdb_1.4.4          DBI_1.2.3            
##  [4] ggeffects_2.3.1       lme4_1.1-37           Matrix_1.7-4         
##  [7] kirkegaard_2025-11-19 robustbase_0.99-6     psych_2.5.6          
## [10] assertthat_0.2.1      weights_1.1.2         magrittr_2.0.4       
## [13] lubridate_1.9.4       forcats_1.0.1         stringr_1.6.0        
## [16] dplyr_1.1.4           purrr_1.2.0           readr_2.1.5          
## [19] tidyr_1.3.1           tibble_3.3.0          ggplot2_4.0.1.9000   
## [22] tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] Rdpack_2.6.4        mnormt_2.1.1        gridExtra_2.3      
##  [4] rlang_1.1.6.9000    furrr_0.3.1         compiler_4.5.2     
##  [7] mgcv_1.9-3          gdata_3.0.1         systemfonts_1.3.1  
## [10] vctrs_0.6.5         pkgconfig_2.0.3     shape_1.4.6.1      
## [13] fastmap_1.2.0       backports_1.5.0     dbplyr_2.5.1       
## [16] labeling_0.4.3      utf8_1.2.6          rmarkdown_2.30     
## [19] tzdb_0.5.0          haven_2.5.5         nloptr_2.2.1       
## [22] ragg_1.5.0          xfun_0.53           glmnet_4.1-10      
## [25] jomo_2.7-6          cachem_1.1.0        jsonlite_2.0.0     
## [28] pan_1.9             broom_1.0.11        parallel_4.5.2     
## [31] cluster_2.1.8.2     R6_2.6.1            bslib_0.9.0        
## [34] stringi_1.8.7       RColorBrewer_1.1-3  parallelly_1.45.1  
## [37] boot_1.3-32         rpart_4.1.24        jquerylib_0.1.4    
## [40] Rcpp_1.1.0          iterators_1.0.14    knitr_1.50         
## [43] base64enc_0.1-3     splines_4.5.2       nnet_7.3-20        
## [46] timechange_0.3.0    tidyselect_1.2.1    rstudioapi_0.17.1  
## [49] yaml_2.3.10         codetools_0.2-20    listenv_0.9.1      
## [52] lattice_0.22-7      withr_3.0.2         S7_0.2.1           
## [55] evaluate_1.0.5      foreign_0.8-91      survival_3.8-6     
## [58] pillar_1.11.1       mice_3.18.0         checkmate_2.3.3    
## [61] foreach_1.5.2       reformulas_0.4.1    insight_1.4.2      
## [64] generics_0.1.4      hms_1.1.3           scales_1.4.0       
## [67] minqa_1.2.8         gtools_3.9.5        globals_0.18.0     
## [70] glue_1.8.0          Hmisc_5.2-4         tools_4.5.2        
## [73] data.table_1.17.8   grid_4.5.2          rbibutils_2.3      
## [76] datawizard_1.3.0    colorspace_2.1-2    nlme_3.1-168       
## [79] htmlTable_2.4.3     Formula_1.2-5       cli_3.6.5          
## [82] textshaping_1.0.4   gtable_0.3.6        DEoptimR_1.1-4     
## [85] broom.mixed_0.2.9.6 sass_0.4.10         digest_0.6.39      
## [88] htmlwidgets_1.6.4   farver_2.1.2        htmltools_0.5.8.1  
## [91] lifecycle_1.0.4     mitml_0.4-5         MASS_7.3-65
#write data to file for reuse
ratings_sub %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}