library(data.table)
library(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(forcats)
library(ggplot2)
library(scales)
library(ggrepel)
library(ggtext)        # rich-text titles
library(hexbin)        # density hex-binning (fixes over-plotting)
library(systemfonts)
library(knitr)
library(kableExtra)
library(broom)
# ---------------------------------------------------------------------------
#  Visual identity — a dark, modern, "cinema in a dark room" look.
#  System font (San Francisco / Helvetica Neue), deep near-black canvas,
#  a projector-amber accent, and density hex-bins instead of point clouds.
# ---------------------------------------------------------------------------
.fams <- unique(systemfonts::system_fonts()$family)
pick_font <- function(cands, fb) { h <- cands[cands %in% .fams]; if (length(h)) h[1] else fb }
f_sans <- pick_font(c("SF Pro Display", "SF Pro Text", "Helvetica Neue", "Helvetica", "Arial"), "sans")

# dark palette
bg   <- "#0b0c0f"; grid <- "#23262e"; ink  <- "#ececec"; dim <- "#9aa0a8"
amber<- "#ffb53d"; teal <- "#34d0c3"; red  <- "#ff6b5e"; green <- "#7bd88f"
gold <- "#e0a83a"; blue <- "#5aa9e6"; purple <- "#b08cf0"; sand <- "#d8b48c"
cat_pal <- c(amber, teal, red, blue, green, purple, sand, gold)

# luminous ramp for density / heat
heat_ramp <- c("#16181d", "#3a2d18", "#7d5420", "#d98f2c", amber, "#ffe6b8")
scale_fill_heat <- function(...) scale_fill_gradientn(colours = heat_ramp, ...)

theme_noir <- function(base_size = 13) {
  theme_minimal(base_size = base_size, base_family = f_sans) +
    theme(
      text             = element_text(colour = ink),
      plot.title       = element_markdown(family = f_sans, face = "bold", size = rel(1.4),
                                          colour = ink, margin = margin(b = 3), lineheight = 1.1),
      plot.subtitle    = element_markdown(family = f_sans, size = rel(0.95),
                                          colour = dim, lineheight = 1.2, margin = margin(b = 13)),
      plot.caption     = element_text(family = f_sans, size = rel(0.68),
                                      colour = dim, hjust = 1, margin = margin(t = 10)),
      plot.title.position = "plot", plot.caption.position = "plot",
      axis.title       = element_text(family = f_sans, size = rel(0.82), colour = dim),
      axis.title.x     = element_text(margin = margin(t = 6)),
      axis.title.y     = element_text(margin = margin(r = 6)),
      axis.text        = element_text(family = f_sans, size = rel(0.78), colour = dim),
      panel.grid.major = element_line(colour = grid, linewidth = 0.3),
      panel.grid.minor = element_blank(),
      legend.position  = "top", legend.justification = "left",
      legend.title     = element_text(family = f_sans, size = rel(0.74), colour = dim),
      legend.text      = element_text(family = f_sans, size = rel(0.74), colour = dim),
      strip.text       = element_text(family = f_sans, face = "bold", size = rel(0.84), colour = ink),
      plot.background  = element_rect(fill = bg, colour = NA),
      panel.background = element_rect(fill = bg, colour = NA),
      legend.key       = element_rect(fill = NA, colour = NA),
      plot.margin      = margin(14, 16, 10, 14)
    )
}
theme_set(theme_noir())

hl <- function(txt, col) sprintf("<span style='color:%s;'>%s</span>", col, txt)
ann_txt <- function(x, y, label, colour = ink, hjust = 0, size = 3.1, fontface = "italic")
  annotate("text", x = x, y = y, label = label, colour = colour, hjust = hjust,
           size = size, family = f_sans, fontface = fontface, lineheight = 0.95)
ann_arrow <- function(x, y, xend, yend, curvature = -0.3, colour = ink)
  annotate("curve", x = x, y = y, xend = xend, yend = yend, curvature = curvature,
           arrow = arrow(length = unit(0.02, "npc"), type = "closed"),
           colour = colour, linewidth = 0.45)

SRC <- "Source: TMDB full-catalogue dump · analysis by Tsikhan Karoukin"
dollar_short <- label_dollar(scale_cut = cut_short_scale())
num_short    <- label_number(scale_cut = cut_short_scale())

A spreadsheet on my laptop holds more than a million films, and I wanted to hear what they say when you read them all at once. Where do films get made? What do they cost? Who keeps turning up in the credits of the good ones, and can you predict any of it? I turned the lights down and went looking.

Why I’m doing this

I just finished university, and I work best when I start from a question I care about and let the numbers push back. The data is almost the entire TMDB catalogue: more than 1.2 million films, a century of release dates, scores from two separate crowds, budgets, box office, genres, and the names behind the camera. It is one of the biggest free film datasets anywhere, and hardly anyone reads the whole thing. I wanted to.

A pile of charts is not analysis. I started with five questions and kept the whole report pointed at them:

  • Does money buy quality? A big budget feels like it should buy a better film. Does it, once you step back from this weekend’s headlines and look at a million films at once?
  • Are old films actually better, or do we just forget the bad ones?
  • Which countries and languages score far above their weight, and why?
  • Where do critics and crowds split apart, and what hides in that gap?
  • Can a simple model guess a film’s score from its basic facts, or does taste slip through every formula?

Under most charts you’ll find a few paragraphs where I think out loud. I read the chart, I ask what might be fooling me, and I say what I take from it. A short highlighted takeaway then gives you the one line I’d offer if we were sitting in front of the screen. Charts that earned nothing got cut, and I list the cuts near the end.

A word on honesty: this is a portfolio project, not a journal paper. Where the data is thin, or where a reading is my hunch rather than a fact, I say so. I would rather show you that I know where the limits are than fake a confidence I don’t have.

The data, in one breath

Before I trust a single chart, I want to know what I’m holding. How big is it, and where are the holes?

# ===========================================================================
#  >>> THE ONE PLACE TO SWAP THE DATASET <<<
#  Point DATA_PATH at a newer CSV (same TMDB schema) and re-knit. That's it.
# ===========================================================================
DATA_PATH <- "TMDB_all_movies.csv"

stopifnot("Data file not found — check DATA_PATH" = file.exists(DATA_PATH))
movies <- fread(DATA_PATH, showProgress = FALSE)

# Fail loudly and clearly if a newer file is missing a column we rely on.
required_cols <- c("id","title","vote_average","vote_count","status","release_date",
                   "revenue","runtime","budget","original_language","popularity",
                   "genres","production_countries","director","music_composer",
                   "imdb_rating","imdb_votes")
missing_cols <- setdiff(required_cols, names(movies))
if (length(missing_cols))
  stop("Dataset is missing expected columns: ", paste(missing_cols, collapse = ", "))

suppressWarnings(movies[, release_date := as.Date(release_date)])
movies[, year := year(release_date)]
movies[, decade := floor(year / 10) * 10]

cat("Loaded", format(nrow(movies), big.mark = ","), "movies x",
    ncol(movies), "columns  (",
    round(as.numeric(object.size(movies)) / 1e9, 2), "GB in memory )\n")
## Loaded 1,212,714 movies x 32 columns  ( 1.27 GB in memory )

Each row reads like a short biography: a title, a release date, a running time, what two audiences made of it, what it cost, what it earned, and a few of the people who built it. Five of the most-rated films, so you can see the shape of a row:

movies[order(-imdb_votes)][1:5,
  .(title, year, vote_average, imdb_rating, runtime,
    budget = dollar_short(budget), genres = substr(genres, 1, 34))] |>
  kable(caption = "Five of the most-rated titles in the catalogue",
        col.names = c("Title","Year","TMDB","IMDb","Runtime","Budget","Genres")) |>
  kable_styling(bootstrap_options = c("hover"), full_width = FALSE)
Five of the most-rated titles in the catalogue
Title Year TMDB IMDb Runtime Budget Genres
The Shawshank Redemption 1994 8.723 9.3 142 $25M Drama, Crime
The Dark Knight 2008 8.531 9.1 152 $185M Action, Crime, Thriller
Inception 2010 8.372 8.8 148 $160M Action, Science Fiction, Adventure
Fight Club 1999 8.438 8.8 139 $63M Drama, Thriller
Interstellar 2014 8.500 8.7 169 $165M Adventure, Drama, Science Fiction

The blanks matter more than they look. In a dataset like this they are not random. They track how much attention a film got, so the gaps carry information of their own.

na_tab <- data.table(
  column = names(movies),
  pct = sapply(movies, function(x) {
    if (inherits(x, "Date")) mean(is.na(x))
    else mean(is.na(x) | x == "" | x == "NA")
  }) * 100
)[order(-pct)][pct > 0][1:16]

ggplot(na_tab, aes(reorder(column, pct), pct)) +
  geom_col(aes(fill = pct), width = 0.72) +
  geom_text(aes(label = sprintf("%.0f%%", pct)), hjust = -0.18,
            family = f_sans, size = 3, colour = dim) +
  coord_flip(clip = "off") +
  scale_fill_heat(guide = "none", limits = c(0, 100)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.13)), labels = NULL) +
  labs(title = paste0("What's ", hl("missing", amber), ", and how badly"),
       subtitle = "Share of the 1.2M rows where each field is blank. The metadata thins out fast.",
       x = NULL, y = NULL, caption = SRC)

Read it top to bottom and the industry’s quiet hierarchy shows through. The prestige credits, the composer and the cinematographer and the tagline some marketer agonised over, sit empty for most titles. The small independent films and the foreign ones never had those details written down in the first place. The same gap explains why only a minority of films carry an IMDb score. IMDb leans toward titles that found an audience, while TMDB lists the millions that never did.

The columns I lean on hardest hold up well. Runtime, vote counts, language and release date are there for nearly everything, so the spine of the analysis is solid. One fact stays taped to my monitor for the rest of the report. Anything about crew or money describes the famous, well-documented part of cinema, not all of it. When a later chart tells me a composer keeps scoring great films, part of what it measures is which films were famous enough to be catalogued in full. I point that out wherever it bites.

How good is a “good” movie?

The simplest question to ask of a ratings dataset is what the scores look like. Two crowds rate these films: TMDB’s younger, fan-heavy audience and IMDb’s older, larger one. I drew both as smooth curves so they sit on the same scale, even though one has far more films behind it.

rd <- rbindlist(list(
  data.table(score = movies[vote_count > 0, vote_average], src = "TMDB audience"),
  data.table(score = movies[!is.na(imdb_rating) & imdb_rating > 0, imdb_rating], src = "IMDb audience")
))
m_tmdb <- mean(movies[vote_count > 0, vote_average], na.rm = TRUE)
m_imdb <- mean(movies[imdb_rating > 0, imdb_rating], na.rm = TRUE)

ggplot(rd, aes(score, fill = src, colour = src)) +
  geom_density(alpha = 0.28, linewidth = 1, adjust = 1.3) +
  scale_fill_manual(values = c("TMDB audience" = amber, "IMDb audience" = teal), name = NULL) +
  scale_colour_manual(values = c("TMDB audience" = amber, "IMDb audience" = teal), name = NULL) +
  scale_x_continuous(limits = c(0, 10), breaks = 0:10) +
  ann_arrow(3.0, 0.30, 5.6, 0.15, curvature = 0.3, colour = dim) +
  ann_txt(1.9, 0.34, "the genuinely\npanned tail", colour = dim) +
  labs(title = paste0("Almost everything lives between ", hl("6 and 7", amber)),
       subtitle = sprintf("Density of ratings · means: TMDB %.2f, IMDb %.2f · the two crowds barely disagree", m_tmdb, m_imdb),
       x = "Rating (0–10)", y = "Density", caption = SRC)

Neither curve sits at 5, the middle of the scale. Both pile up around 6.5, for two reasons that both ring true. The kind one is choice: people rate films they picked to watch, and you rarely sit down to a movie you expect to hate. The less kind one is score inflation. We treat 7 as “fine”, 8 as “good”, and save anything below 5 for films we actively resent.

This shapes the whole report. When almost everything scores 6 to 7, the raw number is a weak instrument, and a 7.0 only means “good, like most films”. The signal lives in the rare highs and lows, and even more in the films where the two crowds disagree. So I rank by how many people voted and by the gap between TMDB and IMDb, which both say far more than the crowded middle of this curve.

One honest note about the picture itself: the TMDB curve looks a bit spiky, like a comb. That’s real, not a glitch. Films with only a few votes land on round numbers (one 7/10 vote gives an average of exactly 7.0), so the scores pile up on whole numbers. It’s a small reminder that an “average” from three votes is barely an average at all.

The other half of “how good is it” is “how many people bothered to have an opinion at all”, and that follows one of the most uneven patterns you will ever see.

ggplot(movies[vote_count > 0], aes(vote_count)) +
  geom_histogram(bins = 46, fill = blue, colour = bg, linewidth = 0.15) +
  scale_x_log10(labels = num_short) +
  scale_y_continuous(labels = num_short) +
  ann_txt(1.1, Inf, "the silent\nmajority", colour = dim) +
  ann_txt(9000, Inf, "the blockbusters\neveryone rates", colour = amber, hjust = 1) +
  labs(title = "A few films collect almost all the attention",
       subtitle = "TMDB vote counts on a log axis. Most films get a handful of votes.",
       x = "Number of votes (log scale)", y = "Movies", caption = SRC)

The bottom axis is logarithmic, so each step to the right means ten times more votes. Even on that squeezed scale the slope is steep and lopsided. Most films collect a few votes; a tiny group collects tens of thousands or millions. Statisticians call this a power law, and you meet the same shape in the size of cities, in wealth, and in how often words turn up in a language.

This is the clearest chart in the dataset for me, because it changes what “the movies” means. We talk about cinema as the few hundred films that get reviews and box-office coverage. Those are rare exceptions, floating on an ocean of films almost nobody has seen or rated. For a filmmaker it carries a hard lesson: being seen at all is most of the battle, and quality without attention sits close to invisible. For me it is a warning. An average from five votes is noise, so from here on I trust a score only once enough people have cast one.

How long should a movie be?

rt <- movies[runtime >= 10 & runtime <= 240]
ggplot(rt, aes(runtime)) +
  geom_histogram(bins = 60, fill = teal, colour = bg, linewidth = 0.15) +
  annotate("rect", xmin = 90, xmax = 120, ymin = 0, ymax = Inf, fill = amber, alpha = 0.08) +
  geom_vline(xintercept = c(90, 120), linetype = "22", colour = amber, linewidth = 0.5) +
  ann_txt(105, 66000, "comfort zone", colour = amber, hjust = 0.5) +
  scale_y_continuous(labels = num_short) +
  scale_x_continuous(breaks = seq(0, 210, 30)) +
  labs(title = paste0("Most films sit in the ", hl("90–120 minute", amber), " window"),
       subtitle = "Runtime of every film between 10 and 240 minutes.",
       x = "Runtime (minutes)", y = "Movies", caption = SRC)

If films could be any length, you’d expect a wide, shapeless spread. Instead there’s a tall, narrow spike right in the 90–120 minute range, and everything else drops off around it. This is one of the strongest patterns in the whole dataset, and it’s no accident: film length was shaped by cinema schedules (how many showings fit in one evening), by our attention spans, and by decades of experience about how long one story can hold a room.

The long tail on the right is the part I find interesting, the films that run well past two hours. They are not long by accident. A studio doesn’t hand a director three hours on a whim. It gives that time to people it trusts, on stories it believes can carry the weight. So when a later chart shows longer films scoring higher, I will already suspect the runtime is a sign of trust the film earned before shooting started, rather than the thing that made it good.

The money

mon <- rbindlist(list(
  data.table(amount = movies[budget  > 0, budget],  kind = "Budget"),
  data.table(amount = movies[revenue > 0, revenue], kind = "Box-office revenue")
))
ggplot(mon, aes(amount, fill = kind)) +
  geom_histogram(bins = 44, colour = bg, linewidth = 0.12) +
  facet_wrap(~kind, ncol = 1, scales = "free_y") +
  scale_x_log10(labels = dollar_short) +
  scale_y_continuous(labels = num_short) +
  scale_fill_manual(values = c("Budget" = gold, "Box-office revenue" = teal), guide = "none") +
  labs(title = "What films cost, and what they take at the box office",
       subtitle = "Only films with a real figure on record. Note the enormous range.",
       x = "US dollars (log scale)", y = "Movies", caption = SRC)

Film budgets cover an almost unreal range, from shorts shot for a few thousand dollars to $300-million giants. On a log axis both budgets and box office form clean, bell-shaped curves. That regular shape is useful. It says the amounts grow by multiplying, by percentages rather than fixed sums, which is why every money chart here sits on a log axis. On a normal axis a dozen blockbusters would flatten everything else against the floor.

One data-quality point sits underneath all of this. A huge number of films record a budget or box office of exactly zero, which almost always means the figure was never reported, not that the film was free or earned nothing. So every money chart uses only films with real, non-zero numbers, and that group leans toward the bigger, better-documented productions. The film shot for $20,000, or the small release that quietly took $400,000, stays invisible here.

A century of cinema

yc <- movies[year >= 1900 & year <= 2024, .N, by = year][order(year)]
ggplot(yc, aes(year, N)) +
  geom_area(fill = amber, alpha = 0.16) +
  geom_line(colour = amber, linewidth = 1) +
  ann_arrow(1997, max(yc$N) * 0.55, 2014, max(yc$N) * 0.9, curvature = -0.3, colour = teal) +
  ann_txt(1971, max(yc$N) * 0.52, "the streaming-era\nflood of titles", colour = teal) +
  scale_y_continuous(labels = num_short) +
  scale_x_continuous(breaks = seq(1900, 2020, 20)) +
  labs(title = "Film production went vertical after 2000",
       subtitle = "Titles released per year. Cheap cameras and streaming opened the gates.",
       x = NULL, y = "Movies released", caption = SRC)

For most of the twentieth century the line barely moves. A few hundred films a year, drifting upward as cinemas and television spread. Around the year 2000 it bends, and keeps bending, until the most recent years carry tens of thousands of releases each. The climb is almost vertical, and it dwarfs everything before it.

The barriers fell one after another. Cameras good enough for a clean image got cheap. Editing moved onto a laptop. Then the big one: you no longer needed a film print and a cinema deal to put a movie in front of people, because streaming and a plain upload reach the world on their own. Voices the old system would have turned away now get made and seen, which is a genuine gain. It also changes what the word “movie” covers. The typical film in this data is not a studio release with a marketing budget. It is something smaller, stranger and more personal, and every average I take across all films runs straight through that flood.

Are old movies actually better?

yr <- movies[year >= 1925 & year <= 2023 & !is.na(imdb_rating) & vote_count >= 10,
             .(imdb = mean(imdb_rating, na.rm = TRUE), n = .N), by = year][n >= 25][order(year)]
ggplot(yr, aes(year, imdb)) +
  geom_line(colour = dim, linewidth = 0.4, alpha = 0.7) +
  geom_smooth(method = "loess", span = 0.4, se = FALSE, colour = amber, linewidth = 1.4) +
  ann_txt(1934, 6.35, "the survivors of the past\nflatter every old decade", colour = dim) +
  scale_x_continuous(breaks = seq(1930, 2020, 15)) +
  labs(title = paste0("Older films score higher, and ", hl("survivorship", amber), " is most of why"),
       subtitle = "Mean IMDb score by release year (≥25 films per year, each with ≥10 votes)",
       x = NULL, y = "Mean IMDb rating", caption = SRC)

At first glance the chart looks like proof that films are getting worse. The 1940s through the 1960s score well above recent years, and the line drifts down across the whole century. You could write a smug paragraph here about how they don’t make them like they used to. I don’t buy it for a second. The reason is the single most important idea in this report, and it has a name: survivorship bias. We only kept the survivors.

We don’t have a fair sample of 1955. We have the films from 1955 that someone cared about enough to record and rate seventy years later. The dull B-movies and the cheap quickies that flopped and vanished mostly never reached IMDb at all. So the 1955 we average is already trimmed down to its survivors, which lean good. Every 2023 release lands in the data the moment it exists, the masterpiece and the disaster side by side. Comparing the best of the past against the whole of the present, then reading the gap as “decline”, is one of the oldest mistakes in statistics.

This is why I filter by vote count and by year before trusting any time trend. The golden age is, in large part, an effect of what we chose to keep. A dataset is never the world. It is the world after a long, invisible filter has already had its say.

dc <- movies[!is.na(imdb_rating) & decade >= 1930 & decade <= 2020,
             .(imdb_rating, dl = factor(paste0(decade, "s"),
                                        levels = paste0(seq(1930, 2020, 10), "s")))]
ggplot(dc, aes(dl, imdb_rating, fill = dl)) +
  geom_violin(colour = NA, alpha = 0.85, scale = "width") +
  geom_boxplot(width = 0.12, fill = bg, colour = ink, outlier.shape = NA, linewidth = 0.35) +
  scale_fill_manual(values = colorRampPalette(c(teal, amber, red))(10), guide = "none") +
  labs(title = "Each decade makes its classics and its disasters",
       subtitle = "Full spread of IMDb scores by decade of release",
       x = NULL, y = "IMDb rating", caption = SRC)

If quality were collapsing, the whole range would slide down, the very best along with the average. It doesn’t. The middle drifts down a little, the survivors again, while the range stays as wide in the 2010s as it was in the 1950s. The top is just as high. What the modern decades added is a long, heavy bottom, all the recorded flops the past used to throw away.

This is the chart that kills the nostalgia for me. Every decade holds the same span of ambition and failure, with great films at the top in much the same proportion. The recent decades are only more honest about how much weak material gets made. Cinema isn’t sliding. We finally write all of it down.

The world map of cinema

How many films a country makes and how good they are are two completely different maps, and mixing them up is one of the most common mistakes people make about world cinema. Let’s draw both. First, who simply makes the most films:

ce <- movies[production_countries != "" & !is.na(production_countries), .(id, production_countries)]
ce <- ce[, .(country = trimws(unlist(strsplit(production_countries, ",")))), by = id]
cc <- ce[, .N, by = country][order(-N)][1:18]
ggplot(cc, aes(reorder(country, N), N)) +
  geom_col(fill = blue, width = 0.72) +
  geom_text(aes(label = num_short(N)), hjust = -0.14, family = f_sans, size = 2.9, colour = dim) +
  coord_flip(clip = "off") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.14)), labels = NULL) +
  labs(title = "Who makes the most movies",
       subtitle = "Top production countries by title count (one film can list several)",
       x = NULL, y = NULL, caption = SRC)

The United States leads by a wide margin, which surprises no one. The rest of the list is the interesting part. Japan, France, the UK and India all make films on an industrial scale, and a long tail of smaller countries sits close behind. “Hollywood equals cinema” is a box-office illusion. It confuses the films with the most marketing and the most screens for the films that actually get made. Counted honestly, world cinema is enormous, and the American share, large as it is, falls well short of a monopoly.

Now the map I came here for. Not who makes the most films, but who makes the best-rated ones, by language and by country:

lr <- movies[!is.na(imdb_rating) & original_language != "",
             .(rating = mean(imdb_rating), n = .N), by = original_language][n >= 200][order(-rating)][1:18]
ggplot(lr, aes(reorder(original_language, rating), rating)) +
  geom_segment(aes(xend = original_language, y = 5.8, yend = rating), colour = grid, linewidth = 1) +
  geom_point(aes(size = n), colour = amber) +
  coord_flip() +
  scale_size_continuous(range = c(2.5, 9), labels = num_short, name = "films rated") +
  labs(title = paste0("The ", hl("languages", amber), " with the highest average scores"),
       subtitle = "Mean IMDb score per original language (≥200 rated films). Dot size = sample.",
       x = NULL, y = "Mean IMDb rating", caption = SRC)

cr <- merge(ce, movies[, .(id, imdb_rating)], by = "id")[!is.na(imdb_rating)]
cr <- cr[, .(rating = mean(imdb_rating), n = .N), by = country][n >= 120][order(-rating)][1:18]
ggplot(cr, aes(reorder(country, rating), rating)) +
  geom_segment(aes(xend = country, y = 5.5, yend = rating), colour = grid, linewidth = 1) +
  geom_point(aes(size = n), colour = teal) +
  coord_flip() +
  scale_size_continuous(range = c(2.5, 9), labels = num_short, name = "films rated") +
  labs(title = paste0("Quality by ", hl("country", teal), " of production"),
       subtitle = "Mean IMDb rating per production country (≥120 rated films)",
       x = NULL, y = "Mean IMDb rating", caption = SRC)

The two lists barely overlap, and that gap is the whole story. Volume belongs to the United States, India and Japan. The top of the quality charts belongs to places the box office never mentions: Nepal and Sri Lanka, Albania and Macedonia, Georgia and Bangladesh, the Palestinian territories and Tunisia. By language it is the same picture, carried by India’s regional cinemas (Odia, Gujarati, Marathi, Kannada) and the small nations of the Balkans and the Baltic. Persian sits nowhere near the top. Neither does anything Scandinavian. Before reaching for a tidy reason, I have to throw out the lazy one I put in an earlier draft.

That lazy version said only a country’s best films travel far enough to get rated, so the average looks inflated. In 2010 that held. Today it doesn’t. A film gets a TMDB page the week it is announced, and a Nepali drama or a Bosnian comedy gathers its votes the same way a Marvel release does. The export gate is gone. The real reasons sit closer to home, in who makes these films and who sits down to grade them.

Start with what gets made. To run a factory of disposable genre films, the straight-to-streaming horror and the formula romance that fill the bottom of any large industry, a country needs money and a paying domestic audience. Nepal, Albania and Georgia have no such factory. Getting a film financed in Skopje or Tbilisi takes a real reason: a piece of national history, a social wound, a story the country wants told about itself. The forgettable middle never gets produced, so it never drags the average down. Bangladesh sits near the top of the country chart, and one of its films turns up later in my hidden-gems list. Aguner Poroshmoni, a 1995 drama set during the 1971 war of independence, written and directed by the novelist Humayun Ahmed. A small industry makes a film like that because the war still aches at home, not because it will sell tickets in London.

Then look at who holds the pen. A Marathi or Bengali film is rated almost entirely by Marathi and Bengali speakers, in India and across the diaspora, who watch in their own language and bring the context with them. The same film barely registers anywhere else. Outside South Asia, Indian cinema gets watched for a laugh or not at all, and those viewers never bother to score it. So the rating pool fills with the sympathetic and empties of the indifferent. A Marvel film is the reverse: tens of millions rate it, including everyone it annoyed.

Politics sharpens the effect. Palestinian, Syrian, Afghan and Tunisian films all rank high, and none of these are popcorn industries. A film shot in Gaza or in post-revolution Tunis is a statement about occupation, war or freedom, and the people who seek it out arrive already on its side. Some of the warmth is admiration for the work. Some is solidarity with the cause. A 0-to-10 score cannot tell the two apart.

None of this makes the films bad, and none of it means they are secretly worse than Hollywood. The honest answer is a stack of small reasons rather than one clean law. No cheap filler at the bottom. A home crowd that grades with pride and context. Subjects that invite sympathy. And samples small enough, a few hundred to a couple of thousand films, that one beloved national release can lift the whole line. “Iran makes better films” was never it. “These countries make fewer, more serious films, watched and graded by the people they were made for” gets much closer.

Genres

ge <- movies[genres != "" & !is.na(genres), .(id, genres, imdb_rating)]
ge <- ge[, .(genre = trimws(unlist(strsplit(genres, ", ")))), by = .(id, imdb_rating)]
g_count <- ge[, .N, by = genre][order(-N)]
top_g <- g_count[1:10, genre]
g_rate <- ge[!is.na(imdb_rating), .(rating = mean(imdb_rating), n = .N), by = genre][n >= 500]
gc <- merge(g_count, g_rate, by = "genre")
ggplot(gc, aes(reorder(genre, rating), rating)) +
  geom_segment(aes(xend = genre, y = 5.5, yend = rating), colour = grid, linewidth = 1) +
  geom_point(aes(size = N, colour = rating)) +
  coord_flip() +
  scale_colour_gradient(low = blue, high = amber, guide = "none") +
  scale_size_continuous(range = c(2.5, 11), labels = num_short, name = "films in genre") +
  labs(title = paste0("What critics reward: ", hl("documentary and history", amber), " up top, horror down low"),
       subtitle = "Mean IMDb rating by genre · dot size = how many films carry the tag",
       x = NULL, y = "Mean IMDb rating", caption = SRC)

How common a genre is and how respected it is turn out to be two different things. Drama and Comedy, the workhorses that carry most of cinema, sit in the middle of the pack. The top belongs to the genres people choose on purpose: Documentary, History, War. That fits. You don’t make a documentary about a subject you’re indifferent to, and the viewer who hunts one down came ready to pay attention. Serious subjects pull in committed makers and patient audiences, and the scores follow.

Horror sits dead last, and the average treats it unfairly. No label in cinema is wider. It files Hereditary and The Shining in the same drawer as Sharknado 5 and a bottomless supply of cheap streaming creature flicks. Horror is one of the cheapest genres to shoot, which is its blessing and its curse: for every Hereditary, fifty films get made in a weekend for the price of a used car, and they drag the line down. That is the trap in any genre average. It crushes an enormous range into a single number, and the number remembers the floor, not the ceiling.

gl <- movies[genres != "" & !is.na(genres), .(gs = strsplit(genres, ", ")), by = id]
co <- matrix(0, 10, 10, dimnames = list(top_g, top_g))
for (i in seq_len(nrow(gl))) {
  g <- intersect(unlist(gl$gs[i]), top_g)
  if (length(g) > 1) for (a in 1:(length(g) - 1)) for (b in (a + 1):length(g)) {
    co[g[a], g[b]] <- co[g[a], g[b]] + 1; co[g[b], g[a]] <- co[g[b], g[a]] + 1
  }
}
cdf <- as.data.table(as.table(co)); setnames(cdf, c("g1", "g2", "n"))
cdf[as.character(g1) == as.character(g2), n := NA]
cdf[, txt := ifelse(is.na(n), "", num_short(n))]
cdf[, tcol := fifelse(!is.na(n) & n >= max(n, na.rm = TRUE) * 0.45, "#1a1208", ink)]
ggplot(cdf, aes(g1, g2, fill = n)) +
  geom_tile(colour = bg, linewidth = 1.4) +
  geom_text(aes(label = txt, colour = tcol), family = f_sans, size = 2.7) +
  scale_colour_identity() +
  scale_fill_heat(na.value = "#15171c", labels = num_short, name = "co-tagged films") +
  labs(title = "Which genres get married",
       subtitle = "How often the top-10 genres share a single film (diagonal blanked)",
       x = NULL, y = NULL, caption = SRC) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1), panel.grid = element_blank())

Films almost never wear one genre. They blend, and this grid shows which blends repeat. The brightest cell by a mile is Drama plus Romance. The love story is cinema’s default setting, so common it hardly counts as a combination. Behind it sit Action plus Thriller, the engine of the modern blockbuster, and Comedy plus Romance, the rom-com, an industry of its own.

The pairing that caught me off guard is Horror plus Comedy. Fear and laughter look like opposites, yet filmmakers keep welding them together, and the mechanics explain why. Both build tension, both live or die on timing, both pay off in a sudden release. Genre behaves less like a row of sealed boxes and more like a palette a director mixes at will. The repeat mixtures say something plain about what audiences want: a love story under almost everything, and a readiness to be frightened and amused in the same minute.

Does money buy quality?

This is the question I most wanted the data to settle, because instinct and evidence point opposite ways. I’ll build up to it. First the raw money picture: budget against box office, drawn as a density map so a million points don’t collapse into one smudge. Brighter cells hold more films, and the dashed line marks break-even.

fin <- movies[budget > 0 & revenue > 0]
fin[, ROI := (revenue - budget) / budget]
ggplot(fin, aes(budget, revenue)) +
  geom_hex(bins = 46) +
  geom_abline(slope = 1, intercept = 0, linetype = "22", colour = ink, linewidth = 0.55) +
  scale_x_log10(labels = dollar_short) + scale_y_log10(labels = dollar_short) +
  scale_fill_heat(trans = "log10", labels = num_short, name = "films") +
  ann_txt(2e3, 5e8, "PROFIT, above the line", colour = green, fontface = "bold") +
  ann_txt(2.5e7, 3e3, "LOSS, below the line", colour = red, fontface = "bold") +
  labs(title = "Most films with real budgets do clear the bar",
       subtitle = sprintf("Budget vs revenue (log–log density) · %s films with both figures recorded",
                          format(nrow(fin), big.mark = ",")),
       x = "Budget", y = "Revenue", caption = SRC)

The bright core sits above the break-even line, which is a relief: films with a real budget tend to earn at least something back. The machine mostly works. Now look at how tall that cloud stands at any single budget. A $10-million film might take a few hundred thousand dollars or a few hundred million. The link between what you spend and what you make is real and very loose at the same time.

That is the first crack in the idea that money buys success. A budget buys a floor: a basic level of polish, recognisable faces, a marketing push. It does not buy the outcome. The spread runs so wide that for any one film the result comes down to things no budget controls, like timing, word of mouth, whether the thing is any good, and luck.

hits <- fin[budget >= 1e6 & ROI < 1500][order(-ROI)][1:15]
ggplot(hits, aes(reorder(title, ROI), ROI)) +
  geom_segment(aes(xend = title, y = 0, yend = ROI), colour = grid, linewidth = 1) +
  geom_point(colour = amber, size = 3.2) +
  geom_text(aes(label = paste0(round(ROI), "×")), hjust = -0.3, family = f_sans, size = 2.8, colour = dim) +
  coord_flip(clip = "off") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.13))) +
  labs(title = paste0("The ", hl("runaway hits", amber), ": tiny budgets, absurd returns"),
       subtitle = "Highest return-on-investment among films with a ≥ $1M budget",
       x = NULL, y = "Return on investment (× budget)", caption = SRC)

flops <- fin[budget >= 1e7][order(revenue - budget)][1:10,
  .(Title = title, Year = year, Budget = dollar_short(budget),
    Revenue = dollar_short(revenue), `Net loss` = dollar_short(revenue - budget))]
flops |>
  kable(caption = "The other end: biggest absolute box-office losses (≥ $10M budget)") |>
  kable_styling(bootstrap_options = c("hover"), full_width = FALSE)
The other end: biggest absolute box-office losses (≥ $10M budget)
Title Year Budget Revenue Net loss
War and Peace 1968 $700M $14.00M -$686.00M
Veerabhadrudu 2026 $800M $340.00M -$460.00M
Puppet Master: The Littlest Reich 2018 $250M $781K -$249.22M
Deewangee 2002 $220M $992K -$219.01M
Wake Up Dead Man: A Knives Out Mystery 2025 $210M $4.00M -$206.00M
The Gray Man 2022 $200M $454K -$199.55M
The Tomorrow War 2021 $200M $14.40M -$185.60M
Down 2025 $180M $1.48M -$178.52M
Who Ordered Sausage? 1997 $175M $16K -$174.98M
The Killer 2023 $175M $362K -$174.64M

Read the winners’ list for the heights, not the exact order. Rocky cost about a million dollars and made Stallone a star. Gone with the Wind and the 1937 Snow White still belong on any honest version of this chart. None of these figures are adjusted for inflation, though, and a few entries are plain noise, including a filmed stage show credited with a 499-times return.

The flop table needs even more care, because most of it is not a list of flops. The Gray Man, The Tomorrow War, The Killer and the new Knives Out were all made for, or sold to, Netflix and Amazon. A streaming film carries a huge budget and almost no box office on record, so the spreadsheet reads a $200-million disaster where no ticket was ever sold to lose. Two more entries are dated 2025 and 2026 and have barely come out. And the biggest “loss” of all, the 1968 Soviet War and Peace, was funded by a state that lent the production its own army for free and never counted receipts in dollars; the numbers are guesses dressed as facts. The real lesson of this table is about reading data, not about losing money.

Now the chart that answers the main question. Budget against score, the density behind a bright trend line:

br <- fin[!is.na(imdb_rating)]
ggplot(br, aes(budget, imdb_rating)) +
  geom_hex(bins = 44) +
  geom_smooth(method = "loess", span = 0.5, colour = red, fill = NA, linewidth = 1.4) +
  geom_vline(xintercept = 7e7, linetype = "22", colour = dim, linewidth = 0.4) +
  ann_txt(8e7, 3.3, "the curve flattens\naround ~$70M", colour = dim) +
  scale_x_log10(labels = dollar_short) +
  scale_fill_heat(trans = "log10", labels = num_short, name = "films") +
  labs(title = "Money helps a little, then it stops helping",
       subtitle = "Budget against IMDb score. The trend rises, then flattens out.",
       x = "Budget (log scale)", y = "IMDb rating", caption = SRC)

The honest answer is also a hopeful one. The trend line does climb as budgets grow, so money counts for something, but the climb is gentle and it levels off near $70 million. Past that point another fifty million buys spectacle, effects and movie stars. It stops buying good reviews. The people scoring films on IMDb mostly don’t care what the thing cost.

The density seals it. The best-scored films spread across the entire budget range instead of bunching at the expensive end. Some beloved films cost almost nothing; some forgettable ones cost a fortune. Money removes a few ways to fail, but the part that lifts a good film into a great one, the script and the performances and a clear point of view, is not on the menu. As answers go, I find that a hopeful one.

Crowd-pleasers vs critics’ darlings

If popularity and quality were the same thing, this chart would be a clean diagonal. It isn’t. The action sits in the corners off that line, where buzz and quality come apart. I’ve drawn it as a density map, since a million points would only make a solid blob, and split it into four corners with a score line at 7 and a popularity line.

pr <- movies[!is.na(imdb_rating) & popularity > 0 & vote_count > 0]
ggplot(pr, aes(popularity, imdb_rating)) +
  geom_hex(bins = 50) +
  geom_hline(yintercept = 7, linetype = "22", colour = ink, linewidth = 0.4) +
  geom_vline(xintercept = 10, linetype = "22", colour = ink, linewidth = 0.4) +
  scale_x_log10() +
  scale_fill_heat(trans = "log10", labels = num_short, name = "films") +
  annotate("label", x = 0.16, y = 9.0, label = "HIDDEN GEMS", colour = green,
           fontface = "bold", size = 3.3, family = f_sans, fill = "#10130f", alpha = 0.85, label.size = 0) +
  annotate("label", x = 110, y = 9.0, label = "BELOVED & POPULAR", colour = amber,
           fontface = "bold", size = 3.3, family = f_sans, fill = "#15120a", alpha = 0.85, label.size = 0) +
  annotate("label", x = 0.16, y = 3.9, label = "FORGOTTEN", colour = dim,
           fontface = "bold", size = 3.3, family = f_sans, fill = "#121316", alpha = 0.85, label.size = 0) +
  annotate("label", x = 110, y = 3.9, label = "OVERHYPED", colour = red,
           fontface = "bold", size = 3.3, family = f_sans, fill = "#171010", alpha = 0.85, label.size = 0) +
  labs(title = "The four fates of a movie",
       subtitle = "Popularity (log) vs IMDb rating · the corners tell the story, not the bright core",
       x = "Popularity (log scale)", y = "IMDb rating", caption = SRC)

The bright centre is the ordinary majority, films with middling buzz and middling scores, where most of any catalogue lives. The corners hold the story. Top-right is the dream: films widely seen and widely loved, the Shawshank tier that everyone seems to agree on. Bottom-left is the forgotten quarter, films that weren’t good and that nobody watched, and there’s a rough justice in that.

The other two corners are the ones I can’t leave alone. Top-left, the hidden gems, are films the few people who found them scored highly while the crowd never showed. The opposite corner, overhyped, is where the marketing beat the movie, the titles everyone discussed and few enjoyed. That gap between attention and quality is the most useful thing in the dataset for a normal viewer. The loud, popular thing the feed pushes at you is not the same as the thing you would love.

hg <- pr[imdb_rating >= 7.7 & popularity < quantile(popularity, 0.25) & imdb_votes >= 2000][
  order(-imdb_rating, -imdb_votes)][1:12,
  .(Title = title, Year = year, IMDb = imdb_rating, Votes = num_short(imdb_votes),
    Genres = substr(genres, 1, 36))]
hg |>
  kable(caption = "A sample of hidden gems: highly rated, quietly under-watched (≥2k votes, bottom-quartile popularity)") |>
  kable_styling(bootstrap_options = c("hover"), full_width = FALSE)
A sample of hidden gems: highly rated, quietly under-watched (≥2k votes, bottom-quartile popularity)
Title Year IMDb Votes Genres
Sahebs Who Never Left 2022 9.4 4.179K
Johanne Sacreblu 2025 9.1 8.658K Comedy, Romance
The Web 2013 9.1 3.591K Comedy, Science Fiction, Romance, Dr
Aguner Poroshmoni 1995 9.0 3.573K Drama, War
Iron Maiden: Rock In Rio 2002 9.0 2.277K Music, Documentary
Sarsenapati Hambirrao 2022 8.9 3.121K Action, Drama, History, War
Secrets of Sinauli 2021 8.9 2.255K
The Silence of Swastika 2021 8.8 10.680K History, Documentary
Cem Yılmaz: One Taste One Texture 1999 8.8 8.581K Comedy, TV Movie
The Moromete Family 1987 8.8 4.584K Drama
Silverfang 4 1989 8.8 3.438K Animation
Aparajito 2022 8.7 5.604K Drama, History

I used to call this a watchlist built by arithmetic. Then I read the names, and the truth is stranger and better than a row of buried masterpieces. The very top is a concert film, Iron Maiden: Rock in Rio. Below it sit a Turkish stand-up special, Cem Yılmaz: One Taste, One Texture, and a nine-minute short, Johanne Sacreblu, that went viral in Mexico in 2025.

What these share is a small, devoted crowd. A live show scores a 9 because the people who watch it already worship the band, and they are grading Iron Maiden, not the camera work. A stand-up special scores a 9 because Turks who grew up on Cem Yılmaz turn out to rate him. The number is real. It measures a fan’s loyalty more than a film’s craft.

Johanne Sacreblu is the clearest case of the internet moving a number on purpose. A Mexican creator made it as a jab at Emilia Pérez, the awards-season hit that many in Mexico felt botched their country and their language. Thousands of Mexican viewers replied by handing the little parody nines and tens. The 9.1 is not a verdict on the film. It is a vote in an argument. Scores swing like this in days now, up when a community rallies behind something, down when a film gets review-bombed for reasons that have nothing to do with what is on the screen.

Take those out and a real pattern is left, and it matches the world map. Sarsenapati Hambirrao is a Marathi epic about a Maratha general. Aparajito restages Satyajit Ray shooting his first film. Aguner Poroshmoni returns to the 1971 war in Bangladesh. The Moromete Family adapts a beloved Romanian novel about village life before the war. A community made each of these for itself and grades it with pride, at home and across the diaspora, while the rest of the world never looks. “Underrated” has it backwards. They are rated precisely, by the people they were made for.

The people behind the camera

Reputations are easy to assert and hard to test. A million rows let me test them. Can a director be both an artist and a commercial force, or does the job force a choice? For the chart below I kept only directors with at least six rated films, so a single lucky hit can’t crown anyone.

dm <- movies[director != "" & !is.na(director) & !is.na(imdb_rating)]
ds <- dm[, .(rating = mean(imdb_rating), n = .N, rev = sum(revenue, na.rm = TRUE)),
         by = director][n >= 6]
comm <- ds[rev >= 1e8]
ggplot(comm, aes(rating, rev / 1e9)) +
  geom_point(aes(size = n), alpha = 0.45, colour = teal) +
  geom_smooth(method = "lm", se = FALSE, colour = amber, linewidth = 0.9, linetype = "22") +
  geom_text_repel(data = comm[rev > 4e9 | rating > 7.7],
                  aes(label = director), family = f_sans, size = 2.9, colour = ink,
                  max.overlaps = 14, min.segment.length = 0, segment.colour = grid) +
  scale_size_continuous(range = c(2, 9), name = "films") +
  scale_y_continuous(labels = label_dollar(suffix = "B")) +
  labs(title = paste0("The rare double threat: ", hl("acclaim", amber), " and box office"),
       subtitle = "Directors with ≥6 rated films and ≥$100M total gross · the top-right quadrant is nearly empty",
       x = "Mean IMDb rating", y = "Total box office", caption = SRC)

A faint upward tilt runs through the cloud: better-reviewed directors do earn a little more. The real story is the empty top-right corner. Being both consistently loved and consistently enormous is something almost nobody pulls off. Christopher Nolan is the clearest case, high on both axes at once. Peter Jackson and James Cameron get close on the back of one giant trilogy or franchise each. Past them the corner thins out fast, and most directors settle somewhere along the trade-off.

The two “can’t do both” groups are the interesting part. On the high-quality, low-money side sit Quentin Tarantino, Hayao Miyazaki, the Coen brothers and India’s Rajkumar Hirani, who make films people treasure without ever needing a multiplex full of seats. On the other side stand the commercial engines, Michael Bay and Roland Emmerich among them, whose averages hover in the middle while their grosses run into the billions. Neither group failed at anything. They took different jobs. The chart’s quiet lesson is how hard it is to hold both at once, and how few people ever do.

# Only count films that cleared 1,000 IMDb votes, so a couple of vanity uploads
# can't crown an unknown. Even then the list is full of surprises.
dm_voted <- movies[director != "" & !is.na(director) & !is.na(imdb_rating) & imdb_votes >= 1000]
tr <- dm_voted[, .(rating = mean(imdb_rating), n = .N), by = director][n >= 4][order(-rating)][1:15]
ggplot(tr, aes(reorder(director, rating), rating)) +
  geom_segment(aes(xend = director, y = 7, yend = rating), colour = grid, linewidth = 1) +
  geom_point(aes(size = n), colour = amber) +
  coord_flip() +
  scale_size_continuous(range = c(2.5, 8), name = "films") +
  labs(title = "The highest average scores go to people you'd never guess",
       subtitle = "Mean IMDb score, counting only films with 1,000+ votes (min. 4 such films)",
       x = NULL, y = "Mean IMDb score", caption = SRC)

This is the chart where the metric shows its hand. Even after I throw out everyone whose films never cleared a thousand votes, almost no famous name survives. The top belongs to directors of concert films and stand-up specials, David Mallet and Wayne Isham and Rocco Urbisci, whose fans grade the show rather than the filmmaking. Right behind them comes a wall of South Indian cinema, Vetrimaaran and K. Balachander and Rishab Shetty of Kantara, rated by some of the most engaged audiences on earth, alongside Pixar’s Pete Docter and the director of the Demon Slayer films. Nolan and Spielberg are nowhere near it.

The reason is plain once you see it. A high average rewards a narrow, loyal crowd that all loves the same thing. The moment a director gets famous enough for tens of millions to watch, the audience widens, the sceptics pile in, and the average drifts back toward the middle. “Highest-rated director” measures not who is best but who has the most devoted and least diluted following, and that points at fandoms, regional cinema and animation long before it points at the Hollywood A-list.

cmp <- movies[music_composer != "" & !is.na(music_composer) & !is.na(imdb_rating),
              .(rating = mean(imdb_rating), n = .N), by = music_composer][n >= 12][order(-n)][1:18]
ggplot(cmp, aes(reorder(music_composer, n), n)) +
  geom_col(aes(fill = rating), width = 0.72) +
  geom_text(aes(label = sprintf("%.1f", rating)), hjust = -0.2, family = f_sans, size = 2.7, colour = dim) +
  coord_flip(clip = "off") +
  scale_fill_gradient(low = blue, high = amber, name = "mean IMDb") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.13))) +
  labs(title = "The maestros who score the most cinema",
       subtitle = "Most prolific composers · bar length = films, colour and number = mean IMDb rating",
       x = NULL, y = "Films scored", caption = SRC)

The composer is the most important person most viewers never notice, and the busiest of them are staggering. The name at the very top is not John Williams or Hans Zimmer. It is Ilaiyaraaja, who has scored well over three hundred films, almost all in Tamil, which says everything about the scale of South Indian cinema. Ennio Morricone, Max Steiner and Jerry Goldsmith fill out the rest, the men who gave the twentieth century its sound.

Look at the colour, though. These averages sit in the low-to-mid sixes, ordinary by the standards of this dataset. A composer who scores three hundred films works on masterpieces and on forgettable jobs alike, and the average lands where most films land. Prolific and acclaimed are different things. The lesson here is scale, not quality. A small guild writes the music for an enormous share of all cinema, and you have heard far more of their work than you could ever name.

How long, and does it matter?

rrd <- movies[runtime >= 60 & runtime <= 210 & !is.na(imdb_rating)]
ggplot(rrd, aes(runtime, imdb_rating)) +
  geom_hex(bins = 44) +
  geom_smooth(method = "loess", span = 0.5, colour = red, fill = NA, linewidth = 1.4) +
  scale_fill_heat(trans = "log10", labels = num_short, name = "films") +
  labs(title = "Longer films tend to score a little higher",
       subtitle = "Runtime against IMDb score. The slope is real but gentle.",
       x = "Runtime (minutes)", y = "IMDb rating", caption = SRC)

The line climbs: longer films do score a little higher on average. I gave away my reading earlier and I’ll hold to it. The cause runs backward from how it looks. Adding thirty minutes to a film does not improve it. Studios hand out long runtimes only to directors and material they already trust, so a three-hour cut is a vote of confidence cast before anyone bought a ticket. Length marks expected quality rather than creating it. Stretch a weak script to three hours and you get a longer weak film.

rg <- merge(rrd[, .(id, runtime)], ge[, .(id, genre)], by = "id")[genre %in% top_g]
rg[, genre := factor(genre, levels = top_g)]
ggplot(rg, aes(genre, runtime, fill = genre)) +
  geom_boxplot(colour = ink, outlier.shape = NA, linewidth = 0.3, alpha = 0.8) +
  scale_fill_manual(values = colorRampPalette(c(teal, blue, purple, red))(10), guide = "none") +
  coord_cartesian(ylim = c(70, 165)) +
  labs(title = "Which genres ask for your whole evening",
       subtitle = "Runtime by genre · drama and history stretch out; horror and animation stay lean",
       x = NULL, y = "Runtime (minutes)", caption = SRC) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

Split by genre, the runtimes land where common sense says they should, which is its own kind of reassurance that the data behaves. Drama and historical films run longest, because a whole life or a whole era needs room to breathe. Horror and animation keep it short. Horror dies the moment the tension breaks, and animation costs a fortune per minute, so every second has to earn its place. Action settles near two hours, enough for a few set-pieces and a finale, not so long that the energy drains away.

Do the two crowds agree?

bs <- movies[!is.na(imdb_rating) & imdb_rating > 0 & vote_count >= 50 & imdb_votes >= 100]
r <- cor(bs$vote_average, bs$imdb_rating, use = "complete.obs")
ggplot(bs, aes(vote_average, imdb_rating)) +
  geom_hex(bins = 50) +
  geom_abline(slope = 1, intercept = 0, linetype = "22", colour = ink, linewidth = 0.5) +
  geom_smooth(method = "lm", se = FALSE, colour = teal, linewidth = 1) +
  scale_fill_heat(trans = "log10", labels = num_short, name = "films") +
  annotate("label", x = 2.3, y = 9, label = sprintf("r = %.2f", r),
           family = f_sans, fontface = "bold", size = 5, colour = amber,
           fill = "#15120a", label.size = 0, label.r = unit(0.4, "lines")) +
  labs(title = "Two different crowds, one verdict",
       subtitle = sprintf("Well-voted films (≥50 TMDB, ≥100 IMDb votes) · n = %s · dashed = perfect agreement",
                          format(nrow(bs), big.mark = ",")),
       x = "TMDB vote average", y = "IMDb rating", caption = SRC)

Given how loudly the internet fights about films, I expected these two crowds to split. They mostly don’t, though seeing it took one bit of cleanup. An average built on three votes means nothing, so I limited the chart to well-voted films, at least 50 votes on TMDB and 100 on IMDb. Without that filter the link sags to a noisy 0.46, dragged down by films whose “average” rests on a handful of clicks. On films enough people actually rated, the cloud tightens to a clean diagonal and the link reaches r ≈ 0.84, strong by any social-science standard. Two platforms, two audiences, founded decades apart, and they broadly agree on what is good. TMDB runs a touch higher, which fits its younger, keener crowd. Popular taste is far more shared than the comment sections suggest.

The part I’d chase next is the spread around that line, the films where the two crowds genuinely split. Those gaps carry signal. A film TMDB loves and IMDb shrugs at is usually recent, with a young, organised fan base behind it. The reverse, high on IMDb and quiet on TMDB, tends to be an older classic whose reputation is settled among long-time fans but hasn’t reached the newer crowd. It is also where manipulation lives. A coordinated push on a forum can move a thin score in a day, one more reason I check the vote count before I trust the number.

Can a model predict greatness?

The honest test of whether you understand something is whether you can predict it. Can a model guess a film’s IMDb score from its plain facts? I built a deliberately simple one, the kind you can read off the page. The aim is not a high leaderboard number. It is to see which levers move a score, and how much of the picture stays beyond their reach.

md <- movies[!is.na(imdb_rating) & runtime >= 30 & runtime <= 240 & year >= 1960 & year <= 2024]
md[, `:=`(
  log_votes      = log10(vote_count + 1),
  log_popularity = log10(popularity + 1),
  log_budget     = ifelse(budget  > 0, log10(budget),  NA),
  log_revenue    = ifelse(revenue > 0, log10(revenue), NA),
  has_tagline    = as.integer(tagline != "" & !is.na(tagline))
)]
for (g in c("Drama","Comedy","Thriller","Action","Horror","Romance","Documentary"))
  md[, (paste0("is_", g)) := as.integer(grepl(g, genres, fixed = TRUE))]

mcols <- c("imdb_rating","runtime","log_votes","log_popularity","log_budget","log_revenue",
           "has_tagline", paste0("is_", c("Drama","Comedy","Thriller","Action","Horror","Romance","Documentary")))
mc <- md[, ..mcols][complete.cases(md[, ..mcols])]

Before the model, a look at how the numeric columns relate. Strong links between inputs are a warning sign, since they make each input’s effect hard to separate, and they tell a small story of their own.

nums <- c("imdb_rating","runtime","log_votes","log_popularity","log_budget","log_revenue")
cm <- cor(mc[, ..nums]); cm[lower.tri(cm)] <- NA
cmdf <- as.data.table(as.table(cm))[!is.na(N)]; setnames(cmdf, c("v1","v2","r"))
nice <- c(imdb_rating="IMDb rating", runtime="runtime", log_votes="log votes",
          log_popularity="log popularity", log_budget="log budget", log_revenue="log revenue")
cmdf[, `:=`(v1 = factor(nice[v1], levels = nice), v2 = factor(nice[v2], levels = nice))]
ggplot(cmdf, aes(v1, v2, fill = r)) +
  geom_tile(colour = bg, linewidth = 1.4) +
  geom_text(aes(label = sprintf("%.2f", r)), family = f_sans, size = 2.9,
            colour = ifelse(abs(cmdf$r) > 0.6, "#11131a", ink)) +
  scale_fill_gradient2(low = teal, mid = "#15171c", high = red, midpoint = 0, limits = c(-1, 1), name = "r") +
  labs(title = "What moves with what",
       subtitle = "Correlations among the numeric predictors",
       x = NULL, y = NULL, caption = SRC) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1), panel.grid = element_blank())

The strongest links are the ones you’d guess. Budget, box office and popularity rise together, because expensive films get more marketing and more eyes. The score itself barely tracks any single column, which already hints at how the model will do. Nothing is tangled enough to break a regression, so I fit a plain linear one and held back a fifth of the data to test it on films it had never seen.

set.seed(42)
idx <- sample(nrow(mc), 0.8 * nrow(mc))
train <- mc[idx]; test <- mc[-idx]
fit <- lm(imdb_rating ~ ., data = train)

test[, pred := predict(fit, test)]
r2   <- cor(test$pred, test$imdb_rating)^2
rmse <- sqrt(mean((test$pred - test$imdb_rating)^2))
mae  <- mean(abs(test$pred - test$imdb_rating))
data.frame(Metric = c("R² (test)","RMSE","MAE"), Value = round(c(r2, rmse, mae), 3)) |>
  kable(caption = sprintf("Held-out performance · trained on %s films, tested on %s",
                          format(nrow(train), big.mark = ","), format(nrow(test), big.mark = ","))) |>
  kable_styling(bootstrap_options = c("hover"), full_width = FALSE)
Held-out performance · trained on 9,414 films, tested on 2,354
Metric Value
R² (test) 0.307
RMSE 0.915
MAE 0.660
cf <- as.data.table(tidy(fit))[term != "(Intercept)"]
cf[, nice := gsub("_", " ", gsub("is_", "genre: ", gsub("log_", "log ", term)))]
ggplot(cf, aes(reorder(nice, estimate), estimate, fill = estimate > 0)) +
  geom_col(width = 0.7) +
  geom_hline(yintercept = 0, colour = ink, linewidth = 0.4) +
  coord_flip() +
  scale_fill_manual(values = c("TRUE" = green, "FALSE" = red),
                    labels = c("TRUE" = "lifts rating", "FALSE" = "lowers rating"), name = NULL) +
  labs(title = "What the model thinks predicts a good rating",
       subtitle = "Linear-model coefficients · direction and size of each lever",
       x = NULL, y = "Coefficient (effect on IMDb rating)", caption = SRC)

Two things hold at once, and the point is to keep both. First, the model explains only about 31% of the differences in scores on films it never saw. That is the finding, not a failure. Most of what makes a film work is missing from these columns. You can’t read greatness off a runtime and a budget and a few genre tags, because it lives in the writing and the acting and the timing, the thousand small choices no row here records. As a film fan I find that a relief. The data agrees that a film is a piece of culture, not a product you can spec on a sheet.

Second, inside that limit the levers it does find make sense, which tells me it isn’t inventing them. A Documentary or Drama tag and a heavy vote count push the predicted score up; a Horror tag pulls it down, all in line with the genre chart. One effect closes the loop with the money section. Once popularity and box office are held constant, a bigger budget nudges the prediction down. That is the blockbuster penalty turning up as a coefficient, the expensive spectacle scoring a notch below the smaller, well-told film. With no knowledge of any earlier chart, the model reached the same verdict on its own. Money is not what makes a film good.

Mount Olympus: the top 100

top100 <- movies[imdb_votes > 10000 & imdb_rating > 7.5][order(-imdb_rating, -imdb_votes)][1:100,
  .(Title = title, Year = year, IMDb = imdb_rating, Votes = num_short(imdb_votes),
    TMDB = round(vote_average, 1), Runtime = paste0(runtime, "m"),
    Director = substr(director, 1, 22), Genres = substr(genres, 1, 30))]
top100 |>
  kable(caption = "The 100 highest-rated films with a real vote base (>10k IMDb votes)") |>
  kable_styling(bootstrap_options = c("hover","condensed"), font_size = 11) |>
  scroll_box(height = "560px")
The 100 highest-rated films with a real vote base (>10k IMDb votes)
Title Year IMDb Votes TMDB Runtime Director Genres
Metal Gear Solid 3: Existence 2005 9.5 12.302K 8.2 208m Hideo Kojima Thriller, Action
The Shawshank Redemption 1994 9.3 3.1990M 8.7 142m Frank Darabont Drama, Crime
Doctor Who: The Day of the Doctor 2013 9.3 21.110K 8.2 77m Nick Hurran Science Fiction, Adventure
The Godfather Trilogy: 1901-1980 1992 9.3 16.681K 8.9 583m Francis Ford Coppola Crime, Drama
The Godfather 1972 9.2 2.2320M 8.7 175m Francis Ford Coppola Drama, Crime
CMYLMZ 2008 9.2 10.938K 8.8 174m Murat Dündar Comedy
The Dark Knight 2008 9.1 3.1798M 8.5 152m Christopher Nolan Action, Crime, Thriller
The Chaos Class 1975 9.1 46.354K 8.0 85m Ertem Eğilmez Comedy, Drama
Ramayana : The Legend of Prince Rama 1993 9.1 17.800K 8.2 135m Koichi Sasaki, Yûgô Sa Animation, Action, Adventure,
The Lord of the Rings: The Return of the King 2003 9.0 2.1702M 8.5 201m Peter Jackson Adventure, Fantasy, Action
Schindler’s List 1993 9.0 1.5884M 8.6 195m Steven Spielberg Drama, History, War
The Godfather Part II 1974 9.0 1.4980M 8.6 202m Francis Ford Coppola Drama, Crime
12 Angry Men 1957 9.0 987.606K 8.6 97m Sidney Lumet Drama
CM101MMXI Fundamentals 2013 9.0 48.478K 8.2 139m Murat Dündar Comedy, TV Movie, Documentary
Attack on Titan: THE LAST ATTACK 2024 9.0 31.065K 8.7 145m Yuichiro Hayashi Animation, Action, Adventure,
The Lord of the Rings: The Fellowship of the Ring 2001 8.9 2.2124M 8.4 179m Peter Jackson Adventure, Fantasy, Action
David Attenborough: A Life on Our Planet 2020 8.9 36.208K 8.4 83m Jonathan Hughes, Keith Documentary, Drama
Mirror Game 2016 8.9 31.048K 7.9 144m Amitabh Reza Chowdhury Crime, Mystery, Thriller
Tosun Pasha 1976 8.9 26.456K 8.0 90m Kartal Tibet Comedy
O.J.: Made in America 2016 8.9 23.980K 8.4 467m Ezra Edelman Documentary, Crime, History
The Message 1976 8.9 12.003K 8.6 206m Moustapha Akkad History, Drama, Adventure, Act
Inception 2010 8.8 2.8271M 8.4 148m Christopher Nolan Action, Science Fiction, Adven
Fight Club 1999 8.8 2.6196M 8.4 139m David Fincher Drama, Thriller
Forrest Gump 1994 8.8 2.5041M 8.5 142m Robert Zemeckis Comedy, Drama, Romance
Pulp Fiction 1994 8.8 2.4419M 8.5 154m Quentin Tarantino Thriller, Crime, Comedy
The Lord of the Rings: The Two Towers 2002 8.8 1.9624M 8.4 179m Peter Jackson Adventure, Fantasy, Action
The Good, the Bad and the Ugly 1966 8.8 892.285K 8.5 161m Sergio Leone Western
The Foster Brothers 1976 8.8 22.597K 8.0 80m Ertem Eğilmez Comedy, Family
The Marathon Family 1982 8.8 18.198K 7.6 92m Slobodan Šijan Comedy, Drama
C/o Kancharapalem 2018 8.8 10.867K 7.3 152m Venkatesh Maha Drama, Romance
The Phantom of the Opera at the Royal Albert Hall 2011 8.8 10.860K 8.3 160m Laurence Connor, Nick Music, Drama, Romance
The Silence of Swastika 2021 8.8 10.680K 6.3 56m History, Documentary
Interstellar 2014 8.7 2.5466M 8.5 169m Christopher Nolan Adventure, Drama, Science Fict
The Matrix 1999 8.7 2.2548M 8.2 136m Lana Wachowski, Lilly Action, Science Fiction
The Empire Strikes Back 1980 8.7 1.5088M 8.4 124m Irvin Kershner Adventure, Action, Science Fic
GoodFellas 1990 8.7 1.3964M 8.5 145m Martin Scorsese Drama, Crime
12th Fail 2023 8.7 173.166K 7.9 146m Vidhu Vinod Chopra Drama
777 Charlie 2022 8.7 47.863K 7.7 166m Kiranraj K. Drama, Adventure, Comedy
Kill Bill: The Whole Bloody Affair 2011 8.7 44.715K 8.1 254m Quentin Tarantino Action, Crime
Stop Making Sense 1984 8.7 24.237K 8.3 88m Jonathan Demme Documentary, Music
The Chaos Class Is Waking Up 1976 8.7 22.465K 7.8 94m Ertem Eğilmez Comedy
Saban, Son of Saban 1977 8.7 20.074K 7.8 90m Ertem Eğilmez Comedy
#Home 2021 8.7 19.809K 7.7 162m Rojin Thomas Drama, Family
Peranbu 2019 8.7 18.837K 7.3 147m Ram Drama, Family
Demon Slayer: Kimetsu no Yaiba Mt. Natagumo Arc 2021 8.7 18.706K 7.8 138m Haruo Sotozaki Animation, Action, Fantasy
Who’s Singin’ Over There? 1980 8.7 17.962K 7.6 86m Slobodan Šijan Drama, Comedy, Adventure
Solo Leveling -ReAwakening- 2024 8.7 17.300K 7.1 116m Shunsuke Nakashige Action, Adventure, Fantasy, An
Manichitrathazhu 1993 8.7 14.443K 7.5 169m Fazil Horror, Comedy, Mystery
Balkan Spy 1984 8.7 13.484K 7.7 92m Dušan Kovačević, Božid Drama, Comedy
Neşeli Günler 1978 8.7 13.034K 8.2 95m Orhan Aksoy Comedy, Family
Shoah 1985 8.7 11.637K 8.2 566m Claude Lanzmann Documentary, History
Dave Chappelle: Killin’ Them Softly 2000 8.7 11.562K 8.1 57m Stan Lathan Comedy, TV Movie
Kadaisi Vivasayi 2022 8.7 10.628K 8.6 145m M. Manikandan Drama
Se7en 1995 8.6 2.0292M 8.4 127m David Fincher Crime, Mystery, Thriller
The Silence of the Lambs 1991 8.6 1.7285M 8.3 119m Jonathan Demme Crime, Thriller, Drama
Saving Private Ryan 1998 8.6 1.6449M 8.2 169m Steven Spielberg War, Drama, History
Star Wars 1977 8.6 1.5741M 8.2 121m George Lucas Adventure, Action, Science Fic
The Green Mile 1999 8.6 1.5659M 8.5 189m Frank Darabont Fantasy, Drama, Crime
Terminator 2: Judgment Day 1991 8.6 1.2870M 8.2 137m James Cameron Action, Thriller, Science Fict
One Flew Over the Cuckoo’s Nest 1975 8.6 1.1584M 8.4 135m Miloš Forman Drama
Spirited Away 2001 8.6 971.541K 8.5 125m Hayao Miyazaki Animation, Family, Fantasy
City of God 2002 8.6 873.248K 8.4 129m Fernando Meirelles Drama, Crime
Life Is Beautiful 1997 8.6 814.916K 8.4 116m Roberto Benigni Comedy, Drama
It’s a Wonderful Life 1946 8.6 554.905K 8.3 131m Frank Capra Drama, Family, Fantasy
Seven Samurai 1954 8.6 402.617K 8.4 207m Akira Kurosawa Action, Drama
Severance NA 8.6 393.375K 0.0 942m Sam Donovan, Jessica L
Jai Bhim 2021 8.6 234.825K 7.3 164m T. J. Gnanavel Crime, Drama, Mystery, History
Soorarai Pottru 2020 8.6 132.172K 7.8 149m Sudha Kongara Prasad Drama, Action
Harakiri 1962 8.6 91.929K 8.4 135m Masaki Kobayashi Action, Drama, History
Rocketry: The Nambi Effect 2022 8.6 62.256K 7.6 154m R. Madhavan Drama, History
Bo Burnham: Inside 2021 8.6 59.732K 8.0 88m Bo Burnham Comedy, Drama
20 Days in Mariupol 2023 8.6 58.857K 8.1 94m Mstyslav Chernov Documentary, War
Anbe Sivam 2003 8.6 28.353K 7.6 160m Sundar C Comedy, Drama
Nayakan 1987 8.6 27.943K 7.8 156m Mani Ratnam Drama, Crime
Anne of Green Gables 1985 8.6 24.563K 8.1 199m Kevin Sullivan Family, Drama
Night and Fog 1956 8.6 23.508K 8.3 32m Alain Resnais Documentary, History
A Dog’s Will 2000 8.6 21.704K 8.4 104m Guel Arraes Comedy, Drama, Fantasy, Romanc
Pariyerum Perumal 2018 8.6 20.565K 8.2 155m Mari Selvaraj Romance, Drama
Earthlings 2005 8.6 20.541K 8.1 95m Shaun Monson Documentary
The Chaos Class Is on Vacation 1977 8.6 20.322K 7.6 97m Ertem Eğilmez Adventure, Comedy, Drama
Pretty Village, Pretty Flame 1996 8.6 19.033K 8.0 130m Srđan Dragojević Drama, War
Feyzo, The Polite One 1978 8.6 18.695K 7.9 83m Atıf Yılmaz Comedy
The Broken Landlord 1985 8.6 17.736K 8.2 101m Nesli Çölgeçen Drama, Comedy
Making "“The Matrix”" 1999 8.6 14.375K 7.3 26m Josh Oreck Documentary
Toma 2021 8.6 13.368K 6.9 140m Zoran Lisinac, Dragan Drama, Music, History
Pink Floyd: Live at Pompeii 1972 8.6 11.563K 8.1 62m Adrian Maben Music, Documentary
My Dear Brother 1973 8.6 11.280K 8.3 92m Ertem Eğilmez Drama
Mission Muh Dikhayi 2025 8.6 10.767K 0.0 53m Suryansh Thakur Family, Comedy, Drama
The Untold History Of The United States 2012 8.6 10.115K 8.2 585m Oliver Stone Documentary, History
Django Unchained 2012 8.5 1.8900M 8.2 165m Quentin Tarantino Drama, Western
Gladiator 2000 8.5 1.8559M 8.2 155m Ridley Scott Action, Drama, Adventure
The Prestige 2006 8.5 1.5947M 8.2 130m Christopher Nolan Drama, Mystery, Science Fictio
The Departed 2006 8.5 1.5593M 8.2 151m Martin Scorsese Drama, Thriller, Crime
Back to the Future 1985 8.5 1.4576M 8.3 116m Robert Zemeckis Adventure, Comedy, Science Fic
Léon: The Professional 1994 8.5 1.3543M 8.3 111m Luc Besson Crime, Drama, Action
American History X 1998 8.5 1.2761M 8.3 119m Tony Kaye Drama
The Lion King 1994 8.5 1.2631M 8.3 89m Roger Allers, Rob Mink Animation, Family, Drama
The Usual Suspects 1995 8.5 1.2415M 8.2 106m Bryan Singer Drama, Crime, Thriller
Parasite 2019 8.5 1.1717M 8.5 133m Bong Joon Ho Comedy, Thriller, Drama
Whiplash 2014 8.5 1.1614M 8.4 107m Damien Chazelle Drama, Music, Thriller

Scroll to the top of the list and the number-one film is a compilation of cutscenes from Metal Gear Solid 3, a video game. That one line tells you to read this table with humour as well as respect. The canon is here, of course: The Shawshank Redemption, The Godfather, The Lord of the Rings, 12 Angry Men, the films almost anyone would defend. They sit beside filmed concerts and stage musicals, three or four Turkish comedies that Turkey adores and the rest of the world has never heard of, anime, The Message (a 1976 epic about the birth of Islam, carried by Muslim audiences worldwide), and regional Indian dramas lifted by their home crowds.

Every thread from the report meets here. Drama leads, as the genre chart promised. The diaspora and the fandoms that ran through the world map and the hidden gems crowd the summit alongside the Hollywood greats. This is not one mountain. It is the Western canon and a dozen passionate national and fan canons stacked in the same column, each one carried by the people who love it most.

What I deliberately left out, and why

A good edit is mostly about what you leave out. The first version of this analysis had around forty charts. I cut about a third because they didn’t earn their place, and I merged several others into stronger single charts. To show my thinking, not just my output, here’s the cut list:

  • Second “vote count” chart (IMDb votes):it had the exact same shape as the TMDB one. Two near-twin charts made the same point, so I kept the cleaner one.
  • Separate popularity chart:its only message (“popularity is also very uneven”) was already covered by the vote counts, and it works far better inside the popularity-vs-score corners chart.
  • “Films by production status” chart:98% of entries are simply “Released”. That’s a one-line fact dressed up as a chart, so it’s now a single sentence.
  • Cinematographers section:it told the exact same “a few names dominate the big films” story as the composer chart, with worse data coverage. Repetitive, so it went.
  • TMDB-votes vs IMDb-votes chart:it only showed that popular films are popular everywhere. Not much insight for the space, so it became a single sentence in the agreement section.
  • Genre “violin” chart:its “spread” story overlapped both the decade chart and the genre dot chart. I kept the genre-mix heatmap instead, which says something the others can’t.
  • Separate “top directors by box office” chart:the quality-vs-money chart shows the same thing with far more detail, so the plain bar was repetitive.
  • Raw model printouts (summary() and base-R diagnostic plots):console dumps don’t belong in a portfolio piece. I replaced them with one readable model, a clean effects chart, and honest test results.
  • Top-language count chart (two-letter codes):the language codes were hard to read; the “who makes the most” story works far better through countries, which I kept.
  • Dense dot-cloud charts:the original drew hundreds of thousands of single dots, which turned into solid blobs. I rebuilt every big scatter as a density map, which shows where the films actually pile up instead of an unreadable smudge.

Conclusions

Around twenty charts and one stubborn question later, here is where I land on whether any of this reduces to numbers.

  1. Cinema exploded; it didn’t just grow. Yearly output jumped from hundreds to tens of thousands of films the moment the tools got cheap and streaming arrived. “The movies” today is a bigger, more open and more personal thing than the famous classics suggest, and that flood shapes the recent end of every trend.
  2. Scores cluster at “pretty good.” Both crowds centre on 6.5, not 5, and on well-voted films they agree at r ≈ 0.84. The signal lives in the rare highs and lows and in the disagreements, not the crowded middle, so I leaned on vote counts and gaps over the plain score.
  3. Old films only look better. The drop in scores over time is the survivors trick. We kept the good old films and forgot the rest, while every modern flop is recorded. I had to repeat this to myself on almost every time-based chart.
  4. “Most films” and “best films” are different maps. The US, Japan and India make the most. The highest scores go to small cinemas like Nepal, Bangladesh and Albania, and to India’s regional languages. The cause is a stack of small things rather than one law: these countries make fewer, more serious films, and the people grading them are mostly the home crowd and the diaspora who care.
  5. Money buys a floor, not a ceiling. Budget links weakly to score and levels off near $70M. The biggest surprise hits are cheap, the worst “flops” are expensive (and often just streaming films with no box office on record), and the model even finds a small penalty for big budgets. For any one film, the result comes down to craft and luck.
  6. The hidden gems are not what they look like. The corner of high score and low attention is full of concert films, fan favourites and films a community made for itself. A few are genuinely overlooked. Most are rated exactly, by the people they were made for.
  7. Being both an artist and a blockbuster machine is rare. The top-right corner of acclaim and box office is nearly empty. Most directors trade one for the other, and the few who do both, Nolan above all, have earned their reputation.
  8. Greatness slips through the model. A reasonable model explains only about 31% of the differences in scores and leaves the rest unexplained. That gap is the finding, not a failure. Taste mostly won’t fit a formula, and I think that is a good thing.
  9. The number is not the film. Concert recordings, video-game cutscenes, streaming “flops” and review-bombing campaigns all sit in this data wearing the same clothes as ordinary films. Half the work was telling them apart. A score records how a film was received, by whom, and under what pressure, never simply how good it was.

Honest limitations

  • Missing data has a direction, not just an amount. Crew and money fields are blank for the small, foreign and old films, so every result about people or budgets describes the famous part of cinema, not all of it.
  • The survivors trick lifts every pre-1980 quality number, as discussed above.
  • No inflation adjustment on budgets or box office. Old dollars aren’t today’s dollars, which bends every comparison across eras.
  • “Score” is not “quality.” IMDb and TMDB numbers measure how a film was received, which mixes real craft with marketing, fandom, nostalgia and timing.
  • The country and language scores mix several effects at once: what gets made, who rates it, and small samples. I read them as a source of questions, not a ranking of national talent.

If I picked this up again

I’d pull in critic scores (Rotten Tomatoes, Metacritic) to set against the crowd, adjust the money for inflation, add awards data, and swap the linear model for something stronger like gradient boosting, to learn whether that ~31% ceiling comes from the method or from cinema’s own mystery. My bet is on the mystery. After a million films, I am glad the spreadsheet can’t explain everything.

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] pl_PL/pl_PL/pl_PL/C/pl_PL/pl_PL
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] broom_1.0.10        kableExtra_1.4.0    knitr_1.50         
##  [4] systemfonts_1.3.2   hexbin_1.28.5       ggtext_0.1.2       
##  [7] ggrepel_0.9.8       scales_1.4.0        ggplot2_3.5.2      
## [10] forcats_1.0.0       lubridate_1.9.4     stringr_1.5.1      
## [13] tidyr_1.3.1         dplyr_1.1.4         data.table_1.18.2.1
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.5.2         stringi_1.8.7     
##  [5] lattice_0.22-7     digest_0.6.37      magrittr_2.0.3     evaluate_1.0.4    
##  [9] grid_4.5.1         timechange_0.3.0   RColorBrewer_1.1-3 fastmap_1.2.0     
## [13] Matrix_1.7-3       jsonlite_2.0.0     backports_1.5.0    mgcv_1.9-3        
## [17] purrr_1.0.4        viridisLite_0.4.2  textshaping_1.0.5  jquerylib_0.1.4   
## [21] cli_3.6.5          rlang_1.1.6        litedown_0.9       splines_4.5.1     
## [25] commonmark_2.0.0   withr_3.0.2        cachem_1.1.0       yaml_2.3.10       
## [29] tools_4.5.1        vctrs_0.6.5        R6_2.6.1           lifecycle_1.0.4   
## [33] ragg_1.5.2         pkgconfig_2.0.3    pillar_1.10.2      bslib_0.9.0       
## [37] gtable_0.3.6       glue_1.8.0         Rcpp_1.0.14        xfun_0.59         
## [41] tibble_3.3.0       tidyselect_1.2.1   rstudioapi_0.18.0  farver_2.1.2      
## [45] nlme_3.1-168       htmltools_0.5.8.1  labeling_0.4.3     rmarkdown_2.29    
## [49] svglite_2.2.2      compiler_4.5.1     S7_0.2.1           markdown_2.0      
## [53] gridtext_0.1.6