Init
Data
Ground truth
Early ratings at various sample sizes
Shrinkage methods
Evaluation: MAE vs ground truth
Split by source
Signed bias
Example tracelines
Meta

Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

library(arrow)

## 
## Attaching package: 'arrow'
## 
## The following object is masked from 'package:magrittr':
## 
##     is_in
## 
## The following object is masked from 'package:lubridate':
## 
##     duration
## 
## The following object is masked from 'package:utils':
## 
##     timestamp

library(RSQLite)
library(glue)

theme_set(theme_bw())
options(digits = 3)

if (F) {
  library(btw)
  btw::btw_mcp_session()
}

Data

We use games with complete review histories only. For the original 1000 games (500 top-rated, 500 sampled), we keep only those with <=50k total reviews, since our scraper capped at 50k. The additional 1727 randomly sampled games were scraped without a cap (all had <50k reviews).

# Top-rated games from SteamDB (complete data only)
target_top <- read_csv("data/steamdb_top_rated.csv", show_col_types = FALSE) %>%
  mutate(app_id = as.integer(app_id), source = "top_rated") %>%
  filter(total_reviews <= 50000)

# Original random sample (complete data only)
target_sampled_old <- read_csv("data/sampled_games.csv", show_col_types = FALSE) %>%
  mutate(app_id = as.integer(app_id), source = "sampled") %>%
  filter(total_reviews <= 50000)

# New random sample (2000 games, all <50k, scraped completely)
target_sampled_new <- read_csv("data/sampled_games_2k.csv", show_col_types = FALSE) %>%
  mutate(app_id = as.integer(app_id), source = "sampled") %>%
  filter(total_reviews < 50000)

target_games <- bind_rows(
  target_top %>% select(app_id, name, total_reviews, source),
  target_sampled_old %>% select(app_id, name, total_reviews, source),
  target_sampled_new %>% select(app_id, name, total_reviews, source)
)

cat("Top-rated (complete):", sum(target_games$source == "top_rated"), "\n")

## Top-rated (complete): 357

cat("Sampled (complete):", sum(target_games$source == "sampled"), "\n")

## Sampled (complete): 2210

cat("Total:", nrow(target_games), "\n")

## Total: 2567

# Load reviews from both parquet files
votes_old <- read_parquet("data/steam_review_votes.parquet") %>%
  as_tibble() %>%
  mutate(app_id = as.integer(app_id), voted_up = as.logical(voted_up))

votes_new <- read_parquet("data/steam_reviews_1k.parquet") %>%
  as_tibble() %>%
  mutate(app_id = as.integer(app_id), voted_up = as.logical(voted_up))

# Merge and filter to complete games
votes <- bind_rows(votes_old, votes_new) %>%
  filter(app_id %in% target_games$app_id)

cat("Total reviews:", format(nrow(votes), big.mark = ","), "\n")

## Total reviews: 34,847,499

cat("Unique games with reviews:", n_distinct(votes$app_id), "\n")

## Unique games with reviews: 2560

# Per-game summary
game_summary <- votes %>%
  group_by(app_id) %>%
  summarize(
    n_reviews = n(),
    positive = sum(voted_up),
    negative = n_reviews - positive,
    raw_score = mean(voted_up),
    .groups = "drop"
  ) %>%
  left_join(target_games %>% select(app_id, name, source), by = "app_id")

game_summary

Ground truth

Ground truth is the full-sample mean rating for each game, using all available reviews.

ground_truth <- game_summary %>%
  select(app_id, true_rating = raw_score, n_reviews, source)

cat("Games:", nrow(ground_truth), "\n")

## Games: 2560

cat("Mean true rating:", round(mean(ground_truth$true_rating), 3), "\n")

## Mean true rating: 0.851

cat("SD true rating:", round(sd(ground_truth$true_rating), 3), "\n")

## SD true rating: 0.122

Early ratings at various sample sizes

We order each game’s reviews by timestamp and compute ratings using only the first N reviews.

votes_ordered <- votes %>%
  group_by(app_id) %>%
  arrange(timestamp_created, .by_group = TRUE) %>%
  mutate(review_order = row_number()) %>%
  ungroup()

cutoffs <- c(10, 25, 50, 100, 250, 500, 1000)

early_ratings <- map_dfr(cutoffs, function(k) {
  votes_ordered %>%
    filter(review_order <= k) %>%
    group_by(app_id) %>%
    summarize(
      positive = sum(voted_up),
      n = n(),
      raw = mean(voted_up),
      .groups = "drop"
    ) %>%
    filter(n == k) %>%
    mutate(cutoff = k)
})

early_ratings %>% count(cutoff)

Shrinkage methods

We compare five approaches to estimating a game’s true positive rate from a small sample of early reviews:

Raw score: positive / total. Unbiased but noisy at small n.
SteamDB formula: score - (score - 0.5) * 2^(-log10(n + 1)). Shrinks toward 50% with an ad hoc decay function.
Wilson lower bound: Lower end of a 95% confidence interval for the proportion. Conservative — designed to answer “what’s the worst this game plausibly is?”
Laplace smoothing: (positive + 1) / (n + 2). Bayesian posterior mean with a uniform Beta(1,1) prior. Shrinks toward 50%.
Empirical Bayes: (positive + a) / (n + a + b) where the Beta(a, b) prior is estimated from the observed distribution of game ratings via method of moments. Shrinks toward the actual mean rating (~85%) with data-calibrated strength.

The key distinction: SteamDB, Wilson, and Laplace all implicitly shrink toward 50%. Empirical Bayes shrinks toward the actual population mean (~85%), which is why it outperforms.

# Estimate EB prior
mu <- mean(game_summary$raw_score)
v <- var(game_summary$raw_score)
a_prior <- mu * (mu * (1 - mu) / v - 1)
b_prior <- (1 - mu) * (mu * (1 - mu) / v - 1)
cat("Empirical Bayes prior: Beta(", round(a_prior, 2), ",", round(b_prior, 2), ")\n")

## Empirical Bayes prior: Beta( 6.38 , 1.12 )

cat("Prior mean:", round(a_prior / (a_prior + b_prior), 3), "\n")

## Prior mean: 0.851

cat("Prior weight:", round(a_prior + b_prior, 1), "pseudo-reviews\n")

## Prior weight: 7.5 pseudo-reviews

# Apply all methods
adjusted <- early_ratings %>%
  mutate(
    steamdb = raw - (raw - 0.5) * 2^(-log10(n + 1)),
    wilson = {
      z <- qnorm(0.975)
      p <- raw
      (p + z^2/(2*n) - z * sqrt((p*(1-p) + z^2/(4*n))/n)) / (1 + z^2/n)
    },
    laplace = (positive + 1) / (n + 2),
    eb = (positive + a_prior) / (n + a_prior + b_prior)
  )

Evaluation: MAE vs ground truth

eval_df <- adjusted %>%
  inner_join(ground_truth %>% select(app_id, true_rating, source), by = "app_id") %>%
  mutate(
    err_raw = abs(raw - true_rating),
    err_steamdb = abs(steamdb - true_rating),
    err_wilson = abs(wilson - true_rating),
    err_laplace = abs(laplace - true_rating),
    err_eb = abs(eb - true_rating)
  )

mae_results <- eval_df %>%
  group_by(cutoff) %>%
  summarize(
    n_games = n(),
    raw = mean(err_raw),
    steamdb = mean(err_steamdb),
    wilson = mean(err_wilson),
    laplace = mean(err_laplace),
    eb = mean(err_eb),
    .groups = "drop"
  )

mae_results

mae_long <- mae_results %>%
  pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
               names_to = "method", values_to = "mae") %>%
  mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
                         labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes")))

ggplot(mae_long, aes(cutoff, mae, color = method)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_x_log10(breaks = cutoffs) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    x = "Number of early reviews",
    y = "Mean absolute error vs. long-term rating",
    title = "Predicting long-term rating from early reviews",
    subtitle = sprintf("%s games with complete review histories", format(max(mae_results$n_games), big.mark = ",")),
    color = "Method"
  )

GG_save("figs/shrinkage_mae.png")

# Percentage change in MAE relative to raw score
mae_results %>%
  mutate(
    across(c(steamdb, wilson, laplace, eb), ~ round((. - raw) / raw * 100, 1), .names = "{.col}_pct")
  ) %>%
  select(cutoff, n_games, ends_with("_pct"))

Split by source

mae_by_source <- eval_df %>%
  group_by(cutoff, source) %>%
  summarize(
    n_games = n(),
    raw = mean(err_raw),
    steamdb = mean(err_steamdb),
    wilson = mean(err_wilson),
    laplace = mean(err_laplace),
    eb = mean(err_eb),
    .groups = "drop"
  )

mae_by_source %>%
  pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
               names_to = "method", values_to = "mae") %>%
  mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
                         labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
  ggplot(aes(cutoff, mae, color = method)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_x_log10(breaks = cutoffs) +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~source) +
  labs(
    x = "Number of early reviews",
    y = "Mean absolute error",
    title = "Shrinkage methods by source",
    color = "Method"
  )

GG_save("figs/shrinkage_mae_by_source.png")

Signed bias

Do methods systematically over- or under-estimate the true rating?

bias_results <- eval_df %>%
  group_by(cutoff) %>%
  summarize(
    raw = mean(raw - true_rating),
    steamdb = mean(steamdb - true_rating),
    wilson = mean(wilson - true_rating),
    laplace = mean(laplace - true_rating),
    eb = mean(eb - true_rating),
    .groups = "drop"
  )

bias_results %>%
  pivot_longer(-cutoff, names_to = "method", values_to = "bias") %>%
  mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
                         labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
  ggplot(aes(cutoff, bias, color = method)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  scale_x_log10(breaks = cutoffs) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    x = "Number of early reviews",
    y = "Mean signed error (positive = overestimates)",
    title = "Systematic bias by method",
    color = "Method"
  )

GG_save("figs/shrinkage_bias.png")

Example tracelines

How do the methods behave as reviews accumulate for individual games?

# Pick example games spanning the rating range
set.seed(42)
examples <- game_summary %>%
  filter(n_reviews >= 5000) %>%
  mutate(rating_bin = cut(raw_score, breaks = c(0, 0.5, 0.7, 0.85, 0.95, 1))) %>%
  group_by(rating_bin) %>%
  slice_sample(n = 1) %>%
  ungroup() %>%
  arrange(desc(raw_score))

examples %>% select(name, n_reviews, raw_score, source)

# Compute running estimates as reviews accumulate
trace_data <- votes_ordered %>%
  filter(app_id %in% examples$app_id) %>%
  group_by(app_id) %>%
  mutate(
    cum_positive = cumsum(voted_up),
    cum_n = row_number(),
    raw = cum_positive / cum_n,
    steamdb = raw - (raw - 0.5) * 2^(-log10(cum_n + 1)),
    wilson = {
      z <- qnorm(0.975)
      (raw + z^2/(2*cum_n) - z * sqrt((raw*(1-raw) + z^2/(4*cum_n))/cum_n)) / (1 + z^2/cum_n)
    },
    laplace = (cum_positive + 1) / (cum_n + 2),
    eb = (cum_positive + a_prior) / (cum_n + a_prior + b_prior)
  ) %>%
  ungroup() %>%
  left_join(examples %>% select(app_id, name, true_rating = raw_score), by = "app_id")

# Thin to ~200 log-spaced points per game
trace_thin <- trace_data %>%
  group_by(app_id) %>%
  filter(cum_n %in% unique(c(1:20, round(10^seq(log10(21), log10(max(cum_n)), length.out = 180))))) %>%
  ungroup()

trace_thin %>%
  select(app_id, name, cum_n, true_rating, raw, steamdb, wilson, laplace, eb) %>%
  pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
               names_to = "method", values_to = "estimate") %>%
  mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
                         labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
  ggplot(aes(cum_n, estimate, color = method)) +
  geom_line(alpha = 0.8) +
  geom_hline(aes(yintercept = true_rating), linetype = "dashed", alpha = 0.5) +
  scale_x_log10(breaks = c(10, 100, 1000, 10000)) +
  facet_wrap(~name, scales = "free_y", ncol = 1) +
  labs(
    x = "Number of reviews",
    y = "Estimated rating",
    title = "Rating estimates as reviews accumulate",
    subtitle = "Dashed line = true long-term rating",
    color = "Method"
  ) +
  theme(legend.position = "bottom")

GG_save("figs/shrinkage_tracelines.png")

Steam rating shrinkage methods compared

Emil Kirkegaard & Claude (Anthropic)

17 March, 2026