Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
library(arrow)
## 
## Attaching package: 'arrow'
## 
## The following object is masked from 'package:magrittr':
## 
##     is_in
## 
## The following object is masked from 'package:lubridate':
## 
##     duration
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
library(RSQLite)
library(glue)

theme_set(theme_bw())
options(digits = 3)

if (F) {
  library(btw)
  btw::btw_mcp_session()
}

Data

We use games with complete review histories only. For the original 1000 games (500 top-rated, 500 sampled), we keep only those with <=50k total reviews, since our scraper capped at 50k. The additional 1727 randomly sampled games were scraped without a cap (all had <50k reviews).

# Top-rated games from SteamDB (complete data only)
target_top <- read_csv("data/steamdb_top_rated.csv", show_col_types = FALSE) %>%
  mutate(app_id = as.integer(app_id), source = "top_rated") %>%
  filter(total_reviews <= 50000)

# Original random sample (complete data only)
target_sampled_old <- read_csv("data/sampled_games.csv", show_col_types = FALSE) %>%
  mutate(app_id = as.integer(app_id), source = "sampled") %>%
  filter(total_reviews <= 50000)

# New random sample (2000 games, all <50k, scraped completely)
target_sampled_new <- read_csv("data/sampled_games_2k.csv", show_col_types = FALSE) %>%
  mutate(app_id = as.integer(app_id), source = "sampled") %>%
  filter(total_reviews < 50000)

target_games <- bind_rows(
  target_top %>% select(app_id, name, total_reviews, source),
  target_sampled_old %>% select(app_id, name, total_reviews, source),
  target_sampled_new %>% select(app_id, name, total_reviews, source)
)

cat("Top-rated (complete):", sum(target_games$source == "top_rated"), "\n")
## Top-rated (complete): 357
cat("Sampled (complete):", sum(target_games$source == "sampled"), "\n")
## Sampled (complete): 2210
cat("Total:", nrow(target_games), "\n")
## Total: 2567
# Load reviews from both parquet files
votes_old <- read_parquet("data/steam_review_votes.parquet") %>%
  as_tibble() %>%
  mutate(app_id = as.integer(app_id), voted_up = as.logical(voted_up))

votes_new <- read_parquet("data/steam_reviews_1k.parquet") %>%
  as_tibble() %>%
  mutate(app_id = as.integer(app_id), voted_up = as.logical(voted_up))

# Merge and filter to complete games
votes <- bind_rows(votes_old, votes_new) %>%
  filter(app_id %in% target_games$app_id)

cat("Total reviews:", format(nrow(votes), big.mark = ","), "\n")
## Total reviews: 34,847,499
cat("Unique games with reviews:", n_distinct(votes$app_id), "\n")
## Unique games with reviews: 2560
# Per-game summary
game_summary <- votes %>%
  group_by(app_id) %>%
  summarize(
    n_reviews = n(),
    positive = sum(voted_up),
    negative = n_reviews - positive,
    raw_score = mean(voted_up),
    .groups = "drop"
  ) %>%
  left_join(target_games %>% select(app_id, name, source), by = "app_id")

game_summary

Ground truth

Ground truth is the full-sample mean rating for each game, using all available reviews.

ground_truth <- game_summary %>%
  select(app_id, true_rating = raw_score, n_reviews, source)

cat("Games:", nrow(ground_truth), "\n")
## Games: 2560
cat("Mean true rating:", round(mean(ground_truth$true_rating), 3), "\n")
## Mean true rating: 0.851
cat("SD true rating:", round(sd(ground_truth$true_rating), 3), "\n")
## SD true rating: 0.122

Early ratings at various sample sizes

We order each game’s reviews by timestamp and compute ratings using only the first N reviews.

votes_ordered <- votes %>%
  group_by(app_id) %>%
  arrange(timestamp_created, .by_group = TRUE) %>%
  mutate(review_order = row_number()) %>%
  ungroup()

cutoffs <- c(10, 25, 50, 100, 250, 500, 1000)

early_ratings <- map_dfr(cutoffs, function(k) {
  votes_ordered %>%
    filter(review_order <= k) %>%
    group_by(app_id) %>%
    summarize(
      positive = sum(voted_up),
      n = n(),
      raw = mean(voted_up),
      .groups = "drop"
    ) %>%
    filter(n == k) %>%
    mutate(cutoff = k)
})

early_ratings %>% count(cutoff)

Shrinkage methods

We compare five approaches to estimating a game’s true positive rate from a small sample of early reviews:

  1. Raw score: positive / total. Unbiased but noisy at small n.
  2. SteamDB formula: score - (score - 0.5) * 2^(-log10(n + 1)). Shrinks toward 50% with an ad hoc decay function.
  3. Wilson lower bound: Lower end of a 95% confidence interval for the proportion. Conservative — designed to answer “what’s the worst this game plausibly is?”
  4. Laplace smoothing: (positive + 1) / (n + 2). Bayesian posterior mean with a uniform Beta(1,1) prior. Shrinks toward 50%.
  5. Empirical Bayes: (positive + a) / (n + a + b) where the Beta(a, b) prior is estimated from the observed distribution of game ratings via method of moments. Shrinks toward the actual mean rating (~85%) with data-calibrated strength.

The key distinction: SteamDB, Wilson, and Laplace all implicitly shrink toward 50%. Empirical Bayes shrinks toward the actual population mean (~85%), which is why it outperforms.

# Estimate EB prior
mu <- mean(game_summary$raw_score)
v <- var(game_summary$raw_score)
a_prior <- mu * (mu * (1 - mu) / v - 1)
b_prior <- (1 - mu) * (mu * (1 - mu) / v - 1)
cat("Empirical Bayes prior: Beta(", round(a_prior, 2), ",", round(b_prior, 2), ")\n")
## Empirical Bayes prior: Beta( 6.38 , 1.12 )
cat("Prior mean:", round(a_prior / (a_prior + b_prior), 3), "\n")
## Prior mean: 0.851
cat("Prior weight:", round(a_prior + b_prior, 1), "pseudo-reviews\n")
## Prior weight: 7.5 pseudo-reviews
# Apply all methods
adjusted <- early_ratings %>%
  mutate(
    steamdb = raw - (raw - 0.5) * 2^(-log10(n + 1)),
    wilson = {
      z <- qnorm(0.975)
      p <- raw
      (p + z^2/(2*n) - z * sqrt((p*(1-p) + z^2/(4*n))/n)) / (1 + z^2/n)
    },
    laplace = (positive + 1) / (n + 2),
    eb = (positive + a_prior) / (n + a_prior + b_prior)
  )

Evaluation: MAE vs ground truth

eval_df <- adjusted %>%
  inner_join(ground_truth %>% select(app_id, true_rating, source), by = "app_id") %>%
  mutate(
    err_raw = abs(raw - true_rating),
    err_steamdb = abs(steamdb - true_rating),
    err_wilson = abs(wilson - true_rating),
    err_laplace = abs(laplace - true_rating),
    err_eb = abs(eb - true_rating)
  )

mae_results <- eval_df %>%
  group_by(cutoff) %>%
  summarize(
    n_games = n(),
    raw = mean(err_raw),
    steamdb = mean(err_steamdb),
    wilson = mean(err_wilson),
    laplace = mean(err_laplace),
    eb = mean(err_eb),
    .groups = "drop"
  )

mae_results
mae_long <- mae_results %>%
  pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
               names_to = "method", values_to = "mae") %>%
  mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
                         labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes")))

ggplot(mae_long, aes(cutoff, mae, color = method)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_x_log10(breaks = cutoffs) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    x = "Number of early reviews",
    y = "Mean absolute error vs. long-term rating",
    title = "Predicting long-term rating from early reviews",
    subtitle = sprintf("%s games with complete review histories", format(max(mae_results$n_games), big.mark = ",")),
    color = "Method"
  )

GG_save("figs/shrinkage_mae.png")
# Percentage change in MAE relative to raw score
mae_results %>%
  mutate(
    across(c(steamdb, wilson, laplace, eb), ~ round((. - raw) / raw * 100, 1), .names = "{.col}_pct")
  ) %>%
  select(cutoff, n_games, ends_with("_pct"))

Split by source

mae_by_source <- eval_df %>%
  group_by(cutoff, source) %>%
  summarize(
    n_games = n(),
    raw = mean(err_raw),
    steamdb = mean(err_steamdb),
    wilson = mean(err_wilson),
    laplace = mean(err_laplace),
    eb = mean(err_eb),
    .groups = "drop"
  )

mae_by_source %>%
  pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
               names_to = "method", values_to = "mae") %>%
  mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
                         labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
  ggplot(aes(cutoff, mae, color = method)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_x_log10(breaks = cutoffs) +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~source) +
  labs(
    x = "Number of early reviews",
    y = "Mean absolute error",
    title = "Shrinkage methods by source",
    color = "Method"
  )

GG_save("figs/shrinkage_mae_by_source.png")

Signed bias

Do methods systematically over- or under-estimate the true rating?

bias_results <- eval_df %>%
  group_by(cutoff) %>%
  summarize(
    raw = mean(raw - true_rating),
    steamdb = mean(steamdb - true_rating),
    wilson = mean(wilson - true_rating),
    laplace = mean(laplace - true_rating),
    eb = mean(eb - true_rating),
    .groups = "drop"
  )

bias_results %>%
  pivot_longer(-cutoff, names_to = "method", values_to = "bias") %>%
  mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
                         labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
  ggplot(aes(cutoff, bias, color = method)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  scale_x_log10(breaks = cutoffs) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    x = "Number of early reviews",
    y = "Mean signed error (positive = overestimates)",
    title = "Systematic bias by method",
    color = "Method"
  )

GG_save("figs/shrinkage_bias.png")

Example tracelines

How do the methods behave as reviews accumulate for individual games?

# Pick example games spanning the rating range
set.seed(42)
examples <- game_summary %>%
  filter(n_reviews >= 5000) %>%
  mutate(rating_bin = cut(raw_score, breaks = c(0, 0.5, 0.7, 0.85, 0.95, 1))) %>%
  group_by(rating_bin) %>%
  slice_sample(n = 1) %>%
  ungroup() %>%
  arrange(desc(raw_score))

examples %>% select(name, n_reviews, raw_score, source)
# Compute running estimates as reviews accumulate
trace_data <- votes_ordered %>%
  filter(app_id %in% examples$app_id) %>%
  group_by(app_id) %>%
  mutate(
    cum_positive = cumsum(voted_up),
    cum_n = row_number(),
    raw = cum_positive / cum_n,
    steamdb = raw - (raw - 0.5) * 2^(-log10(cum_n + 1)),
    wilson = {
      z <- qnorm(0.975)
      (raw + z^2/(2*cum_n) - z * sqrt((raw*(1-raw) + z^2/(4*cum_n))/cum_n)) / (1 + z^2/cum_n)
    },
    laplace = (cum_positive + 1) / (cum_n + 2),
    eb = (cum_positive + a_prior) / (cum_n + a_prior + b_prior)
  ) %>%
  ungroup() %>%
  left_join(examples %>% select(app_id, name, true_rating = raw_score), by = "app_id")

# Thin to ~200 log-spaced points per game
trace_thin <- trace_data %>%
  group_by(app_id) %>%
  filter(cum_n %in% unique(c(1:20, round(10^seq(log10(21), log10(max(cum_n)), length.out = 180))))) %>%
  ungroup()
trace_thin %>%
  select(app_id, name, cum_n, true_rating, raw, steamdb, wilson, laplace, eb) %>%
  pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
               names_to = "method", values_to = "estimate") %>%
  mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
                         labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
  ggplot(aes(cum_n, estimate, color = method)) +
  geom_line(alpha = 0.8) +
  geom_hline(aes(yintercept = true_rating), linetype = "dashed", alpha = 0.5) +
  scale_x_log10(breaks = c(10, 100, 1000, 10000)) +
  facet_wrap(~name, scales = "free_y", ncol = 1) +
  labs(
    x = "Number of reviews",
    y = "Estimated rating",
    title = "Rating estimates as reviews accumulate",
    subtitle = "Dashed line = true long-term rating",
    color = "Method"
  ) +
  theme(legend.position = "bottom")

GG_save("figs/shrinkage_tracelines.png")

Meta

write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] glue_1.8.0            RSQLite_2.4.3         arrow_23.0.1.1       
##  [4] kirkegaard_2025-11-19 robustbase_0.99-6     psych_2.5.6          
##  [7] assertthat_0.2.1      weights_1.1.2         magrittr_2.0.4       
## [10] lubridate_1.9.4       forcats_1.0.1         stringr_1.6.0        
## [13] dplyr_1.1.4           purrr_1.2.0           readr_2.1.5          
## [16] tidyr_1.3.1           tibble_3.3.0          ggplot2_4.0.1.9000   
## [19] tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] Rdpack_2.6.4       DBI_1.2.3          mnormt_2.1.1       gridExtra_2.3     
##  [5] rlang_1.1.6.9000   compiler_4.5.2     gdata_3.0.1        systemfonts_1.3.1 
##  [9] vctrs_0.6.5        pkgconfig_2.0.3    shape_1.4.6.1      crayon_1.5.3      
## [13] fastmap_1.2.0      backports_1.5.0    labeling_0.4.3     rmarkdown_2.30    
## [17] tzdb_0.5.0         nloptr_2.2.1       ragg_1.5.0         bit_4.6.0         
## [21] xfun_0.53          glmnet_4.1-10      jomo_2.7-6         cachem_1.1.0      
## [25] jsonlite_2.0.0     blob_1.2.4         pan_1.9            broom_1.0.11      
## [29] parallel_4.5.2     cluster_2.1.8.2    R6_2.6.1           bslib_0.9.0       
## [33] stringi_1.8.7      RColorBrewer_1.1-3 boot_1.3-32        rpart_4.1.24      
## [37] jquerylib_0.1.4    Rcpp_1.1.0         iterators_1.0.14   knitr_1.50        
## [41] base64enc_0.1-3    Matrix_1.7-4       splines_4.5.2      nnet_7.3-20       
## [45] timechange_0.3.0   tidyselect_1.2.1   rstudioapi_0.17.1  yaml_2.3.10       
## [49] codetools_0.2-20   lattice_0.22-7     withr_3.0.2        S7_0.2.1          
## [53] evaluate_1.0.5     foreign_0.8-91     archive_1.1.12     survival_3.8-6    
## [57] pillar_1.11.1      mice_3.18.0        checkmate_2.3.3    foreach_1.5.2     
## [61] reformulas_0.4.1   generics_0.1.4     vroom_1.6.6        hms_1.1.3         
## [65] scales_1.4.0       minqa_1.2.8        gtools_3.9.5       Hmisc_5.2-4       
## [69] tools_4.5.2        data.table_1.17.8  lme4_1.1-37        grid_4.5.2        
## [73] rbibutils_2.3      colorspace_2.1-2   nlme_3.1-168       htmlTable_2.4.3   
## [77] Formula_1.2-5      cli_3.6.5          textshaping_1.0.4  gtable_0.3.6      
## [81] DEoptimR_1.1-4     sass_0.4.10        digest_0.6.39      htmlwidgets_1.6.4 
## [85] farver_2.1.2       memoise_2.0.1      htmltools_0.5.8.1  lifecycle_1.0.4   
## [89] mitml_0.4-5        bit64_4.6.0-1      MASS_7.3-65