library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1.9000 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
library(arrow)
##
## Attaching package: 'arrow'
##
## The following object is masked from 'package:magrittr':
##
## is_in
##
## The following object is masked from 'package:lubridate':
##
## duration
##
## The following object is masked from 'package:utils':
##
## timestamp
library(RSQLite)
library(glue)
theme_set(theme_bw())
options(digits = 3)
if (F) {
library(btw)
btw::btw_mcp_session()
}
We use games with complete review histories only. For the original 1000 games (500 top-rated, 500 sampled), we keep only those with <=50k total reviews, since our scraper capped at 50k. The additional 1727 randomly sampled games were scraped without a cap (all had <50k reviews).
# Top-rated games from SteamDB (complete data only)
target_top <- read_csv("data/steamdb_top_rated.csv", show_col_types = FALSE) %>%
mutate(app_id = as.integer(app_id), source = "top_rated") %>%
filter(total_reviews <= 50000)
# Original random sample (complete data only)
target_sampled_old <- read_csv("data/sampled_games.csv", show_col_types = FALSE) %>%
mutate(app_id = as.integer(app_id), source = "sampled") %>%
filter(total_reviews <= 50000)
# New random sample (2000 games, all <50k, scraped completely)
target_sampled_new <- read_csv("data/sampled_games_2k.csv", show_col_types = FALSE) %>%
mutate(app_id = as.integer(app_id), source = "sampled") %>%
filter(total_reviews < 50000)
target_games <- bind_rows(
target_top %>% select(app_id, name, total_reviews, source),
target_sampled_old %>% select(app_id, name, total_reviews, source),
target_sampled_new %>% select(app_id, name, total_reviews, source)
)
cat("Top-rated (complete):", sum(target_games$source == "top_rated"), "\n")
## Top-rated (complete): 357
cat("Sampled (complete):", sum(target_games$source == "sampled"), "\n")
## Sampled (complete): 2210
cat("Total:", nrow(target_games), "\n")
## Total: 2567
# Load reviews from both parquet files
votes_old <- read_parquet("data/steam_review_votes.parquet") %>%
as_tibble() %>%
mutate(app_id = as.integer(app_id), voted_up = as.logical(voted_up))
votes_new <- read_parquet("data/steam_reviews_1k.parquet") %>%
as_tibble() %>%
mutate(app_id = as.integer(app_id), voted_up = as.logical(voted_up))
# Merge and filter to complete games
votes <- bind_rows(votes_old, votes_new) %>%
filter(app_id %in% target_games$app_id)
cat("Total reviews:", format(nrow(votes), big.mark = ","), "\n")
## Total reviews: 34,847,499
cat("Unique games with reviews:", n_distinct(votes$app_id), "\n")
## Unique games with reviews: 2560
# Per-game summary
game_summary <- votes %>%
group_by(app_id) %>%
summarize(
n_reviews = n(),
positive = sum(voted_up),
negative = n_reviews - positive,
raw_score = mean(voted_up),
.groups = "drop"
) %>%
left_join(target_games %>% select(app_id, name, source), by = "app_id")
game_summary
Ground truth is the full-sample mean rating for each game, using all available reviews.
ground_truth <- game_summary %>%
select(app_id, true_rating = raw_score, n_reviews, source)
cat("Games:", nrow(ground_truth), "\n")
## Games: 2560
cat("Mean true rating:", round(mean(ground_truth$true_rating), 3), "\n")
## Mean true rating: 0.851
cat("SD true rating:", round(sd(ground_truth$true_rating), 3), "\n")
## SD true rating: 0.122
We order each game’s reviews by timestamp and compute ratings using only the first N reviews.
votes_ordered <- votes %>%
group_by(app_id) %>%
arrange(timestamp_created, .by_group = TRUE) %>%
mutate(review_order = row_number()) %>%
ungroup()
cutoffs <- c(10, 25, 50, 100, 250, 500, 1000)
early_ratings <- map_dfr(cutoffs, function(k) {
votes_ordered %>%
filter(review_order <= k) %>%
group_by(app_id) %>%
summarize(
positive = sum(voted_up),
n = n(),
raw = mean(voted_up),
.groups = "drop"
) %>%
filter(n == k) %>%
mutate(cutoff = k)
})
early_ratings %>% count(cutoff)
We compare five approaches to estimating a game’s true positive rate from a small sample of early reviews:
score - (score - 0.5) * 2^(-log10(n + 1)). Shrinks toward 50% with an ad hoc decay function.(positive + 1) / (n + 2). Bayesian posterior mean with a uniform Beta(1,1) prior. Shrinks toward 50%.(positive + a) / (n + a + b) where the Beta(a, b) prior is estimated from the observed distribution of game ratings via method of moments. Shrinks toward the actual mean rating (~85%) with data-calibrated strength.The key distinction: SteamDB, Wilson, and Laplace all implicitly shrink toward 50%. Empirical Bayes shrinks toward the actual population mean (~85%), which is why it outperforms.
# Estimate EB prior
mu <- mean(game_summary$raw_score)
v <- var(game_summary$raw_score)
a_prior <- mu * (mu * (1 - mu) / v - 1)
b_prior <- (1 - mu) * (mu * (1 - mu) / v - 1)
cat("Empirical Bayes prior: Beta(", round(a_prior, 2), ",", round(b_prior, 2), ")\n")
## Empirical Bayes prior: Beta( 6.38 , 1.12 )
cat("Prior mean:", round(a_prior / (a_prior + b_prior), 3), "\n")
## Prior mean: 0.851
cat("Prior weight:", round(a_prior + b_prior, 1), "pseudo-reviews\n")
## Prior weight: 7.5 pseudo-reviews
# Apply all methods
adjusted <- early_ratings %>%
mutate(
steamdb = raw - (raw - 0.5) * 2^(-log10(n + 1)),
wilson = {
z <- qnorm(0.975)
p <- raw
(p + z^2/(2*n) - z * sqrt((p*(1-p) + z^2/(4*n))/n)) / (1 + z^2/n)
},
laplace = (positive + 1) / (n + 2),
eb = (positive + a_prior) / (n + a_prior + b_prior)
)
eval_df <- adjusted %>%
inner_join(ground_truth %>% select(app_id, true_rating, source), by = "app_id") %>%
mutate(
err_raw = abs(raw - true_rating),
err_steamdb = abs(steamdb - true_rating),
err_wilson = abs(wilson - true_rating),
err_laplace = abs(laplace - true_rating),
err_eb = abs(eb - true_rating)
)
mae_results <- eval_df %>%
group_by(cutoff) %>%
summarize(
n_games = n(),
raw = mean(err_raw),
steamdb = mean(err_steamdb),
wilson = mean(err_wilson),
laplace = mean(err_laplace),
eb = mean(err_eb),
.groups = "drop"
)
mae_results
mae_long <- mae_results %>%
pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
names_to = "method", values_to = "mae") %>%
mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes")))
ggplot(mae_long, aes(cutoff, mae, color = method)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_x_log10(breaks = cutoffs) +
scale_y_continuous(labels = scales::percent_format()) +
labs(
x = "Number of early reviews",
y = "Mean absolute error vs. long-term rating",
title = "Predicting long-term rating from early reviews",
subtitle = sprintf("%s games with complete review histories", format(max(mae_results$n_games), big.mark = ",")),
color = "Method"
)
GG_save("figs/shrinkage_mae.png")
# Percentage change in MAE relative to raw score
mae_results %>%
mutate(
across(c(steamdb, wilson, laplace, eb), ~ round((. - raw) / raw * 100, 1), .names = "{.col}_pct")
) %>%
select(cutoff, n_games, ends_with("_pct"))
mae_by_source <- eval_df %>%
group_by(cutoff, source) %>%
summarize(
n_games = n(),
raw = mean(err_raw),
steamdb = mean(err_steamdb),
wilson = mean(err_wilson),
laplace = mean(err_laplace),
eb = mean(err_eb),
.groups = "drop"
)
mae_by_source %>%
pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
names_to = "method", values_to = "mae") %>%
mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
ggplot(aes(cutoff, mae, color = method)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_x_log10(breaks = cutoffs) +
scale_y_continuous(labels = scales::percent_format()) +
facet_wrap(~source) +
labs(
x = "Number of early reviews",
y = "Mean absolute error",
title = "Shrinkage methods by source",
color = "Method"
)
GG_save("figs/shrinkage_mae_by_source.png")
Do methods systematically over- or under-estimate the true rating?
bias_results <- eval_df %>%
group_by(cutoff) %>%
summarize(
raw = mean(raw - true_rating),
steamdb = mean(steamdb - true_rating),
wilson = mean(wilson - true_rating),
laplace = mean(laplace - true_rating),
eb = mean(eb - true_rating),
.groups = "drop"
)
bias_results %>%
pivot_longer(-cutoff, names_to = "method", values_to = "bias") %>%
mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
ggplot(aes(cutoff, bias, color = method)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
scale_x_log10(breaks = cutoffs) +
scale_y_continuous(labels = scales::percent_format()) +
labs(
x = "Number of early reviews",
y = "Mean signed error (positive = overestimates)",
title = "Systematic bias by method",
color = "Method"
)
GG_save("figs/shrinkage_bias.png")
How do the methods behave as reviews accumulate for individual games?
# Pick example games spanning the rating range
set.seed(42)
examples <- game_summary %>%
filter(n_reviews >= 5000) %>%
mutate(rating_bin = cut(raw_score, breaks = c(0, 0.5, 0.7, 0.85, 0.95, 1))) %>%
group_by(rating_bin) %>%
slice_sample(n = 1) %>%
ungroup() %>%
arrange(desc(raw_score))
examples %>% select(name, n_reviews, raw_score, source)
# Compute running estimates as reviews accumulate
trace_data <- votes_ordered %>%
filter(app_id %in% examples$app_id) %>%
group_by(app_id) %>%
mutate(
cum_positive = cumsum(voted_up),
cum_n = row_number(),
raw = cum_positive / cum_n,
steamdb = raw - (raw - 0.5) * 2^(-log10(cum_n + 1)),
wilson = {
z <- qnorm(0.975)
(raw + z^2/(2*cum_n) - z * sqrt((raw*(1-raw) + z^2/(4*cum_n))/cum_n)) / (1 + z^2/cum_n)
},
laplace = (cum_positive + 1) / (cum_n + 2),
eb = (cum_positive + a_prior) / (cum_n + a_prior + b_prior)
) %>%
ungroup() %>%
left_join(examples %>% select(app_id, name, true_rating = raw_score), by = "app_id")
# Thin to ~200 log-spaced points per game
trace_thin <- trace_data %>%
group_by(app_id) %>%
filter(cum_n %in% unique(c(1:20, round(10^seq(log10(21), log10(max(cum_n)), length.out = 180))))) %>%
ungroup()
trace_thin %>%
select(app_id, name, cum_n, true_rating, raw, steamdb, wilson, laplace, eb) %>%
pivot_longer(cols = c(raw, steamdb, wilson, laplace, eb),
names_to = "method", values_to = "estimate") %>%
mutate(method = factor(method, levels = c("raw", "steamdb", "wilson", "laplace", "eb"),
labels = c("Raw", "SteamDB", "Wilson LB", "Laplace", "Empirical Bayes"))) %>%
ggplot(aes(cum_n, estimate, color = method)) +
geom_line(alpha = 0.8) +
geom_hline(aes(yintercept = true_rating), linetype = "dashed", alpha = 0.5) +
scale_x_log10(breaks = c(10, 100, 1000, 10000)) +
facet_wrap(~name, scales = "free_y", ncol = 1) +
labs(
x = "Number of reviews",
y = "Estimated rating",
title = "Rating estimates as reviews accumulate",
subtitle = "Dashed line = true long-term rating",
color = "Method"
) +
theme(legend.position = "bottom")
GG_save("figs/shrinkage_tracelines.png")
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] glue_1.8.0 RSQLite_2.4.3 arrow_23.0.1.1
## [4] kirkegaard_2025-11-19 robustbase_0.99-6 psych_2.5.6
## [7] assertthat_0.2.1 weights_1.1.2 magrittr_2.0.4
## [10] lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0
## [13] dplyr_1.1.4 purrr_1.2.0 readr_2.1.5
## [16] tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.1.9000
## [19] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] Rdpack_2.6.4 DBI_1.2.3 mnormt_2.1.1 gridExtra_2.3
## [5] rlang_1.1.6.9000 compiler_4.5.2 gdata_3.0.1 systemfonts_1.3.1
## [9] vctrs_0.6.5 pkgconfig_2.0.3 shape_1.4.6.1 crayon_1.5.3
## [13] fastmap_1.2.0 backports_1.5.0 labeling_0.4.3 rmarkdown_2.30
## [17] tzdb_0.5.0 nloptr_2.2.1 ragg_1.5.0 bit_4.6.0
## [21] xfun_0.53 glmnet_4.1-10 jomo_2.7-6 cachem_1.1.0
## [25] jsonlite_2.0.0 blob_1.2.4 pan_1.9 broom_1.0.11
## [29] parallel_4.5.2 cluster_2.1.8.2 R6_2.6.1 bslib_0.9.0
## [33] stringi_1.8.7 RColorBrewer_1.1-3 boot_1.3-32 rpart_4.1.24
## [37] jquerylib_0.1.4 Rcpp_1.1.0 iterators_1.0.14 knitr_1.50
## [41] base64enc_0.1-3 Matrix_1.7-4 splines_4.5.2 nnet_7.3-20
## [45] timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.17.1 yaml_2.3.10
## [49] codetools_0.2-20 lattice_0.22-7 withr_3.0.2 S7_0.2.1
## [53] evaluate_1.0.5 foreign_0.8-91 archive_1.1.12 survival_3.8-6
## [57] pillar_1.11.1 mice_3.18.0 checkmate_2.3.3 foreach_1.5.2
## [61] reformulas_0.4.1 generics_0.1.4 vroom_1.6.6 hms_1.1.3
## [65] scales_1.4.0 minqa_1.2.8 gtools_3.9.5 Hmisc_5.2-4
## [69] tools_4.5.2 data.table_1.17.8 lme4_1.1-37 grid_4.5.2
## [73] rbibutils_2.3 colorspace_2.1-2 nlme_3.1-168 htmlTable_2.4.3
## [77] Formula_1.2-5 cli_3.6.5 textshaping_1.0.4 gtable_0.3.6
## [81] DEoptimR_1.1-4 sass_0.4.10 digest_0.6.39 htmlwidgets_1.6.4
## [85] farver_2.1.2 memoise_2.0.1 htmltools_0.5.8.1 lifecycle_1.0.4
## [89] mitml_0.4-5 bit64_4.6.0-1 MASS_7.3-65