library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: Hmisc
##
##
## Attaching package: 'Hmisc'
##
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
##
## The following objects are masked from 'package:base':
##
## format.pval, units
##
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
ebbr,
#devtools::install_github("dgrtwo/ebbr")
tidymodels
)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ rsample 1.3.0
## ✔ dials 1.4.0 ✔ tune 1.3.0
## ✔ infer 1.0.8 ✔ workflows 1.2.0
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.3.1 ✔ yardstick 1.3.2
## ✔ recipes 1.3.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ scales::alpha() masks psych::alpha(), ggplot2::alpha()
## ✖ recipes::averages() masks kirkegaard::averages()
## ✖ scales::discard() masks purrr::discard()
## ✖ recipes::discretize() masks kirkegaard::discretize()
## ✖ magrittr::extract() masks tidyr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ assertthat::has_name() masks tibble::has_name()
## ✖ kirkegaard::is_logical() masks purrr::is_logical()
## ✖ dplyr::lag() masks stats::lag()
## ✖ magrittr::set_names() masks purrr::set_names()
## ✖ yardstick::spec() masks readr::spec()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ recipes::step() masks stats::step()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ✖ parsnip::translate() masks Hmisc::translate()
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
library(future)
library(furrr)
plan(sequential)
plan(multisession(workers = 3))
#read steam data files
games = read_csv("data/games.zip", na = "\\N", col_select = c(app_id, name, release_date)) %>% mutate(app_id = as.integer(app_id))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 140082 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name
## dbl (1): app_id
## date (1): release_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reviews = read_csv("data/reviews.zip") %>% mutate(app_id = as.integer(app_id))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 140116 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): app_id, review_score_description, metacritic_score, reviews, recomm...
## dbl (7): review_score, positive, negative, total, steamspy_user_score, steam...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `app_id = as.integer(app_id)`.
## Caused by warning:
## ! NAs introduced by coercion
genres = read_csv("data/genres.zip") %>% mutate(app_id = as.integer(app_id))
## Rows: 353339 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): genre
## dbl (1): app_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tags = read_csv("data/tags.zip") %>% mutate(app_id = as.integer(app_id))
## Rows: 1744632 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): tag
## dbl (1): app_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#reviews but no missing data
reviews_sub = reviews %>% select(app_id, positive, total) %>%
filter(total >= 10) %>%
miss_filter()
#empirical bayes rating
eb_fit = ebb_fit_prior(
reviews_sub,
positive,
total
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the ebbr package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#save results
reviews_sub = bind_cols(
reviews_sub %>% select(app_id),
eb_fit %>% augment()
)
#actual distribution of ratings, full sample
reviews_all = reviews %>%
mutate(
mean = positive / total
) %>%
select(app_id, positive, total, mean) %>%
miss_filter()
#fit empirical bayes rating
eb_fit_all = ebb_fit_prior(
reviews_all,
positive,
total
)
#save results
reviews_all = bind_cols(
reviews_all %>% select(app_id),
eb_fit_all %>% augment()
)
#mean rating as function of rating count
reviews_all %>%
ggplot(aes(total, .raw)) +
geom_point(aes(), alpha = 0.1) +
scale_x_log10() +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, 0.1)) +
geom_smooth(method = "gam",
formula = y ~ s(x, k = 20), # increase number of knots
method.args = list(family = gaussian)) +
labs(
x = "Number of reviews",
y = "Positive rating %",
title = "Steam game ratings"
)
GG_save("figs/mean_rating~ratings.png")
#raw and adjusted ratings
reviews_all %>%
GG_scatter(".raw", ".fitted", alpha = 0.1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
scale_x_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, 0.1)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, 0.1)) +
labs(
x = "Raw rating",
y = "Adjusted rating",
title = "Steam game ratings"
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/raw mean vs. bayes mean.png")
## `geom_smooth()` using formula = 'y ~ x'
reviews_all %>%
filter(.raw == 1) %>%
select(total, .fitted) %>%
describe2()
reviews_all %>%
filter(total == 1) %>%
select(total, .fitted) %>%
describe2()
#spread from long format to wide format with dummy coding
genres2 = genres %>%
pivot_wider(
names_from = genre,
values_from = genre
) %>%
mutate(across(where(is.character), ~ ifelse(is.na(.), 0, 1)))
#exclude unnecessary columns
genre_counts = genres2 %>%
select(-app_id) %>%
colSums() %>%
sort()
#keep those with 100+
genres_final = genres2 %>%
select(app_id, all_of(names(genre_counts[genre_counts > 100]))) %>%
df_add_affix(prefix = "genre_") %>%
rename(app_id = genre_app_id)
#join games, reviews, genres, tags
#keep only inner join
d = games %>%
inner_join(reviews_sub, by = "app_id") %>%
inner_join(genres_final, by = "app_id") %>%
inner_join(tags_final, by = "app_id") %>%
df_legalize_names()
#rating by release date
d %>%
ggplot(aes(x = release_date, y = fitted)) +
geom_point(alpha = 0.1, aes(size = total)) +
geom_smooth() +
scale_x_date(date_labels = "%Y-%m-%d", date_breaks = "1 year") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(
x = "Release date",
y = "Positive rating %",
title = "Steam game ratings by release date"
) +
theme(
axis.text.x = element_text(angle = 90)
)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 695 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 695 rows containing missing values or values outside the scale range
## (`geom_point()`).
GG_save("figs/rating~release_date.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 695 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 695 rows containing missing values or values outside the scale range
## (`geom_point()`).
#rating by number of reviews
d %>%
ggplot(aes(x = total, y = fitted)) +
geom_point(alpha = 0.1) +
geom_smooth() +
#log 10 x
scale_x_log10() +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(
x = "Number of reviews",
y = "Positive rating %",
title = "Steam game ratings by number of reviews"
)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
GG_save("figs/rating~reviews.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#rating by tag
d %>%
#keep only top 50 tags
select(app_id, tail(names(tag_counts_legal), 50)) %>%
pivot_longer(cols = starts_with("tag_")) %>%
filter(value == 1) %>%
mutate(
name = str_remove(name, "tag_")
) %>%
#join with ratings
inner_join(d %>% select(app_id, fitted), by = "app_id") %>%
#sort by mean rating
mutate(
name = fct_reorder(name, fitted, .fun = mean)
) %>%
GG_group_means(
"fitted",
"name",
type = "point",
split_group_labels = F
) +
coord_flip() +
theme(
axis.text.x = element_text(hjust = 1, size = 8) # smaller and left-aligned
) +
#labs
labs(
x = "Tag",
y = "Positive rating %",
title = "Steam game ratings by tag"
)
#multiplayer and singleplayer
d %>%
mutate(
player_tags = ifelse(
tag_Multiplayer == 1 & tag_Singleplayer == 1,
"multiplayer and singleplayer",
ifelse(
tag_Multiplayer == 1,
"multiplayer",
ifelse(
tag_Singleplayer == 1,
"singleplayer",
"none"
)
)
) %>% factor(levels = c("none", "singleplayer", "multiplayer", "multiplayer and singleplayer"))
) %>%
{
lm(
fitted ~ player_tags + as.numeric(release_date),
data = .,
)
} %>%
ggeffects::ggpredict(
terms = "player_tags"
) %>%
plot() +
labs(
x = "Player tags",
y = "Positive rating %",
title = "Steam game ratings by player tags",
subtitle = "Adjusted for release date, and using empirical bayes rating"
)
GG_save("figs/multiplayer_singleplayer.png")
#build a model formula
form_1 = str_glue("fitted ~ release_date + {str_c(names(tag_counts_legal[tag_counts_legal >= 100]), collapse = ' + ')}")
fit_1 = lm(form_1, data = d)
fit_1 %>% summary()
##
## Call:
## lm(formula = form_1, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7573 -0.0754 0.0227 0.0962 0.3846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.36e-01 1.16e-02 28.89 < 2e-16 ***
## release_date 2.09e-05 6.55e-07 31.88 < 2e-16 ***
## tag_Mod 5.38e-02 1.62e-02 3.33 0.00087 ***
## tag_Tile_Matching 8.75e-02 1.82e-02 4.79 1.6e-06 ***
## tag_Baseball 3.65e-02 1.85e-02 1.97 0.04885 *
## tag_Intentionally_Awkward_Controls 2.23e-02 1.71e-02 1.31 0.19067
## tag_Crowdfunded 1.75e-02 1.48e-02 1.18 0.23604
## tag_Cycling 3.02e-02 1.83e-02 1.65 0.09916 .
## tag_Dungeons_and_Dragons 3.39e-02 1.57e-02 2.16 0.03042 *
## tag_Vikings 1.59e-02 1.57e-02 1.02 0.30877
## tag_Gaming -4.44e-02 1.82e-02 -2.44 0.01472 *
## tag_Pool -2.69e-02 1.71e-02 -1.57 0.11544
## tag_Skating 1.20e-02 1.94e-02 0.62 0.53781
## tag_Boss_Rush 3.96e-02 1.81e-02 2.19 0.02856 *
## tag_Tennis -3.90e-03 1.73e-02 -0.23 0.82173
## tag_Movie -2.91e-02 1.44e-02 -2.01 0.04406 *
## tag_Epic 3.87e-02 1.31e-02 2.95 0.00316 **
## tag_Jump_Scare 2.41e-02 1.62e-02 1.48 0.13808
## tag_Ambient 2.15e-02 1.67e-02 1.29 0.19840
## tag_Web_Publishing 4.87e-03 1.56e-02 0.31 0.75519
## tag_Skateboarding 3.40e-02 1.80e-02 1.89 0.05867 .
## tag_Wrestling 2.71e-02 1.57e-02 1.72 0.08457 .
## tag_Silent_Protagonist -1.03e-02 1.33e-02 -0.77 0.43969
## tag_Football_American -2.33e-02 1.39e-02 -1.68 0.09350 .
## tag_Mini_Golf 3.15e-02 1.92e-02 1.64 0.10166
## tag_360_Video -4.25e-02 1.70e-02 -2.50 0.01243 *
## tag_Job_Simulator 1.94e-02 1.64e-02 1.19 0.23511
## tag_Dwarf 4.05e-02 1.26e-02 3.21 0.00133 **
## tag_Electronic_Music 3.04e-02 1.70e-02 1.79 0.07300 .
## tag_Social_Deduction 2.37e-02 1.41e-02 1.68 0.09299 .
## tag_Pinball -1.74e-02 1.30e-02 -1.34 0.18050
## tag_LEGO 2.08e-02 1.41e-02 1.48 0.13895
## tag_Basketball -2.14e-02 1.44e-02 -1.49 0.13584
## tag_Asymmetric_VR 1.87e-02 1.43e-02 1.31 0.19120
## tag_GameMaker -1.17e-04 1.16e-02 -0.01 0.99196
## tag_Motorbike 2.53e-02 1.44e-02 1.76 0.07787 .
## tag_Kickstarter 1.28e-03 1.12e-02 0.11 0.90895
## tag_Medical_Sim -3.79e-02 1.25e-02 -3.02 0.00254 **
## tag_Bikes -1.22e-02 1.51e-02 -0.81 0.41731
## tag_World_War_I 2.08e-02 1.24e-02 1.68 0.09359 .
## tag_Spaceships 4.15e-02 1.42e-02 2.92 0.00354 **
## tag_Rome 1.20e-02 1.27e-02 0.95 0.34334
## tag_Unforgiving 1.77e-02 1.24e-02 1.43 0.15241
## tag_Submarine 4.34e-02 1.46e-02 2.98 0.00289 **
## tag_Photo_Editing 1.86e-02 1.33e-02 1.39 0.16338
## tag_Experience 2.86e-03 1.07e-02 0.27 0.78984
## tag_Golf -8.08e-03 1.51e-02 -0.53 0.59274
## tag_Software_Training -4.82e-02 1.28e-02 -3.75 0.00018 ***
## tag_Sequel -7.75e-03 9.95e-03 -0.78 0.43640
## tag_Roguevania -2.00e-02 1.39e-02 -1.43 0.15188
## tag_Episodic 3.16e-02 1.05e-02 3.02 0.00253 **
## tag_Outbreak_Sim -1.30e-02 1.25e-02 -1.04 0.30030
## tag_Jet -2.49e-02 1.32e-02 -1.88 0.05944 .
## tag_Chess -7.99e-03 1.15e-02 -0.69 0.48728
## tag_Farming -2.12e-04 1.24e-02 -0.02 0.98642
## tag_Horses 5.63e-03 1.08e-02 0.52 0.60365
## tag_Touch_Friendly 4.74e-02 9.11e-03 5.20 2.0e-07 ***
## tag_Boxing -5.89e-03 1.24e-02 -0.48 0.63471
## tag_Werewolves 9.50e-03 1.18e-02 0.81 0.42060
## tag_Mars -2.17e-02 1.13e-02 -1.93 0.05401 .
## tag_Spelling 8.49e-03 1.32e-02 0.64 0.52024
## tag_Sniper 6.67e-03 1.07e-02 0.62 0.53407
## tag_Cult_Classic 1.54e-02 8.67e-03 1.77 0.07612 .
## tag_Nostalgia 4.61e-02 9.89e-03 4.66 3.1e-06 ***
## tag_Transhumanism 7.44e-03 1.21e-02 0.61 0.54004
## tag_Audio_Production -3.06e-03 1.11e-02 -0.27 0.78350
## tag_Villain_Protagonist 2.74e-03 9.43e-03 0.29 0.77099
## tag_Offroad -1.19e-02 1.08e-02 -1.10 0.27284
## tag_Illuminati -2.15e-02 9.07e-03 -2.37 0.01775 *
## tag_Foreign 1.13e-03 1.13e-02 0.10 0.91994
## tag_Music_Based_Procedural_Generation 1.93e-02 1.11e-02 1.74 0.08214 .
## tag_Real_Time_with_Pause 3.29e-02 8.76e-03 3.76 0.00017 ***
## tag_Football_Soccer 4.11e-03 1.03e-02 0.40 0.68904
## tag_Naval_Combat 1.38e-02 1.23e-02 1.12 0.26349
## tag_Time_Attack 2.11e-02 9.15e-03 2.30 0.02120 *
## tag_Sailing -1.27e-03 1.12e-02 -0.11 0.90998
## tag_Video_Production -1.57e-02 1.05e-02 -1.50 0.13463
## tag_Heist 3.16e-03 1.01e-02 0.31 0.75492
## tag_Trains 2.39e-02 9.08e-03 2.63 0.00847 **
## tag_On_Rails_Shooter -1.33e-02 9.83e-03 -1.35 0.17565
## tag_Addictive 4.89e-02 7.55e-03 6.48 9.3e-11 ***
## tag_Cold_War 2.80e-02 9.25e-03 3.03 0.00242 **
## tag_Gambling -4.94e-03 9.87e-03 -0.50 0.61642
## tag_Trivia -5.52e-02 9.97e-03 -5.53 3.2e-08 ***
## tag_Archery 6.10e-03 1.01e-02 0.60 0.54529
## tag_Diplomacy -1.58e-03 9.87e-03 -0.16 0.87244
## tag_Snow 1.31e-02 9.51e-03 1.38 0.16721
## tag_Remake -1.06e-02 7.23e-03 -1.47 0.14274
## tag_Real_Time -1.20e-02 7.69e-03 -1.56 0.11893
## tag_Naval -6.34e-03 1.06e-02 -0.60 0.55072
## tag_Minigames 6.86e-03 8.51e-03 0.81 0.42071
## tag_Traditional_Roguelike 1.74e-02 1.00e-02 1.74 0.08158 .
## tag_Party 5.06e-03 9.93e-03 0.51 0.61040
## tag_Superhero 6.02e-03 8.79e-03 0.69 0.49308
## tag_Asynchronous_Multiplayer 6.93e-03 8.58e-03 0.81 0.41918
## tag_Boomer_Shooter 5.29e-02 9.35e-03 5.66 1.6e-08 ***
## tag_Politics 7.04e-03 9.20e-03 0.77 0.44409
## tag_Transportation 1.21e-02 8.98e-03 1.35 0.17730
## tag_MOBA -1.92e-02 8.90e-03 -2.16 0.03083 *
## tag_Typing 1.60e-02 9.49e-03 1.69 0.09136 .
## tag_Escape_Room 7.22e-03 7.85e-03 0.92 0.35784
## tag_Western 1.06e-02 8.46e-03 1.25 0.21155
## tag_Assassin -3.23e-03 8.87e-03 -0.36 0.71595
## tag_Dinosaurs 3.19e-03 8.06e-03 0.40 0.69212
## tag_Political_Sim -1.68e-02 8.97e-03 -1.88 0.06057 .
## tag_Programming 3.31e-02 8.33e-03 3.98 7.0e-05 ***
## tag_Immersive 2.08e-02 8.08e-03 2.58 0.01002 *
## tag_Party_Game 2.78e-02 9.32e-03 2.98 0.00289 **
## tag_Faith 3.82e-02 7.83e-03 4.88 1.1e-06 ***
## tag_4X -1.94e-02 7.71e-03 -2.52 0.01169 *
## tag_Hacking 1.87e-02 8.40e-03 2.23 0.02576 *
## tag_Tanks 6.09e-03 8.07e-03 0.75 0.45071
## tag_Fishing 1.94e-02 8.25e-03 2.35 0.01860 *
## tag_Narrative 2.11e-02 7.68e-03 2.75 0.00600 **
## tag_Hunting -2.45e-02 8.36e-03 -2.93 0.00334 **
## tag_Animation_and_Modeling -6.14e-03 8.08e-03 -0.76 0.44717
## tag_Blood 7.81e-03 6.69e-03 1.17 0.24331
## tag_Vampire -6.49e-03 7.93e-03 -0.82 0.41331
## tag_Trading_Card_Game -2.82e-02 7.75e-03 -3.64 0.00027 ***
## tag_FMV 1.81e-03 6.55e-03 0.28 0.78230
## tag_Mouse_only 1.23e-02 6.38e-03 1.94 0.05299 .
## tag_Game_Development -4.23e-03 8.16e-03 -0.52 0.60401
## tag_Otome 1.70e-02 7.37e-03 2.31 0.02114 *
## tag_Roguelike_Deckbuilder 2.16e-02 9.50e-03 2.27 0.02331 *
## tag_Political 1.27e-03 7.86e-03 0.16 0.87166
## tag_Ninja 9.07e-03 7.38e-03 1.23 0.21931
## tag_Underwater 1.12e-02 8.25e-03 1.36 0.17469
## tag_Beautiful 9.25e-03 5.87e-03 1.57 0.11538
## tag_Mining 1.19e-02 7.92e-03 1.50 0.13310
## tag_Dynamic_Narration -6.70e-03 7.93e-03 -0.84 0.39839
## tag_Hex_Grid 1.52e-02 7.33e-03 2.07 0.03816 *
## tag_Pirates 1.12e-02 7.61e-03 1.47 0.14029
## tag_Time_Travel 1.01e-02 7.49e-03 1.35 0.17593
## tag_Wholesome 9.64e-05 8.32e-03 0.01 0.99075
## tag_Co_op_Campaign -3.48e-04 7.31e-03 -0.05 0.96204
## tag_Dog 1.22e-02 7.06e-03 1.73 0.08446 .
## tag_Steampunk -1.09e-03 6.70e-03 -0.16 0.87089
## tag_Character_Action_Game -1.92e-02 6.34e-03 -3.03 0.00243 **
## tag_Sokoban 4.07e-02 7.44e-03 5.46 4.7e-08 ***
## tag_Satire 6.22e-03 7.04e-03 0.88 0.37719
## tag_Quick_Time_Events -1.14e-03 7.48e-03 -0.15 0.87880
## tag_Underground -8.30e-03 7.53e-03 -1.10 0.27030
## tag_God_Game -3.04e-03 6.83e-03 -0.44 0.65690
## tag_Action_RTS 1.26e-02 8.42e-03 1.49 0.13555
## tag_Open_World_Survival_Craft 7.47e-03 7.26e-03 1.03 0.30394
## tag_Mechs -2.55e-03 6.74e-03 -0.38 0.70508
## tag_Martial_Arts -8.95e-03 7.22e-03 -1.24 0.21479
## tag_World_War_II 1.85e-02 6.09e-03 3.03 0.00243 **
## tag_Software 2.23e-02 7.77e-03 2.87 0.00417 **
## tag_Voxel 8.70e-03 6.54e-03 1.33 0.18317
## tag_Time_Manipulation 2.03e-02 7.12e-03 2.85 0.00439 **
## tag_Solitaire 1.30e-02 6.95e-03 1.87 0.06104 .
## tag_Gothic 2.18e-02 7.03e-03 3.11 0.00190 **
## tag_MMORPG -2.72e-02 6.62e-03 -4.11 4.0e-05 ***
## tag_Agriculture 1.37e-02 7.65e-03 1.80 0.07258 .
## tag_Looter_Shooter -1.29e-02 7.70e-03 -1.67 0.09433 .
## tag_Conspiracy -6.72e-04 7.02e-03 -0.10 0.92378
## tag_Combat_Racing 4.75e-03 7.75e-03 0.61 0.54022
## tag_Trading -1.08e-02 6.80e-03 -1.59 0.11221
## tag_Spectacle_fighter 6.84e-03 6.61e-03 1.03 0.30090
## tag_Bullet_Time 1.02e-02 6.71e-03 1.52 0.12736
## tag_Split_Screen 1.97e-02 6.34e-03 3.10 0.00192 **
## tag_Parody 4.43e-04 6.12e-03 0.07 0.94224
## tag_Hero_Shooter 3.27e-03 6.98e-03 0.47 0.63936
## tag_America -7.84e-03 6.56e-03 -1.20 0.23185
## tag_Cozy 5.32e-02 7.74e-03 6.87 6.4e-12 ***
## tag_3D_Vision -1.51e-02 6.68e-03 -2.27 0.02339 *
## tag_6DOF -8.88e-03 6.61e-03 -1.34 0.17899
## tag_Battle_Royale -1.53e-02 6.54e-03 -2.34 0.01910 *
## tag_Dragons 4.93e-03 6.38e-03 0.77 0.43937
## tag_Auto_Battler -1.40e-02 6.55e-03 -2.14 0.03261 *
## tag_Gun_Customization -4.26e-03 6.94e-03 -0.61 0.53927
## tag_eSports 4.47e-03 6.12e-03 0.73 0.46513
## tag_Cooking 1.88e-02 6.70e-03 2.80 0.00512 **
## tag_Soundtrack -2.39e-03 5.22e-03 -0.46 0.64708
## tag_Mystery_Dungeon -1.29e-02 6.27e-03 -2.07 0.03892 *
## tag_Class_Based 1.69e-02 6.64e-03 2.54 0.01103 *
## tag_Vehicular_Combat 8.12e-03 7.17e-03 1.13 0.25732
## tag_Grand_Strategy -1.37e-02 6.39e-03 -2.15 0.03175 *
## tag_Word_Game -4.15e-03 6.35e-03 -0.65 0.51329
## tag_Science 1.57e-02 6.03e-03 2.61 0.00898 **
## tag_Colony_Sim 1.75e-02 6.52e-03 2.68 0.00742 **
## tag_Design_and_Illustration 1.46e-02 6.22e-03 2.35 0.01901 *
## tag_Swordplay 2.25e-03 6.27e-03 0.36 0.71949
## tag_Moddable 5.76e-02 5.25e-03 10.97 < 2e-16 ***
## tag_Utilities 1.10e-02 6.89e-03 1.59 0.11170
## tag_Noir -7.92e-04 5.78e-03 -0.14 0.89104
## tag_Lovecraftian 1.36e-02 5.62e-03 2.42 0.01566 *
## tag_Automation 1.49e-02 6.02e-03 2.48 0.01304 *
## tag_Space_Sim -5.69e-03 6.20e-03 -0.92 0.35894
## tag_Farming_Sim -9.39e-03 6.93e-03 -1.35 0.17543
## tag_Dark_Comedy 1.04e-02 5.63e-03 1.85 0.06466 .
## tag_Capitalism 1.39e-02 6.18e-03 2.25 0.02463 *
## tag_Competitive 1.30e-02 4.99e-03 2.61 0.00906 **
## tag_Match_3 -2.29e-02 5.24e-03 -4.36 1.3e-05 ***
## tag_3D_Fighter -1.70e-02 6.29e-03 -2.71 0.00682 **
## tag_Fighting 3.52e-03 5.37e-03 0.66 0.51164
## tag_Philosophical 1.25e-02 5.55e-03 2.26 0.02378 *
## tag_Experimental -8.64e-03 4.67e-03 -1.85 0.06431 .
## tag_Classic 5.76e-02 4.34e-03 13.29 < 2e-16 ***
## tag_Automobile_Sim -5.19e-03 6.13e-03 -0.85 0.39726
## tag_Music -1.22e-02 5.02e-03 -2.42 0.01553 *
## tag_Rhythm 2.40e-02 5.66e-03 4.23 2.3e-05 ***
## tag_Crime -2.61e-03 5.34e-03 -0.49 0.62546
## tag_Creature_Collector -1.19e-03 5.54e-03 -0.21 0.83019
## tag_Twin_Stick_Shooter 1.08e-02 5.43e-03 1.99 0.04611 *
## tag_Destruction 1.39e-02 5.13e-03 2.71 0.00677 **
## tag_CRPG 3.44e-03 5.16e-03 0.67 0.50528
## tag_Level_Editor 2.10e-02 4.84e-03 4.33 1.5e-05 ***
## tag_Mythology 6.82e-03 5.16e-03 1.32 0.18556
## tag_2D_Fighter -9.81e-03 5.76e-03 -1.70 0.08848 .
## tag_Cats 2.87e-02 4.76e-03 6.03 1.7e-09 ***
## tag_Alternate_History 6.42e-04 4.97e-03 0.13 0.89713
## tag_Grid_Based_Movement 1.91e-02 5.31e-03 3.60 0.00032 ***
## tag_Parkour 1.97e-02 5.20e-03 3.79 0.00015 ***
## tag_Card_Battler -1.46e-02 6.75e-03 -2.17 0.03019 *
## tag_RPGMaker 3.15e-02 4.21e-03 7.50 6.7e-14 ***
## tag_Loot 5.06e-03 4.84e-03 1.05 0.29580
## tag_Party_Based_RPG -1.68e-03 5.10e-03 -0.33 0.74177
## tag_4_Player_Local 9.55e-03 5.25e-03 1.82 0.06864 .
## tag_Team_Based -6.85e-03 4.83e-03 -1.42 0.15644
## tag_Inventory_Management 2.44e-03 5.12e-03 0.48 0.63434
## tag_Flight 9.97e-03 4.72e-03 2.11 0.03450 *
## tag_Wargame -6.18e-03 5.27e-03 -1.17 0.24107
## tag_Modern -6.36e-03 4.45e-03 -1.43 0.15342
## tag_Souls_like 1.21e-02 4.93e-03 2.46 0.01383 *
## tag_Artificial_Intelligence -6.05e-03 4.80e-03 -1.26 0.20823
## tag_Runner -2.91e-03 4.86e-03 -0.60 0.54974
## tag_Beat_em_up 2.46e-03 4.72e-03 0.52 0.60260
## tag_Memes 9.89e-03 3.73e-03 2.65 0.00802 **
## tag_Idler -2.46e-02 4.43e-03 -5.55 2.9e-08 ***
## tag_Dystopian 7.26e-03 4.80e-03 1.51 0.13040
## tag_Comic_Book 6.28e-03 4.53e-03 1.39 0.16595
## tag_Short 6.50e-03 3.59e-03 1.81 0.07006 .
## tag_Tutorial 2.49e-03 4.61e-03 0.54 0.58969
## tag_Driving -1.52e-03 5.34e-03 -0.28 0.77628
## tag_Fast_Paced 1.13e-02 3.91e-03 2.89 0.00389 **
## tag_Metroidvania 6.02e-03 4.78e-03 1.26 0.20767
## tag_Psychedelic 1.84e-02 4.42e-03 4.16 3.1e-05 ***
## tag_Deckbuilding 2.04e-02 6.50e-03 3.14 0.00167 **
## tag_Tactical_RPG 1.83e-03 5.03e-03 0.36 0.71664
## tag_City_Builder -2.17e-05 5.00e-03 0.00 0.99654
## tag_Turn_Based 1.20e-02 3.87e-03 3.10 0.00192 **
## tag_Tabletop 5.86e-03 4.60e-03 1.27 0.20275
## tag_Supernatural 6.49e-03 4.39e-03 1.48 0.13918
## tag_Perma_Death 2.33e-05 4.28e-03 0.01 0.99567
## tag_Board_Game 1.59e-03 4.60e-03 0.35 0.72956
## tag_Collectathon 1.68e-02 4.25e-03 3.96 7.6e-05 ***
## tag_Massively_Multiplayer -5.07e-02 4.06e-03 -12.51 < 2e-16 ***
## tag_Real_Time_Tactics -3.80e-03 4.66e-03 -0.82 0.41491
## tag_Thriller -4.21e-03 4.24e-03 -0.99 0.32086
## tag_Arena_Shooter 1.08e-02 4.39e-03 2.45 0.01421 *
## tag_Psychological 1.24e-03 4.09e-03 0.30 0.76226
## tag_Nonlinear 9.64e-03 4.33e-03 2.23 0.02596 *
## tag_Economy -8.70e-03 4.61e-03 -1.88 0.05947 .
## tag_Tower_Defense 1.16e-02 4.19e-03 2.77 0.00567 **
## tag_Demons -7.53e-03 4.12e-03 -1.83 0.06750 .
## tag_Lore_Rich 9.95e-03 4.31e-03 2.31 0.02102 *
## tag_RTS 7.58e-03 4.37e-03 1.73 0.08296 .
## tag_Dark_Humor 1.65e-02 3.98e-03 4.15 3.3e-05 ***
## tag_Aliens 1.02e-03 4.32e-03 0.24 0.81288
## tag_Military 5.56e-03 4.56e-03 1.22 0.22332
## tag_Time_Management 1.89e-02 4.02e-03 4.71 2.5e-06 ***
## tag_Strategy_RPG -8.19e-03 4.44e-03 -1.84 0.06538 .
## tag_Robots 1.73e-02 4.15e-03 4.17 3.0e-05 ***
## tag_Detective -3.67e-03 4.27e-03 -0.86 0.39038
## tag_Abstract 8.81e-03 3.95e-03 2.23 0.02571 *
## tag_Cyberpunk -1.69e-03 4.13e-03 -0.41 0.68259
## tag_Conversation 4.77e-03 3.98e-03 1.20 0.23163
## tag_1980s 2.04e-03 4.11e-03 0.50 0.61929
## tag_Third_Person_Shooter -8.36e-03 4.37e-03 -1.91 0.05607 .
## tag_Replay_Value -1.34e-02 3.35e-03 -4.00 6.4e-05 ***
## tag_Precision_Platformer 1.81e-02 4.37e-03 4.15 3.4e-05 ***
## tag_LGBTQplus 1.52e-02 3.50e-03 4.36 1.3e-05 ***
## tag_Local_Co_Op 4.99e-03 4.16e-03 1.20 0.22989
## tag_Investigation 1.72e-03 4.17e-03 0.41 0.67946
## tag_Narration -5.72e-04 3.64e-03 -0.16 0.87517
## tag_Card_Game -2.81e-03 4.98e-03 -0.56 0.57300
## tag_Clicker -4.55e-02 3.60e-03 -12.63 < 2e-16 ***
## tag_Life_Sim 2.47e-03 3.89e-03 0.64 0.52527
## tag_Stealth 4.22e-03 3.68e-03 1.15 0.25134
## tag_Isometric 5.88e-03 3.48e-03 1.69 0.09051 .
## tag_Cinematic -7.46e-03 3.77e-03 -1.98 0.04767 *
## tag_2_5D 5.95e-03 3.71e-03 1.60 0.10881
## tag_Education 2.28e-02 3.69e-03 6.19 6.1e-10 ***
## tag_War 2.14e-03 4.20e-03 0.51 0.60932
## tag_Historical 1.26e-02 3.54e-03 3.56 0.00037 ***
## tag_Text_Based 4.88e-03 3.58e-03 1.36 0.17258
## tag_Top_Down_Shooter 4.68e-04 4.32e-03 0.11 0.91370
## tag_NSFW -1.09e-02 4.31e-03 -2.54 0.01124 *
## tag_Zombies -1.27e-02 3.52e-03 -3.62 0.00030 ***
## tag_Hentai 1.77e-02 3.98e-03 4.45 8.7e-06 ***
## tag_Score_Attack 8.03e-03 3.57e-03 2.25 0.02445 *
## tag_JRPG -1.60e-03 3.56e-03 -0.45 0.65221
## tag_Post_apocalyptic 7.47e-03 3.57e-03 2.09 0.03663 *
## tag_Base_Building 1.22e-03 4.07e-03 0.30 0.76434
## tag_Surreal 1.87e-02 3.29e-03 5.67 1.4e-08 ***
## tag_Racing -3.02e-03 3.79e-03 -0.80 0.42547
## tag_Dating_Sim -8.43e-03 3.50e-03 -2.41 0.01603 *
## tag_1990_s 1.31e-02 3.47e-03 3.78 0.00016 ***
## tag_Great_Soundtrack 3.74e-02 2.55e-03 14.63 < 2e-16 ***
## tag_Dungeon_Crawler -5.26e-04 3.54e-03 -0.15 0.88193
## tag_Walking_Simulator -1.05e-02 3.29e-03 -3.18 0.00148 **
## tag_Local_Multiplayer 7.92e-03 3.89e-03 2.03 0.04200 *
## tag_Mature -1.11e-02 3.87e-03 -2.88 0.00395 **
## tag_Nature 1.62e-02 3.51e-03 4.62 3.8e-06 ***
## tag_Shoot_Em_Up 1.06e-02 3.60e-03 2.95 0.00319 **
## tag_Resource_Management 7.14e-03 3.59e-03 1.99 0.04663 *
## tag_Hack_and_Slash -2.27e-03 3.50e-03 -0.65 0.51696
## tag_Turn_Based_Tactics 1.06e-04 3.94e-03 0.03 0.97863
## tag_Emotional 1.87e-02 3.19e-03 5.84 5.3e-09 ***
## tag_Online_Co_Op -4.94e-03 3.54e-03 -1.39 0.16366
## tag_Sports -3.07e-03 3.25e-03 -0.95 0.34417
## tag_Bullet_Hell 1.75e-02 3.68e-03 4.75 2.0e-06 ***
## tag_Survival_Horror -8.91e-03 3.46e-03 -2.58 0.00995 **
## tag_Turn_Based_Combat 6.56e-03 3.70e-03 1.77 0.07609 .
## tag_Romance 2.22e-03 3.34e-03 0.67 0.50591
## tag_Dark_Fantasy -8.20e-04 3.31e-03 -0.25 0.80445
## tag_Turn_Based_Strategy 8.65e-03 3.91e-03 2.21 0.02691 *
## tag_Interactive_Fiction 6.67e-03 3.18e-03 2.10 0.03587 *
## tag_Crafting -1.15e-02 3.42e-03 -3.36 0.00079 ***
## tag_Immersive_Sim 6.49e-03 3.18e-03 2.04 0.04115 *
## tag_Gore 9.81e-03 3.21e-03 3.06 0.00223 **
## tag_Hidden_Object -4.40e-03 2.96e-03 -1.49 0.13685
## tag_Choose_Your_Own_Adventure 3.38e-03 3.15e-03 1.07 0.28335
## tag_Cartoon -8.73e-04 3.09e-03 -0.28 0.77762
## tag_Drama 3.52e-03 3.03e-03 1.16 0.24561
## tag_Procedural_Generation 2.05e-02 3.13e-03 6.56 5.4e-11 ***
## tag_Action_Roguelike 1.01e-02 3.95e-03 2.56 0.01038 *
## tag_Puzzle_Platformer 2.03e-03 3.09e-03 0.66 0.51164
## tag_Medieval 2.98e-03 3.14e-03 0.95 0.34315
## tag_Logic -4.15e-03 3.09e-03 -1.34 0.17925
## tag_Futuristic 7.94e-06 3.29e-03 0.00 0.99808
## tag_Old_School 6.41e-03 3.05e-03 2.10 0.03567 *
## tag_Tactical -6.14e-05 3.09e-03 -0.02 0.98416
## tag_3D_Platformer -3.32e-03 3.31e-03 -1.00 0.31587
## tag_Management -1.14e-02 3.29e-03 -3.48 0.00050 ***
## tag_Building -3.40e-03 3.41e-03 -1.00 0.31784
## tag_Side_Scroller -9.31e-03 2.88e-03 -3.23 0.00124 **
## tag_Space 2.81e-04 3.20e-03 0.09 0.93004
## tag_Violent -1.90e-02 2.95e-03 -6.45 1.2e-10 ***
## tag_Magic 3.22e-03 2.93e-03 1.10 0.27196
## tag_Roguelite 8.14e-03 3.67e-03 2.22 0.02652 *
## tag_Sandbox 1.39e-02 2.84e-03 4.89 1.0e-06 ***
## tag_Action_RPG -1.02e-02 3.14e-03 -3.24 0.00118 **
## tag_Hand_drawn 9.16e-03 2.64e-03 3.47 0.00052 ***
## tag_VR 2.54e-02 2.49e-03 10.19 < 2e-16 ***
## tag_Co_op 4.73e-03 3.14e-03 1.51 0.13133
## tag_Character_Customization 4.45e-03 2.64e-03 1.68 0.09213 .
## tag_Point_and_Click 5.18e-03 2.56e-03 2.02 0.04338 *
## tag_Minimalist 1.61e-02 2.65e-03 6.08 1.2e-09 ***
## tag_Difficult 4.89e-03 2.33e-03 2.10 0.03553 *
## tag_Roguelike 7.83e-03 3.62e-03 2.16 0.03044 *
## tag_FPS -1.29e-02 2.92e-03 -4.43 9.4e-06 ***
## tag_Nudity -2.54e-02 3.66e-03 -6.93 4.2e-12 ***
## tag_Physics 1.03e-02 2.61e-03 3.95 7.7e-05 ***
## tag_Psychological_Horror 1.17e-02 2.67e-03 4.38 1.2e-05 ***
## tag_Sexual_Content -1.10e-02 3.61e-03 -3.06 0.00219 **
## tag_PvP -1.44e-02 3.00e-03 -4.78 1.7e-06 ***
## tag_PvE -4.37e-03 2.78e-03 -1.57 0.11602
## tag_Multiple_Endings 1.69e-02 2.63e-03 6.42 1.4e-10 ***
## tag_Comedy 2.09e-02 2.38e-03 8.81 < 2e-16 ***
## tag_Dark 9.89e-04 2.62e-03 0.38 0.70529
## tag_Mystery 8.09e-03 2.65e-03 3.06 0.00223 **
## tag_Open_World -6.46e-03 2.52e-03 -2.56 0.01053 *
## tag_Free_to_Play 1.30e-02 1.85e-03 7.04 1.9e-12 ***
## tag_Cartoony 7.39e-03 2.50e-03 2.96 0.00311 **
## tag_Survival -1.56e-02 2.49e-03 -6.27 3.5e-10 ***
## tag_Linear 5.90e-03 2.40e-03 2.45 0.01412 *
## tag_Realistic -3.14e-02 2.52e-03 -12.49 < 2e-16 ***
## tag_2D_Platformer 1.36e-04 2.87e-03 0.05 0.96212
## tag_Choices_Matter 6.41e-03 2.55e-03 2.51 0.01198 *
## tag_Visual_Novel 3.44e-02 2.65e-03 12.99 < 2e-16 ***
## tag_Top_Down 1.07e-02 2.48e-03 4.33 1.5e-05 ***
## tag_Controller 1.76e-02 2.24e-03 7.84 4.4e-15 ***
## tag_Female_Protagonist 2.00e-03 1.98e-03 1.01 0.31156
## tag_Third_Person 1.03e-03 2.45e-03 0.42 0.67357
## tag_Family_Friendly -1.89e-03 2.19e-03 -0.86 0.38730
## tag_Stylized 1.01e-02 2.21e-03 4.56 5.1e-06 ***
## tag_Retro 8.85e-03 2.23e-03 3.97 7.1e-05 ***
## tag_Platformer 1.23e-02 2.64e-03 4.68 2.9e-06 ***
## tag_Sci_fi 7.00e-03 2.39e-03 2.93 0.00339 **
## tag_Horror -3.99e-03 2.44e-03 -1.64 0.10165
## tag_Anime 2.15e-02 2.14e-03 10.05 < 2e-16 ***
## tag_Shooter 4.19e-03 2.68e-03 1.56 0.11845
## tag_Combat 5.37e-03 2.41e-03 2.23 0.02566 *
## tag_Relaxing 5.44e-03 2.16e-03 2.51 0.01202 *
## tag_Funny 1.72e-02 2.00e-03 8.60 < 2e-16 ***
## tag_Early_Access -1.16e-02 1.86e-03 -6.25 4.2e-10 ***
## tag_Multiplayer 1.49e-02 2.30e-03 6.45 1.1e-10 ***
## tag_Arcade 1.25e-03 2.06e-03 0.61 0.54209
## tag_Action_Adventure -4.02e-03 2.17e-03 -1.85 0.06410 .
## tag_First_Person -5.39e-03 2.19e-03 -2.46 0.01395 *
## tag_Fantasy 2.80e-03 1.95e-03 1.44 0.15122
## tag_Cute 1.51e-02 1.85e-03 8.14 4.1e-16 ***
## tag_Pixel_Graphics 2.95e-02 1.92e-03 15.39 < 2e-16 ***
## tag_Story_Rich 1.23e-02 1.85e-03 6.65 3.0e-11 ***
## tag_Exploration -9.59e-04 1.87e-03 -0.51 0.60829
## tag_Colorful 1.32e-03 1.79e-03 0.74 0.45999
## tag_Puzzle 2.43e-02 1.80e-03 13.53 < 2e-16 ***
## tag_Atmospheric 3.37e-03 1.65e-03 2.04 0.04165 *
## tag_RPG -5.97e-03 1.82e-03 -3.29 0.00102 **
## tag_Strategy -7.74e-03 1.80e-03 -4.30 1.7e-05 ***
## tag_Simulation -2.45e-02 1.62e-03 -15.11 < 2e-16 ***
## tag_3D -7.87e-04 1.84e-03 -0.43 0.66927
## tag_2D 1.12e-02 1.67e-03 6.69 2.3e-11 ***
## tag_Casual -2.00e-03 1.33e-03 -1.50 0.13306
## tag_Adventure -7.78e-03 1.35e-03 -5.75 9.2e-09 ***
## tag_Action -6.73e-03 1.54e-03 -4.38 1.2e-05 ***
## tag_Indie 4.80e-03 1.30e-03 3.68 0.00023 ***
## tag_Singleplayer 1.27e-02 1.38e-03 9.23 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.136 on 59095 degrees of freedom
## (695 observations deleted due to missingness)
## Multiple R-squared: 0.217, Adjusted R-squared: 0.212
## F-statistic: 39.9 on 410 and 59095 DF, p-value: <2e-16
(fit_1_tidy = tidy(fit_1, conf.int = T))
fit_1_tidy$p.value.fdr = p.adjust(fit_1_tidy$p.value, method = "fdr")
#plot coefficients with fdr < 0.05
fit_1_tidy %>%
filter(
p.value.fdr < 0.001,
str_detect(term, "tag_")
) %>%
mutate(
term = str_remove(term, "tag_"),
term = str_clean(term),
#reorder
term = fct_reorder(term, estimate, .fun = mean)
) %>%
ggplot(aes(x = term, y = estimate)) +
geom_point() +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.3) +
coord_flip() +
labs(
x = "Tag",
y = "Coefficient",
title = "Steam game ratings by tag"
) +
theme(
axis.text.x = element_text(hjust = 1, size = 8) # smaller and left-aligned
)
#compare models
fit_1_compare = compare_predictors(
d,
outcome = "fitted",
predictors = c(tag_counts_legal %>% tail(100) %>% names()),
controls = c("release_date"),
keep_controls = F
)
#plot
#determine factor levels based only on full fit
fit_1_compare_levels = fit_1_compare %>%
filter(
model == "full"
) %>%
mutate(
term = str_remove(term, "tag_"),
term = str_clean(term),
#reorder
term = fct_reorder(term, estimate, .fun = mean)
) %>%
pull(term)
#plot
fit_1_compare %>%
mutate(
term = str_remove(term, "tag_"),
term = str_clean(term),
#reorder
term = fct_relevel(term, fit_1_compare_levels %>% levels)
) %>%
GG_plot_models() +
labs(
x = "Tag",
y = "Coefficient",
title = "Steam game ratings by tag"
) +
#smaller x axis font size
theme(
axis.text.y = element_text(size = 5.5) # smaller and left-aligned
)
# scale_y_discrete(guide = guide_axis(n.dodge = 2))
GG_save("figs/compare_tags_models.png")
#scatterplot of coefs from singular vs. full regression
fit_1_compare %>%
filter(
model == "singular",
term != "(Intercept)"
) %>%
select(term, estimate) %>%
inner_join(
fit_1_compare %>%
filter(
model == "full",
term != "(Intercept)"
) %>%
select(term, estimate),
by = "term",
suffix = c("_singular", "_full")
) %>%
GG_scatter("estimate_singular", "estimate_full", case_names = "term") +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", alpha = 0.3) +
labs(
x = "Singular model coefficient",
y = "Full model coefficient",
title = "Steam game ratings by tag"
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/compare_tags_scatter.png")
## `geom_smooth()` using formula = 'y ~ x'
#look for 2-way interactions
#data frame of all possible 2-way interactions
interactions = expand_grid(
tag_1 = names(tag_counts_legal[tag_counts_legal >= 100]),
tag_2 = names(tag_counts_legal[tag_counts_legal >= 100])
) %>%
filter(
tag_1 < tag_2
)
#compute counts, since many tag combinations dont exist
interactions$n_tag_combo = future_map2_int(
interactions$tag_1,
interactions$tag_2,
function(x, y) {
d %>%
filter(
.data[[x]] == 1,
.data[[y]] == 1
) %>%
nrow()
}
)
#subset tags
interactions_sub = interactions %>%
filter(
n_tag_combo > 20
)
#fit models
interactions_sub$lm_fit <- future_map2(
interactions_sub$tag_1,
interactions_sub$tag_2,
function(t1, t2) {
formula_str <- str_glue("fitted ~ release_date + {t1} * {t2}")
list(lm(as.formula(formula_str), data = d) %>%
broom::tidy())
},
.progress = TRUE
)
#extract interactions
interactions_coefs = map_dfr(
interactions_sub$lm_fit,
function(x) {
x[[1]] %>%
filter(str_detect(term, ":"))
}
)
#adjust the p values
interactions_coefs$p.value.fdr = p.adjust(interactions_coefs$p.value, method = "fdr")
interactions_coefs$p.value.bon = p.adjust(interactions_coefs$p.value, method = "bonferroni")
#plot distribution of p values
interactions_coefs %>%
ggplot(aes(x = p.value)) +
geom_histogram(bins = 100) +
labs(
x = "FDR adjusted p value",
y = "Count",
title = "Distribution of p values for interactions",
subtitle = str_glue("Based on {nrow(interactions_sub)} interaction tests")
)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
GG_save("figs/interactions_pvalue_dist.png")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
This is done in Python to use GPU acceleration on another machine. But we plot the results.
#read predictions
d_ml_results = read_tsv("data/steam_xgboost_predictions.tsv.zip")
## Rows: 59506 Columns: 414
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (414): tag_Mod, tag_Tile_Matching, tag_Baseball, tag_Intentionally_Awkwa...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#plot
d_ml_results %>%
GG_scatter("predicted", "fitted", alpha = 0.1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
scale_x_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, 0.1)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, 0.1)) +
labs(
x = "Predicted rating",
y = "Adjusted rating",
title = "Steam game ratings by predicted rating",
subtitle = str_glue("Based on {nrow(d_ml_results)} games")
)
## `geom_smooth()` using formula = 'y ~ x'
#numerical
lm(fitted ~ predicted, data = d_ml_results) %>% summary()
##
## Call:
## lm(formula = fitted ~ predicted, data = d_ml_results)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7889 -0.0654 0.0182 0.0819 0.4578
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.02181 0.00444 -4.92 8.9e-07 ***
## predicted 1.02865 0.00572 179.73 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.123 on 59504 degrees of freedom
## Multiple R-squared: 0.352, Adjusted R-squared: 0.352
## F-statistic: 3.23e+04 on 1 and 59504 DF, p-value: <2e-16
GG_save("figs/xgboost_predictions.png")
## `geom_smooth()` using formula = 'y ~ x'
#internal multi-threading
# set_engine("xgboost", nthread = 4)
#--- Select predictors and outcome ----
#only top 10 tags for fast fit, testing purposes
tag_vars <- tag_counts_legal %>% names() %>% head()
tag_vars = d %>%
select(starts_with("tag_")) %>%
names()
# Model formula
predictors <- c(tag_vars, "total", "release_date")
d_ml <- d %>%
select(all_of(predictors), fitted) %>%
mutate(
release_date = as.numeric(release_date), # numeric encoding
total = as.numeric(total),
fitted = as.numeric(fitted)
) %>%
drop_na()
#--- Train/Test Split ----
set.seed(1)
data_split <- initial_split(d_ml, prop = 0.98)
train_data <- training(data_split)
test_data <- testing(data_split)
#--- Model Specification ----
xgb_spec <- boost_tree(
trees = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune()
) %>%
set_engine("xgboost") %>%
set_mode("regression")
#--- Recipe ----
xgb_rec <- recipe(fitted ~ ., data = train_data)
#--- Workflow ----
xgb_wflow <- workflow() %>%
add_model(xgb_spec) %>%
add_recipe(xgb_rec)
#--- Grid Specification ----
xgb_grid <- grid_space_filling(
trees(),
tree_depth(),
learn_rate(range = c(-3, -1)), # log10 scale
loss_reduction(),
sample_size = sample_prop(),
mtry(range = c(5, length(predictors))),
size = 20
)
#--- Cross-validation for tuning on training set ----
set.seed(2)
cv_folds <- vfold_cv(train_data, v = 10)
#--- Tune the model ----
tuned_results <- tune_grid(
xgb_wflow,
resamples = cv_folds,
grid = xgb_grid,
metrics = metric_set(rmse, rsq),
control = control_grid(save_pred = TRUE)
)
#--- Evaluate on train set ----
show_best(tuned_results, metric = "rsq")
#--- Select best hyperparameters ----
best_params <- select_best(tuned_results, metric = "rmse")
#--- Finalize and fit on full training set ----
final_wflow <- finalize_workflow(xgb_wflow, best_params)
final_fit <- final_wflow %>%
last_fit(split = data_split, metrics = metric_set(rsq, rmse))
#--- Evaluate on test set ----
show_best(final_fit, metric = "rsq")
#compare OLS with same predictors
ols_fit = lm(fitted ~ ., data = train_data)
ols_fit %>% summary()
#versions
write_sessioninfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] furrr_0.3.1 future_1.40.0 yardstick_1.3.2
## [4] workflowsets_1.1.0 workflows_1.2.0 tune_1.3.0
## [7] rsample_1.3.0 recipes_1.3.0 parsnip_1.3.1
## [10] modeldata_1.4.0 infer_1.0.8 dials_1.4.0
## [13] scales_1.3.0 broom_1.0.8 tidymodels_1.3.0
## [16] ebbr_0.1 kirkegaard_2025-05-02 psych_2.5.3
## [19] assertthat_0.2.1 weights_1.0.4 Hmisc_5.2-3
## [22] magrittr_2.0.3 lubridate_1.9.4 forcats_1.0.0
## [25] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.4
## [28] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [31] ggplot2_3.5.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.17.1 jsonlite_2.0.0 shape_1.4.6.1
## [4] datawizard_1.0.2 jomo_2.7-6 farver_2.1.2
## [7] nloptr_2.2.1 rmarkdown_2.29 ragg_1.4.0
## [10] vctrs_0.6.5 minqa_1.2.8 base64enc_0.1-3
## [13] htmltools_0.5.8.1 haven_2.5.4 Formula_1.2-5
## [16] mitml_0.4-5 sass_0.4.10 parallelly_1.43.0
## [19] bslib_0.9.0 htmlwidgets_1.6.4 plyr_1.8.9
## [22] cachem_1.1.0 lifecycle_1.0.4 iterators_1.0.14
## [25] pkgconfig_2.0.3 Matrix_1.7-3 R6_2.6.1
## [28] fastmap_1.2.0 rbibutils_2.3 digest_0.6.37
## [31] colorspace_2.1-1 textshaping_1.0.0 labeling_0.4.3
## [34] timechange_0.3.0 gdata_3.0.1 mgcv_1.9-1
## [37] compiler_4.5.0 bit64_4.6.0-1 withr_3.0.2
## [40] htmlTable_2.4.3 backports_1.5.0 pan_1.9
## [43] MASS_7.3-65 lava_1.8.1 gtools_3.9.5
## [46] tools_4.5.0 foreign_0.8-90 future.apply_1.11.3
## [49] nnet_7.3-20 glue_1.8.0 nlme_3.1-168
## [52] grid_4.5.0 checkmate_2.3.2 cluster_2.1.8.1
## [55] generics_0.1.3 gtable_0.3.6 tzdb_0.5.0
## [58] class_7.3-23 data.table_1.17.0 hms_1.1.3
## [61] foreach_1.5.2 pillar_1.10.2 vroom_1.6.5
## [64] splines_4.5.0 lhs_1.2.0 lattice_0.22-5
## [67] survival_3.8-3 bit_4.6.0 tidyselect_1.2.1
## [70] knitr_1.50 reformulas_0.4.0 gridExtra_2.3
## [73] stats4_4.5.0 xfun_0.52 hardhat_1.4.1
## [76] timeDate_4041.110 stringi_1.8.7 VGAM_1.1-13
## [79] DiceDesign_1.10 lazyeval_0.2.2 yaml_2.3.10
## [82] boot_1.3-31 evaluate_1.0.3 codetools_0.2-19
## [85] cli_3.6.4 rpart_4.1.24 systemfonts_1.2.2
## [88] Rdpack_2.6.4 munsell_0.5.1 jquerylib_0.1.4
## [91] Rcpp_1.0.14 ggeffects_2.2.1 globals_0.17.0
## [94] parallel_4.5.0 gower_1.0.2 GPfit_1.0-9
## [97] lme4_1.1-37 listenv_0.9.1 glmnet_4.1-8
## [100] ipred_0.9-15 prodlim_2025.04.28 insight_1.2.0
## [103] crayon_1.5.3 rlang_1.1.6 mnormt_2.1.1
## [106] mice_3.17.0
#write data to file for reuse
d %>% write_rds("data/data_for_reuse_joined.rds", compress = "xz")
# d_ml %>% write_rds("data/data_for_reuse_ml.rds", compress = "xz")
#OSF
if (F) {
library(osfr)
#login
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/XXX/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(
osf_proj,
path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"),
conflicts = "overwrite"
)
}