1) Load ratings
fname <- dplyr::case_when(
file.exists("MovieRatings.xlsx") ~ "MovieRatings.xlsx",
file.exists("MovieRatings (1).xlsx") ~ "MovieRatings (1).xlsx",
file.exists("MovieRatings (2).xlsx") ~ "MovieRatings (2).xlsx",
TRUE ~ NA_character_
)
stopifnot(!is.na(fname))
sheets <- readxl::excel_sheets(fname)
sheet_use <- if ("MovieRatings" %in% sheets) "MovieRatings" else sheets[1]
ratings_wide <- readxl::read_excel(fname, sheet = sheet_use) |> janitor::clean_names()
ratings <- ratings_wide |>
tidyr::pivot_longer(-critic, names_to = "movie_id", values_to = "rating") |>
dplyr::rename(user_id = critic) |>
dplyr::mutate(rating = as.numeric(rating)) |>
dplyr::filter(!is.na(rating))
knitr::kable(head(ratings, 8), caption = "Sample ratings (long format)")
Sample ratings (long format)
Burton |
jungle_book |
4 |
Burton |
star_wars_force |
4 |
Charley |
captain_america |
4 |
Charley |
deadpool |
5 |
Charley |
frozen |
4 |
Charley |
jungle_book |
3 |
Charley |
pitch_perfect2 |
2 |
Charley |
star_wars_force |
3 |
2) Global Baseline (mu + b_i)
mu <- mean(ratings$rating, na.rm = TRUE)
lambda <- 5 # set to 0 if your spreadsheet says no regularization
item_bias <- ratings |>
dplyr::group_by(movie_id) |>
dplyr::summarise(n_i = dplyr::n(), sum_i = sum(rating - mu), .groups = "drop") |>
dplyr::mutate(b_i = sum_i / (lambda + n_i))
global_baseline <- item_bias |>
dplyr::mutate(global_score = mu + b_i) |>
dplyr::arrange(dplyr::desc(global_score))
knitr::kable(global_baseline, digits = 3, caption = "Global Baseline (mu + b_i)")
Global Baseline (mu + b_i)
deadpool |
9 |
4.590 |
0.328 |
4.262 |
captain_america |
11 |
3.721 |
0.233 |
4.167 |
star_wars_force |
13 |
2.852 |
0.158 |
4.093 |
jungle_book |
10 |
-0.344 |
-0.023 |
3.911 |
frozen |
11 |
-2.279 |
-0.142 |
3.792 |
pitch_perfect2 |
7 |
-8.541 |
-0.712 |
3.223 |
3) Plot + save outputs
pretty_names <- c(
captain_america = "Captain America", captainamerica = "Captain America",
deadpool = "Deadpool", frozen = "Frozen",
jungle_book = "Jungle Book", junglebook = "Jungle Book",
pitch_perfect2 = "Pitch Perfect 2",
star_wars_force = "Star Wars: The Force Awakens", starwarsforce = "Star Wars: The Force Awakens"
)
global_formatted <- global_baseline |>
dplyr::mutate(movie = dplyr::coalesce(pretty_names[movie_id],
stringr::str_to_title(gsub("_"," ", movie_id)))) |>
dplyr::select(movie_id, movie, n_i, b_i, global_score) |>
dplyr::arrange(dplyr::desc(global_score))
ggplot(global_formatted, aes(x = reorder(movie, global_score), y = global_score)) +
geom_col() + coord_flip() +
labs(title = "Global Baseline (mu + b_i) — Non-Personalized Ranking",
x = "Movie", y = "Baseline Score")

readr::write_csv(global_formatted, "global_baseline_results.csv")
ggplot2::ggsave("global_baseline_bar.png", width = 7, height = 4, dpi = 300)