Week 3A: Global Baseline Estimates

Setup

# If you need a package, install in the Console (not here):
# install.packages(c("readr","dplyr","tidyr","readxl","knitr","ggplot2"))
library(readr)
library(dplyr)
library(tidyr)
library(readxl)
library(knitr)
library(ggplot2)

Load data (CSV first; fallback to Excel)

csv_path  <- file.path("data", "ratings_export.csv")
xlsx_path <- file.path("data", "MovieRatings.xlsx")

load_csv <- function(path) {
  readr::read_csv(path, show_col_types = FALSE) |>
    dplyr::select(name, movie, rating)
}

load_excel <- function(path) {
  wide <- readxl::read_excel(path, sheet = "MovieRatings")
  wide |>
    dplyr::rename(name = 1) |>
    tidyr::pivot_longer(-name, names_to = "movie", values_to = "rating") |>
    dplyr::mutate(rating = as.numeric(rating))
}

if (file.exists(csv_path)) {
  message("Loading CSV: ", csv_path)
  ratings <- load_csv(csv_path)
} else if (file.exists(xlsx_path)) {
  message("CSV not found. Loading Excel: ", xlsx_path)
  ratings <- load_excel(xlsx_path)
} else {
  stop("Neither 'data/ratings_export.csv' nor 'data/MovieRatings.xlsx' found. ",
       "Please place at least one of them in the 'data' folder.")
}

## Loading CSV: data/ratings_export.csv

stopifnot(all(c("name","movie","rating") %in% names(ratings)))
glimpse(ratings)

## Rows: 30
## Columns: 3
## $ name   <chr> "Fedie", "Fedie", "Fedie", "Fedie", "Fedie", "Fedie", "Bishop",…
## $ movie  <chr> "Jurassic World", "The Dark Knight", "The Godfather", "Avatar",…
## $ rating <dbl> 4, 5, 3, 4, 5, 3, 3, 4, 5, 3, 4, 4, 5, 5, 4, 4, 5, 3, 4, 4, 5, …

head(ratings, 10)

## # A tibble: 10 × 3
##    name   movie             rating
##    <chr>  <chr>              <dbl>
##  1 Fedie  Jurassic World         4
##  2 Fedie  The Dark Knight        5
##  3 Fedie  The Godfather          3
##  4 Fedie  Avatar                 4
##  5 Fedie  Avengers: Endgame      5
##  6 Fedie  The Old Guard          3
##  7 Bishop Jurassic World         3
##  8 Bishop The Dark Knight        4
##  9 Bishop The Godfather          5
## 10 Bishop Avatar                 3

Clean / standardize

ratings_clean <- ratings |>
  mutate(
    name  = trimws(as.character(name)),
    movie = trimws(as.character(movie)),
    rating = as.numeric(rating)
  ) |>
  filter(!is.na(name), !is.na(movie), !is.na(rating)) |>
  distinct(name, movie, .keep_all = TRUE)

kable(head(ratings_clean, 15), caption = "Cleaned ratings (first 15 rows)")

Cleaned ratings (first 15 rows)
name	movie	rating
Fedie	Jurassic World	4
Fedie	The Dark Knight	5
Fedie	The Godfather	3
Fedie	Avatar	4
Fedie	Avengers: Endgame	5
Fedie	The Old Guard	3
Bishop	Jurassic World	3
Bishop	The Dark Knight	4
Bishop	The Godfather	5
Bishop	Avatar	3
Bishop	Avengers: Endgame	4
Bishop	The Old Guard	4
Tim	Jurassic World	5
Tim	The Dark Knight	5
Tim	The Godfather	4

Global Baseline ingredients (μ, user averages, movie averages)

mu <- mean(ratings_clean$rating, na.rm = TRUE)

user_avg <- ratings_clean |>
  group_by(name) |>
  summarize(user_avg = mean(rating), .groups = "drop")

movie_avg <- ratings_clean |>
  group_by(movie) |>
  summarize(movie_avg = mean(rating), .groups = "drop")

kable(data.frame(global_mean = mu), caption = "Global mean (μ)")

Global mean (μ)
global_mean
3.966667

kable(user_avg, caption = "User averages")

User averages
name	user_avg
Bishop	3.833333
Chris	4.166667
Cody	3.500000
Fedie	4.000000
Tim	4.333333

kable(movie_avg, caption = "Movie averages")

Movie averages
movie	movie_avg
Avatar	3.8
Avengers: Endgame	4.4
Jurassic World	3.8
The Dark Knight	4.4
The Godfather	4.2
The Old Guard	3.2

Baseline predictions for all user × movie pairs

all_pairs <- tidyr::expand_grid(
  name = sort(unique(ratings_clean$name)),
  movie = sort(unique(ratings_clean$movie))
)

pred_baseline <- all_pairs |>
  left_join(user_avg,  by = "name") |>
  left_join(movie_avg, by = "movie") |>
  mutate(pred = user_avg + movie_avg - mu) |>
  left_join(ratings_clean, by = c("name","movie"))  # brings in actual ratings if they exist

kable(head(pred_baseline |> arrange(name, desc(pred)), 12),
      caption = "Sample baseline predictions with any existing ratings")

Sample baseline predictions with any existing ratings
name	movie	user_avg	movie_avg	pred	rating
Bishop	Avengers: Endgame	3.833333	4.4	4.266667	4
Bishop	The Dark Knight	3.833333	4.4	4.266667	4
Bishop	The Godfather	3.833333	4.2	4.066667	5
Bishop	Avatar	3.833333	3.8	3.666667	3
Bishop	Jurassic World	3.833333	3.8	3.666667	3
Bishop	The Old Guard	3.833333	3.2	3.066667	4
Chris	Avengers: Endgame	4.166667	4.4	4.600000	4
Chris	The Dark Knight	4.166667	4.4	4.600000	4
Chris	The Godfather	4.166667	4.2	4.400000	5
Chris	Avatar	4.166667	3.8	4.000000	5
Chris	Jurassic World	4.166667	3.8	4.000000	4
Chris	The Old Guard	4.166667	3.2	3.400000	3

Recommendations: top-2 unseen per user

recs_unseen <- pred_baseline |>
  filter(is.na(rating)) |>
  group_by(name) |>
  slice_max(pred, n = 2, with_ties = FALSE) |>
  ungroup()

if (nrow(recs_unseen) == 0) {
  recs_unseen <- pred_baseline |>
    group_by(name) |>
    slice_max(pred, n = 2, with_ties = FALSE) |>
    ungroup()
}

kable(recs_unseen |> arrange(name, desc(pred)),
      caption = "Top-2 recommendations per user (Global Baseline)")

Top-2 recommendations per user (Global Baseline)
name	movie	user_avg	movie_avg	pred	rating
Bishop	Avengers: Endgame	3.833333	4.4	4.266667	4
Bishop	The Dark Knight	3.833333	4.4	4.266667	4
Chris	Avengers: Endgame	4.166667	4.4	4.600000	4
Chris	The Dark Knight	4.166667	4.4	4.600000	4
Cody	Avengers: Endgame	3.500000	4.4	3.933333	4
Cody	The Dark Knight	3.500000	4.4	3.933333	4
Fedie	Avengers: Endgame	4.000000	4.4	4.433333	5
Fedie	The Dark Knight	4.000000	4.4	4.433333	5
Tim	Avengers: Endgame	4.333333	4.4	4.766667	5
Tim	The Dark Knight	4.333333	4.4	4.766667	5

(Optional) Quick visuals

ggplot(movie_avg, aes(x = reorder(movie, movie_avg), y = movie_avg)) +
  geom_col() + coord_flip() +
  labs(title = "Average rating by movie", x = "Movie", y = "Avg rating") +
  theme_minimal()

ggplot(user_avg, aes(x = reorder(name, user_avg), y = user_avg)) +
  geom_col() + coord_flip() +
  labs(title = "Average rating by user", x = "User", y = "Avg rating") +
  theme_minimal()

Week 3A: Global Baseline Estimates

Kevin Martin

Setup

Load data (CSV first; fallback to Excel)

Clean / standardize

Global Baseline ingredients (μ, user averages, movie averages)

Baseline predictions for all user × movie pairs

Recommendations: top-2 unseen per user

(Optional) Quick visuals