Setup

# If you need a package, install in the Console (not here):
# install.packages(c("readr","dplyr","tidyr","readxl","knitr","ggplot2"))
library(readr)
library(dplyr)
library(tidyr)
library(readxl)
library(knitr)
library(ggplot2)

Load data (CSV first; fallback to Excel)

csv_path  <- file.path("data", "ratings_export.csv")
xlsx_path <- file.path("data", "MovieRatings.xlsx")

load_csv <- function(path) {
  readr::read_csv(path, show_col_types = FALSE) |>
    dplyr::select(name, movie, rating)
}

load_excel <- function(path) {
  wide <- readxl::read_excel(path, sheet = "MovieRatings")
  wide |>
    dplyr::rename(name = 1) |>
    tidyr::pivot_longer(-name, names_to = "movie", values_to = "rating") |>
    dplyr::mutate(rating = as.numeric(rating))
}

if (file.exists(csv_path)) {
  message("Loading CSV: ", csv_path)
  ratings <- load_csv(csv_path)
} else if (file.exists(xlsx_path)) {
  message("CSV not found. Loading Excel: ", xlsx_path)
  ratings <- load_excel(xlsx_path)
} else {
  stop("Neither 'data/ratings_export.csv' nor 'data/MovieRatings.xlsx' found. ",
       "Please place at least one of them in the 'data' folder.")
}
## Loading CSV: data/ratings_export.csv
stopifnot(all(c("name","movie","rating") %in% names(ratings)))
glimpse(ratings)
## Rows: 30
## Columns: 3
## $ name   <chr> "Fedie", "Fedie", "Fedie", "Fedie", "Fedie", "Fedie", "Bishop",…
## $ movie  <chr> "Jurassic World", "The Dark Knight", "The Godfather", "Avatar",…
## $ rating <dbl> 4, 5, 3, 4, 5, 3, 3, 4, 5, 3, 4, 4, 5, 5, 4, 4, 5, 3, 4, 4, 5, …
head(ratings, 10)
## # A tibble: 10 × 3
##    name   movie             rating
##    <chr>  <chr>              <dbl>
##  1 Fedie  Jurassic World         4
##  2 Fedie  The Dark Knight        5
##  3 Fedie  The Godfather          3
##  4 Fedie  Avatar                 4
##  5 Fedie  Avengers: Endgame      5
##  6 Fedie  The Old Guard          3
##  7 Bishop Jurassic World         3
##  8 Bishop The Dark Knight        4
##  9 Bishop The Godfather          5
## 10 Bishop Avatar                 3

Clean / standardize

ratings_clean <- ratings |>
  mutate(
    name  = trimws(as.character(name)),
    movie = trimws(as.character(movie)),
    rating = as.numeric(rating)
  ) |>
  filter(!is.na(name), !is.na(movie), !is.na(rating)) |>
  distinct(name, movie, .keep_all = TRUE)

kable(head(ratings_clean, 15), caption = "Cleaned ratings (first 15 rows)")
Cleaned ratings (first 15 rows)
name movie rating
Fedie Jurassic World 4
Fedie The Dark Knight 5
Fedie The Godfather 3
Fedie Avatar 4
Fedie Avengers: Endgame 5
Fedie The Old Guard 3
Bishop Jurassic World 3
Bishop The Dark Knight 4
Bishop The Godfather 5
Bishop Avatar 3
Bishop Avengers: Endgame 4
Bishop The Old Guard 4
Tim Jurassic World 5
Tim The Dark Knight 5
Tim The Godfather 4

Global Baseline ingredients (μ, user averages, movie averages)

mu <- mean(ratings_clean$rating, na.rm = TRUE)

user_avg <- ratings_clean |>
  group_by(name) |>
  summarize(user_avg = mean(rating), .groups = "drop")

movie_avg <- ratings_clean |>
  group_by(movie) |>
  summarize(movie_avg = mean(rating), .groups = "drop")

kable(data.frame(global_mean = mu), caption = "Global mean (μ)")
Global mean (μ)
global_mean
3.966667
kable(user_avg, caption = "User averages")
User averages
name user_avg
Bishop 3.833333
Chris 4.166667
Cody 3.500000
Fedie 4.000000
Tim 4.333333
kable(movie_avg, caption = "Movie averages")
Movie averages
movie movie_avg
Avatar 3.8
Avengers: Endgame 4.4
Jurassic World 3.8
The Dark Knight 4.4
The Godfather 4.2
The Old Guard 3.2

Baseline predictions for all user × movie pairs

all_pairs <- tidyr::expand_grid(
  name = sort(unique(ratings_clean$name)),
  movie = sort(unique(ratings_clean$movie))
)

pred_baseline <- all_pairs |>
  left_join(user_avg,  by = "name") |>
  left_join(movie_avg, by = "movie") |>
  mutate(pred = user_avg + movie_avg - mu) |>
  left_join(ratings_clean, by = c("name","movie"))  # brings in actual ratings if they exist

kable(head(pred_baseline |> arrange(name, desc(pred)), 12),
      caption = "Sample baseline predictions with any existing ratings")
Sample baseline predictions with any existing ratings
name movie user_avg movie_avg pred rating
Bishop Avengers: Endgame 3.833333 4.4 4.266667 4
Bishop The Dark Knight 3.833333 4.4 4.266667 4
Bishop The Godfather 3.833333 4.2 4.066667 5
Bishop Avatar 3.833333 3.8 3.666667 3
Bishop Jurassic World 3.833333 3.8 3.666667 3
Bishop The Old Guard 3.833333 3.2 3.066667 4
Chris Avengers: Endgame 4.166667 4.4 4.600000 4
Chris The Dark Knight 4.166667 4.4 4.600000 4
Chris The Godfather 4.166667 4.2 4.400000 5
Chris Avatar 4.166667 3.8 4.000000 5
Chris Jurassic World 4.166667 3.8 4.000000 4
Chris The Old Guard 4.166667 3.2 3.400000 3

Recommendations: top-2 unseen per user

recs_unseen <- pred_baseline |>
  filter(is.na(rating)) |>
  group_by(name) |>
  slice_max(pred, n = 2, with_ties = FALSE) |>
  ungroup()

if (nrow(recs_unseen) == 0) {
  recs_unseen <- pred_baseline |>
    group_by(name) |>
    slice_max(pred, n = 2, with_ties = FALSE) |>
    ungroup()
}

kable(recs_unseen |> arrange(name, desc(pred)),
      caption = "Top-2 recommendations per user (Global Baseline)")
Top-2 recommendations per user (Global Baseline)
name movie user_avg movie_avg pred rating
Bishop Avengers: Endgame 3.833333 4.4 4.266667 4
Bishop The Dark Knight 3.833333 4.4 4.266667 4
Chris Avengers: Endgame 4.166667 4.4 4.600000 4
Chris The Dark Knight 4.166667 4.4 4.600000 4
Cody Avengers: Endgame 3.500000 4.4 3.933333 4
Cody The Dark Knight 3.500000 4.4 3.933333 4
Fedie Avengers: Endgame 4.000000 4.4 4.433333 5
Fedie The Dark Knight 4.000000 4.4 4.433333 5
Tim Avengers: Endgame 4.333333 4.4 4.766667 5
Tim The Dark Knight 4.333333 4.4 4.766667 5

(Optional) Quick visuals

ggplot(movie_avg, aes(x = reorder(movie, movie_avg), y = movie_avg)) +
  geom_col() + coord_flip() +
  labs(title = "Average rating by movie", x = "Movie", y = "Avg rating") +
  theme_minimal()

ggplot(user_avg, aes(x = reorder(name, user_avg), y = user_avg)) +
  geom_col() + coord_flip() +
  labs(title = "Average rating by user", x = "User", y = "Avg rating") +
  theme_minimal()