Setup
# If you need a package, install in the Console (not here):
# install.packages(c("readr","dplyr","tidyr","readxl","knitr","ggplot2"))
library(readr)
library(dplyr)
library(tidyr)
library(readxl)
library(knitr)
library(ggplot2)
Load data (CSV first; fallback to Excel)
csv_path <- file.path("data", "ratings_export.csv")
xlsx_path <- file.path("data", "MovieRatings.xlsx")
load_csv <- function(path) {
readr::read_csv(path, show_col_types = FALSE) |>
dplyr::select(name, movie, rating)
}
load_excel <- function(path) {
wide <- readxl::read_excel(path, sheet = "MovieRatings")
wide |>
dplyr::rename(name = 1) |>
tidyr::pivot_longer(-name, names_to = "movie", values_to = "rating") |>
dplyr::mutate(rating = as.numeric(rating))
}
if (file.exists(csv_path)) {
message("Loading CSV: ", csv_path)
ratings <- load_csv(csv_path)
} else if (file.exists(xlsx_path)) {
message("CSV not found. Loading Excel: ", xlsx_path)
ratings <- load_excel(xlsx_path)
} else {
stop("Neither 'data/ratings_export.csv' nor 'data/MovieRatings.xlsx' found. ",
"Please place at least one of them in the 'data' folder.")
}
## Loading CSV: data/ratings_export.csv
stopifnot(all(c("name","movie","rating") %in% names(ratings)))
glimpse(ratings)
## Rows: 30
## Columns: 3
## $ name <chr> "Fedie", "Fedie", "Fedie", "Fedie", "Fedie", "Fedie", "Bishop",…
## $ movie <chr> "Jurassic World", "The Dark Knight", "The Godfather", "Avatar",…
## $ rating <dbl> 4, 5, 3, 4, 5, 3, 3, 4, 5, 3, 4, 4, 5, 5, 4, 4, 5, 3, 4, 4, 5, …
head(ratings, 10)
## # A tibble: 10 × 3
## name movie rating
## <chr> <chr> <dbl>
## 1 Fedie Jurassic World 4
## 2 Fedie The Dark Knight 5
## 3 Fedie The Godfather 3
## 4 Fedie Avatar 4
## 5 Fedie Avengers: Endgame 5
## 6 Fedie The Old Guard 3
## 7 Bishop Jurassic World 3
## 8 Bishop The Dark Knight 4
## 9 Bishop The Godfather 5
## 10 Bishop Avatar 3
Clean / standardize
ratings_clean <- ratings |>
mutate(
name = trimws(as.character(name)),
movie = trimws(as.character(movie)),
rating = as.numeric(rating)
) |>
filter(!is.na(name), !is.na(movie), !is.na(rating)) |>
distinct(name, movie, .keep_all = TRUE)
kable(head(ratings_clean, 15), caption = "Cleaned ratings (first 15 rows)")
Cleaned ratings (first 15 rows)
Fedie |
Jurassic World |
4 |
Fedie |
The Dark Knight |
5 |
Fedie |
The Godfather |
3 |
Fedie |
Avatar |
4 |
Fedie |
Avengers: Endgame |
5 |
Fedie |
The Old Guard |
3 |
Bishop |
Jurassic World |
3 |
Bishop |
The Dark Knight |
4 |
Bishop |
The Godfather |
5 |
Bishop |
Avatar |
3 |
Bishop |
Avengers: Endgame |
4 |
Bishop |
The Old Guard |
4 |
Tim |
Jurassic World |
5 |
Tim |
The Dark Knight |
5 |
Tim |
The Godfather |
4 |
Global Baseline ingredients (μ, user averages, movie averages)
mu <- mean(ratings_clean$rating, na.rm = TRUE)
user_avg <- ratings_clean |>
group_by(name) |>
summarize(user_avg = mean(rating), .groups = "drop")
movie_avg <- ratings_clean |>
group_by(movie) |>
summarize(movie_avg = mean(rating), .groups = "drop")
kable(data.frame(global_mean = mu), caption = "Global mean (μ)")
Global mean (μ)
3.966667 |
kable(user_avg, caption = "User averages")
User averages
Bishop |
3.833333 |
Chris |
4.166667 |
Cody |
3.500000 |
Fedie |
4.000000 |
Tim |
4.333333 |
kable(movie_avg, caption = "Movie averages")
Movie averages
Avatar |
3.8 |
Avengers: Endgame |
4.4 |
Jurassic World |
3.8 |
The Dark Knight |
4.4 |
The Godfather |
4.2 |
The Old Guard |
3.2 |
Baseline predictions for all user × movie pairs
all_pairs <- tidyr::expand_grid(
name = sort(unique(ratings_clean$name)),
movie = sort(unique(ratings_clean$movie))
)
pred_baseline <- all_pairs |>
left_join(user_avg, by = "name") |>
left_join(movie_avg, by = "movie") |>
mutate(pred = user_avg + movie_avg - mu) |>
left_join(ratings_clean, by = c("name","movie")) # brings in actual ratings if they exist
kable(head(pred_baseline |> arrange(name, desc(pred)), 12),
caption = "Sample baseline predictions with any existing ratings")
Sample baseline predictions with any existing ratings
Bishop |
Avengers: Endgame |
3.833333 |
4.4 |
4.266667 |
4 |
Bishop |
The Dark Knight |
3.833333 |
4.4 |
4.266667 |
4 |
Bishop |
The Godfather |
3.833333 |
4.2 |
4.066667 |
5 |
Bishop |
Avatar |
3.833333 |
3.8 |
3.666667 |
3 |
Bishop |
Jurassic World |
3.833333 |
3.8 |
3.666667 |
3 |
Bishop |
The Old Guard |
3.833333 |
3.2 |
3.066667 |
4 |
Chris |
Avengers: Endgame |
4.166667 |
4.4 |
4.600000 |
4 |
Chris |
The Dark Knight |
4.166667 |
4.4 |
4.600000 |
4 |
Chris |
The Godfather |
4.166667 |
4.2 |
4.400000 |
5 |
Chris |
Avatar |
4.166667 |
3.8 |
4.000000 |
5 |
Chris |
Jurassic World |
4.166667 |
3.8 |
4.000000 |
4 |
Chris |
The Old Guard |
4.166667 |
3.2 |
3.400000 |
3 |
Recommendations: top-2 unseen per user
recs_unseen <- pred_baseline |>
filter(is.na(rating)) |>
group_by(name) |>
slice_max(pred, n = 2, with_ties = FALSE) |>
ungroup()
if (nrow(recs_unseen) == 0) {
recs_unseen <- pred_baseline |>
group_by(name) |>
slice_max(pred, n = 2, with_ties = FALSE) |>
ungroup()
}
kable(recs_unseen |> arrange(name, desc(pred)),
caption = "Top-2 recommendations per user (Global Baseline)")
Top-2 recommendations per user (Global Baseline)
Bishop |
Avengers: Endgame |
3.833333 |
4.4 |
4.266667 |
4 |
Bishop |
The Dark Knight |
3.833333 |
4.4 |
4.266667 |
4 |
Chris |
Avengers: Endgame |
4.166667 |
4.4 |
4.600000 |
4 |
Chris |
The Dark Knight |
4.166667 |
4.4 |
4.600000 |
4 |
Cody |
Avengers: Endgame |
3.500000 |
4.4 |
3.933333 |
4 |
Cody |
The Dark Knight |
3.500000 |
4.4 |
3.933333 |
4 |
Fedie |
Avengers: Endgame |
4.000000 |
4.4 |
4.433333 |
5 |
Fedie |
The Dark Knight |
4.000000 |
4.4 |
4.433333 |
5 |
Tim |
Avengers: Endgame |
4.333333 |
4.4 |
4.766667 |
5 |
Tim |
The Dark Knight |
4.333333 |
4.4 |
4.766667 |
5 |
(Optional) Quick visuals
ggplot(movie_avg, aes(x = reorder(movie, movie_avg), y = movie_avg)) +
geom_col() + coord_flip() +
labs(title = "Average rating by movie", x = "Movie", y = "Avg rating") +
theme_minimal()

ggplot(user_avg, aes(x = reorder(name, user_avg), y = user_avg)) +
geom_col() + coord_flip() +
labs(title = "Average rating by user", x = "User", y = "Avg rating") +
theme_minimal()
