library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)

Overview

This lab focuses on recommender systems. Specifically, this lab creates a function that performs the Global Baseline Estimate algorithm to create a prediction for a movie rating.

Get the Data

The below data frame is the survey data I collected, but I removed one rating for Wicked to make the prediction:

df <- data.frame(Critic = c(1, 2, 3, 4, 5),
                 Barbie = c(5, 4, 4, 3, 4),
                 Oppenheimer = c(3, 2, 2, 3, 4),
                 TopGunMaverick = c(3, 3, 3, 5, 3),
                 AvatarTheWayOfWater = c(2, 5, 5, 4, 4),
                 Wicked = c(4, 3, 4, 3, NA),
                 GetOut = c(5, 5, 4, 4, 5))

I then turned the data frame into a matrix:

movie_matrix <- as.matrix(df[, -which(names(df) == "Critic")])
row.names(movie_matrix) = c("u1", "u2", "u3", "u4", "u5")
r <- as(movie_matrix, "realRatingMatrix")
r
## 5 x 6 rating matrix of class 'realRatingMatrix' with 29 ratings.
getRatingMatrix(r)
## 5 x 6 sparse Matrix of class "dgCMatrix"
##    Barbie Oppenheimer TopGunMaverick AvatarTheWayOfWater Wicked GetOut
## u1      5           3              3                   2      4      5
## u2      4           2              3                   5      3      5
## u3      4           2              3                   5      4      4
## u4      3           3              5                   4      3      4
## u5      4           4              3                   4      .      5

Creating the Recommender

I first found the overall mean movie rating for the entire matrix:

# Find the mean movie rating for all observations
mean_movie_rating_overall = mean(movie_matrix, na.rm = TRUE)
mean_movie_rating_overall
## [1] 3.724138

I then created new data frames related to average movie ratings for each user and movie:

# Find the mean for each movie
movie_mean_ratings <- data.frame(movie=c("Barbie", "Oppenheimer", "TopGunMaverick", "AvatarTheWayOfWater", "Wicked", "GetOut"), mean_rating=NA, relative_to_avg=NA)
movie_mean_ratings$mean_rating <- colMeans(df[, -which(names(df) == "Critic")], na.rm = TRUE)
# Find the mean diff -- movie avg - overall mean movie rating
movie_mean_ratings$relative_to_avg  <- movie_mean_ratings$mean_rating - mean_movie_rating_overall
movie_mean_ratings
# Find the mean for each user
user_mean_ratings <- data.frame(user=c("u1", "u2", "u3", "u4", "u5"), mean_rating=NA, relative_to_avg=NA)
user_mean_ratings$mean_rating  <- rowMeans(df[, -which(names(df) == "Critic")], na.rm = TRUE)
# Find the mean diff -- user avg - overall mean user rating
user_mean_ratings$relative_to_avg  <- user_mean_ratings$mean_rating - mean_movie_rating_overall
user_mean_ratings

I then created a function predict_rating which uses the Global Baseline Estimate algorithm:

# Global Baseline Estimate = Mean Movie Rating + Selected Movie's rating relative to average + User's rating relative to average
# This takes in the user, movie, the overall mean movie rating, the mean movie rating df and the mean user rating df
predict_rating <- function(user1, movie1, mean_movie_rating_overall, movie_mean_ratings, user_mean_ratings) {
  filtered_movie_rating <- movie_mean_ratings |>
    filter(movie == movie1)
  filtered_user_rating <- user_mean_ratings |>
    filter(user == user1)
  final_prediction <- mean_movie_rating_overall + filtered_movie_rating$relative_to_avg + filtered_user_rating$relative_to_avg
  return(final_prediction)
}

To predict the unknown movie rating for u5 and Wicked, we would call predict_rating:

# Predict `u5`'s movie rating for `Wicked`
predict_rating("u5", "Wicked", mean_movie_rating_overall, movie_mean_ratings, user_mean_ratings)
## [1] 3.775862

The predicted rating for Wicked is 3.775862 which is high, so it would be recommended as a movie.

Let’s try a similar data frame with missing data for the movie Conclave, and run the same above lines of code:

df <- data.frame(Critic = c(1, 2, 3, 4, 5),
                 Barbie = c(5, 4, 4, 3, 4),
                 Oppenheimer = c(3, 2, 2, 3, 4),
                 TopGunMaverick = c(3, 3, 3, 5, 3),
                 AvatarTheWayOfWater = c(2, 5, 5, 4, 4),
                 Wicked = c(4, 3, 4, 3, NA),
                 GetOut = c(5, 5, 4, 4, 5),
                 Conclave = c(2, NA, 1, 2, NA))
movie_matrix <- as.matrix(df[, -which(names(df) == "Critic")])
row.names(movie_matrix) = c("u1", "u2", "u3", "u4", "u5")
r <- as(movie_matrix, "realRatingMatrix")
r
## 5 x 7 rating matrix of class 'realRatingMatrix' with 32 ratings.
getRatingMatrix(r)
## 5 x 7 sparse Matrix of class "dgCMatrix"
##    Barbie Oppenheimer TopGunMaverick AvatarTheWayOfWater Wicked GetOut Conclave
## u1      5           3              3                   2      4      5        2
## u2      4           2              3                   5      3      5        .
## u3      4           2              3                   5      4      4        1
## u4      3           3              5                   4      3      4        2
## u5      4           4              3                   4      .      5        .
# Find the mean movie rating for all observations
mean_movie_rating_overall = mean(movie_matrix, na.rm = TRUE)

# Find the mean for each movie
movie_mean_ratings <- data.frame(movie=c("Barbie", "Oppenheimer", "TopGunMaverick", "AvatarTheWayOfWater", "Wicked", "GetOut", "Conclave"), mean_rating=NA, relative_to_avg=NA)
movie_mean_ratings$mean_rating <- colMeans(df[, -which(names(df) == "Critic")], na.rm = TRUE)
# Find the mean diff -- movie avg - overall mean movie rating
movie_mean_ratings$relative_to_avg  <- movie_mean_ratings$mean_rating - mean_movie_rating_overall
movie_mean_ratings
# Find the mean for each user
user_mean_ratings <- data.frame(user=c("u1", "u2", "u3", "u4", "u5"), mean_rating=NA, relative_to_avg=NA)
user_mean_ratings$mean_rating  <- rowMeans(df[, -which(names(df) == "Critic")], na.rm = TRUE)
# Find the mean diff -- user avg - overall mean user rating
user_mean_ratings$relative_to_avg  <- user_mean_ratings$mean_rating - mean_movie_rating_overall
user_mean_ratings
# Predict `u2` and `u5`'s movie rating for `Conclave`
predict_rating("u2", "Conclave", mean_movie_rating_overall, movie_mean_ratings, user_mean_ratings)
## [1] 1.802083
predict_rating("u5", "Conclave", mean_movie_rating_overall, movie_mean_ratings, user_mean_ratings)
## [1] 2.135417

u2’s predicted rating for Conclave is 1.802083 and u5’s predicted rating for Conclave is 2.135417, so Conclave might not be recommended for these users.

Conclusions

This lab provided a good example of creating an algorithm to show how a prediction can be made using averages. This algorithm used all of the existing data to make a prediction for a user.