library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
This lab focuses on recommender systems. Specifically, this lab creates a function that performs the Global Baseline Estimate algorithm to create a prediction for a movie rating.
The below data frame is the survey data I collected, but I removed
one rating for Wicked
to make the prediction:
df <- data.frame(Critic = c(1, 2, 3, 4, 5),
Barbie = c(5, 4, 4, 3, 4),
Oppenheimer = c(3, 2, 2, 3, 4),
TopGunMaverick = c(3, 3, 3, 5, 3),
AvatarTheWayOfWater = c(2, 5, 5, 4, 4),
Wicked = c(4, 3, 4, 3, NA),
GetOut = c(5, 5, 4, 4, 5))
I then turned the data frame into a matrix:
movie_matrix <- as.matrix(df[, -which(names(df) == "Critic")])
row.names(movie_matrix) = c("u1", "u2", "u3", "u4", "u5")
r <- as(movie_matrix, "realRatingMatrix")
r
## 5 x 6 rating matrix of class 'realRatingMatrix' with 29 ratings.
getRatingMatrix(r)
## 5 x 6 sparse Matrix of class "dgCMatrix"
## Barbie Oppenheimer TopGunMaverick AvatarTheWayOfWater Wicked GetOut
## u1 5 3 3 2 4 5
## u2 4 2 3 5 3 5
## u3 4 2 3 5 4 4
## u4 3 3 5 4 3 4
## u5 4 4 3 4 . 5
I first found the overall mean movie rating for the entire matrix:
# Find the mean movie rating for all observations
mean_movie_rating_overall = mean(movie_matrix, na.rm = TRUE)
mean_movie_rating_overall
## [1] 3.724138
I then created new data frames related to average movie ratings for each user and movie:
# Find the mean for each movie
movie_mean_ratings <- data.frame(movie=c("Barbie", "Oppenheimer", "TopGunMaverick", "AvatarTheWayOfWater", "Wicked", "GetOut"), mean_rating=NA, relative_to_avg=NA)
movie_mean_ratings$mean_rating <- colMeans(df[, -which(names(df) == "Critic")], na.rm = TRUE)
# Find the mean diff -- movie avg - overall mean movie rating
movie_mean_ratings$relative_to_avg <- movie_mean_ratings$mean_rating - mean_movie_rating_overall
movie_mean_ratings
# Find the mean for each user
user_mean_ratings <- data.frame(user=c("u1", "u2", "u3", "u4", "u5"), mean_rating=NA, relative_to_avg=NA)
user_mean_ratings$mean_rating <- rowMeans(df[, -which(names(df) == "Critic")], na.rm = TRUE)
# Find the mean diff -- user avg - overall mean user rating
user_mean_ratings$relative_to_avg <- user_mean_ratings$mean_rating - mean_movie_rating_overall
user_mean_ratings
I then created a function predict_rating
which uses the
Global Baseline Estimate algorithm:
# Global Baseline Estimate = Mean Movie Rating + Selected Movie's rating relative to average + User's rating relative to average
# This takes in the user, movie, the overall mean movie rating, the mean movie rating df and the mean user rating df
predict_rating <- function(user1, movie1, mean_movie_rating_overall, movie_mean_ratings, user_mean_ratings) {
filtered_movie_rating <- movie_mean_ratings |>
filter(movie == movie1)
filtered_user_rating <- user_mean_ratings |>
filter(user == user1)
final_prediction <- mean_movie_rating_overall + filtered_movie_rating$relative_to_avg + filtered_user_rating$relative_to_avg
return(final_prediction)
}
To predict the unknown movie rating for u5
and
Wicked
, we would call predict_rating
:
# Predict `u5`'s movie rating for `Wicked`
predict_rating("u5", "Wicked", mean_movie_rating_overall, movie_mean_ratings, user_mean_ratings)
## [1] 3.775862
The predicted rating for Wicked
is 3.775862 which is
high, so it would be recommended as a movie.
Let’s try a similar data frame with missing data for the movie
Conclave
, and run the same above lines of code:
df <- data.frame(Critic = c(1, 2, 3, 4, 5),
Barbie = c(5, 4, 4, 3, 4),
Oppenheimer = c(3, 2, 2, 3, 4),
TopGunMaverick = c(3, 3, 3, 5, 3),
AvatarTheWayOfWater = c(2, 5, 5, 4, 4),
Wicked = c(4, 3, 4, 3, NA),
GetOut = c(5, 5, 4, 4, 5),
Conclave = c(2, NA, 1, 2, NA))
movie_matrix <- as.matrix(df[, -which(names(df) == "Critic")])
row.names(movie_matrix) = c("u1", "u2", "u3", "u4", "u5")
r <- as(movie_matrix, "realRatingMatrix")
r
## 5 x 7 rating matrix of class 'realRatingMatrix' with 32 ratings.
getRatingMatrix(r)
## 5 x 7 sparse Matrix of class "dgCMatrix"
## Barbie Oppenheimer TopGunMaverick AvatarTheWayOfWater Wicked GetOut Conclave
## u1 5 3 3 2 4 5 2
## u2 4 2 3 5 3 5 .
## u3 4 2 3 5 4 4 1
## u4 3 3 5 4 3 4 2
## u5 4 4 3 4 . 5 .
# Find the mean movie rating for all observations
mean_movie_rating_overall = mean(movie_matrix, na.rm = TRUE)
# Find the mean for each movie
movie_mean_ratings <- data.frame(movie=c("Barbie", "Oppenheimer", "TopGunMaverick", "AvatarTheWayOfWater", "Wicked", "GetOut", "Conclave"), mean_rating=NA, relative_to_avg=NA)
movie_mean_ratings$mean_rating <- colMeans(df[, -which(names(df) == "Critic")], na.rm = TRUE)
# Find the mean diff -- movie avg - overall mean movie rating
movie_mean_ratings$relative_to_avg <- movie_mean_ratings$mean_rating - mean_movie_rating_overall
movie_mean_ratings
# Find the mean for each user
user_mean_ratings <- data.frame(user=c("u1", "u2", "u3", "u4", "u5"), mean_rating=NA, relative_to_avg=NA)
user_mean_ratings$mean_rating <- rowMeans(df[, -which(names(df) == "Critic")], na.rm = TRUE)
# Find the mean diff -- user avg - overall mean user rating
user_mean_ratings$relative_to_avg <- user_mean_ratings$mean_rating - mean_movie_rating_overall
user_mean_ratings
# Predict `u2` and `u5`'s movie rating for `Conclave`
predict_rating("u2", "Conclave", mean_movie_rating_overall, movie_mean_ratings, user_mean_ratings)
## [1] 1.802083
predict_rating("u5", "Conclave", mean_movie_rating_overall, movie_mean_ratings, user_mean_ratings)
## [1] 2.135417
u2
’s predicted rating for Conclave
is
1.802083 and u5
’s predicted rating for
Conclave
is 2.135417, so Conclave
might not be
recommended for these users.
This lab provided a good example of creating an algorithm to show how a prediction can be made using averages. This algorithm used all of the existing data to make a prediction for a user.