Data 607 - Week 11

Load the Required Packages:

Below, we load the packages required for data analysis.

library(knitr)
library(tidyverse)

Load the Movie Ratings Data:

Below, we load the movie ratings data from which we will build our recommender system and store it in a data frame.

my_url <- "https://raw.githubusercontent.com/geedoubledee/data607_week11/main/Week11_MovieRatings.csv"
movie_ratings_df <- read_csv(my_url)
fix_col_names <- function(s){
    s <- gsub("[()]", "", s)
    s <- gsub(" ", "_", s)
    s
}
colnames(movie_ratings_df) <- map_chr(colnames(movie_ratings_df), fix_col_names)
movie_ratings_df <- movie_ratings_df %>%
    column_to_rownames(., "User")

Split the Data into Two Data Frames and Calculate the Mean Ratings by User and by Movie:

We create two additional data frames from our data, one to capture the mean ratings by user:

baseline <- round(mean(as.matrix(movie_ratings_df), na.rm = TRUE), 2)
movie_cols <- c("M3GAN_2023", "The_Menu_2022", "Barbarian_2022", "Glass_Onion_2022", "Tár_2022", "Aftersun_2022")
movie_ratings_df_by_user <- movie_ratings_df %>%
    mutate(User_Mean = round(rowMeans(.[, movie_cols], na.rm = TRUE), 2),
           User_Relative_to_Baseline = User_Mean - baseline)
movie_ratings_df_by_user$User_Mean[is.nan(
    movie_ratings_df_by_user$User_Mean)] <- NA
movie_ratings_df_by_user$User_Relative_to_Baseline[is.nan(
    movie_ratings_df_by_user$User_Relative_to_Baseline)] <- NA
kable(movie_ratings_df_by_user, format = "simple")

	M3GAN_2023	The_Menu_2022	Barbarian_2022	Glass_Onion_2022	Tár_2022	Aftersun_2022	User_Mean	User_Relative_to_Baseline
Glen	NA	4	5	4	5	3	4.20	0.35
Sebastian	NA	4	5	3	4	2	3.60	-0.25
Alex	NA	NA	NA	NA	NA	NA	NA	NA
Victoria	NA	NA	NA	3	NA	NA	3.00	-0.85
Javin	NA	4	4	3	NA	NA	3.67	-0.18
Matt	NA	NA	NA	NA	NA	4	4.00	0.15
Anne	NA	4	NA	3	5	NA	4.00	0.15
Grifin	NA	NA	NA	NA	NA	NA	NA	NA
Claire	NA	NA	NA	4	NA	NA	4.00	0.15
Vicki	NA	4	5	3	NA	NA	4.00	0.15
Dan	NA	2	5	NA	NA	NA	3.50	-0.35
Frankie	NA	NA	NA	3	5	NA	4.00	0.15

And another to capture the mean ratings by movie:

movie_ratings_df_by_movie <- as.data.frame(t(movie_ratings_df))
user_cols <- c("Glen", "Sebastian", "Alex", "Victoria", "Javin", "Matt",
               "Anne", "Grifin", "Claire", "Vicki", "Dan", "Frankie")
movie_ratings_df_by_movie <- movie_ratings_df_by_movie %>%
    mutate(Movie_Mean = round(rowMeans(.[, user_cols], na.rm = TRUE), 2),
           Movie_Relative_to_Baseline = Movie_Mean - baseline)
movie_ratings_df_by_movie$Movie_Mean[is.nan(
    movie_ratings_df_by_movie$Movie_Mean)] <- NA
movie_ratings_df_by_movie$Movie_Relative_to_Baseline[is.nan(
    movie_ratings_df_by_movie$Movie_Relative_to_Baseline)] <- NA
kable(movie_ratings_df_by_movie, format = "simple")

	Glen	Sebastian	Alex	Victoria	Javin	Matt	Anne	Grifin	Claire	Vicki	Dan	Frankie	Movie_Mean	Movie_Relative_to_Baseline
M3GAN_2023	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
The_Menu_2022	4	4	NA	NA	4	NA	4	NA	NA	4	2	NA	3.67	-0.18
Barbarian_2022	5	5	NA	NA	4	NA	NA	NA	NA	5	5	NA	4.80	0.95
Glass_Onion_2022	4	3	NA	3	3	NA	3	NA	4	3	NA	3	3.25	-0.60
Tár_2022	5	4	NA	NA	NA	NA	5	NA	NA	NA	NA	5	4.75	0.90
Aftersun_2022	3	2	NA	NA	NA	4	NA	NA	NA	NA	NA	NA	3.00	-0.85

Now we are ready to begin building our recommender system.

Build the Recommendations Data Frame:

We set the initial predicted rating for each movie a user has not yet seen/rated as 0, and we set the initial predicted rating of movies they have already seen/rated as NA. This will prevent users from being recommended movies they’ve already watched.

recommendations_df <- movie_ratings_df %>%
    replace(is.na(.), 0)
recommendations_df[recommendations_df > 0] <- NA
kable(recommendations_df, format = "simple")

	The_Menu_2022	Barbarian_2022	Glass_Onion_2022	Tár_2022	Aftersun_2022
Glen	NA	NA	NA	NA	NA
Sebastian	NA	NA	NA	NA	NA
Alex	0	0	0	0	0
Victoria	0	0	NA	0	0
Javin	NA	NA	NA	0	0
Matt	0	0	0	0	NA
Anne	NA	0	NA	NA	0
Grifin	0	0	0	0	0
Claire	0	0	NA	0	0
Vicki	NA	NA	NA	0	0
Dan	NA	NA	0	0	0
Frankie	0	0	NA	NA	0

Calculate Predicted Ratings:

Next, we calculate predicted ratings for all the movies a user has not yet seen/rated. The baseline rating will be the average rating across all movies, and each movie’s predicted rating for each user will be the sum of the baseline, the difference in that movie’s average rating relative to the baseline, and the difference in that user’s average rating relative to the baseline.

movie_relative_to_baseline <- movie_ratings_df_by_movie$Movie_Relative_to_Baseline
user_relative_to_baseline <- movie_ratings_df_by_user$User_Relative_to_Baseline

for (i in 1:nrow(recommendations_df)){
    for (j in 1:ncol(recommendations_df)){
        if (is.na(recommendations_df[i, j])){
            next
        }
        else if (recommendations_df[i, j] == 0){
            recommendations_df[i, j] <- sum(baseline,
                                            movie_relative_to_baseline[j],
                                            user_relative_to_baseline[i],
                                            na.rm = TRUE)
        }
    }
}
kable(recommendations_df, format = "simple")

	M3GAN_2023	The_Menu_2022	Barbarian_2022	Glass_Onion_2022	Tár_2022	Aftersun_2022
Glen	4.20	NA	NA	NA	NA	NA
Sebastian	3.60	NA	NA	NA	NA	NA
Alex	3.85	3.67	4.80	3.25	4.75	3.00
Victoria	3.00	2.82	3.95	NA	3.90	2.15
Javin	3.67	NA	NA	NA	4.57	2.82
Matt	4.00	3.82	4.95	3.40	4.90	NA
Anne	4.00	NA	4.95	NA	NA	3.15
Grifin	3.85	3.67	4.80	3.25	4.75	3.00
Claire	4.00	3.82	4.95	NA	4.90	3.15
Vicki	4.00	NA	NA	NA	4.90	3.15
Dan	3.50	NA	NA	2.90	4.40	2.65
Frankie	4.00	3.82	4.95	NA	NA	3.15

Recommend Movies with Highest Predicted Ratings for Each User:

Finally, we can recommend the movie each user should watch next based on our expectations of them liking it:

recommendations_df$Recommendation <- colnames(recommendations_df)[apply(
    recommendations_df, 1, which.max)]
Recommendation <- recommendations_df$Recommendation
names(Recommendation) <- user_cols
Recommendation <- as.data.frame(Recommendation)
kable(Recommendation, format ="simple", row.names = TRUE)

	Recommendation
Glen	M3GAN_2023
Sebastian	M3GAN_2023
Alex	Barbarian_2022
Victoria	Barbarian_2022
Javin	Tár_2022
Matt	Barbarian_2022
Anne	Barbarian_2022
Grifin	Barbarian_2022
Claire	Barbarian_2022
Vicki	Tár_2022
Dan	Tár_2022
Frankie	Barbarian_2022

Conclusions:

Because Glen and Sebastian had seen most of the movies on the list already, M3GAN was the only movie left to recommend to them.

All the other users were recommended to watch either Barbarian or Tár. This is not surprising given that, of all the movies in our data set, the average ratings for these two movies were nearly a full point higher than the average movie rating.

This recommender system could be enhanced by adding a movie-to-movie table that gave similarity scores from one movie to a fixed number of other movies. Genre would most likely be a large part of that similarity score, but shared directors/actors could also factor in. That would especially help when trying to recommend new movies that few users have seen/rated yet. It could also be enhanced by attempting to show users something they didn’t know they would like and bring them out of their movie silos. Generally, you only want to recommend those kinds of movies occasionally, so as not to make users feel like the algorithm doesn’t know them at all.