DATA607 Assignment

                                Global Baseline Estimates

Problem statement: Most recommender systems use personalized algorithms like “content management” and “item-item collaborative filtering.” Sometimes non-personalized recommenders are also useful or necessary. One of the best non-personalized recommender system algorithms is the “Global Baseline Estimate.

The job here is to use the survey data collected and write the R code that makes a movie recommendation using the Global Baseline Estimate algorithm

# Load necessary libraries
library(dplyr)
library(tidyr)

Step 1. Create Movie Dataframe

# Create the movie_ratings dataframe
movie_ratings <- data.frame(
  Critic = c("Vivien", "Carrie", "Julie", "Drew"),
  Gone_with_the_Wind = c(NA, 4, 2, 5),
  Star_Wars = c(NA, 5, 5, 4),
  Sound_of_Music = c(5, 4, 2, NA),
  ET = c(4, 3, NA, NA),
  stringsAsFactors = FALSE
)

# View the dataframe
print(movie_ratings)

##   Critic Gone_with_the_Wind Star_Wars Sound_of_Music ET
## 1 Vivien                 NA        NA              5  4
## 2 Carrie                  4         5              4  3
## 3  Julie                  2         5              2 NA
## 4   Drew                  5         4             NA NA

Step 2. Convert to long format data

ratings_long <- movie_ratings %>%
  pivot_longer(
    cols = -Critic,
    names_to = "Movie",
    values_to = "Rating"
  ) %>%
  filter(!is.na(Rating)) %>%  # Remove missing ratings
  rename(userId = Critic, movieId = Movie, rating = Rating)

# View the long format data
print(ratings_long)

## # A tibble: 11 × 3
##    userId movieId            rating
##    <chr>  <chr>               <dbl>
##  1 Vivien Sound_of_Music          5
##  2 Vivien ET                      4
##  3 Carrie Gone_with_the_Wind      4
##  4 Carrie Star_Wars               5
##  5 Carrie Sound_of_Music          4
##  6 Carrie ET                      3
##  7 Julie  Gone_with_the_Wind      2
##  8 Julie  Star_Wars               5
##  9 Julie  Sound_of_Music          2
## 10 Drew   Gone_with_the_Wind      5
## 11 Drew   Star_Wars               4

Step 3: Calculate the Global Average Rating (μ)

# Calculate the global average rating
mu <- mean(ratings_long$rating, na.rm = TRUE)
print(paste("Global average rating (μ):", round(mu, 3)))

## [1] "Global average rating (μ): 3.909"

Step 4: Calculate User Biases (bᵤ)

user_biases <- ratings_long %>%
  group_by(userId) %>%
  summarize(b_u = mean(rating - mu, na.rm = TRUE))
 
# View user biases
print(user_biases)

## # A tibble: 4 × 2
##   userId     b_u
##   <chr>    <dbl>
## 1 Carrie  0.0909
## 2 Drew    0.591 
## 3 Julie  -0.909 
## 4 Vivien  0.591

Step 5: Calculate Movie Biases (bᵢ)

movie_biases <- ratings_long %>%
  group_by(movieId) %>%
  summarize(b_i = mean(rating - mu, na.rm = TRUE))

# View movie biases
print(movie_biases)

## # A tibble: 4 × 2
##   movieId               b_i
##   <chr>               <dbl>
## 1 ET                 -0.409
## 2 Gone_with_the_Wind -0.242
## 3 Sound_of_Music     -0.242
## 4 Star_Wars           0.758

Step 6: Create All Possible User-Movie Combinations

all_combinations <- expand.grid(
  userId = unique(ratings_long$userId),
  movieId = unique(ratings_long$movieId)
)

# View all combinations
print(all_combinations)

##    userId            movieId
## 1  Vivien     Sound_of_Music
## 2  Carrie     Sound_of_Music
## 3   Julie     Sound_of_Music
## 4    Drew     Sound_of_Music
## 5  Vivien                 ET
## 6  Carrie                 ET
## 7   Julie                 ET
## 8    Drew                 ET
## 9  Vivien Gone_with_the_Wind
## 10 Carrie Gone_with_the_Wind
## 11  Julie Gone_with_the_Wind
## 12   Drew Gone_with_the_Wind
## 13 Vivien          Star_Wars
## 14 Carrie          Star_Wars
## 15  Julie          Star_Wars
## 16   Drew          Star_Wars

Step 7: Generate Predictions for All Combinations Generated predictions for all possible critic-movie combinations using the formula: r̂ᵤᵢ = μ + bᵤ + bᵢ

predictions <- all_combinations %>%
  left_join(user_biases, by = "userId") %>%
  left_join(movie_biases, by = "movieId") %>%
  mutate(
    # Handle cases where we don't have bias information
    b_u = ifelse(is.na(b_u), 0, b_u),
    b_i = ifelse(is.na(b_i), 0, b_i),
    # Calculate predicted rating
    predicted_rating = mu + b_u + b_i,
    # Apply caps to ensure ratings are between 0 and 5
    predicted_rating = pmax(0, pmin(5, predicted_rating)),
    #round off to 2 numeric value after decimal 
    predicted_rating = round(predicted_rating,2)
  ) %>%
  select(userId, movieId, predicted_rating)

# View predictions
print(predictions)

##    userId            movieId predicted_rating
## 1  Vivien     Sound_of_Music             4.26
## 2  Carrie     Sound_of_Music             3.76
## 3   Julie     Sound_of_Music             2.76
## 4    Drew     Sound_of_Music             4.26
## 5  Vivien                 ET             4.09
## 6  Carrie                 ET             3.59
## 7   Julie                 ET             2.59
## 8    Drew                 ET             4.09
## 9  Vivien Gone_with_the_Wind             4.26
## 10 Carrie Gone_with_the_Wind             3.76
## 11  Julie Gone_with_the_Wind             2.76
## 12   Drew Gone_with_the_Wind             4.26
## 13 Vivien          Star_Wars             5.00
## 14 Carrie          Star_Wars             4.76
## 15  Julie          Star_Wars             3.76
## 16   Drew          Star_Wars             5.00

Step 8: Get Recommendations for Specific Users

# Function to get top N recommendations for a user
get_recommendations <- function(user_id, n = 2) {
  user_predictions <- predictions %>%
    filter(userId == user_id) %>%
    arrange(desc(predicted_rating)) %>%
    head(n)
  
  return(user_predictions)
}

# Get recommendations for each user
users <- unique(ratings_long$userId)
for(user in users) {
  cat("\nTop recommendations for", user, ":\n")
  print(get_recommendations(user))
}

## 
## Top recommendations for Vivien :
##   userId        movieId predicted_rating
## 1 Vivien      Star_Wars             5.00
## 2 Vivien Sound_of_Music             4.26
## 
## Top recommendations for Carrie :
##   userId        movieId predicted_rating
## 1 Carrie      Star_Wars             4.76
## 2 Carrie Sound_of_Music             3.76
## 
## Top recommendations for Julie :
##   userId        movieId predicted_rating
## 1  Julie      Star_Wars             3.76
## 2  Julie Sound_of_Music             2.76
## 
## Top recommendations for Drew :
##   userId        movieId predicted_rating
## 1   Drew      Star_Wars             5.00
## 2   Drew Sound_of_Music             4.26

Step 9: Compare Predictions with Actual Ratings

comparison <- ratings_long %>%
  full_join(predictions, by = c("userId", "movieId")) %>%
  rename(actual_rating = rating)

# View comparison
print(comparison)

## # A tibble: 16 × 4
##    userId movieId            actual_rating predicted_rating
##    <chr>  <chr>                      <dbl>            <dbl>
##  1 Vivien Sound_of_Music                 5             4.26
##  2 Vivien ET                             4             4.09
##  3 Carrie Gone_with_the_Wind             4             3.76
##  4 Carrie Star_Wars                      5             4.76
##  5 Carrie Sound_of_Music                 4             3.76
##  6 Carrie ET                             3             3.59
##  7 Julie  Gone_with_the_Wind             2             2.76
##  8 Julie  Star_Wars                      5             3.76
##  9 Julie  Sound_of_Music                 2             2.76
## 10 Drew   Gone_with_the_Wind             5             4.26
## 11 Drew   Star_Wars                      4             5   
## 12 Drew   Sound_of_Music                NA             4.26
## 13 Julie  ET                            NA             2.59
## 14 Drew   ET                            NA             4.09
## 15 Vivien Gone_with_the_Wind            NA             4.26
## 16 Vivien Star_Wars                     NA             5

Conclusion: This implementation demonstrated that even a simple non-personalized approach can provide meaningful recommendations by leveraging aggregate patterns in rating behavior. The global baseline model serves as an important benchmark in recommendation system evaluation - any more complex personalized algorithm should outperform this basic approach to be considered effective.

The assignment successfully illustrated how to transform raw rating data into actionable recommendations using fundamental statistical concepts, providing a solid foundation for understanding more sophisticated recommendation techniques.

DATA607 Assignment_3A

Mehreen Ali Gillani

2025-09-16