Project 1 Data 612

Project 1

This system recommends rom-com novels to readers.

Loading the data

library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'tibble' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
romcom_books <- read.csv("https://raw.githubusercontent.com/samanthabarbaro/data612_recommender_systems/refs/heads/main/proj_1_dataset.csv", na.strings = c("", "NA", "null", "NULL"),  header = TRUE)

Splitting the data

Split the data into an 80/20 training and test set: I’ve pivoted longer so it’s easier to split up different ratings from the same users. With this format, using a random sample of the data set, some users are represented in the training data but not the test data.

pivoted_books <- romcom_books |>
  pivot_longer(
  cols = Carry.On:Attachments,
        names_to = "book",
        values_to = "rating"
    )

pivoted_books
# A tibble: 204 × 3
   X      book                           rating
   <chr>  <chr>                           <int>
 1 user 1 Carry.On                            5
 2 user 1 People.We.Meet.on.Vacation         NA
 3 user 1 Fangirl                             4
 4 user 1 Margot.s.Got.Money.Troubles        NA
 5 user 1 Rosaline.Palmer.Takes.the.Cake      5
 6 user 1 Red..White..and.Royal.Blue         NA
 7 user 1 Boyfriend.Material                 NA
 8 user 1 Eleanor.and.Park                    4
 9 user 1 The.Charm.Offensive                 4
10 user 1 Beach.Read                         NA
# ℹ 194 more rows

Create training and test data frames

set.seed(1122)

#remove NAs and rename the user column

pivoted_books_2 <- pivoted_books |>
  filter(!is.na(rating)) |>
  rename(user = X)
  

#create a training and test set

train_index <- sample(nrow(pivoted_books_2),
                      0.8 * nrow(pivoted_books_2))

train_books <- pivoted_books_2[train_index, ]

test_books <- pivoted_books_2[-train_index, ]

Calculating the raw average and RMSE for training and test data

#mean overall rating (raw average)

raw_average <- train_books |> 
  mutate(mean_rating = round(mean(rating),2))

error <- raw_average |>
  #calculate the difference (error)
  mutate(difference = (rating - mean_rating)) |>
  #square the errors
  mutate (square = (difference * difference))

error
# A tibble: 62 × 6
   user    book                        rating mean_rating difference square
   <chr>   <chr>                        <int>       <dbl>      <dbl>  <dbl>
 1 user 16 Margot.s.Got.Money.Troubles      5        3.23       1.77 3.13  
 2 user 3  Beach.Read                       3        3.23      -0.23 0.0529
 3 user 9  Margot.s.Got.Money.Troubles      3        3.23      -0.23 0.0529
 4 user 7  Beach.Read                       1        3.23      -2.23 4.97  
 5 user 5  Beach.Read                       2        3.23      -1.23 1.51  
 6 user 2  Attachments                      2        3.23      -1.23 1.51  
 7 user 15 Beach.Read                       3        3.23      -0.23 0.0529
 8 user 15 Fangirl                          4        3.23       0.77 0.593 
 9 user 10 Boyfriend.Material               2        3.23      -1.23 1.51  
10 user 4  Carry.On                         4        3.23       0.77 0.593 
# ℹ 52 more rows
#number of rows in the training set = 62
count(train_books)
# A tibble: 1 × 1
      n
  <int>
1    62
#using 62 is not dynamic (does not account for adjusted sizes of training sets)
#which is fine for this one instance, but could be substituted with count(train_books)

#calculate rmse
rmse_training <- error |> 
  #sum the squared errors
  summarise(total_sq = sum(square)) |>
  #take the mean and then the sq root
  mutate(rmse = (sqrt(total_sq/62))) 

#the rmse for the training set is 1.16
rmse_training
# A tibble: 1 × 2
  total_sq  rmse
     <dbl> <dbl>
1     82.8  1.16
#number of rows in the test set is 16
count(test_books)
# A tibble: 1 × 1
      n
  <int>
1    16
#calculate RMSE for test set
rmse_test <- test_books |> 
  mutate(mean_rating = 3.23) |>
  mutate(difference = (rating - mean_rating)) |>
  mutate(square = (difference*difference))

rmse_test <- rmse_test |>
    summarise(total_sq = sum(square)) |>
  #take the mean and then the sq root
  mutate(rmse = (sqrt(total_sq/16))) 

#RMSE for the test set is 1.295
rmse_test
# A tibble: 1 × 2
  total_sq  rmse
     <dbl> <dbl>
1     26.8  1.30

Calculate the bias for each user and each item

The bias for each user is the mean of their ratings - the mean across all books (how much higher or lower they tend to rate something).

The bias for each item is how much higher or lower the average rating is than the over all average rating.

User bias = user average - raw average

Item/book bias = movie average - raw average

#mean for each book, user, and overall
item_user_mean <- train_books |> 
  #mean for each book
  group_by(book) |> 
  mutate(book_mean = round(mean(rating),2)) |> 
  ungroup() |>
  #mean for each user
  group_by(user) |> 
  mutate(user_mean = round(mean(rating),2)) |> 
  ungroup() |>
  mutate(overall_mean = 3.23)

#calculate the biases

# user bias = user mean - overall mean (across all books)

#item bias = item mean - overall mean (across all books)

#creating a key that I will join to the original pivoted data set, which includes all
#item/user combos 
user_biases <- item_user_mean |> 
  mutate(user_bias = (user_mean - overall_mean)) |>
  distinct(user, user_bias)

item_biases <- item_user_mean |> 
  mutate(item_bias = (book_mean - overall_mean)) |>
  distinct(book, item_bias)

Calculating baseline predictors

Finding the baseline predictors for every user-item combination.

Baseline predictor = raw average + user bias + movie bias

book_key <- pivoted_books |> 
  rename(user = X) |>
  mutate(overall_mean = 3.23) |>
  left_join(user_biases, by = "user") |>
  left_join(item_biases, by = "book")

predictions <- book_key |>
  mutate(baseline_predictor = (overall_mean + user_bias + item_bias)) |>
  #if it's larger than 5, then 5
  mutate(baseline_predictor = if_else(baseline_predictor > 5, 5, baseline_predictor))

predictions
# A tibble: 204 × 7
   user   book        rating overall_mean user_bias item_bias baseline_predictor
   <chr>  <chr>        <int>        <dbl>     <dbl>     <dbl>              <dbl>
 1 user 1 Carry.On         5         3.23      1.17      0.48               4.88
 2 user 1 People.We.…     NA         3.23      1.17     -0.56               3.84
 3 user 1 Fangirl          4         3.23      1.17      0.37               4.77
 4 user 1 Margot.s.G…     NA         3.23      1.17      0.77               5   
 5 user 1 Rosaline.P…      5         3.23      1.17      0.27               4.67
 6 user 1 Red..White…     NA         3.23      1.17     -1.23               3.17
 7 user 1 Boyfriend.…     NA         3.23      1.17      0.27               4.67
 8 user 1 Eleanor.an…      4         3.23      1.17      0.77               5   
 9 user 1 The.Charm.…      4         3.23      1.17     -0.23               4.17
10 user 1 Beach.Read      NA         3.23      1.17     -0.61               3.79
# ℹ 194 more rows

Select from the pivoted longer data set (before removing NAs) just the user and the movie column, then join with the movie average rating:

train_predictions <- train_books |>
  left_join(predictions, by = c("user", "book"))

train_predictions
# A tibble: 62 × 8
   user    book               rating.x rating.y overall_mean user_bias item_bias
   <chr>   <chr>                 <int>    <int>        <dbl>     <dbl>     <dbl>
 1 user 16 Margot.s.Got.Mone…        5        5         3.23    1.1         0.77
 2 user 3  Beach.Read                3        3         3.23    0.0200     -0.61
 3 user 9  Margot.s.Got.Mone…        3        3         3.23   -1.06        0.77
 4 user 7  Beach.Read                1        1         3.23   -0.73       -0.61
 5 user 5  Beach.Read                2        2         3.23   -1.23       -0.61
 6 user 2  Attachments               2        2         3.23   -1.23       -0.56
 7 user 15 Beach.Read                3        3         3.23    0.0200     -0.61
 8 user 15 Fangirl                   4        4         3.23    0.0200      0.37
 9 user 10 Boyfriend.Material        2        2         3.23   -1.23        0.27
10 user 4  Carry.On                  4        4         3.23    1.17        0.48
# ℹ 52 more rows
# ℹ 1 more variable: baseline_predictor <dbl>
test_predictions <- test_books |> 
   left_join(predictions, by = c("user", "book"))

test_predictions
# A tibble: 16 × 8
   user    book               rating.x rating.y overall_mean user_bias item_bias
   <chr>   <chr>                 <int>    <int>        <dbl>     <dbl>     <dbl>
 1 user 1  Attachments               3        3         3.23    1.17       -0.56
 2 user 2  Boyfriend.Material        3        3         3.23   -1.23        0.27
 3 user 3  Margot.s.Got.Mone…        4        4         3.23    0.0200      0.77
 4 user 3  Boyfriend.Material        4        4         3.23    0.0200      0.27
 5 user 3  Eleanor.and.Park          4        4         3.23    0.0200      0.77
 6 user 4  Red..White..and.R…        3        3         3.23    1.17       -1.23
 7 user 5  The.Charm.Offensi…        2        2         3.23   -1.23       -0.23
 8 user 5  Attachments               3        3         3.23   -1.23       -0.56
 9 user 7  Red..White..and.R…        1        1         3.23   -0.73       -1.23
10 user 10 People.We.Meet.on…        1        1         3.23   -1.23       -0.56
11 user 10 Attachments               1        1         3.23   -1.23       -0.56
12 user 11 Red..White..and.R…        4        4         3.23    0.27       -1.23
13 user 11 Eleanor.and.Park          3        3         3.23    0.27        0.77
14 user 14 Rosaline.Palmer.T…        5        5         3.23    0.77        0.27
15 user 14 Anna.and.the.Fren…        5        5         3.23    0.77        0.44
16 user 15 Eleanor.and.Park          2        2         3.23    0.0200      0.77
# ℹ 1 more variable: baseline_predictor <dbl>

Calculating RMSE for the baseline predictors for both the training and test data:

#calculate the difference and square the difference

train_predictions <- train_predictions |>
  mutate(difference = rating.x - baseline_predictor) |>
  mutate(square = (difference*difference))

rmse_training_baseline <- train_predictions |> 
  #sum the squared errors
  summarise(total_sq = sum(square)) |>
  #take the mean and then the sq root
  mutate(rmse = (sqrt(total_sq/62))) 

#the rmse for the training set is .55
rmse_training_baseline
# A tibble: 1 × 2
  total_sq  rmse
     <dbl> <dbl>
1     18.8 0.550

RMSE for test set:

test_predictions <- test_predictions |>
  mutate(difference = rating.x - baseline_predictor) |>
  mutate(square = (difference*difference))

rmse_test_baseline <- test_predictions |> 
  #sum the squared errors
  summarise(total_sq = sum(square)) |>
  #take the mean and then the sq root
  mutate(rmse = (sqrt(total_sq/16))) 

#the rmse for the test set is much higher at .935
rmse_test_baseline
# A tibble: 1 × 2
  total_sq  rmse
     <dbl> <dbl>
1     14.0 0.935

Summary

The RMSE was highest when using the raw average, which doesn’t account for differences across users or media at 1.295 on the test data and 1.16 on the training data.

Results were slightly better when accounting for user and book bias. When I calculated the predicted rating for the test set, the RMSE was only .55. However, it was much higher for the test set at .935 (almost a full point). When checking the calculations, one large difference (a low rating of a generally highly rated book by user 15, a high rater) may have introduced a lot of error.

The percent improvement on test data, when switching from a raw average method to a baseline predictor method, was about 28% on the test data and about 52% on the training data.

test_improvment <- (1 - .935 /1.295)
training_improvement <- (1 - .555/1.16)
test_improvment
[1] 0.2779923
training_improvement
[1] 0.5215517