assignment 11

Approach

Prelim

I’m going to need to use the previous data which was created via assignment 3, so I brought over the code and the data. I modified the code to create an output called movie_ratings_fixed.csv that I will use for the recommender.

library(tidyverse)

df <- read.csv("https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/assignment_11/movie_ratings.csv") |>
  select(name, title, rating)

# Global mean
global_mean <- mean(df$rating, na.rm = TRUE)

# Rater effects
s_name <- df |>
  summarize(rater_mean = mean(rating, na.rm = TRUE), .by = name) |>
  mutate(rater_effect = rater_mean - global_mean)

# Movie/item effects
s_title <- df |>
  summarize(item_mean = mean(rating, na.rm = TRUE), .by = title) |>
  mutate(item_effect = item_mean - global_mean)

# Fill original NA ratings
df2 <- df |>
  left_join(s_name, by = "name") |>
  left_join(s_title, by = "title") |>
  mutate(
    rating = if_else(
      is.na(rating),
      round(global_mean + rater_effect + item_effect),
      rating
    )
  )

# Add Shawn, who has no ratings
df3 <- df2 |>
  distinct(title, item_effect) |>
  mutate(
    name = "Shawn",
    rater_effect = 0,
    rating = round(global_mean + item_effect)
  ) |>
  select(name, title, rating)

# Combine fixed original data + Shawn rows
movie_ratings_fixed <- df2 |>
  select(name, title, rating) |>
  bind_rows(df3)

# Write CSV
write.csv(movie_ratings_fixed, "movie_ratings_fixed.csv", row.names = FALSE)

movie_ratings_fixed

Main

I’ll utilize recommenderlab and I’ll do their UBCF method. The github has an easy to follow usage. So I’ll just use that and replace the data with the one developed in assignment 3.

df <- read.csv("https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/assignment_11/movie_ratings_fixed.csv")

train <- MovieLense100[1:300]
rec <- Recommender(train, method = "UBCF")
rec

pre <- predict(rec, MovieLense100[301:302], n = 5)
pre

scheme <- evaluationScheme(MovieLense100, method = "cross-validation", k = 10, given = -5,
    goodRating = 4)
scheme

Codebase

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(recommenderlab)
Warning: package 'recommenderlab' was built under R version 4.5.3
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loading required package: arules
Warning: package 'arules' was built under R version 4.5.3

Attaching package: 'arules'

The following object is masked from 'package:dplyr':

    recode

The following objects are masked from 'package:base':

    abbreviate, write

Loading required package: proxy
Warning: package 'proxy' was built under R version 4.5.3

Attaching package: 'proxy'

The following object is masked from 'package:Matrix':

    as.matrix

The following objects are masked from 'package:stats':

    as.dist, dist

The following object is masked from 'package:base':

    as.matrix
df <- read.csv("https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/assignment_11/movie_ratings.csv")

rating_matrix <- df |>
  select(name, title, rating) |>
  pivot_wider(names_from = title, values_from = rating, values_fill = NA) |>
  column_to_rownames("name") |>
  as.matrix() |>
  as("realRatingMatrix")

rec <- Recommender(rating_matrix, method = "UBCF")

top_rec <- predict(rec, rating_matrix, n = 1)
as(top_rec, "list")
$`0`
[1] "Sinners"

$`1`
[1] "The Materialist"

$`2`
[1] "One Battle After Another"

$`3`
[1] "Begonia"

$`4`
[1] "Wicked for Good"

$`5`
[1] "Sinners"
results <- tibble(
  user = rownames(rating_matrix),
  recommended = as(top_rec, "list")) |>
  unnest(recommended) |>
  rename(movie = recommended)

scheme <- evaluationScheme(rating_matrix, method = "cross-validation", k = 4, given = -2, goodRating = 4)

eval_results <- evaluate(scheme, method = "UBCF", type = "topNList", n = 1:5)
UBCF run fold/sample [model time/prediction time]
     1  [0sec/0.03sec] 
     2  [0sec/0sec] 
     3  [0.02sec/0sec] 
     4  [0sec/0sec] 
avg(eval_results)
            TP        FP        FN        TN N precision    recall       TPR
[1,] 0.4166667 0.3333333 1.0000000 1.2500000 3       NaN 0.3333333 0.3333333
[2,] 0.5000000 1.0000000 0.9166667 0.5833333 3       NaN 0.3750000 0.3750000
[3,] 1.0000000 1.0833333 0.4166667 0.5000000 3       NaN 0.7500000 0.7500000
[4,] 1.0000000 1.0833333 0.4166667 0.5000000 3       NaN 0.7500000 0.7500000
[5,] 1.0000000 1.0833333 0.4166667 0.5000000 3       NaN 0.7500000 0.7500000
           FPR n
[1,] 0.2083333 1
[2,] 0.6666667 2
[3,] 0.7083333 3
[4,] 0.7083333 4
[5,] 0.7083333 5
plot(eval_results)

Video

df |> arrange(name, desc(rating))
   name                    title release_year rating                   rated_at
1  Alex One Battle After Another         2025      5 2026-02-04 13:57:13.204884
2  Alex                  Begonia         2025      4 2026-02-04 13:57:13.204884
3  Alex          Wicked for Good         2025      4 2026-02-04 13:57:13.204884
4  Alex          The Materialist         2025      3 2026-02-04 13:57:13.204884
5  Alex                  Sinners         2025     NA 2026-02-04 13:57:13.204884
6   Bri          Wicked for Good         2025      5 2026-02-04 13:57:13.204884
7   Bri One Battle After Another         2025      4 2026-02-04 13:57:13.204884
8   Bri                  Sinners         2025      4 2026-02-04 13:57:13.204884
9   Bri                  Begonia         2025      3 2026-02-04 13:57:13.204884
10  Bri          The Materialist         2025     NA 2026-02-04 13:57:13.204884
11 Chen                  Begonia         2025      5 2026-02-04 13:57:13.204884
12 Chen          Wicked for Good         2025      4 2026-02-04 13:57:13.204884
13 Chen                  Sinners         2025      4 2026-02-04 13:57:13.204884
14 Chen          The Materialist         2025      3 2026-02-04 13:57:13.204884
15 Chen One Battle After Another         2025     NA 2026-02-04 13:57:13.204884
16 Devi                  Sinners         2025      5 2026-02-04 13:57:13.204884
17 Devi          The Materialist         2025      4 2026-02-04 13:57:13.204884
18 Devi One Battle After Another         2025      3 2026-02-04 13:57:13.204884
19 Devi          Wicked for Good         2025      3 2026-02-04 13:57:13.204884
20 Devi                  Begonia         2025     NA 2026-02-04 13:57:13.204884
21  Eli          The Materialist         2025      5 2026-02-04 13:57:13.204884
22  Eli One Battle After Another         2025      4 2026-02-04 13:57:13.204884
23  Eli                  Begonia         2025      3 2026-02-04 13:57:13.204884
24  Eli                  Sinners         2025      3 2026-02-04 13:57:13.204884
25  Eli          Wicked for Good         2025     NA 2026-02-04 13:57:13.204884
26 Fran One Battle After Another         2025      5 2026-02-04 13:57:13.204884
27 Fran          Wicked for Good         2025      4 2026-02-04 13:57:13.204884
28 Fran          The Materialist         2025      4 2026-02-04 13:57:13.204884
29 Fran                  Begonia         2025      3 2026-02-04 13:57:13.204884
30 Fran                  Sinners         2025     NA 2026-02-04 13:57:13.204884
results
# A tibble: 6 × 2
  user  movie                   
  <chr> <chr>                   
1 Alex  Sinners                 
2 Bri   The Materialist         
3 Chen  One Battle After Another
4 Devi  Begonia                 
5 Eli   Wicked for Good         
6 Fran  Sinners                 

The results show that the recommendations for each person are movies they didn’t watch, which is accurate to the model.

library(tidyverse)
library(recommenderlab)

df <- read.csv("https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/assignment_11/movie_ratings.csv")

Upon reviewing it makes little sense to do a recommendation model, probably should’ve done a missing value model instead.

rating_matrix <- df |>
  select(name, title, rating) |>
  pivot_wider(names_from = title, values_from = rating, values_fill = NA) |>
  column_to_rownames("name") |>
  as.matrix() |>
  as("realRatingMatrix")

rec <- Recommender(
  rating_matrix,
  method = "UBCF",
  parameter = list(method = "Cosine", nn = 3)
)

pred <- predict(
  rec,
  rating_matrix,
  type = "ratings"
)

original <- as(rating_matrix, "matrix")
predicted <- as(pred, "matrix")

missing <- is.na(original)

filled_missing_values <- tibble(
  name = rownames(original)[row(original)[missing]],
  title = colnames(original)[col(original)[missing]],
  predicted_rating = predicted[missing]
)

print(filled_missing_values)
# A tibble: 6 × 3
  name  title                    predicted_rating
  <chr> <chr>                               <dbl>
1 Chen  One Battle After Another             4.43
2 Devi  Begonia                              4.00
3 Eli   Wicked for Good                      4.28
4 Bri   The Materialist                      4.66
5 Alex  Sinners                              4   
6 Fran  Sinners                              4   
movie_means <- df |>
  group_by(title) |>
  summarise(
    mean_rating = mean(rating, na.rm = TRUE),
    n_ratings = sum(!is.na(rating)),
    .groups = "drop"
  ) |>
  arrange(desc(mean_rating))

print(movie_means)
# A tibble: 5 × 3
  title                    mean_rating n_ratings
  <chr>                          <dbl>     <int>
1 One Battle After Another         4.2         5
2 Sinners                          4           4
3 Wicked for Good                  4           5
4 The Materialist                  3.8         5
5 Begonia                          3.6         5

Interesting to see that it goes beyond just calculating the mean.