# Check if libraries are installed, if not, install them
if (!requireNamespace("tidyverse", quietly = TRUE))
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
if (!requireNamespace("caret", quietly = TRUE))
install.packages("caret", repos = "http://cran.us.r-project.org")
if (!requireNamespace("data.table", quietly = TRUE))
install.packages("data.table", repos = "http://cran.us.r-project.org")
if (!requireNamespace("cowplot", quietly = TRUE))
install.packages("cowplot", repos = "http://cran.us.r-project.org")
if (!requireNamespace("lubridate", quietly = TRUE))
install.packages("lubridate", repos = "http://cran.us.r-project.org")
if (!requireNamespace("Metrics", quietly = TRUE))
install.packages("Metrics", repos = "http://cran.us.r-project.org")
if (!requireNamespace("recosystem", quietly = TRUE))
install.packages("recosystem", repos = "http://cran.us.r-project.org")
if (!requireNamespace("scales", quietly = TRUE))
install.packages("scales", repos = "http://cran.us.r-project.org")
if (!requireNamespace("stringr", quietly = TRUE))
install.packages("stringr", repos = "http://cran.us.r-project.org")
if (!requireNamespace("tibble", quietly = TRUE))
install.packages("tibble", repos = "http://cran.us.r-project.org")
if (!requireNamespace("tidyr", quietly = TRUE))
install.packages("tidyr", repos = "http://cran.us.r-project.org")
if (!require(tidyverse)) install.packages("tidyverse")
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'forcats' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Loading the required libraries
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.3.2
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.2
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(dplyr)
library(ggplot2)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.2
##
## Attaching package: 'ggthemes'
##
## The following object is masked from 'package:cowplot':
##
## theme_map
library(lubridate)
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.2
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
library(recosystem)
## Warning: package 'recosystem' was built under R version 4.3.2
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(stringr)
library(tibble)
library(tidyr)
library(tidyverse)
The caret library, short for “Classification And
Regression Training,” is a widely-used R package for machine learning
tasks. It provides a diverse set of tools to streamline classification
and regression tasks, making it a go-to choice for data scientists.
cowplot is a library designed to simplify the creation
of publication-quality figures. It enhances the ease of constructing
visually appealing graphics for effective data representation.
data.table is a high-performance library in R that
facilitates efficient manipulation and operation of dataframes. It is
known for its speed and optimization capabilities, making it a preferred
choice for large-scale data tasks.
Arguably the most popular R package for data manipulation,
dplyr offers a consistent set of tools for intuitive and
user-friendly data manipulation. Data analysts commonly use it to
transform datasets into suitable formats for analysis, exploration, and
visualization.
ggplot2 stands out as the most popular library for data
visualization in R. Renowned for its customizability and efficiency, it
significantly improves the quality and aesthetics of graphics.
Built upon the support provided by ggplot2 and its official
extension, ggthemes offers additional themes for ggplot2
graphics. It allows developers to easily create customized tools and
presets for enhanced visualization.
lubridate is designed to simplify and accelerate
time-related tasks and calculations. It provides an efficient solution
for working with temporal data in R.
The Metrics library is centered around evaluation
metrics, offering functions that simplify the calculation of metrics for
assessing model performance.
Built around matrix factorization, recosystem bundles a
collection of functions to streamline the process of recommendation
systems. It simplifies the implementation of matrix factorization
techniques.
Included for proper axis scaling in plots, the scales
library enhances the visualization of data by ensuring appropriate
scaling on plot axes.
stringr is a library that provides a set of functions
designed to simplify working with strings. It offers convenient tools
for string manipulation tasks.
tibble re-imagines the dataframe format, providing a
cleaner and more modern solution for data representation. It tidies up
the dataframe structure for improved readability.
tidyr simplifies the process of tidying data, making it
easily evaluable and standardizing it into a format compatible with most
functions. It contributes to data cleaning and organization tasks.
The MovieLens dataset, provided by GroupLens Research via MovieLens website, consists of rating data for movies. Users can rate movies on the website, and these datasets are made available for research purposes. It is essential to review the README files accompanying the datasets for usage licenses and other relevant details.
The dataset, gathered by GroupLens, includes 27 million ratings given to 58,000 movies by 280,000 users. Due to the substantial size, only a small subset of the data will be used for this exercise, specifically the MovieLens 10M subset.
The MovieLens 10M dataset is a stable benchmark dataset containing 10 million ratings and 100,000 tag applications applied to 10,000 movies by 72,000 users. It was released in January 2009. The dataset is available for download here, with a size of 63 MB.
ratings.dat: A four-column dataset containing user ID,
movie ID, the user’s rating for the movie, and a timestamp.movies.dat: A three-column dataset containing movie ID,
movie title, and movie genres.The movie rating data is computationally expensive so ill randomly consider 10 percent of the overall Dataset.
# The movies object is converted into a dataframe
movies <- as.data.frame(movies) %>%
mutate(movieId = as.numeric(movieId),
title = as.character(title),
genres = as.character(genres))
# Check the class of the 'movies' dataframe
class(movies)
## [1] "data.frame"
# Display the first few rows of the 'movies' dataframe
head(movies)
## movieId title genres
## 1 1 122 5::838985046
## 2 1 185 5::838983525
## 3 1 231 5::838983392
## 4 1 292 5::838983421
## 5 1 316 5::838983392
## 6 1 329 5::838983392
# Truncate the movies dataset to 10% using sample_frac
fraction_to_keep <- 0.1
movies <- movies %>% sample_frac(fraction_to_keep)
# Check the class of the truncated 'movies' dataframe
class(movies)
## [1] "data.frame"
# Display the first few rows of the truncated 'movies' dataframe
head(movies)
## movieId title genres
## 1 11311 185 3::845105785
## 2 27152 2245 3.5::1116781158
## 3 54937 151 3.5::1115335054
## 4 6924 1674 4::946677095
## 5 51033 1906 5::941479086
## 6 30958 4686 2.5::1065564685
# Join ratings and movies dataframes by "movieId"
filmography <- left_join(ratings, movies, by = "movieId")
## Warning in left_join(ratings, movies, by = "movieId"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 131949 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Check the class of the 'filmography' dataframe
class(filmography)
## [1] "data.table" "data.frame"
# Display the first few rows of the 'filmography' dataframe
head(filmography)
## userId movieId rating timestamp title genres
## 1: 23574 5378 3 1022389970 1459 3::858518019
## 2: 23574 5378 3 1022389970 62 3::858517581
## 3: 23574 5378 3 1022389970 1363 3::858517858
## 4: 23574 5378 3 1022389970 1183 3::858517766
## 5: 23574 5378 3 1022389970 1476 3::858518036
## 6: 20039 778 4 1222226073 <NA> <NA>
To assess the system’s performance, a validation set is created from a small subset of the filmography dataset. The RMSE (Root Mean Square Error) will be calculated on this validation set. The R functions nrow() and sample() are employed to randomly choose row indexes for splitting the filmography dataset into a working set and a validation set. The code snippet demonstrates this split, with 10% of the dataset’s indexes forming the validation set and the remaining 90% creating the working_set object. The validation set is temporarily named ‘temp’ at this stage, as it lacks essential steps to be a valid validation set. Finally, the tibble() function is used to present the lengths of the different sets in a table-like structure.
# set.seed(1, sample.kind="Rounding") # if using R 3.5 or earlier, use `set.seed(1)`
set.seed(1)
test_index <- createDataPartition(y = filmography$rating, times = 1, p = 0.1, list = FALSE)
working_set <- filmography[-test_index,]
temp <- filmography[test_index,]
tibble(Dataset = c("filmography", "working_set", "temp"),
"Number of ratings" = c(nrow(filmography), nrow(working_set), nrow(temp)))
## # A tibble: 3 × 2
## Dataset `Number of ratings`
## <chr> <int>
## 1 filmography 13224359
## 2 working_set 11901922
## 3 temp 1322437
validation <- temp %>%
semi_join(working_set, by = "movieId") %>%
semi_join(working_set, by = "userId")
# Add rows removed from validation set back into working_set set
removed <- anti_join(temp, validation)
## Joining with `by = join_by(userId, movieId, rating, timestamp, title, genres)`
working_set <- rbind(working_set, removed)
dim(working_set)
## [1] 11901985 6
class(working_set)
## [1] "data.table" "data.frame"
str(working_set, vec.len = 2)
## Classes 'data.table' and 'data.frame': 11901985 obs. of 6 variables:
## $ userId : int 23574 23574 23574 23574 20039 ...
## $ movieId : num 5378 5378 ...
## $ rating : num 3 3 3 3 4 ...
## $ timestamp: int 1022389970 1022389970 1022389970 1022389970 1222226073 ...
## $ title : chr "62" "1363" ...
## $ genres : chr "3::858517581" "3::858517858" ...
## - attr(*, ".internal.selfref")=<externalptr>
head(working_set)
## userId movieId rating timestamp title genres
## 1: 23574 5378 3.0 1022389970 62 3::858517581
## 2: 23574 5378 3.0 1022389970 1363 3::858517858
## 3: 23574 5378 3.0 1022389970 1183 3::858517766
## 4: 23574 5378 3.0 1022389970 1476 3::858518036
## 5: 20039 778 4.0 1222226073 <NA> <NA>
## 6: 55211 3638 3.5 1228924460 2518 2::946317977
summary(working_set)
## userId movieId rating timestamp
## Min. : 1 Min. : 1 Min. :0.500 Min. :8.229e+08
## 1st Qu.:18181 1st Qu.: 742 1st Qu.:3.000 1st Qu.:9.488e+08
## Median :35785 Median : 1883 Median :3.500 Median :1.038e+09
## Mean :35922 Mean : 4205 Mean :3.506 Mean :1.034e+09
## 3rd Qu.:53664 3rd Qu.: 3624 3rd Qu.:4.000 3rd Qu.:1.128e+09
## Max. :71567 Max. :65130 Max. :5.000 Max. :1.231e+09
## title genres
## Length:11901985 Length:11901985
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Rating count
working_set %>%
group_by(rating) %>%
summarize(count = n())
## # A tibble: 10 × 2
## rating count
## <dbl> <int>
## 1 0.5 118764
## 2 1 462679
## 3 1.5 140603
## 4 2 963009
## 5 2.5 449527
## 6 3 2789770
## 7 3.5 1045667
## 8 4 3391552
## 9 4.5 698816
## 10 5 1841598
working_set %>%
group_by(rating) %>%
summarize(count = n()) %>%
ggplot(aes(x = rating, y = count)) +
geom_bar(stat = "identity", fill = "#8888ff") +
ggtitle("Rating Distribution") +
xlab("Rating") +
ylab("Occurrences Count") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(n.breaks = 10) +
theme_economist() +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
# as_datetime() showcase
sample(as_datetime(working_set$timestamp, origin = "1970-01-01"), replace = TRUE, size = 20)
## [1] "2004-08-30 19:49:44 UTC" "2005-09-07 02:11:24 UTC"
## [3] "1996-05-07 06:43:26 UTC" "2004-06-01 17:43:00 UTC"
## [5] "2005-12-04 23:56:40 UTC" "2001-01-12 00:19:18 UTC"
## [7] "1999-12-04 21:39:41 UTC" "2002-05-02 23:34:44 UTC"
## [9] "2008-08-22 00:39:30 UTC" "2000-12-06 15:29:02 UTC"
## [11] "1996-06-25 21:43:04 UTC" "1996-10-23 15:27:55 UTC"
## [13] "2005-11-28 21:18:43 UTC" "2006-07-04 02:13:41 UTC"
## [15] "2000-05-10 18:24:30 UTC" "2003-05-04 22:57:00 UTC"
## [17] "2006-12-27 22:19:36 UTC" "2008-02-25 22:00:45 UTC"
## [19] "2004-10-22 06:58:49 UTC" "2000-05-04 19:39:23 UTC"
working_set %>%
mutate(year = year(as_datetime(timestamp, origin = "1970-01-01"))) %>%
group_by(year) %>%
summarize(count = n())
## # A tibble: 14 × 2
## year count
## <int> <int>
## 1 1996 1183428
## 2 1997 527717
## 3 1998 241132
## 4 1999 913519
## 5 2000 1553372
## 6 2001 913469
## 7 2002 694158
## 8 2003 826744
## 9 2004 920436
## 10 2005 1411807
## 11 2006 916249
## 12 2007 853775
## 13 2008 928136
## 14 2009 18043
working_set %>%
mutate(year = year(as_datetime(timestamp, origin = "1970-01-01"))) %>%
ggplot(aes(x = year)) +
geom_bar(fill = "#8888ff") +
ggtitle("Ratings per year") +
xlab("Year") +
ylab("Number of ratings") +
scale_y_continuous(labels = comma) +
theme_economist() +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
working_set %>%
mutate(year = year(as_datetime(timestamp, origin = "1970-01-01"))) %>%
group_by(year) %>%
summarize(avg = mean(rating)) %>%
ggplot(aes(x = year, y = avg)) +
geom_bar(stat = "identity", fill = "#8888ff") +
ggtitle("Average rating per year") +
xlab("Year") +
ylab("Average rating") +
scale_y_continuous(labels = comma) +
theme_economist() +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
working_set %>%
group_by(movieId) %>%
summarize(count = n()) %>%
slice_head(n = 10)
## # A tibble: 10 × 2
## movieId count
## <dbl> <int>
## 1 1 9409
## 2 2 2176
## 3 3 3515
## 4 4 306
## 5 5 7244
## 6 6 6063
## 7 7 4193
## 8 8 5473
## 9 9 228
## 10 10 21035
working_set %>%
group_by(movieId) %>%
summarize(count = n()) %>%
ggplot(aes(x = movieId, y = count)) +
geom_point(alpha = 0.2, color = "#4020dd") +
geom_smooth(color = "red") +
ggtitle("Ratings per movie") +
xlab("Movies") +
ylab("Number of ratings") +
scale_y_continuous(labels = comma) +
scale_x_continuous(n.breaks = 10) +
theme_economist() +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## GENRES ### Genres count
# Movie genres count
working_set %>%
group_by(genres) %>%
summarize(count = n()) %>%
slice_max(order_by = count, n = 8)
## # A tibble: 8 × 2
## genres count
## <chr> <int>
## 1 <NA> 53900
## 2 3::858009499 7795
## 3 3::868278428 5226
## 4 4::945959722 4727
## 5 1::948334895 4696
## 6 2::948334749 4686
## 7 4::990524634 4155
## 8 4::960671307 4050
# Individual genres count
genres <- c("Action", "Adventure", "Animation",
"Children", "Comedy", "Crime",
"Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical",
"Mystery", "Romance", "Sci-Fi",
"Thriller", "War", "Western")
genres_df <- data.frame(
Genres = genres,
Count = sapply(genres, function(x) {
sum(sapply(str_extract_all(working_set$genres, regex(x, ignore_case = TRUE)), length))
})
)
print(genres_df)
## Genres Count
## Action Action 53900
## Adventure Adventure 53900
## Animation Animation 53900
## Children Children 53900
## Comedy Comedy 53900
## Crime Crime 53900
## Documentary Documentary 53900
## Drama Drama 53900
## Fantasy Fantasy 53900
## Film-Noir Film-Noir 53900
## Horror Horror 53900
## Musical Musical 53900
## Mystery Mystery 53900
## Romance Romance 53900
## Sci-Fi Sci-Fi 53900
## Thriller Thriller 53900
## War War 53900
## Western Western 53900
genres_df %>%
ggplot(aes(x = Count, y = Genres)) +
ggtitle("Genre Popularity") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Genres") +
scale_x_continuous(labels = comma) +
theme_economist() +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
## Genre popularity plot
genres_df %>%
ggplot(aes(x = Count, y = Genres)) +
ggtitle("Genre Popularity") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Genres") +
scale_x_continuous(labels = scales::comma) +
theme_economist() +
theme(
plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm")
)
Next i’ll inspect the Training Data
# How many rows and columns are there in the training dataset?
# Rows
n_rows <- dim(working_set)[1]
# Columns
n_cols <- dim(working_set)[2]
n_rows
## [1] 11901985
n_cols
## [1] 6
# How many zeros were given as ratings in the training dataset?
n_zeros <- sum(working_set$rating == 0.0)
n_zeros
## [1] 0
# How many threes were given as ratings in the training dataset?
n_threes <- sum(working_set$rating == 3.0)
n_threes
## [1] 2789770
# How many different movies are in the training dataset?
n_movies <- dim(as.data.frame(table(working_set$movieId)))[1]
n_movies
## [1] 9836
# How many different users are in the training dataset?
n_users <- dim(as.data.frame(table(working_set$userId)))[1]
n_users
## [1] 68549
# set.seed(1, sample.kind="Rounding") # if using R 3.5 or earlier, use `set.seed(1)`
set.seed(1)
train_index <- createDataPartition(filmography$rating, times = 1, p = 0.9, list = FALSE)
train_set <- filmography[train_index,]
temp_test_set <- filmography[-train_index,]
tibble(Dataset = c("filmography", "train_set", "temp_test_set"),
"Number of ratings" = c(nrow(filmography), nrow(train_set), nrow(temp_test_set)))
## # A tibble: 3 × 2
## Dataset `Number of ratings`
## <chr> <int>
## 1 filmography 13224359
## 2 train_set 11901925
## 3 temp_test_set 1322434
I’ll Make sure userId and movieId in the testing set are also in the training set set
test_set <- temp_test_set %>%
semi_join(train_set, by = "movieId") %>%
semi_join(train_set, by = "userId")
Next I’ll Add rows removed from the testing set back into the training set set
removed <- anti_join(temp_test_set, test_set)
## Joining with `by = join_by(userId, movieId, rating, timestamp, title, genres)`
train_set <- rbind(train_set, removed)
Random guessing is a simple and naive approach where predictions are made randomly without any consideration of the underlying patterns or information in the data. In the context of a recommendation system, random guessing involves assigning ratings or making recommendations randomly without taking into account user preferences, historical behavior, or any other relevant factors.
## # A tibble: 3 × 4
## Model MAE MSE RMSE
## <chr> <dbl> <dbl> <dbl>
## 1 Cinematch NA NA 0.952
## 2 The Netflix Prize NA NA 0.857
## 3 Random guessing 1.17 2.27 1.51
## # A tibble: 4 × 4
## Model MAE MSE RMSE
## <chr> <dbl> <dbl> <dbl>
## 1 Cinematch NA NA 0.952
## 2 The Netflix Prize NA NA 0.857
## 3 Random guessing 1.17 2.27 1.51
## 4 Linear model (mean baseline) 0.861 1.14 1.07
# Group by Movie
b_i <- train_set %>%
group_by(movieId) %>%
summarize(
b_i = mean(rating - mu),
b_i_isolated = mean(rating)
)
# Display the Top 10 Movie Biases
b_i %>% slice_head(n = 10)
## # A tibble: 10 × 3
## movieId b_i b_i_isolated
## <dbl> <dbl> <dbl>
## 1 1 0.423 3.93
## 2 2 -0.281 3.23
## 3 3 -0.399 3.11
## 4 4 -0.663 2.84
## 5 5 -0.371 3.14
## 6 6 0.275 3.78
## 7 7 -0.168 3.34
## 8 8 -0.349 3.16
## 9 9 -0.493 3.01
## 10 10 -0.0970 3.41
Next ill create two bias plot for movies and adjusted bias plot . The
final result is a side-by-side comparison of the isolated and adjusted
movie bias plots, providing insights into the distribution of bias
values for movies in the dataset.
# Linear Model Construction (Mean + Movie Bias)
y_hat_b_i <- mu + validation %>%
left_join(b_i, by = "movieId") %>%
.$b_i
# Evaluation Metrics
evaluation <- bind_rows(evaluation,
tibble(Model = "Linear model (mean + movie bias)",
MAE = Metrics::mae(validation$rating, y_hat_b_i),
MSE = Metrics::mse(validation$rating, y_hat_b_i),
RMSE = Metrics::rmse(validation$rating, y_hat_b_i)))
# Print Evaluation Results
print(evaluation)
## # A tibble: 5 × 4
## Model MAE MSE RMSE
## <chr> <dbl> <dbl> <dbl>
## 1 Cinematch NA NA 0.952
## 2 The Netflix Prize NA NA 0.857
## 3 Random guessing 1.17 2.27 1.51
## 4 Linear model (mean baseline) 0.861 1.14 1.07
## 5 Linear model (mean + movie bias) 0.735 0.883 0.940
Next ill create two bias plot for users and adjusted bias plot . The final result is a side-by-side comparison of the isolated and adjusted users bias plots, providing insights into the distribution of bias values for movies in the dataset.
# Bias per user
b_u <- train_set %>%
left_join(b_i, by = 'movieId') %>%
group_by(userId) %>%
summarize(b_u = mean(rating - mu - b_i),
b_u_isolated = mean(rating))
# Display top 10 rows of b_u
b_u %>% slice_head(n = 10)
## # A tibble: 10 × 3
## userId b_u b_u_isolated
## <int> <dbl> <dbl>
## 1 1 1.51 5
## 2 2 -0.306 3
## 3 3 -0.355 3.03
## 4 4 0.856 4.49
## 5 5 0.265 4.15
## 6 6 0.451 4.19
## 7 7 0.0302 3.72
## 8 8 0.285 3.46
## 9 9 0.710 4.74
## 10 10 -0.0552 3.81
# Isolated user bias plot
b_u_isolated_plot <- b_u %>%
ggplot(aes(x = b_u_isolated)) +
geom_histogram(bins = 20, fill = "#8888ff", color = "#4020dd") +
ggtitle("User Bias (isolated)") +
xlab("Bias value") +
ylab("Count") +
scale_y_continuous(labels = comma) +
scale_x_continuous(n.breaks = 10) +
theme_economist() +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
# Adjusted user bias plot
b_u_plot <- b_u %>%
ggplot(aes(x = b_u)) +
geom_histogram(bins = 20, fill = "#8888ff", color = "#4020dd") +
ggtitle("User Bias (adjusted)") +
xlab("Bias value") +
ylab("Count") +
scale_y_continuous(labels = comma) +
scale_x_continuous(n.breaks = 10) +
theme_economist() +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
# Combine both b_u plots with plot_grid()
plot_grid(b_u_isolated_plot, b_u_plot, labels = "AUTO", nrow = 2)
# Linear Model Construction (mean + movie bias + user bias) he linear
model predictions (y_hat_b_u) by incorporating mean, movie bias, and
user bias. It then evaluates the performance of this linear model using
Mean Absolute Error (MAE), Mean Squared Error (MSE), and Root Mean
Squared Error (RMSE) metrics and appends the results to the evaluation
tibble. Finally, it prints the evaluation results.
y_hat_b_u <- validation %>%
left_join(b_i, by='movieId') %>%
left_join(b_u, by='userId') %>%
mutate(y_hat = mu + b_i + b_u) %>%
.$y_hat
# Evaluation
evaluation <- bind_rows(evaluation,
tibble(Model = "Linear model (mean + movie and user bias)",
MAE = Metrics::mae(validation$rating, y_hat_b_u),
MSE = Metrics::mse(validation$rating, y_hat_b_u),
RMSE = Metrics::rmse(validation$rating, y_hat_b_u)))
# Print evaluation results
print(evaluation)
## # A tibble: 6 × 4
## Model MAE MSE RMSE
## <chr> <dbl> <dbl> <dbl>
## 1 Cinematch NA NA 0.952
## 2 The Netflix Prize NA NA 0.857
## 3 Random guessing 1.17 2.27 1.51
## 4 Linear model (mean baseline) 0.861 1.14 1.07
## 5 Linear model (mean + movie bias) 0.735 0.883 0.940
## 6 Linear model (mean + movie and user bias) 0.599 0.623 0.789
To obtain recommendations for our users, we will predict their ratings for movies they haven’t watched yet.
# Top 10 movie recommendation by the linear model
top10_prediction_linear <- test_set %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(y_hat = mu + b_i + b_u) %>%
arrange(desc(y_hat)) %>%
select(title) %>%
unique() %>%
slice_head(n = 10)
top10_prediction_linear_df <- data.frame(
Title = top10_prediction_linear,
Rating = rep(NA, 10),
Count = rep(NA, 10)
)
for (i in 1:10) {
indexes <- which(test_set$title == as.character(top10_prediction_linear[i]))
top10_prediction_linear_df$Rating[i] <- mean(test_set$rating[indexes])
top10_prediction_linear_df$Count[i] <- sum(
test_set$title == as.character(top10_prediction_linear[i])
)
}
cat("## Top 10 Movie Recommendations by the Linear Model\n")
## ## Top 10 Movie Recommendations by the Linear Model
print(top10_prediction_linear_df)
## title Rating Count
## 1 2110 3.615635 NA
## 2 3072 3.798223 NA
## 3 3727 3.492898 NA
## 4 457 3.443638 NA
## 5 553 3.464971 NA
## 6 36 3.642815 NA
## 7 380 3.586940 NA
## 8 110 3.498329 NA
## 9 1358 3.562103 NA
## 10 2840 3.952658 NA
worst10_prediction_linear <- test_set %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(y_hat = mu + b_i + b_u) %>%
arrange(b_i) %>%
select(title) %>%
unique() %>%
slice_head(n = 10)
worst10_prediction_linear_df <- data.frame(
Title = worst10_prediction_linear,
Rating = rep(NA, 10),
Count = rep(NA, 10)
)
for (i in 1:10) {
indexes <- which(test_set$title == as.character(worst10_prediction_linear[i]))
worst10_prediction_linear_df$Rating[i] <- mean(test_set$rating[indexes])
worst10_prediction_linear_df$Count[i] <- sum(
test_set$title == as.character(worst10_prediction_linear[i])
)
}
cat("## Worst 10 Movie Recommendations by the Linear Model\n")
## ## Worst 10 Movie Recommendations by the Linear Model
print(worst10_prediction_linear_df)
## title Rating Count
## 1 928 3.669732 NA
## 2 17 3.511967 NA
## 3 266 3.572042 NA
## 4 597 3.480154 NA
## 5 268 3.327044 NA
## 6 527 3.469821 NA
## 7 28 3.421429 NA
## 8 2302 3.452148 NA
## 9 1690 3.613208 NA
## 10 2558 3.641026 NA
Using Regularization, we can fit our machine learning model appropriately on a given test set and hence reduce the errors in it.
regularization <- function(lambda, train_set, test_set) {
# Calculate mean of ratings in the training set
mu <- mean(train_set$rating)
# Calculate movie bias (b_i)
b_i <- train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu) / (n() + lambda))
# Calculate user bias (b_u)
b_u <- train_set %>%
left_join(b_i, by = "movieId") %>%
filter(!is.na(b_i)) %>%
group_by(userId) %>%
summarize(b_u = sum(rating - mu - b_i) / (n() + lambda))
# Predict ratings using the trained biases
predicted_ratings <- test_set %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
filter(!is.na(b_i), !is.na(b_u)) %>%
mutate(pred = mu + b_i + b_u) %>%
pull(pred)
# Return the root mean square error (RMSE)
return(Metrics::rmse(predicted_ratings, test_set$rating))
}
# Test regularization function with different lambda values
lambdas <- seq(0, 10, 0.25)
lambdas_rmse <- sapply(lambdas,
regularization,
train_set = train_set,
test_set = test_set)
# Create a tibble to display lambda and corresponding RMSE values
lambdas_tibble <- tibble(Lambda = lambdas, RMSE = lambdas_rmse)
print(lambdas_tibble)
## # A tibble: 41 × 2
## Lambda RMSE
## <dbl> <dbl>
## 1 0 0.794
## 2 0.25 0.794
## 3 0.5 0.794
## 4 0.75 0.794
## 5 1 0.794
## 6 1.25 0.794
## 7 1.5 0.794
## 8 1.75 0.794
## 9 2 0.794
## 10 2.25 0.794
## # ℹ 31 more rows
lambdas_tibble %>%
ggplot(aes(x = Lambda, y = RMSE)) +
geom_point() +
ggtitle("Effect of Lambda on RMSE") +
xlab("Lambda") +
ylab("RMSE") +
scale_y_continuous(n.breaks = 6, labels = comma) +
scale_x_continuous(n.breaks = 10) +
theme_economist() +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
## Regularized linear model construction
# Find the optimal lambda value that minimizes RMSE
lambda <- lambdas[which.min(lambdas_rmse)]
# Calculate the mean of ratings in the training set
mu <- mean(train_set$rating)
# Calculate regularized movie bias (b_i)
b_i_regularized <- train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu) / (n() + lambda))
# Calculate regularized user bias (b_u)
b_u_regularized <- train_set %>%
left_join(b_i_regularized, by = "movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu) / (n() + lambda))
# Predict ratings using regularized biases
y_hat_regularized <- validation %>%
left_join(b_i_regularized, by = "movieId") %>%
left_join(b_u_regularized, by = "userId") %>%
mutate(prediction = mu + b_i + b_u) %>%
pull(prediction)
# Evaluate the model and update the evaluation table
evaluation <- bind_rows(evaluation,
tibble(Model = "Linear model with regularized bias",
MAE = Metrics::mae(validation$rating, y_hat_regularized),
MSE = Metrics::mse(validation$rating, y_hat_regularized),
RMSE = Metrics::rmse(validation$rating, y_hat_regularized)))
print(evaluation)
## # A tibble: 7 × 4
## Model MAE MSE RMSE
## <chr> <dbl> <dbl> <dbl>
## 1 Cinematch NA NA 0.952
## 2 The Netflix Prize NA NA 0.857
## 3 Random guessing 1.17 2.27 1.51
## 4 Linear model (mean baseline) 0.861 1.14 1.07
## 5 Linear model (mean + movie bias) 0.735 0.883 0.940
## 6 Linear model (mean + movie and user bias) 0.599 0.623 0.789
## 7 Linear model with regularized bias 0.599 0.623 0.789
Next i’ll use the regularized linear moel to recommend movies
# Top 10 movie recommendations by the regularized linear model
top10_prediction_regularized <- test_set %>%
left_join(b_i_regularized, by = "movieId") %>%
left_join(b_u_regularized, by = "userId") %>%
mutate(y_hat = mu + b_i + b_u) %>%
arrange(desc(y_hat)) %>%
select(title) %>%
unique() %>%
slice_head(n = 10)
# Create a data frame for the top 10 predictions
top10_prediction_regularized_df <- data.frame(Title = top10_prediction_regularized,
Rating = rep(NA, 10),
Count = rep(NA, 10))
# Populate the data frame with actual ratings and counts
for (i in 1:10) {
indexes <- which(test_set$title == as.character(top10_prediction_regularized[i]))
top10_prediction_regularized_df$Rating[i] <- mean(test_set$rating[indexes])
top10_prediction_regularized_df$Count[i] <- sum(
test_set$title == as.character(top10_prediction_regularized[i])
)
}
# Print the top 10 recommendations
print(top10_prediction_regularized_df)
## title Rating Count
## 1 2110 3.615635 NA
## 2 3072 3.798223 NA
## 3 3727 3.492898 NA
## 4 457 3.443638 NA
## 5 553 3.464971 NA
## 6 36 3.642815 NA
## 7 2840 3.952658 NA
## 8 1895 4.123762 NA
## 9 432 3.693925 NA
## 10 5266 4.135009 NA
# Worst 10 movie recommendations by the regularized linear model
worst10_prediction_regularized <- test_set %>%
left_join(b_i_regularized, by = "movieId") %>%
left_join(b_u_regularized, by = "userId") %>%
mutate(y_hat = mu + b_i + b_u) %>%
arrange(y_hat) %>%
select(title) %>%
unique() %>%
slice_head(n = 10)
# Create a data frame for the worst 10 predictions
worst10_prediction_regularized_df <- data.frame(Title = worst10_prediction_regularized,
Rating = rep(NA, 10),
Count = rep(NA, 10))
# Populate the data frame with actual ratings and counts
for (i in 1:10) {
indexes <- which(test_set$title == as.character(worst10_prediction_regularized[i]))
worst10_prediction_regularized_df$Rating[i] <- mean(test_set$rating[indexes])
worst10_prediction_regularized_df$Count[i] <- sum(
test_set$title == as.character(worst10_prediction_regularized[i])
)
}
# Print the worst 10 recommendations
print(worst10_prediction_regularized_df)
## title Rating Count
## 1 141 3.532245 NA
## 2 58 3.313594 NA
## 3 95 3.506606 NA
## 4 805 3.450049 NA
## 5 1244 3.549180 NA
## 6 1302 3.412010 NA
## 7 1228 3.532206 NA
## 8 5294 2.720000 NA
## 9 1288 3.532805 NA
## 10 4340 4.083333 NA
Matrix factorization is a technique used in machine learning and collaborative filtering to decompose a matrix into the product of two lower-dimensional matrices. The goal is to represent the original matrix by capturing its latent factors in a more compact form. This technique is widely used in recommendation systems and other applications where data can be represented as a matrix ## Prepare Training and Testing Sets
set.seed(1)
train_recosystem <- with(train_set, data_memory(user_index = userId,
item_index = movieId,
rating = rating))
test_recosystem <- with(test_set, data_memory(user_index = userId,
item_index = movieId,
rating = rating))
recommendation_system <- Reco()
tuning <- recommendation_system$tune(train_recosystem, opts = list(dim = c(10, 20, 30),
lrate = c(0.1, 0.2),
nthread = 4,
niter = 10))
recommendation_system$train(train_recosystem, opts = c(tuning$min,
nthread = 4,
niter = 20))
## iter tr_rmse obj
## 0 0.6869 1.4218e+07
## 1 0.5118 1.2463e+07
## 2 0.5100 1.2694e+07
## 3 0.4972 1.2730e+07
## 4 0.4880 1.2753e+07
## 5 0.4822 1.2767e+07
## 6 0.4782 1.2787e+07
## 7 0.4750 1.2843e+07
## 8 0.4733 1.2850e+07
## 9 0.4718 1.2866e+07
## 10 0.4696 1.2924e+07
## 11 0.4686 1.2921e+07
## 12 0.4676 1.2926e+07
## 13 0.4664 1.2952e+07
## 14 0.4664 1.3004e+07
## 15 0.4647 1.2958e+07
## 16 0.4645 1.3001e+07
## 17 0.4640 1.2996e+07
## 18 0.4631 1.3007e+07
## 19 0.4625 1.3032e+07
y_hat_MF <- recommendation_system$predict(test_recosystem, out_memory())
evaluation <- bind_rows(evaluation,
tibble(Model = "Matrix factorization",
MAE = Metrics::mae(validation$rating, y_hat_MF),
MSE = Metrics::mse(validation$rating, y_hat_MF),
RMSE = Metrics::rmse(validation$rating, y_hat_MF)))
## Warning in actual - predicted: longer object length is not a multiple of
## shorter object length
## Warning in actual - predicted: longer object length is not a multiple of
## shorter object length
## Warning in actual - predicted: longer object length is not a multiple of
## shorter object length
print(evaluation)
## # A tibble: 8 × 4
## Model MAE MSE RMSE
## <chr> <dbl> <dbl> <dbl>
## 1 Cinematch NA NA 0.952
## 2 The Netflix Prize NA NA 0.857
## 3 Random guessing 1.17 2.27 1.51
## 4 Linear model (mean baseline) 0.861 1.14 1.07
## 5 Linear model (mean + movie bias) 0.735 0.883 0.940
## 6 Linear model (mean + movie and user bias) 0.599 0.623 0.789
## 7 Linear model with regularized bias 0.599 0.623 0.789
## 8 Matrix factorization 1.07 1.80 1.34
In machine learning, it is extremely helpful to have a single number to judge a model’s performance, whether it be during training, cross-validation, or monitoring after deployment. Root mean square error is one of the most widely used measures for this ## Movies Recommendations ### Top 10 movie recommendation by the matrix factorization model
top10_prediction_MF <- tibble(title = test_set$title, y_hat = y_hat_MF) %>%
arrange(desc(y_hat)) %>%
select(title) %>%
unique() %>%
slice_head(n = 10)
top10_prediction_MF_df <- data.frame(Title = top10_prediction_MF,
Rating = rep(NA, 10),
Count = rep(NA, 10))
for (i in 1:10) {
indexes <- which(test_set$title == as.character(top10_prediction_MF[i,]))
top10_prediction_MF_df$Rating[i] <- mean(test_set$rating[indexes])
top10_prediction_MF_df$Count[i] <- sum(
test_set$title == as.character(top10_prediction_MF[i,])
)
}
# Top 10 movie recommendation
print(top10_prediction_MF_df)
## title Rating Count
## 1 1527 3.626032 NA
## 2 2702 3.924670 NA
## 3 7154 4.113636 NA
## 4 3896 4.109453 NA
## 5 6979 3.816261 NA
## 6 5608 4.025463 NA
## 7 1 3.604768 NA
## 8 2021 3.335581 NA
## 9 1821 3.500000 NA
## 10 2395 3.614521 NA
worst10_prediction_MF <- tibble(title = test_set$title, y_hat = y_hat_MF) %>%
arrange(y_hat) %>%
select(title) %>%
unique() %>%
slice_head(n = 10)
worst10_prediction_MF_df <- data.frame(Title = worst10_prediction_MF,
Rating = rep(NA, 10),
Count = rep(NA, 10))
for (i in 1:10) {
indexes <- which(test_set$title == as.character(worst10_prediction_MF[i,]))
worst10_prediction_MF_df$Rating[i] <- mean(test_set$rating[indexes])
worst10_prediction_MF_df$Count[i] <- sum(
test_set$title == as.character(worst10_prediction_MF[i,])
)
}
print(worst10_prediction_MF_df)
## title Rating Count
## 1 2012 3.413289 NA
## 2 3896 4.109453 NA
## 3 4367 3.213035 NA
## 4 3931 2.125000 NA
## 5 2420 3.518882 NA
## 6 4123 3.300000 NA
## 7 4954 3.871560 NA
## 8 2018 3.774485 NA
## 9 2707 3.484211 NA
## 10 3082 3.296479 NA