This assignment uses the Movielens dataset to analyse the matrix factorization techniques for content based recommendations.
Load the required libraries
##Install & load libraries ##
##If not found, install first.
unavailable <- setdiff(c("recommenderlab", "reshape2","dplyr", "useful", "data.table", "proxy", "Matrix", "irlba" ), rownames(installed.packages()))
if (length(unavailable)>0){
install.packages(unavailable)
}
##load the libraries
library(recommenderlab)
library(reshape2)
library(dplyr)
# to display the corners for large matrices
library(useful)
library(data.table)
library(proxy)
library(Matrix)
#for svd
library(irlba)
Data Acquisition & Preprocessing
#Download the movie lens datasets zip directly from the web site.
download.file("http://files.grouplens.org/datasets/movielens/ml-100k.zip", destfile = "ml-100k.zip")
#Unzip the zip file into movies directly of the current working directory.
unzip("ml-100k.zip", exdir = "movies")
dir("movies/ml-100k")
## [1] "allbut.pl" "mku.sh" "README" "u.data"
## [5] "u.genre" "u.info" "u.item" "u.occupation"
## [9] "u.user" "u1.base" "u1.test" "u2.base"
## [13] "u2.test" "u3.base" "u3.test" "u4.base"
## [17] "u4.test" "u5.base" "u5.test" "ua.base"
## [21] "ua.test" "ub.base" "ub.test"
#Ratings Dataset
ratings <- read.table("movies//ml-100k/u.data", header=FALSE, sep="\t", col.names = c("UserID", "MovieID", "Rating", "Timestamp"))
head(ratings)
## UserID MovieID Rating Timestamp
## 1 196 242 3 881250949
## 2 186 302 3 891717742
## 3 22 377 1 878887116
## 4 244 51 2 880606923
## 5 166 346 1 886397596
## 6 298 474 4 884182806
ratings$Timestamp <- as.POSIXct(ratings$Timestamp, origin="1970-01-01")
head(ratings)
## UserID MovieID Rating Timestamp
## 1 196 242 3 1997-12-04 09:55:49
## 2 186 302 3 1998-04-04 13:22:22
## 3 22 377 1 1997-11-07 01:18:36
## 4 244 51 2 1997-11-26 23:02:03
## 5 166 346 1 1998-02-01 23:33:16
## 6 298 474 4 1998-01-07 08:20:06
Prepare ‘User Ratings’ Matrix. Rows should represent the MovieID and Columns are UserID
##Prepare User Ratings Matrix. Rows should represent the MovieID and Columns are UserID
##-------------------------------------------------------------------------------------
(numberOfObservations <- nrow(ratings))
## [1] 100000
userratings <- ratings
for (i in 1:numberOfObservations){
if (userratings[i,3] > 3){
userratings[i,3] <- 1
}
else{
userratings[i,3] <- -1
}
}
#We do not need timestamp
userratings <- userratings[, -4]
head(userratings)
## UserID MovieID Rating
## 1 196 242 -1
## 2 186 302 -1
## 3 22 377 -1
## 4 244 51 -1
## 5 166 346 -1
## 6 298 474 1
#RE-SHAPE - dcast & binarize - long format to wide format with movie as row and userid as column
user.bin.ratings <- dcast(userratings, MovieID~UserID, value.var = "Rating", na.rm=FALSE)
for (i in 1:ncol(user.bin.ratings)){
user.bin.ratings[which(is.na(user.bin.ratings[,i]) == TRUE),i] <- 0
}
corner(user.bin.ratings)
## MovieID 1 2 3 4
## 1 1 1 1 0 0
## 2 2 -1 0 0 0
## 3 3 1 0 0 0
## 4 4 -1 0 0 0
## 5 5 -1 0 0 0
#remove MovieID col. Rows are MovieID, cols are UserID
user.bin.ratings = user.bin.ratings[,-1]
corner(user.bin.ratings)
## 1 2 3 4 5
## 1 1 1 0 0 1
## 2 -1 0 0 0 -1
## 3 1 0 0 0 0
## 4 -1 0 0 0 0
## 5 -1 0 0 0 0
#There are 1682 movies and 943 users ratings in the user.bin.ratings matrix.
dim(user.bin.ratings)
## [1] 1682 943
Prepare the ‘Movie Genres’ Matrix. Rows should represent the MovieID and Columns are Genres
##Prepare Movie Genres Matrix. Rows should represent the MovieID and Columns are Genres
##-------------------------------------------------------------------------------------
genres <- as.data.table(read.table("movies/ml-100k/u.genre", header=FALSE, sep = '|', quote = ""))
setnames(genres, c('Name', 'ID'))
knitr::kable(genres)
| Name | ID |
|---|---|
| unknown | 0 |
| Action | 1 |
| Adventure | 2 |
| Animation | 3 |
| Children’s | 4 |
| Comedy | 5 |
| Crime | 6 |
| Documentary | 7 |
| Drama | 8 |
| Fantasy | 9 |
| Film-Noir | 10 |
| Horror | 11 |
| Musical | 12 |
| Mystery | 13 |
| Romance | 14 |
| Sci-Fi | 15 |
| Thriller | 16 |
| War | 17 |
| Western | 18 |
data.movies <- as.data.table(read.table("movies/ml-100k/u.item", header=FALSE, sep = '|', quote = ""))
# removing the columns in which all values are NA
data.movies = data.movies[, colSums(is.na(data.movies))<nrow(data.movies), with = F]
corner(data.movies)
## V1 V2 V3
## 1: 1 Toy Story (1995) 01-Jan-1995
## 2: 2 GoldenEye (1995) 01-Jan-1995
## 3: 3 Four Rooms (1995) 01-Jan-1995
## 4: 4 Get Shorty (1995) 01-Jan-1995
## 5: 5 Copycat (1995) 01-Jan-1995
## V5 V6
## 1: http://us.imdb.com/M/title-exact?Toy%20Story%20(1995) 0
## 2: http://us.imdb.com/M/title-exact?GoldenEye%20(1995) 0
## 3: http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995) 0
## 4: http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995) 0
## 5: http://us.imdb.com/M/title-exact?Copycat%20(1995) 0
setnames(data.movies, c('MovieID', 'MovieName', 'ReleaseDate', 'URL', as.character(genres$Name)))
corner(data.movies,5,8)
## MovieID MovieName ReleaseDate
## 1: 1 Toy Story (1995) 01-Jan-1995
## 2: 2 GoldenEye (1995) 01-Jan-1995
## 3: 3 Four Rooms (1995) 01-Jan-1995
## 4: 4 Get Shorty (1995) 01-Jan-1995
## 5: 5 Copycat (1995) 01-Jan-1995
## URL unknown Action
## 1: http://us.imdb.com/M/title-exact?Toy%20Story%20(1995) 0 0
## 2: http://us.imdb.com/M/title-exact?GoldenEye%20(1995) 0 1
## 3: http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995) 0 0
## 4: http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995) 0 1
## 5: http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0
## Adventure Animation
## 1: 0 1
## 2: 1 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
#Final Movie Genres Matrix matrix
movie.genres <- data.movies[, -(1:5), with=F]
head(movie.genres)
## Action Adventure Animation Children's Comedy Crime Documentary Drama
## 1: 0 0 1 1 1 0 0 0
## 2: 1 1 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 1 0 0 0 1 0 0 1
## 5: 0 0 0 0 0 1 0 1
## 6: 0 0 0 0 0 0 0 1
## Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War
## 1: 0 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 1 0
## 3: 0 0 0 0 0 0 0 1 0
## 4: 0 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 1 0
## 6: 0 0 0 0 0 0 0 0 0
## Western
## 1: 0
## 2: 0
## 3: 0
## 4: 0
## 5: 0
## 6: 0
#There are 1682 movies and 18 genres in the movie.genres matrix.[ 1 indicates true, 0 indicates false]
dim(movie.genres)
## [1] 1682 18
Lets create a simple user profile matrix, by calculating the dot product of the movie genres matrix and the user binary ratings matrix.
##User Profile Matrix = Dot Product of (Movie Genres Matrix) and (User Ratings Matrix)
##User Profile Matrix - with rows as the Genres [18] and Columns as the User IDs [943]
#-------------------------------------------------------------------------------------
#
#Calculate dot product for User Profiles
genres.data <- as.data.frame(movie.genres)
(cols <- ncol(user.bin.ratings))
## [1] 943
(rows <- ncol(genres.data))
## [1] 18
user.profiles = matrix(0,rows,cols)
for (c in 1:cols){
for (r in 1:rows){
user.profiles[r,c] <- sum((genres.data[,r]) * (user.bin.ratings[,c]))
}
}
corner(user.profiles)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 3 4 -8 2 -10
## [2,] -8 3 0 0 -5
## [3,] -2 1 0 0 2
## [4,] -15 0 0 0 -19
## [5,] 7 8 -6 4 -16
#binarize
user.profiles <- as.matrix((user.profiles > 0) + 0)
corner(user.profiles)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 1 0 1 0
## [2,] 0 1 0 0 0
## [3,] 0 1 0 0 1
## [4,] 0 0 0 0 0
## [5,] 1 1 0 1 0
So, we have got the user profile matrix, which contains the users general aggreagated inclination towards the movie genres. Lets check for a sample user’s profile’s interests and recommend few movies based on the Jaccard similarity distance, which works better for binary matrices like this.
#
#Lets get the first users interests
(user.1.profile <- user.profiles[,1]) #First user's profile
## [1] 1 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0
#rbind it with the genres matrix, and check the similarity
similarity <- rbind.data.frame(user.1.profile, genres.data)
similarity <- data.frame(lapply(similarity,function(x){as.integer(x)})) #convert data to type integer
#Calculate Jaccard distance between our first user profile and all movie genres data.
similarity_results <- dist(similarity, method = "Jaccard")
similarity_results <- as.data.frame(as.matrix(similarity_results[1:nrow(genres.data)]))
#find the ones with minimum distance
closest.rows <- which(similarity_results == min(similarity_results))
#Recommended movies
genres.data[closest.rows,]
## Action Adventure Animation Children's Comedy Crime Documentary Drama
## 17 1 0 0 0 1 1 0 0
## 855 1 0 0 0 0 0 0 1
## Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War
## 17 0 0 1 0 0 0 0 1 0
## 855 0 0 0 0 1 1 0 1 0
## Western
## 17 0
## 855 0
corner(data.movies[closest.rows,],2,3)
## MovieID MovieName ReleaseDate
## 1: 17 From Dusk Till Dawn (1996) 05-Feb-1996
## 2: 855 Diva (1981) 01-Jan-1981
So, for content based recommendations, we do not need lot of user data, we can start giving recommendations to users based on the item data. On the flip side, if our item data is not well distributed ( say, we have got lots of action movies only), then the recommendations won’t be effective here.
Similary, the matrix factorization technique, SVD, compresses the ratings matrix by considering the only high varying values. Lets review this:
#Try to apply the SVD on the initial ratings matrix and review the object sizes.
head(ratings)
## UserID MovieID Rating Timestamp
## 1 196 242 3 1997-12-04 09:55:49
## 2 186 302 3 1998-04-04 13:22:22
## 3 22 377 1 1997-11-07 01:18:36
## 4 244 51 2 1997-11-26 23:02:03
## 5 166 346 1 1998-02-01 23:33:16
## 6 298 474 4 1998-01-07 08:20:06
m.matrix = sparseMatrix(i = ratings$UserID, j = ratings$MovieID,x=ratings$Rating)
#Lets just focus on 20 large singular values, and correponding singular vectors
m.svd = irlba(m.matrix, nu = 20, nv = 20)
(object.size(m.matrix))
## 1208152 bytes
recommenderlab::plot(m.svd$d, main="20 large singular values")
(d <- m.svd$d)
## [1] 640.63362 244.83635 217.84622 159.15360 158.21191 145.87261 126.57977
## [8] 121.90770 106.82918 99.74794 93.79886 93.25844 89.91150 84.34179
## [15] 83.81221 81.81204 79.07797 77.88653 76.38800 75.34159
corner(u <- m.svd$u)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.065804306 -0.005975064 -0.006132555 -0.08434724 -0.01418805
## [2,] 0.014021043 0.046626024 0.052578557 0.01628247 0.01546749
## [3,] 0.005657981 0.025618448 0.023361829 0.02856416 -0.04405688
## [4,] 0.005993102 0.020698495 0.012452276 0.01969657 -0.02511649
## [5,] 0.032747312 -0.009159010 -0.046130648 -0.01440656 -0.01051815
corner(v <- m.svd$v)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.09595094 0.087239785 -0.01697376 -0.016206075 0.14050607
## [2,] 0.03517952 0.007025058 -0.06250392 -0.003233914 -0.04075770
## [3,] 0.01992881 0.028618173 -0.01164050 -0.048800596 -0.00440180
## [4,] 0.05995224 -0.013050196 -0.02644554 -0.042999522 -0.03822088
## [5,] 0.02160711 0.015310568 -0.02646012 -0.017901490 -0.01565492
#Notice the size difference between the original ratings matrix Vs the factored matrices !
(object.size(m.matrix)- sum(object.size(m.svd$u), object.size(m.svd$d), object.size(m.svd$v) ))
## 787552 bytes
Lets use the recommenderlab package to generate the SVD model
The description and default parameters of SVD method in recommenderlab are as follows:
recommenderRegistry$get_entry('SVD', dataType='realRatingMatrix')
## Recommender method: SVD
## Description: Recommender based on SVD approximation with column-mean imputation (real data).
## Parameters:
## k maxiter normalize
## 1 10 100 center
Create the model using SVD method:
ratings.ratingMatrix <- as(ratings, "realRatingMatrix")
smp_size <- floor(0.75 * nrow(ratings.ratingMatrix))
set.seed(123)
train_ind <- sample(seq_len(nrow(ratings.ratingMatrix)), size = smp_size)
train.RatingMat <- ratings.ratingMatrix[train_ind, ]
test.RatingMat <- ratings.ratingMatrix[-train_ind, ]
svd_rec <- Recommender(
data=train.RatingMat,
method='SVD',
parameter=list(
categories=20, # number of latent factors
normalize='center', # normalizing by subtracting average rating per user;
method='Pearson' # use Pearson correlation
))
## Warning: Unknown parameters: categories, method
## Available parameter (with default values):
## k = 10
## maxiter = 100
## normalize = center
## verbose = FALSE
svd_rec
## Recommender of type 'SVD' for 'realRatingMatrix'
## learned using 707 users.
#recommend few movies to user 1
userID <- 1
topN <- 3
topN_recommendList <-predict(svd_rec,train.RatingMat[userID],n=topN)
corner(data.movies[topN_recommendList@items[[1]],],2,3)
## MovieID MovieName ReleaseDate
## 1: 100 Fargo (1996) 14-Feb-1997
## 2: 276 Leaving Las Vegas (1995) 01-Jan-1995