DATA 643 - Project 3

Matrix factorization in the context of recommender systems.

Description

This assignment uses the Movielens dataset to analyse the matrix factorization techniques for content based recommendations.

Load and prepare the data for analysis

Load the required libraries

##Install & load libraries ##

##If not found, install first.
unavailable <- setdiff(c("recommenderlab", "reshape2","dplyr", "useful", "data.table", "proxy", "Matrix", "irlba" ), rownames(installed.packages()))
if (length(unavailable)>0){
  install.packages(unavailable)
}

##load the libraries
library(recommenderlab)
library(reshape2)
library(dplyr)
# to display the corners for large matrices
library(useful) 
library(data.table)
library(proxy)
library(Matrix)
#for svd
library(irlba)

Data Acquisition & Preprocessing

#Download the movie lens datasets zip directly from the web site.
download.file("http://files.grouplens.org/datasets/movielens/ml-100k.zip", destfile = "ml-100k.zip")

#Unzip the zip file into movies directly of the current working directory.
unzip("ml-100k.zip", exdir = "movies")
dir("movies/ml-100k")

##  [1] "allbut.pl"    "mku.sh"       "README"       "u.data"      
##  [5] "u.genre"      "u.info"       "u.item"       "u.occupation"
##  [9] "u.user"       "u1.base"      "u1.test"      "u2.base"     
## [13] "u2.test"      "u3.base"      "u3.test"      "u4.base"     
## [17] "u4.test"      "u5.base"      "u5.test"      "ua.base"     
## [21] "ua.test"      "ub.base"      "ub.test"

#Ratings Dataset
ratings <- read.table("movies//ml-100k/u.data", header=FALSE, sep="\t", col.names = c("UserID", "MovieID", "Rating", "Timestamp"))
head(ratings)

##   UserID MovieID Rating Timestamp
## 1    196     242      3 881250949
## 2    186     302      3 891717742
## 3     22     377      1 878887116
## 4    244      51      2 880606923
## 5    166     346      1 886397596
## 6    298     474      4 884182806

ratings$Timestamp <- as.POSIXct(ratings$Timestamp, origin="1970-01-01")
head(ratings)

##   UserID MovieID Rating           Timestamp
## 1    196     242      3 1997-12-04 09:55:49
## 2    186     302      3 1998-04-04 13:22:22
## 3     22     377      1 1997-11-07 01:18:36
## 4    244      51      2 1997-11-26 23:02:03
## 5    166     346      1 1998-02-01 23:33:16
## 6    298     474      4 1998-01-07 08:20:06

Prepare ‘User Ratings’ Matrix. Rows should represent the MovieID and Columns are UserID

##Prepare User Ratings Matrix. Rows should represent the MovieID and Columns are UserID
##-------------------------------------------------------------------------------------
(numberOfObservations <- nrow(ratings))

## [1] 100000

userratings <- ratings
for (i in 1:numberOfObservations){
  if (userratings[i,3] > 3){
    userratings[i,3] <- 1
  }
  else{
    userratings[i,3] <- -1
  }
}
#We do not need timestamp
userratings <- userratings[, -4]
head(userratings)

##   UserID MovieID Rating
## 1    196     242     -1
## 2    186     302     -1
## 3     22     377     -1
## 4    244      51     -1
## 5    166     346     -1
## 6    298     474      1

#RE-SHAPE - dcast & binarize -  long format to wide format with movie as row and userid as column
user.bin.ratings <- dcast(userratings, MovieID~UserID, value.var = "Rating", na.rm=FALSE)
for (i in 1:ncol(user.bin.ratings)){
  user.bin.ratings[which(is.na(user.bin.ratings[,i]) == TRUE),i] <- 0
}
corner(user.bin.ratings)

##   MovieID  1 2 3 4
## 1       1  1 1 0 0
## 2       2 -1 0 0 0
## 3       3  1 0 0 0
## 4       4 -1 0 0 0
## 5       5 -1 0 0 0

#remove MovieID col. Rows are MovieID, cols are UserID
user.bin.ratings = user.bin.ratings[,-1] 
corner(user.bin.ratings)

##    1 2 3 4  5
## 1  1 1 0 0  1
## 2 -1 0 0 0 -1
## 3  1 0 0 0  0
## 4 -1 0 0 0  0
## 5 -1 0 0 0  0

#There are 1682 movies and 943 users ratings in the user.bin.ratings matrix.
dim(user.bin.ratings)

## [1] 1682  943

Prepare the ‘Movie Genres’ Matrix. Rows should represent the MovieID and Columns are Genres

##Prepare Movie Genres Matrix. Rows should represent the MovieID and Columns are Genres
##-------------------------------------------------------------------------------------
genres <- as.data.table(read.table("movies/ml-100k/u.genre", header=FALSE, sep = '|',  quote = ""))
setnames(genres, c('Name', 'ID'))
knitr::kable(genres)

Name	ID
unknown	0
Action	1
Adventure	2
Animation	3
Children’s	4
Comedy	5
Crime	6
Documentary	7
Drama	8
Fantasy	9
Film-Noir	10
Horror	11
Musical	12
Mystery	13
Romance	14
Sci-Fi	15
Thriller	16
War	17
Western	18

data.movies <- as.data.table(read.table("movies/ml-100k/u.item", header=FALSE,  sep = '|', quote = ""))
# removing the columns in which all values are NA 
data.movies = data.movies[, colSums(is.na(data.movies))<nrow(data.movies), with = F]
corner(data.movies)

##    V1                V2          V3
## 1:  1  Toy Story (1995) 01-Jan-1995
## 2:  2  GoldenEye (1995) 01-Jan-1995
## 3:  3 Four Rooms (1995) 01-Jan-1995
## 4:  4 Get Shorty (1995) 01-Jan-1995
## 5:  5    Copycat (1995) 01-Jan-1995
##                                                        V5 V6
## 1:  http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)  0
## 2:    http://us.imdb.com/M/title-exact?GoldenEye%20(1995)  0
## 3: http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)  0
## 4: http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)  0
## 5:      http://us.imdb.com/M/title-exact?Copycat%20(1995)  0

setnames(data.movies, c('MovieID', 'MovieName', 'ReleaseDate', 'URL', as.character(genres$Name)))
corner(data.movies,5,8)

##    MovieID         MovieName ReleaseDate
## 1:       1  Toy Story (1995) 01-Jan-1995
## 2:       2  GoldenEye (1995) 01-Jan-1995
## 3:       3 Four Rooms (1995) 01-Jan-1995
## 4:       4 Get Shorty (1995) 01-Jan-1995
## 5:       5    Copycat (1995) 01-Jan-1995
##                                                       URL unknown Action
## 1:  http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)       0      0
## 2:    http://us.imdb.com/M/title-exact?GoldenEye%20(1995)       0      1
## 3: http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)       0      0
## 4: http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)       0      1
## 5:      http://us.imdb.com/M/title-exact?Copycat%20(1995)       0      0
##    Adventure Animation
## 1:         0         1
## 2:         1         0
## 3:         0         0
## 4:         0         0
## 5:         0         0

#Final Movie Genres Matrix matrix
movie.genres <- data.movies[, -(1:5), with=F]
head(movie.genres)

##    Action Adventure Animation Children's Comedy Crime Documentary Drama
## 1:      0         0         1          1      1     0           0     0
## 2:      1         1         0          0      0     0           0     0
## 3:      0         0         0          0      0     0           0     0
## 4:      1         0         0          0      1     0           0     1
## 5:      0         0         0          0      0     1           0     1
## 6:      0         0         0          0      0     0           0     1
##    Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War
## 1:       0         0      0       0       0       0      0        0   0
## 2:       0         0      0       0       0       0      0        1   0
## 3:       0         0      0       0       0       0      0        1   0
## 4:       0         0      0       0       0       0      0        0   0
## 5:       0         0      0       0       0       0      0        1   0
## 6:       0         0      0       0       0       0      0        0   0
##    Western
## 1:       0
## 2:       0
## 3:       0
## 4:       0
## 5:       0
## 6:       0

#There are 1682 movies and 18 genres in the movie.genres matrix.[ 1 indicates true, 0 indicates false]
dim(movie.genres)

## [1] 1682   18

Build User Profile Matrix for recommendation. ( DOT PRODUCT of the ‘Movie Ratings’ and ‘Movie Genres’ )

Lets create a simple user profile matrix, by calculating the dot product of the movie genres matrix and the user binary ratings matrix.

##User Profile Matrix = Dot Product of (Movie Genres Matrix) and (User Ratings Matrix)
##User Profile Matrix - with rows as the Genres [18] and Columns as the User IDs [943]
#-------------------------------------------------------------------------------------
#

#Calculate dot product for User Profiles
genres.data <- as.data.frame(movie.genres)
(cols <- ncol(user.bin.ratings))

## [1] 943

(rows <- ncol(genres.data))

## [1] 18

user.profiles = matrix(0,rows,cols)

for (c in 1:cols){
  for (r in 1:rows){
    user.profiles[r,c] <- sum((genres.data[,r]) * (user.bin.ratings[,c]))
  }
}
corner(user.profiles)

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    3    4   -8    2  -10
## [2,]   -8    3    0    0   -5
## [3,]   -2    1    0    0    2
## [4,]  -15    0    0    0  -19
## [5,]    7    8   -6    4  -16

#binarize
user.profiles <- as.matrix((user.profiles > 0) + 0)
corner(user.profiles)

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    1    0    1    0
## [2,]    0    1    0    0    0
## [3,]    0    1    0    0    1
## [4,]    0    0    0    0    0
## [5,]    1    1    0    1    0

Recommend movies to user 1.

So, we have got the user profile matrix, which contains the users general aggreagated inclination towards the movie genres. Lets check for a sample user’s profile’s interests and recommend few movies based on the Jaccard similarity distance, which works better for binary matrices like this.

#
#Lets get the first users interests
(user.1.profile <- user.profiles[,1]) #First user's profile

##  [1] 1 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0

#rbind it with the genres matrix, and check the similarity
similarity <- rbind.data.frame(user.1.profile, genres.data)
similarity <- data.frame(lapply(similarity,function(x){as.integer(x)})) #convert data to type integer

#Calculate Jaccard distance between our first user profile and all movie genres data.
similarity_results <- dist(similarity, method = "Jaccard")
similarity_results <- as.data.frame(as.matrix(similarity_results[1:nrow(genres.data)]))
#find the ones with minimum distance
closest.rows <- which(similarity_results == min(similarity_results))

#Recommended movies
genres.data[closest.rows,]

##     Action Adventure Animation Children's Comedy Crime Documentary Drama
## 17       1         0         0          0      1     1           0     0
## 855      1         0         0          0      0     0           0     1
##     Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War
## 17        0         0      1       0       0       0      0        1   0
## 855       0         0      0       0       1       1      0        1   0
##     Western
## 17        0
## 855       0

corner(data.movies[closest.rows,],2,3)

##    MovieID                  MovieName ReleaseDate
## 1:      17 From Dusk Till Dawn (1996) 05-Feb-1996
## 2:     855                Diva (1981) 01-Jan-1981

Observation:

So, for content based recommendations, we do not need lot of user data, we can start giving recommendations to users based on the item data. On the flip side, if our item data is not well distributed ( say, we have got lots of action movies only), then the recommendations won’t be effective here.

Apply SVD to ratings matrix.

Similary, the matrix factorization technique, SVD, compresses the ratings matrix by considering the only high varying values. Lets review this:

#Try to apply the SVD on the initial ratings matrix and review the object sizes.
head(ratings)

##   UserID MovieID Rating           Timestamp
## 1    196     242      3 1997-12-04 09:55:49
## 2    186     302      3 1998-04-04 13:22:22
## 3     22     377      1 1997-11-07 01:18:36
## 4    244      51      2 1997-11-26 23:02:03
## 5    166     346      1 1998-02-01 23:33:16
## 6    298     474      4 1998-01-07 08:20:06

m.matrix = sparseMatrix(i = ratings$UserID, j = ratings$MovieID,x=ratings$Rating)

#Lets just focus on 20 large singular values, and correponding singular vectors
m.svd = irlba(m.matrix, nu = 20, nv = 20) 

(object.size(m.matrix))

## 1208152 bytes

recommenderlab::plot(m.svd$d, main="20 large singular values")

(d <- m.svd$d)

##  [1] 640.63362 244.83635 217.84622 159.15360 158.21191 145.87261 126.57977
##  [8] 121.90770 106.82918  99.74794  93.79886  93.25844  89.91150  84.34179
## [15]  83.81221  81.81204  79.07797  77.88653  76.38800  75.34159

corner(u <- m.svd$u)

##             [,1]         [,2]         [,3]        [,4]        [,5]
## [1,] 0.065804306 -0.005975064 -0.006132555 -0.08434724 -0.01418805
## [2,] 0.014021043  0.046626024  0.052578557  0.01628247  0.01546749
## [3,] 0.005657981  0.025618448  0.023361829  0.02856416 -0.04405688
## [4,] 0.005993102  0.020698495  0.012452276  0.01969657 -0.02511649
## [5,] 0.032747312 -0.009159010 -0.046130648 -0.01440656 -0.01051815

corner(v <- m.svd$v)

##            [,1]         [,2]        [,3]         [,4]        [,5]
## [1,] 0.09595094  0.087239785 -0.01697376 -0.016206075  0.14050607
## [2,] 0.03517952  0.007025058 -0.06250392 -0.003233914 -0.04075770
## [3,] 0.01992881  0.028618173 -0.01164050 -0.048800596 -0.00440180
## [4,] 0.05995224 -0.013050196 -0.02644554 -0.042999522 -0.03822088
## [5,] 0.02160711  0.015310568 -0.02646012 -0.017901490 -0.01565492

#Notice the size difference between the original ratings matrix Vs the factored matrices !
(object.size(m.matrix)- sum(object.size(m.svd$u), object.size(m.svd$d), object.size(m.svd$v) ))

## 787552 bytes

Lets use the recommenderlab package to generate the SVD model

The description and default parameters of SVD method in recommenderlab are as follows:

recommenderRegistry$get_entry('SVD', dataType='realRatingMatrix')

## Recommender method: SVD
## Description: Recommender based on SVD approximation with column-mean imputation (real data).
## Parameters:
##    k maxiter normalize
## 1 10     100    center

Create the model using SVD method:

ratings.ratingMatrix <- as(ratings, "realRatingMatrix")
smp_size <- floor(0.75 * nrow(ratings.ratingMatrix))
set.seed(123)
train_ind <- sample(seq_len(nrow(ratings.ratingMatrix)), size = smp_size)

train.RatingMat <- ratings.ratingMatrix[train_ind, ]
test.RatingMat <- ratings.ratingMatrix[-train_ind, ]

svd_rec <- Recommender(
   data=train.RatingMat,
   method='SVD',            
   parameter=list(
     categories=20,         # number of latent factors
     normalize='center',    # normalizing by subtracting average rating per user;
     method='Pearson'       # use Pearson correlation
   ))

## Warning: Unknown parameters: categories, method

## Available parameter (with default values):
## k     =  10
## maxiter   =  100
## normalize     =  center
## verbose   =  FALSE

svd_rec

## Recommender of type 'SVD' for 'realRatingMatrix' 
## learned using 707 users.

#recommend few movies to user 1
userID <- 1
topN <- 3
topN_recommendList <-predict(svd_rec,train.RatingMat[userID],n=topN) 
corner(data.movies[topN_recommendList@items[[1]],],2,3)

##    MovieID                MovieName ReleaseDate
## 1:     100             Fargo (1996) 14-Feb-1997
## 2:     276 Leaving Las Vegas (1995) 01-Jan-1995