Movie Recommendation Engine

Data

About the Dataset:

We will be using the MovieLens dataset for this purpose. It has been collected by the GroupLens Research Project at the University of Minnesota. MovieLens 100K dataset can be downloaded from here. It consists of:

100,000 ratings (1-5) from 943 users on 1682 movies.
Each user has rated at least 20 movies.
Simple demographic info for the users (age, gender, occupation, zip)
Genre information of movies

Importing Data:

# Giving column names for each CSV . 
# Column names available in the readme file

#Reading users file
data_user<-read.csv("~/ml-100k/u.user",sep = '|',header = F,col.names = c('user_id', 'age', 'sex', 'occupation', 'zip_code'))

#Reading ratings file
data_rat<-read.csv("~/ml-100k/u.data",sep = '\t',header = F,col.names = c('user_id', 'movie_id', 'rating', 'unix_timestamp'))

#Reading items file
data_item<-read.csv("~/ml-100k/u.item",sep = '|',header = F,col.names = c('movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure','Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'))

Validating the imported data:

#Validating user files

dim(data_user)

## [1] 943   5

head(data_user)

##   user_id age sex occupation zip_code
## 1       1  24   M technician    85711
## 2       2  53   F      other    94043
## 3       3  23   M     writer    32067
## 4       4  24   M technician    43537
## 5       5  33   F      other    15213
## 6       6  42   M  executive    98101

#Validating ratings files

dim(data_rat)

## [1] 100000      4

head(data_rat)

##   user_id movie_id rating unix_timestamp
## 1     196      242      3      881250949
## 2     186      302      3      891717742
## 3      22      377      1      878887116
## 4     244       51      2      880606923
## 5     166      346      1      886397596
## 6     298      474      4      884182806

#Validating items files
dim(data_item)

## [1] 1682   24

head(data_item)

##   movie.id                                          movie.title
## 1        1                                     Toy Story (1995)
## 2        2                                     GoldenEye (1995)
## 3        3                                    Four Rooms (1995)
## 4        4                                    Get Shorty (1995)
## 5        5                                       Copycat (1995)
## 6        6 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
##   release.date video.release.date
## 1  01-Jan-1995                 NA
## 2  01-Jan-1995                 NA
## 3  01-Jan-1995                 NA
## 4  01-Jan-1995                 NA
## 5  01-Jan-1995                 NA
## 6  01-Jan-1995                 NA
##                                                       IMDb.URL unknown
## 1        http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)       0
## 2          http://us.imdb.com/M/title-exact?GoldenEye%20(1995)       0
## 3       http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)       0
## 4       http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)       0
## 5            http://us.imdb.com/M/title-exact?Copycat%20(1995)       0
## 6 http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)       0
##   Action Adventure Animation Children.s Comedy Crime Documentary Drama
## 1      0         0         1          1      1     0           0     0
## 2      1         1         0          0      0     0           0     0
## 3      0         0         0          0      0     0           0     0
## 4      1         0         0          0      1     0           0     1
## 5      0         0         0          0      0     1           0     1
## 6      0         0         0          0      0     0           0     1
##   Fantasy Film.Noir Horror Musical Mystery Romance Sci.Fi Thriller War
## 1       0         0      0       0       0       0      0        0   0
## 2       0         0      0       0       0       0      0        1   0
## 3       0         0      0       0       0       0      0        1   0
## 4       0         0      0       0       0       0      0        0   0
## 5       0         0      0       0       0       0      0        1   0
## 6       0         0      0       0       0       0      0        0   0
##   Western
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0
## 6       0

Loading Train and Test Dataset:

#Reading the test and train data
#Giving column names to both the data

#Reading train data
train<-read.csv("~/ml-100k/u1.base",sep = '\t',header = F,
                    col.names = c('user_id', 'movie_id', 'rating', 'unix_timestamp'))

#Reading test data
test<-read.csv("~/ml-100k/u1.test",sep = '\t',header = F,
                    col.names = c('user_id', 'movie_id', 'rating', 'unix_timestamp'))

Validating Train and Test Dataset:

#Validating train file

dim(train)

## [1] 80000     4

head(train)

##   user_id movie_id rating unix_timestamp
## 1       1        1      5      874965758
## 2       1        2      3      876893171
## 3       1        3      4      878542960
## 4       1        4      3      876893119
## 5       1        5      3      889751712
## 6       1        7      4      875071561

#Validating ratings files

dim(test)

## [1] 20000     4

head(test)

##   user_id movie_id rating unix_timestamp
## 1       1        6      5      887431973
## 2       1       10      3      875693118
## 3       1       12      5      878542960
## 4       1       14      5      874965706
## 5       1       17      3      875073198
## 6       1       20      4      887431883

Building A Simple Popularity Model

A simple approach could be to recommend the items which are liked by most number of users. This is a blazing fast and dirty approach and thus has a major drawback. The things is, there is no personalization involved with this approach. Lets start with making a popularity based model, i.e. the one where all the users have same recommendation based on the most popular choices.

#Grouping movie_id with mean rating

popularity_rec<-aggregate(train[, 3], list(train$movie_id), mean)
head(popularity_rec)

##   Group.1        x
## 1       1 3.892950
## 2       2 3.180952
## 3       3 3.000000
## 4       4 3.526316
## 5       5 3.304348
## 6       6 3.400000

#Arranging the mean rating in descending order by expanding the selection with movie_id

popularity_rec<- popularity_rec[order(-popularity_rec$x),] 
colnames(popularity_rec)<-c("movie_id","rating")
popularity_rec$rating <- round(popularity_rec$rating)
#Recommended movies with top ratings
head(popularity_rec,20)

##      movie_id rating
## 1116     1122      5
## 1182     1189      5
## 1194     1201      5
## 1285     1293      5
## 1450     1467      5
## 1480     1500      5
## 1567     1599      5
## 1621     1653      5
## 1434     1449      5
## 1353     1367      5
## 408       408      5
## 169       169      5
## 846       850      4
## 1562     1594      4
## 1610     1642      4
## 318       318      4
## 483       483      4
## 64         64      4
## 12         12      4
## 50         50      4

Building a Collaborating Filtering Model

Now lets come to the special class of algorithms which are tailor-made for solving the recommendation problem. There are typically two types of algorithms – Content Based and Collaborative Filtering. Item-Item Collaborative filtering: It is quite similar to previous algorithm, but instead of finding customer look alike, we try finding item look alike. Once we have item look alike matrix, we can easily recommend alike items to customer who have purchased any item from the store. This algorithm is far less resource consuming than user-user collaborative filtering. Hence, for a new customer the algorithm takes far lesser time than user-user collaborate as we don’t need all similarity scores between customers. And with fixed number of products, product-product look alike matrix is fixed over time.

Lets start by understanding the basics of a collaborative filtering algorithm. The core idea works in 2 steps:

1.Find similar items by using a similarity metric. 2.For a user, recommend the items most similar to the items he/she already likes. To give you a high level overview, this is done by making an item-item matrix in which we keep a record of the pair of items which were rated together.

In this case, an item is a movie. Once we have the matrix, we use it to determine the best recommendations for a user based on the movies he has already rated.

library(recommenderlab)

## Loading required package: Matrix

## Loading required package: arules

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## Loading required package: proxy

## 
## Attaching package: 'proxy'

## The following object is masked from 'package:Matrix':
## 
##     as.matrix

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loading required package: registry

## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy

#Convert data.frame in to transactions:
#Convert to binaryRatingMatrix:

data_train <- as(train, "transactions")

## Warning: Column(s) 1, 2, 3, 4 not logical or factor. Applying default
## discretization (see '? discretizeDF').

data_train_1 <- as(data_train, "binaryRatingMatrix")


data_test<- as(test, "transactions")

## Warning: Column(s) 1, 2, 3, 4 not logical or factor. Applying default
## discretization (see '? discretizeDF').

data_test_1 <- as(data_test, "binaryRatingMatrix")
# Find top 10 recomm movies with Item based collab filter

model1 <- Recommender(data = data_train_1, method = "IBCF", parameter = list(k = 25,method ="pearson"))
model1

## Recommender of type 'IBCF' for 'binaryRatingMatrix' 
## learned using 80000 users.

# Applying model to test

predicted1 <- predict(object = model1, newdata = data_test_1, n = 10)
predicted1

## Recommendations as 'topNList' with n = 10 for 20000 users.

# The latest among those predicted for each user as most recommended


reccom <- data.frame(user_id= sort(rep(1:length(predicted1@items))),
      rating = unlist(predicted1@ratings), movie_id = unlist(predicted1@items))

#Displaying the recommendations for first 25 users


reccom_list<- reccom[order(reccom$user_id),] 
head(reccom_list,25)

##        user_id    rating movie_id
## 11           1 0.5070863       10
## 25011        1 0.5070863       10
## 50011        1 0.5080044        6
## 75011        1 0.5123894        4
## 100011       1 0.5123894        4
## 125011       1 0.5043960       11
## 150011       1 0.5048248        6
## 175011       1 0.5091854        4
## 12           2 0.5019132        6
## 25012        2 0.5043330        2
## 50012        2 0.5043330        2
## 75012        2 0.5043330       11
## 100012       2 0.5069325        7
## 125012       2 0.5034303        1
## 150012       2 0.5014193       10
## 175012       2 0.5026783       10
## 13           3 0.5018705        5
## 25013        3 0.5017658        7
## 50013        3 0.5026233       10
## 75013        3 0.5032640        1
## 100013       3 0.5048487       11
## 125013       3 0.5014193        3
## 150013       3 0.5007818        2
## 175013       3 0.5013570        6
## 14           4 0.5010362        7

Evaluating Recommendation Engines:

Let’s compare both the models we have built till now based on precision-recall characteristics Classification models performance can also be compared using the ROC curve, which plots the true positive rate (TPR) against the false positive rate (FPR).

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:arules':
## 
##     intersect, recode, setdiff, setequal, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

#Evaluate Model M0

pred1 <- data.frame(movie_id=test$movie_id,rating=test$rating)
pred1<-merge(pred1,popularity_rec,by="movie_id")

# create the confusion matrix
cm = as.matrix(table(Actual = pred1$rating.x, Predicted = pred1$rating.y))
cm

##       Predicted
## Actual    1    2    3    4    5
##      1   21  262  788  300    3
##      2    6  212 1311  660    3
##      3    4  213 2576 2380    2
##      4    1  114 2272 4373   12
##      5    2   35  850 3543   25

n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class 
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes

precision = diag / colsums 
recall = diag / rowsums 
df<-data.frame(precision, recall) 
df

##   precision      recall
## 1 0.6176471 0.015283843
## 2 0.2535885 0.096715328
## 3 0.3303835 0.497777778
## 4 0.3885039 0.645747194
## 5 0.5555556 0.005611672

#Evaluate Model M1

es <- evaluationScheme(data_test_1,method = "cross-validation",k=10,given=-1)
ev <- evaluate(es,"IBCF",type="topNList",n=c(1,3,5,10,15,20),parameter = list(k = 25,method ="pearson"))

## IBCF run fold/sample [model time/prediction time]
##   1  [0.13sec/0.31sec] 
##   2  [0.01sec/0.27sec] 
##   3  [0.01sec/0.22sec] 
##   4  [0.02sec/0.19sec] 
##   5  [0sec/0.22sec] 
##   6  [0.01sec/0.19sec] 
##   7  [0sec/0.2sec] 
##   8  [0.02sec/0.22sec] 
##   9  [0.01sec/0.22sec] 
##   10  [0.01sec/0.19sec]

#Average Confusion Matrix
tmp <- ev %>% getConfusionMatrix() %>% as.list()
avg_cm <- as.data.frame(Reduce("+",tmp)/length(tmp)) %>% mutate(n=c(1,3,5,10,15,20)) %>% select('n','precision','recall','TPR','FPR')

avg_cm %>% ggplot(aes(FPR,TPR))+geom_line(color='red')+geom_label(aes(label = n))+labs(title = "ROC Curve")+theme_gray(base_size = 14)

avg_cm %>% ggplot(aes(recall,precision))+geom_line(color='blue')+geom_label(aes(label = n))+labs(title = "ROC Curve")+theme_gray(base_size = 14)

Observations:

1.The item similarity model is definitely better than the popularity model (by at least 10x) 2.On an absolute level, even the item similarity model appears to have a poor performance. It is far from being a useful recommendation system.