Content-Based and Collaborative Filtering

Using data ratings from library SVDApproximation, there are 1M ratings with 6040 users and 3706 movies. The ratings are 1-5 (stars).

library(recommenderlab)

## Loading required package: Matrix

## Loading required package: arules

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## Loading required package: proxy

## 
## Attaching package: 'proxy'

## The following object is masked from 'package:Matrix':
## 
##     as.matrix

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loading required package: registry

library(SVDApproximation)

## Warning: replacing previous import 'data.table::melt' by 'reshape2::melt'
## when loading 'SVDApproximation'

## Warning: replacing previous import 'data.table::dcast' by 'reshape2::dcast'
## when loading 'SVDApproximation'

library(devtools)
install_github(repo = "SVDApproximation", username = "tarashnot")

## Warning: Username parameter is deprecated. Please use tarashnot/
## SVDApproximation

## Skipping install of 'SVDApproximation' from a github remote, the SHA1 (b53f26e5) has not changed since last install.
##   Use `force = TRUE` to force installation

library(SVDApproximation)

dim(ratings)

## [1] 1000209       3

head(ratings)

##    user item rating
## 1:    1    1      5
## 2:    6    1      4
## 3:    8    1      4
## 4:    9    1      5
## 5:   10    1      5
## 6:   18    1      4

summary(ratings)

##       user           item          rating     
##  Min.   :   1   Min.   :   1   Min.   :1.000  
##  1st Qu.:1506   1st Qu.: 966   1st Qu.:3.000  
##  Median :3070   Median :1658   Median :4.000  
##  Mean   :3025   Mean   :1731   Mean   :3.582  
##  3rd Qu.:4476   3rd Qu.:2566   3rd Qu.:4.000  
##  Max.   :6040   Max.   :3706   Max.   :5.000

visualize_ratings(ratings_table = ratings)

Content-Based

# computer item average as missing value
sparse_ratings <- sparseMatrix(i = ratings$user, j = ratings$item, x = ratings$rating, 
                               dims = c(length(unique(ratings$user)), length(unique(ratings$item))),  
                               dimnames = list(paste("u", 1:length(unique(ratings$user)), sep = ""), 
                                               paste("m", 1:length(unique(ratings$item)), sep = "")))
sparse_ratings[1:10, 1:10]

## 10 x 10 sparse Matrix of class "dgCMatrix"

##    [[ suppressing 10 column names 'm1', 'm2', 'm3' ... ]]

##                        
## u1  5 . . . . . . . . .
## u2  . . . . . . . . . .
## u3  . . . . . . . . . .
## u4  . . . . . . . . . .
## u5  . . . . . 2 . . . .
## u6  4 . . . . . . . . .
## u7  . . . . . 4 . . . .
## u8  4 . . 3 . . . . . .
## u9  5 . . . . . . . . .
## u10 5 5 . . . . 4 . . .

(real_ratings <- new("realRatingMatrix", data = sparse_ratings))

## 6040 x 3706 rating matrix of class 'realRatingMatrix' with 1000209 ratings.

# create a Recommender model
model <- Recommender(real_ratings, method = "POPULAR", param=list(normalize = "center"))

# prediction of missing values for first 7 users using item average
prediction <- predict(model, real_ratings[1:7], type="ratings")
(as(prediction, "matrix")[,1:7])

##          m1       m2       m3       m4       m5       m6       m7
## u1       NA 3.864537 3.748432 3.489864 3.768903 4.469532 4.073769
## u2 4.192675 3.389036 3.272931 3.014363 3.293402 3.994031 3.598268
## u3 4.381458 3.577818 3.461714 3.203145 3.482184 4.182814 3.787050
## u4 4.669973 3.866334 3.750229 3.491661 3.770700 4.471329 4.075566
## u5 3.625962 2.822322 2.706218 2.447649 2.726688       NA 3.031554
## u6       NA 3.577266 3.461161 3.202593 3.481632 4.182261 3.786498
## u7 4.802078 3.998438 3.882334 3.623765 3.902804       NA 4.207670

# model evaluation
# 5 ratings of 20% of users are excluded for testing
set.seed(1)
e <- evaluationScheme(real_ratings, method="split", train=0.8, given=-7)

model <- Recommender(getData(e, "train"), "POPULAR")
prediction <- predict(model, getData(e, "known"), type="ratings")

rmse_popular <- calcPredictionAccuracy(prediction, getData(e, "unknown"))[1]
rmse_popular

##      RMSE 
## 0.9578232

User-User Collaborative Filtering

#Building model
model <- Recommender(real_ratings, method = "UBCF", 
                     param=list(normalize = "center", method="Cosine", nn=50))

#Making predictions 
prediction <- predict(model, real_ratings[1:5], type="ratings")
as(prediction, "matrix")[,1:5]

##          m1       m2       m3       m4       m5
## u1       NA 4.211122 4.198299 4.188679 4.188679
## u2 3.921419 3.648803 3.677701 3.710468 3.753001
## u3 3.985738 3.799285 3.850174 3.891836 3.861364
## u4 4.167479 4.183525 4.190476 4.190476 4.195312
## u5 3.275539 3.071561 3.122653 3.118225 3.146465

#Estimating RMSE
set.seed(1)

model <- Recommender(getData(e, "train"), method = "UBCF", 
                     param=list(normalize = "center", method="Cosine", nn=50))

prediction <- predict(model, getData(e, "known"), type="ratings")

rmse_ubcf <- calcPredictionAccuracy(prediction, getData(e, "unknown"))[1]
rmse_ubcf

##     RMSE 
## 0.985047

Item-Item Collaborative Filtering

#Building model
model <- Recommender(real_ratings, method = "IBCF", 
                     param=list(normalize = "center", method="Cosine", k=350))

#Making predictions 
prediction <- predict(model, real_ratings[1:5], type="ratings")
as(prediction, "matrix")[,1:8]

##          m1       m2       m3       m4       m5       m6       m7       m8
## u1       NA 4.315856 4.077801 4.309387 4.299674 4.419849 4.000000 4.122938
## u2 4.229380 3.890647 3.905199 3.987753 4.061174 3.828620 4.003760 4.487145
## u3 4.004607 4.174743 4.256458 4.214143 4.098748 4.109843 4.147706 4.408627
## u4 4.254916 4.500481 4.083806 4.462319 4.118940 4.404064 4.269858 4.195575
## u5 3.518569 3.804019 3.549567 3.653532 4.137978       NA 3.791349 3.595634

#Estimating RMSE
set.seed(1)

model <- Recommender(getData(e, "train"), method = "IBCF", 
                     param=list(normalize = "center", method="Cosine", k=350))

prediction <- predict(model, getData(e, "known"), type="ratings")

rmse_ibcf <- calcPredictionAccuracy(prediction, getData(e, "unknown"))[1]
rmse_ibcf

##     RMSE 
## 1.032426

Summary of findings and recommendations

Summary of findings and recommendations Compare the RMSEs of content based model, user-user based model, and item-item based model, they are 0.9578232, 0.985047, 1.032426 respectively. The results show that the model accuracy from high to low is content based model, user-user based model, item-item based model. Generally speaking, the highest performance of these three models is item-item based model, and the content based is the lowest. It looks a conflict of the results with the general model performance. I would consider to rerun the model at least 10 times to get a more accurate result.

Reference:

Building a Recommendation System with R by Suresh K. Gorakala, Michele Usuelli

https://rpubs.com/tarashnot/recommender_comparison

Project 2 - Data 643

Ann Liu-Ferrara

June 12, 2017