Come up with a recommender system that provides solution to following
The dataset for this project was originally posted on snap website http://snap.stanford.edu/data/web-FineFoods.html and now available also on kaggle site. The dataset contains 500k reviews of 74k products from 250k users. https://www.kaggle.com/snap/amazon-fine-food-reviews
Citation for data
J. McAuley and J. Leskovec. From amateurs to connoisseurs: modeling the evolution of user expertise through online reviews. WWW, 2013. http://i.stanford.edu/~julian/pdfs/www13.pdf
library(data.table)
library(recommenderlab)
library(knitr)
#Read Data
reviews_raw <- fread("Reviews.csv")
##
Read 26.4% of 568454 rows
Read 44.0% of 568454 rows
Read 75.6% of 568454 rows
Read 568454 rows and 10 (of 10) columns from 0.280 GB file in 00:00:06
user_cnt <- as.vector(table(reviews_raw$UserId))
prd_cnt <- as.vector(table(reviews_raw$ProductId))
par(mfrow=c(1,2))
hist(prd_cnt,breaks = 913,xlim=c(1,50),col="orchid",
main="Reviews By Product", xlab = "Number of Reviews")
hist(user_cnt,breaks = 448,xlim=c(1,50),col="orchid",
main="Reviews by user", xlab = "Number of Reviews")
The distribution is as expected with more than 85% users and more than 75% products having less than 5 reviews.
The data has more than one reviews from some users for one product. But the percentage of duplicate is low and hence We shall remove the duplicates.
#Check for duplicate and Remove duplicates
reviews_cnt <- reviews_raw[,.N,.(UserId,ProductId)]
reviews_dup <- reviews_cnt[N>1,]
message("Number of Duplicate Rating:",nrow(reviews_dup)," Percent: ",
round(nrow(reviews_dup)/nrow(reviews_cnt),2)*100,"%")
## Number of Duplicate Rating:5859 Percent: 1%
dups <- paste(reviews_dup$UserId,reviews_dup$ProductId,sep="-")
userprod <- paste(reviews_raw$UserId,reviews_raw$ProductId,sep="-")
nondup <- !(userprod %in% dups)
reviews_uniq <- reviews_raw[nondup,]
#Select required columns
cols <- colnames(reviews_uniq)[c(2,3,7)]
reviews_score <- reviews_uniq[,cols,with=F]
In order to build collaborative filtering model we need select subset of data such that the user has at least n reviews.
# Function to get ratings such that at least n reviews arepreent for a user
get_nreviews <- function(reviews_score,n=15){
prd_cnt <- table(reviews_score$ProductId)
#Select top 2000 Products
prds <- rownames(sort(prd_cnt,decreasing = T)[1:2000])
reviews_score <- reviews_score[ ProductId %in% prds,]
#Select users such that at least n reviews present
usr_cnt <- table(reviews_score$UserId)
usrs <- rownames(usr_cnt[usr_cnt > n])
reviews_score <- reviews_score[ UserId %in% usrs,]
reviews_score <- dcast(reviews_score,UserId ~ ProductId,
value.var =c("Score") )
reviews_nscore <- as.matrix(reviews_score[,-1,with=F])
rownames(reviews_nscore) <- reviews_score$UserId
return(reviews_nscore)
}
reviews_nscore <- get_nreviews(reviews_score, 10)
Normalizing data didn’t help and resulted in poor performance. It could be possibly due to many users providing rating only for items they like.
rating_cnt <- table(reviews_nscore)
barplot(rating_cnt,names=1:5, main = "Rating Distribution")
The dataset is split into training and testing set using recommederlab package.
#Convert to real rating matrix
reviews_score_rrm <- as(reviews_nscore,"realRatingMatrix")
set.seed(50) #This doesn't seems to work.The following split changes if rerun
eval_sets <- evaluationScheme(data = reviews_score_rrm, method = "split",
train = .8,given = 8,goodRating=3, k=1)
unknown <- as(getData(eval_sets,"unknown"),"matrix")
known <- as(getData(eval_sets,"known"),"matrix")
test_user <- rownames(as(getData(eval_sets,"unknown"),"matrix"))
test_rating <- reviews_nscore[test_user, ]
Three recommendation methods popularity based recommendation, user based collaborative filtering and item based collaborative filtering are built.
#Function to identify top products
get_top_prods<- function(reviews_score, n=100){
good_rated <- reviews_score[Score >= 3,]
prd_cnt <- reviews_score[,.N,ProductId]
setorder(prd_cnt,-N)
top_prds <- prd_cnt[1:n,]
return(top_prds)
}
#Function that makes recommendation from popular products
get_popular <- function(known_rating,popular,n){
reco <- known_rating
reco[,] <- 0
k <- nrow(popular)
prds <- colnames(known_rating)
for (i in 1:nrow(known_rating)){
visited_prods <- prds[!is.na(known_rating[i,])]
popular_sel <- popular[!(ProductId %in% visited_prods),]
selected <- sample(popular_sel$ProductId,n)
reco[i,prds %in% selected] <- 1
}
return(reco)
}
#Function to compute true positives
get_tp <- function(pred,actual,nreco,goodRating=3){
tp <- sum(pred > 0 & actual >= goodRating,na.rm=T)
mean_tp <- tp/(nrow(actual))
return(mean_tp)
}
Top 100 products are considered as popular products and later used to make popularity based recommendations.
popular_prods <- get_top_prods(reviews_score,100)
train_ibcf <- function(eval_sets, nreco = 10){
model_ibcf=Recommender(data=getData(eval_sets,"train"),method="IBCF",
parameter=list(method="pearson",normalize=NA,k=20))
recommend_rli=predict(model_ibcf,getData(eval_sets,"known"),n=10)
eval_accuracy <- calcPredictionAccuracy(
x = recommend_rli, data = getData(eval_sets, "unknown"), byUser = F, goodRating = 3,
given=10)
message("TP Rate: ",round(eval_accuracy[1]/nreco*100,2),"%")
return(model_ibcf)
}
model_ibcf <- train_ibcf(eval_sets)
## TP Rate: 21.59%
The item based collaborative filtering gave 20% true positive rate for the first 10 recommendations.
#Function to train ubcf model
train_ubcf <- function(eval_sets, nreco = 10){
model_ubcf=Recommender(data=getData(eval_sets,"train"),method="UBCF",
parameter=list(method="cosine",nn=20, normalize=NA
))
recommend_rl=predict(model_ubcf,getData(eval_sets,"known"),n=nreco)
eval_accuracy <- calcPredictionAccuracy(
x = recommend_rl, data = getData(eval_sets, "unknown"), byUser = F, goodRating = 3,
given=8)
message("TP Rate: ",round(eval_accuracy[1]/nreco*100,2),"%")
return(model_ubcf)
}
model_ubcf <- train_ubcf(eval_sets)
## TP Rate: 40.19%
User based collaborative method performed better than item based collaborative filtering and gave about 40% true positive rate for first 10 recommendations.
For new user with no rating the recommendation are made from popular item list. The recommendation are selected randomly from top 100 items rather than suggesting only top items. This allows for exploration and hence the function defined above to get rating by popularity has a naive explore/exploitation scheme built-in.
A collaborative system that always recommends same set of products is not attractive. Also, the accuracy of predictions does not improve after initial set of recommendation.
Heuristic Methods Ratio Based - Select n% of items from top recommended list and rest randomly eGreedy - Select top recommended item with probability 1 - e and other items with probability e
#Function that makes recommendation using explore/exploit method
explore_exploit_herustic <- function(recos,n,exploit=0.3,prob=NULL){
bn <- bestN(recos,ceiling(n*exploit))
ratings = as(bn,"matrix")
ratings[!is.na(ratings)] <- 1
ratings[is.na(ratings)] <- 0
for(i in 1:nrow(ratings)){
items <- which(ratings[i,]==0)
sel <- sample(items,n*(1-exploit))
ratings[i,sel] <- 1
}
return(ratings)
}
#Explore exploit scheme based on eGreedy method
explore_exploit_eGreedy<- function(recos,e=.1,n,goodRating = 3){
bn <- recos
ratings = as(bn,"matrix")
ratings[is.na(ratings)] <- -1
for(i in 1:nrow(ratings)){
#Initialize probabilities to e
probs <- as.vector(rep(e,ncol(ratings)))
#Set probability of recommended items to 1 - e
probs[ratings[i,]>=goodRating] <- (1 - e)
sel <- sample(ncol(ratings),n,prob=probs)
ratings[i,] <- 0
ratings[i,sel] <- 1
}
return(ratings)
}
Evaluating the model in batch mode has its restriction. It does not allow one to accurately check performance when explore/exploit is used.
In order to do a holistic evaluation of the system we need to simulate user visit and check the progressive performance of the system.
#Function to set known ratings
set_known <- function(x,n){
known = x
unknown = x
for(i in 1:nrow(x)){
all_known <- which(!is.na(x[i,]))
nknown <- length(all_known)
if (n < nknown){
known[i,all_known[(n+1):nknown]] <- NA
unknown[i,all_known[1:n]] <- NA
} else {
unknown[i,] <- NA
}
}
return(list(known=known,unknown=unknown))
}
#Function to simulate user visit and use a recommendation scheme
simulate_user_visit <- function(rating,reco_scheme,nvisits=10,nsel=2,nreco=10,
...){
evals <- NULL
cum_tp <- 0
for (i in 1:nvisits){
message("User Visit: ",i)
#Number of items user buys in each visit
selected <- 1 + (i - 1) * nsel
split_rating <- set_known(rating, selected)
known_rating <- split_rating$known
unknown_rating <- split_rating$unknown
recos <- reco_scheme(known_rating,nreco,...)
tp <- get_tp(recos,unknown_rating,nreco)
cum_tp <- cum_tp + tp
message("Cumulative True Positive:",cum_tp)
evals <- rbind(evals,data.frame(visit=i,tp=tp))
}
return(evals)
}
We shall define various recommendation schemes as function and provide that to the simulation function in order to evaluate the system.
# Popularity only based scheme - Recommend item only by popularity
scheme_popularity <- function(known_rating,nreco){
get_popular(known_rating,popular_prods,nreco)
}
# Use IBCF and heuristic explore, exploit method.
# If available rating is less than 4 (cold start) then use popularity based method
scheme_ibcf_exp <- function(known_rating,nreco,exploit){
if (mean(colSums(known_rating,na.rm = T)) < 5){
message("Cold Start: Using Popularity based recommendation")
get_popular(known_rating,popular_prods,nreco)
} else {
model <- model_ibcf
model_reco <- predict(model, as(known_rating,"realRatingMatrix"),n=nreco)
#Use explore exploitation method
final_reco <- explore_exploit_herustic(model_reco,n = nreco,exploit=exploit)
return(final_reco)
}
}
# Use UBCF and heuristic explore, exploit method.
# If available rating is less than 4 (cold start) then use popularity based method
scheme_ubcf_exp <- function(known_rating,nreco,exploit){
if (mean(colSums(known_rating,na.rm = T)) < 5){
message("Cold Start: Using Popularity based recommendation")
get_popular(known_rating,popular_prods,nreco)
} else {
model <- model_ubcf
model_reco <- predict(model, as(known_rating,"realRatingMatrix"),n=nreco)
#Use explore exploitation method
final_reco <- explore_exploit_herustic(model_reco,n = nreco,exploit=exploit)
return(final_reco)
}
}
We shall simulate user visits and check the model performance for various recommendation schemes
Recommendation only Based on Popularity
pop_tp <- simulate_user_visit(test_rating,scheme_popularity)
pop_tp$cumulative_tp = cumsum(pop_tp$tp)
kable(pop_tp,caption="Popularity Based",digits=4)
visit | tp | cumulative_tp |
---|---|---|
1 | 0.1836 | 0.1836 |
2 | 0.1894 | 0.3730 |
3 | 0.1162 | 0.4892 |
4 | 0.1148 | 0.6040 |
5 | 0.1004 | 0.7044 |
6 | 0.0803 | 0.7848 |
7 | 0.0560 | 0.8407 |
8 | 0.0488 | 0.8895 |
9 | 0.0430 | 0.9326 |
10 | 0.0373 | 0.9699 |
IBCF Based Recommendation using explore/exploit technique with popularity based method for cold start
ibcf_tp <- simulate_user_visit(test_rating,scheme_ibcf_exp,10,2,10,exploit=.5)
ibcf_tp$cumulative_tp = cumsum(ibcf_tp$tp)
kable(ibcf_tp,caption="IBCF Based Scheme")
visit | tp | cumulative_tp |
---|---|---|
1 | 0.1936872 | 0.1936872 |
2 | 0.1434720 | 0.3371593 |
3 | 1.3959828 | 1.7331420 |
4 | 1.2754663 | 3.0086083 |
5 | 1.0243902 | 4.0329986 |
6 | 0.5509326 | 4.5839311 |
7 | 0.3945481 | 4.9784792 |
8 | 0.2941176 | 5.2725968 |
9 | 0.2324247 | 5.5050215 |
10 | 0.1979914 | 5.7030129 |
UBCF Based Recommendation using explore/exploit technique with popularity based method for cold start
ubcf_tp <- simulate_user_visit(test_rating,scheme_ubcf_exp,10,2,10,exploit=.5)
ubcf_tp$cumulative_tp = cumsum(ubcf_tp$tp)
kable(ubcf_tp,caption="UBCF Based Scheme")
visit | tp | cumulative_tp |
---|---|---|
1 | 0.1936872 | 0.1936872 |
2 | 0.1606887 | 0.3543759 |
3 | 2.9813486 | 3.3357245 |
4 | 2.9081779 | 6.2439024 |
5 | 2.3988522 | 8.6427547 |
6 | 1.5853659 | 10.2281205 |
7 | 1.0373027 | 11.2654232 |
8 | 0.7560976 | 12.0215208 |
9 | 0.6154950 | 12.6370158 |
10 | 0.5093257 | 13.1463415 |
Comparing the three schemes
plot(pop_tp$cumulative_tp,type="l",col="blue",ylim=c(0,15),
main="Various Recommendation Schemes",
xlab = "User Visits", ylab = "Cumulative True Positive")
lines(ibcf_tp$cumulative_tp,col="black")
lines(ubcf_tp$cumulative_tp,col="orchid")
legend(1,15,c("Popularity","IBCF","UBCF"),col=c("blue","black","orchid"),lty=c(1,1,1))
The popularity based scheme didn’t do well which is expected. UBCF based scheme provided the best results. We could see that initially (until two user visits) all the schemes perform same due to cold start (all the schemes use popularity based method for cold start). We could also see that the tp rate reduces as user visits more and selects the recommended items. Once user select most of the recommended items the true positives would come mostly from explore scheme and explore schemes tend to have very low true positive rates.
Given below list of some key points learned from building the system