DATA 643 - Final Project Part-I
Context Based Movie Recommendation System
This assignment leverages the context pre filtering and makes it 2D and applies traditional recommender algorithms.
Data set source: https://github.com/irecsys/CARSKit/tree/master/context-aware_data_sets
#Step1 - Load libraries
library(dplyr)
library(recommenderlab)
library(reshape2)
library(RCurl)
library(ggplot2)
library(base)
# Step - Binarize.
binarize <- function(df, ratingFieldName) {
#If the rating is >= 3, then we treat it as 1, else 0.
df$ratingFieldName = ifelse(as.numeric(df[[ratingFieldName]]) >= 3, 1, 0)
return (df)
}
#3. Pre-Filtering - Prepare the predicates and Pre-Filter the contexts as per the match criteria.
filterByContext <- function (df, contexts, exactMatch){
predicates <- sapply(contexts, function(context) {
name <- context[1]
value <- context[2]
paste0(name,"=='",value , "'")
})
#Now combine ALL given predicates
if (exactMatch) {
predicates <- do.call(paste, c(as.list(predicates), sep=" & "))
}
else {
predicates <- do.call(paste, c(as.list(predicates), sep=" | "))
}
#Print
print(predicates)
#Filter Now.
( df.filtered <- df %>% filter_( predicates) )
}
#Get real ratings matrix from the sparse matrix
getRealRatingsMatrix <- function(sparseMatrix) {
sparseMatrix<-as.matrix(sparseMatrix)
#Lest check how sparse this matrix is.
sparseMatrixTemp <- sparseMatrix
is.na(sparseMatrixTemp) <- sparseMatrixTemp==0
print(paste0("Sparsity of the matrix:", sum(is.na(sparseMatrixTemp))/(nrow(sparseMatrixTemp)*ncol(sparseMatrixTemp))))
#Make it as real ratings matrix.
sparseMatrix.realRatings <- as(sapply(data.frame(sparseMatrix),as.numeric), "realRatingMatrix")
return(sparseMatrix.realRatings)
}
#Split the data frame into train and test.
splitDF <- function(normRatingMat)
{
## 75% of the sample size
smp_size <- floor(0.75 * nrow(normRatingMat))
set.seed(123)
train_ind <- sample(seq_len(nrow(normRatingMat)), size = smp_size)
train.RatingMat <- normRatingMat[train_ind, ]
test.RatingMat <- normRatingMat[-train_ind, ]
return(list(trainDF=train.RatingMat, testDF=test.RatingMat))
}
#Create the Model for the rating matrix and method.
createModel <-function (ratingMat,algType)
{
model <- Recommender(ratingMat, method = algType)
names(getModel(model))
getModel(model)$method
getModel(model)$nn
return (model)
}
#Provide the recommendations/predictions.
recommendations <- function(ratingMat, model, userID, n)
{
### predict top n recommendations for given user
topN_recommendList <-predict(model,ratingMat[userID],n=n)
topN_recommendList@items[[1]]
return(topN_recommendList)
}
#Data Acquisition and preprocessing
ratingsurl <- getURL("https://raw.githubusercontent.com/srajeev1/MSDA-IS643/master/projects/Project4/ratings.txt")
ratings <- read.csv(text = ratingsurl,header = TRUE, stringsAsFactors = FALSE)
knitr::kable(ratings[10:20,])
| userid | itemid | rating | Time | Location | Companion | |
|---|---|---|---|---|---|---|
| 10 | 1090 | tt4052394 | 3 | NA | NA | NA |
| 11 | 1090 | tt1596343 | 2 | NA | NA | NA |
| 12 | 1090 | tt2574698 | 2 | NA | NA | NA |
| 13 | 1090 | tt0454848 | 2 | NA | NA | NA |
| 14 | 1088 | tt0405422 | 4 | NA | NA | NA |
| 15 | 1088 | tt0378194 | 4 | NA | NA | NA |
| 16 | 1088 | tt0211915 | 5 | NA | NA | NA |
| 17 | 1088 | tt0169547 | 5 | NA | NA | NA |
| 18 | 1088 | tt0133093 | 5 | NA | NA | NA |
| 19 | 1110 | tt0454876 | 3 | NA | NA | NA |
| 20 | 1110 | tt0181689 | 3 | NA | NA | NA |
# Step2. #binarize.
ranking.data <- binarize(ratings, "rating")
ranking.data$rating=ranking.data$ratingFieldName
knitr::kable(ranking.data[10:20,])
| userid | itemid | rating | Time | Location | Companion | ratingFieldName | |
|---|---|---|---|---|---|---|---|
| 10 | 1090 | tt4052394 | 1 | NA | NA | NA | 1 |
| 11 | 1090 | tt1596343 | 0 | NA | NA | NA | 0 |
| 12 | 1090 | tt2574698 | 0 | NA | NA | NA | 0 |
| 13 | 1090 | tt0454848 | 0 | NA | NA | NA | 0 |
| 14 | 1088 | tt0405422 | 1 | NA | NA | NA | 1 |
| 15 | 1088 | tt0378194 | 1 | NA | NA | NA | 1 |
| 16 | 1088 | tt0211915 | 1 | NA | NA | NA | 1 |
| 17 | 1088 | tt0169547 | 1 | NA | NA | NA | 1 |
| 18 | 1088 | tt0133093 | 1 | NA | NA | NA | 1 |
| 19 | 1110 | tt0454876 | 1 | NA | NA | NA | 1 |
| 20 | 1110 | tt0181689 | 1 | NA | NA | NA | 1 |
#3. Initialize the context.
#Pre-filter contexts - sample context we are trying out is the Time as 'Weekend' and Location is 'Home'
contexts <- list(c('Time','Weekend'), c('Location','Home'))
ranking.data.context <- filterByContext(ranking.data, contexts, TRUE)
## [1] "Time=='Weekend' & Location=='Home'"
knitr::kable(ranking.data.context[10:20,])
| userid | itemid | rating | Time | Location | Companion | ratingFieldName | |
|---|---|---|---|---|---|---|---|
| 10 | 1067 | tt1099212 | 0 | Weekend | Home | Partner | 0 |
| 11 | 1018 | tt0110357 | 1 | Weekend | Home | Family | 1 |
| 12 | 1018 | tt0111161 | 1 | Weekend | Home | Family | 1 |
| 13 | 1018 | tt0169547 | 1 | Weekend | Home | Family | 1 |
| 14 | 1018 | tt0367594 | 1 | Weekend | Home | Family | 1 |
| 15 | 1018 | tt0133093 | 1 | Weekend | Home | Family | 1 |
| 16 | 1018 | tt0120338 | 0 | Weekend | Home | Alone | 0 |
| 17 | 1018 | tt1291150 | 0 | Weekend | Home | Alone | 0 |
| 18 | 1018 | tt1099212 | 0 | Weekend | Home | Alone | 0 |
| 19 | 1018 | tt0120338 | 0 | Weekend | Home | Family | 0 |
| 20 | 1018 | tt1291150 | 0 | Weekend | Home | Family | 0 |
#4. Now convert into 2D and apply the traditional recommenders
rating.df <- ranking.data.context %>% select(userid, itemid, rating)
rating.wide <-dcast(rating.df, userid ~ itemid, value.var='rating', fun.aggregate= mean, na.rm=T)
dim(rating.wide)
## [1] 67 79
#5. This is a highly sparse matrix, so, get the real ratings matrix
realrating.matrix <- getRealRatingsMatrix(rating.wide)
## [1] "Sparsity of the matrix:0.890421311165691"
#6. Visualize it!
image(realrating.matrix, main = "Heatmap of the rating matrix")
#7. Split Dataframe
splitData <-splitDF(realrating.matrix)
#8. Prepare models
IBCF.model <- createModel(splitData$trainDF,"IBCF")
UBCF.model <- createModel(splitData$trainDF,"UBCF")
#Let us get the top 4 recommendations for user 1122 for the given context.
userID <- 10
topN <- 10
predict_list <-recommendations(splitData$trainDF, UBCF.model, userID, topN)
ranking.data.context[predict_list@items[[1]],c("itemid", "Time", "Location")]
## itemid Time Location
## 24 tt0232500 Weekend Home
## 66 tt0462538 Weekend Home
## 69 tt0462538 Weekend Home
## 63 tt0169547 Weekend Home
## 17 tt1291150 Weekend Home
## 26 tt1055369 Weekend Home
## 70 ttnanana1 Weekend Home
## 51 tt0133093 Weekend Home
## 14 tt0367594 Weekend Home
## 60 tt0382625 Weekend Home
userID <- 10
topN <- 10
predict_list <-recommendations(splitData$trainDF, IBCF.model, userID, topN)
ranking.data.context[predict_list@items[[1]],c("itemid", "Time", "Location")]
## itemid Time Location
## 70 ttnanana1 Weekend Home
## 71 tt1232829 Weekend Home
## 4 tt1375666 Weekend Home
## 5 tt0110475 Weekend Home
## 14 tt0367594 Weekend Home
## 15 tt0133093 Weekend Home
## 21 tt1099212 Weekend Home
## 22 tt0319262 Weekend Home
## 23 tt0441773 Weekend Home
## 26 tt1055369 Weekend Home