This assignment leverages the context pre filtering and makes it 2D and applies traditional recommender algorithms.
##Load libraries
unavailable <- setdiff(c("recommenderlab", "xlsx", "RCurl", "reshape2","dplyr", "Matrix"), rownames(installed.packages()))
if (length(unavailable)>0){
install.packages(unavailable)
}
library(recommenderlab)
library(reshape2)
library(RCurl)
library(xlsx)
library(dplyr)
library(Matrix)
Function to binarize the given data set
#binarize.
binarize <- function(df, ratingFieldName) {
#If the rating is >= 3, then we treat it as 1, else 0.
df[[ratingFieldName]] = ifelse(as.numeric(df[[ratingFieldName]]) >= 3, 1, 0)
return (df)
}
Function to apply the context pre-filtering for a given data frame, contexts list, and exactMatch or Not
#Pre-Filtering - Prepare the predicates and Pre-Filter the contexts as per the match criteria.
filterByContext <- function (df, contexts, exactMatch){
predicates <- sapply(contexts, function(context) {
name <- context[1]
value <- context[2]
paste0(name,"=='",trimws(value) , "'")
})
#Now combine ALL given predicates
if (exactMatch) {
predicates <- do.call(paste, c(as.list(predicates), sep=" & "))
}
else {
predicates <- do.call(paste, c(as.list(predicates), sep=" | "))
}
#Print
print(predicates)
#Filter Now.
( df.filtered <- df %>% filter_( predicates) )
}
Function to prepare the real ratings matrix from the given sparse matrix
#Get real ratings matrix from the sparse matrix
getRealRatingsMatrix <- function(sparseMatrix) {
sparseMatrix<-as.matrix(sparseMatrix)
#Lest check how sparse this matrix is.
sparseMatrixTemp <- sparseMatrix
is.na(sparseMatrixTemp) <- sparseMatrixTemp==0
print(paste0("Sparsity of the matrix:", sum(is.na(sparseMatrixTemp))/(nrow(sparseMatrixTemp)*ncol(sparseMatrixTemp))))
#Make it as real ratings matrix.
sparseMatrix.realRatings <- as(sapply(data.frame(sparseMatrix),as.numeric), "realRatingMatrix")
return(sparseMatrix.realRatings)
}
Function to split the data frame into training and test
#Split the data frame int train and test.
splitDF <- function(normRatingMat)
{
## 75% of the sample size
smp_size <- floor(0.75 * nrow(normRatingMat))
set.seed(123)
train_ind <- sample(seq_len(nrow(normRatingMat)), size = smp_size)
train.RatingMat <- normRatingMat[train_ind, ]
test.RatingMat <- normRatingMat[-train_ind, ]
return(list(trainDF=train.RatingMat, testDF=test.RatingMat))
}
Function to create Model using Recommender class for a given ratings matrix and algorithm type
#Create the Model for the rating matrix and method.
createModel <-function (ratingMat,algType)
{
model <- Recommender(ratingMat, method = algType)
names(getModel(model))
getModel(model)$method
getModel(model)$nn
return (model)
}
Function to provide recommendations
#Provide the recommendations/predictions.
recommendations <- function(ratingMat, model, userID, n)
{
### predict top n recommendations for given user
topN_recommendList <-predict(model,ratingMat[userID],n=n)
topN_recommendList@items[[1]]
return(topN_recommendList)
}
Data Acquisition
#Data Acquisition
download.file("http://github.com/psumank/DATA643/blob/master/WK4/Data/Data_TripAdvisor_v1.xls?raw=true", destfile = "Data_TripAdvisor_v1.xls", mode="wb")
hotel.ranking.data.full <- read.xlsx2("Data_TripAdvisor_v1.xls", sheetName="Data")
knitr::kable(hotel.ranking.data.full[10:20,])
| ID_USER | USER_STATE | USER_TIMEZONE | ID_HOTEL | HOTEL_CITY | HOTEL_STATE | HOTEL_TIMEZONE | Trip.Type | Rating | |
|---|---|---|---|---|---|---|---|---|---|
| 10 | 100 | NY | Eastern | 91428 | Indianapolis | IN | Eastern | 2 | 4 |
| 11 | 100 | NY | Eastern | 109399 | Dallas | TX | Central | 3 | 3 |
| 12 | 100 | NY | Eastern | 98764 | Albuquerque | NM | Mountain | 4 | 3 |
| 13 | 149 | OR | Pacific | 240681 | Phoenix | AZ | Mountain | 1 | 3 |
| 14 | 149 | OR | Pacific | 73927 | Phoenix | AZ | Mountain | 1 | 3 |
| 15 | 149 | OR | Pacific | 100584 | Seattle | WA | Pacific | 2 | 1 |
| 16 | 159 | TX | Central | 98815 | ELPaso | TX | Central | 1 | 3 |
| 17 | 159 | TX | Central | 99055 | Houston | TX | Central | 2 | 4 |
| 18 | 159 | TX | Central | 92773 | Louisville | KY | Eastern | 3 | 4 |
| 19 | 198 | VA | Eastern | 74590 | Phoenix | AZ | Mountain | 3 | 4 |
| 20 | 198 | VA | Eastern | 75180 | Tucson | AZ | Mountain | 3 | 4 |
trip.type.data <- read.xlsx2("Data_TripAdvisor_v1.xls", sheetName="Trip Type")
knitr::kable(trip.type.data)
| X1 | Family |
|---|---|
| 2 | Couples |
| 3 | Business |
| 4 | Solo travel |
| 5 | Friends |
#binarize
hotel.ranking.data <- binarize(hotel.ranking.data.full, "Rating")
knitr::kable(hotel.ranking.data[10:20,])
| ID_USER | USER_STATE | USER_TIMEZONE | ID_HOTEL | HOTEL_CITY | HOTEL_STATE | HOTEL_TIMEZONE | Trip.Type | Rating | |
|---|---|---|---|---|---|---|---|---|---|
| 10 | 100 | NY | Eastern | 91428 | Indianapolis | IN | Eastern | 2 | 1 |
| 11 | 100 | NY | Eastern | 109399 | Dallas | TX | Central | 3 | 1 |
| 12 | 100 | NY | Eastern | 98764 | Albuquerque | NM | Mountain | 4 | 1 |
| 13 | 149 | OR | Pacific | 240681 | Phoenix | AZ | Mountain | 1 | 1 |
| 14 | 149 | OR | Pacific | 73927 | Phoenix | AZ | Mountain | 1 | 1 |
| 15 | 149 | OR | Pacific | 100584 | Seattle | WA | Pacific | 2 | 0 |
| 16 | 159 | TX | Central | 98815 | ELPaso | TX | Central | 1 | 1 |
| 17 | 159 | TX | Central | 99055 | Houston | TX | Central | 2 | 1 |
| 18 | 159 | TX | Central | 92773 | Louisville | KY | Eastern | 3 | 1 |
| 19 | 198 | VA | Eastern | 74590 | Phoenix | AZ | Mountain | 3 | 1 |
| 20 | 198 | VA | Eastern | 75180 | Tucson | AZ | Mountain | 3 | 1 |
Context - Trip Type as ‘Business’ (which is ‘3’). and Hotel State as ‘Texas’
#Initialize the context.
#Pre-filter contexts - sample context we are trying out is the Trip Type as 'Business' and hotel state is 'TX'
contexts <- list(c('Trip.Type','3'), c('HOTEL_STATE','TX'))
hotel.ranking.data.context <- filterByContext(hotel.ranking.data, contexts, TRUE)
## [1] "Trip.Type=='3' & HOTEL_STATE=='TX'"
knitr::kable(hotel.ranking.data.context[10:20,])
| ID_USER | USER_STATE | USER_TIMEZONE | ID_HOTEL | HOTEL_CITY | HOTEL_STATE | HOTEL_TIMEZONE | Trip.Type | Rating | |
|---|---|---|---|---|---|---|---|---|---|
| 10 | 221 | CT | Eastern | 1199983 | Arlington | TX | Central | 3 | 1 |
| 11 | 221 | CT | Eastern | 1199983 | Arlington | TX | Central | 3 | 1 |
| 12 | 349 | TX | Central | 114948 | Dallas | TX | Central | 3 | 1 |
| 13 | 349 | TX | Central | 224130 | Dallas | TX | Central | 3 | 1 |
| 14 | 1135 | CA | Pacific | 225041 | ELPaso | TX | Central | 3 | 0 |
| 15 | 1373 | GA | Eastern | 107844 | Houston | TX | Central | 3 | 1 |
| 16 | 1373 | GA | Eastern | 109499 | Arlington | TX | Central | 3 | 1 |
| 17 | 1373 | GA | Eastern | 1176216 | Arlington | TX | Central | 3 | 1 |
| 18 | 1437 | CA | Pacific | 223151 | Houston | TX | Central | 3 | 1 |
| 19 | 1437 | CA | Pacific | 99132 | Houston | TX | Central | 3 | 1 |
| 20 | 1537 | TX | Central | 1966350 | Houston | TX | Central | 3 | 1 |
#Now convert into 2D and apply the traditional recommenders
hotel.rating <- hotel.ranking.data.context %>% select(ID_USER, ID_HOTEL, Rating)
hotel.rating.wide <-dcast(hotel.rating, ID_USER ~ ID_HOTEL, value.var='Rating', fun.aggregate= mean, na.rm=T)
dim(hotel.rating.wide)
## [1] 305 309
#This is a highly sparse matrix, so, get the real ratings matrix
hotel.realrating.matrix <- getRealRatingsMatrix(hotel.rating.wide)
## [1] "Sparsity of the matrix:0.99095973261181"
#Visualize it!
image(hotel.realrating.matrix, main = "Heatmap of the rating matrix")
#Split Dataframe
splitData <-splitDF(hotel.realrating.matrix)
Prepare models and provide some recommendations
#Prepare models
IBCF.model <- createModel(splitData$trainDF,"IBCF")
UBCF.model <- createModel(splitData$trainDF,"UBCF")
#Let us get the top 10 recommendations for user 45 for the given context.
userID <- 45
topN <- 10
predict_list <-recommendations(splitData$trainDF, UBCF.model, userID, topN)
hotel.ranking.data.context[predict_list@items[[1]],c("ID_HOTEL", "HOTEL_CITY", "HOTEL_STATE")]
## ID_HOTEL HOTEL_CITY HOTEL_STATE
## 3 99518 SanAntonio TX
## 4 1751886 Austin TX
## 6 109399 Dallas TX
## 7 240001 ELPaso TX
## 8 109010 Austin TX
## 9 240439 Houston TX
## 10 1199983 Arlington TX
## 11 1199983 Arlington TX
## 12 114948 Dallas TX
## 14 225041 ELPaso TX
userID <- 45
topN <- 10
predict_list <-recommendations(splitData$trainDF, IBCF.model, userID, topN)
hotel.ranking.data.context[predict_list@items[[1]], c("ID_HOTEL", "HOTEL_CITY", "HOTEL_STATE") ]
## ID_HOTEL HOTEL_CITY HOTEL_STATE
## 2 98827 ELPaso TX
## 3 99518 SanAntonio TX
## 4 1751886 Austin TX
## 5 99120 Houston TX
## 7 240001 ELPaso TX
## 8 109010 Austin TX
## 9 240439 Houston TX
## 11 1199983 Arlington TX
## 12 114948 Dallas TX
## 13 224130 Dallas TX