DATA 643 - Project 4

Description

This assignment leverages the context pre filtering and makes it 2D and applies traditional recommender algorithms.

##Load libraries
unavailable <- setdiff(c("recommenderlab", "xlsx", "RCurl", "reshape2","dplyr", "Matrix"), rownames(installed.packages()))
if (length(unavailable)>0){
  install.packages(unavailable)
}

library(recommenderlab)
library(reshape2)
library(RCurl)
library(xlsx)
library(dplyr)
library(Matrix)

Common Functions

Function to binarize the given data set

#binarize.
binarize <- function(df, ratingFieldName) {
  #If the rating is >= 3, then we treat it as 1, else 0.
  df[[ratingFieldName]] = ifelse(as.numeric(df[[ratingFieldName]]) >= 3, 1, 0)
  return (df)
}

Function to apply the context pre-filtering for a given data frame, contexts list, and exactMatch or Not

#Pre-Filtering - Prepare the predicates and Pre-Filter the contexts as per the match criteria.
filterByContext <- function (df, contexts, exactMatch){
  predicates <- sapply(contexts, function(context) {
    name  <-  context[1]
    value <-  context[2]
    paste0(name,"=='",trimws(value) , "'")
  })
  
  #Now combine ALL given predicates
  if (exactMatch) {
    predicates <- do.call(paste, c(as.list(predicates), sep=" & "))  
  }
  else {
    predicates <- do.call(paste, c(as.list(predicates), sep=" | "))  
  }
  
  #Print
  print(predicates)
  
  #Filter Now.
  ( df.filtered <- df %>% filter_( predicates) )
}

Function to prepare the real ratings matrix from the given sparse matrix

#Get real ratings matrix from the sparse matrix
getRealRatingsMatrix <- function(sparseMatrix) {
  
  sparseMatrix<-as.matrix(sparseMatrix)
  
  #Lest check how sparse this matrix is.
  sparseMatrixTemp <- sparseMatrix
  is.na(sparseMatrixTemp) <- sparseMatrixTemp==0  
  
  print(paste0("Sparsity of the matrix:",      sum(is.na(sparseMatrixTemp))/(nrow(sparseMatrixTemp)*ncol(sparseMatrixTemp))))
  
  #Make it as real ratings matrix.
  sparseMatrix.realRatings <- as(sapply(data.frame(sparseMatrix),as.numeric), "realRatingMatrix")
  
  return(sparseMatrix.realRatings)
}

Function to split the data frame into training and test

#Split the data frame int train and test.
splitDF <- function(normRatingMat)
{
  ## 75% of the sample size
  smp_size <- floor(0.75 * nrow(normRatingMat))
  set.seed(123)
  train_ind <- sample(seq_len(nrow(normRatingMat)), size = smp_size)
  
  train.RatingMat <- normRatingMat[train_ind, ]
  test.RatingMat <- normRatingMat[-train_ind, ]
  
  
  return(list(trainDF=train.RatingMat, testDF=test.RatingMat))
}

Function to create Model using Recommender class for a given ratings matrix and algorithm type

#Create the Model for the rating matrix and method.
createModel <-function (ratingMat,algType)
{
  model <- Recommender(ratingMat, method = algType)
  names(getModel(model))
  getModel(model)$method
  
  getModel(model)$nn
  
  return (model)
}

Function to provide recommendations

#Provide the recommendations/predictions.
recommendations <- function(ratingMat, model, userID, n)
{
  ### predict top n recommendations for given user
  topN_recommendList <-predict(model,ratingMat[userID],n=n) 
  topN_recommendList@items[[1]]
  return(topN_recommendList)
}

Client code

Data Acquisition

#Data Acquisition
download.file("http://github.com/psumank/DATA643/blob/master/WK4/Data/Data_TripAdvisor_v1.xls?raw=true", destfile = "Data_TripAdvisor_v1.xls", mode="wb")
hotel.ranking.data.full <- read.xlsx2("Data_TripAdvisor_v1.xls", sheetName="Data")
knitr::kable(hotel.ranking.data.full[10:20,])

	ID_USER	USER_STATE	USER_TIMEZONE	ID_HOTEL	HOTEL_CITY	HOTEL_STATE	HOTEL_TIMEZONE	Trip.Type	Rating
10	100	NY	Eastern	91428	Indianapolis	IN	Eastern	2	4
11	100	NY	Eastern	109399	Dallas	TX	Central	3	3
12	100	NY	Eastern	98764	Albuquerque	NM	Mountain	4	3
13	149	OR	Pacific	240681	Phoenix	AZ	Mountain	1	3
14	149	OR	Pacific	73927	Phoenix	AZ	Mountain	1	3
15	149	OR	Pacific	100584	Seattle	WA	Pacific	2	1
16	159	TX	Central	98815	ELPaso	TX	Central	1	3
17	159	TX	Central	99055	Houston	TX	Central	2	4
18	159	TX	Central	92773	Louisville	KY	Eastern	3	4
19	198	VA	Eastern	74590	Phoenix	AZ	Mountain	3	4
20	198	VA	Eastern	75180	Tucson	AZ	Mountain	3	4

trip.type.data <- read.xlsx2("Data_TripAdvisor_v1.xls", sheetName="Trip Type")
knitr::kable(trip.type.data)

X1	Family
2	Couples
3	Business
4	Solo travel
5	Friends

Data Preprocessing

#binarize
hotel.ranking.data <- binarize(hotel.ranking.data.full, "Rating")
knitr::kable(hotel.ranking.data[10:20,])

	ID_USER	USER_STATE	USER_TIMEZONE	ID_HOTEL	HOTEL_CITY	HOTEL_STATE	HOTEL_TIMEZONE	Trip.Type	Rating
10	100	NY	Eastern	91428	Indianapolis	IN	Eastern	2	1
11	100	NY	Eastern	109399	Dallas	TX	Central	3	1
12	100	NY	Eastern	98764	Albuquerque	NM	Mountain	4	1
13	149	OR	Pacific	240681	Phoenix	AZ	Mountain	1	1
14	149	OR	Pacific	73927	Phoenix	AZ	Mountain	1	1
15	149	OR	Pacific	100584	Seattle	WA	Pacific	2	0
16	159	TX	Central	98815	ELPaso	TX	Central	1	1
17	159	TX	Central	99055	Houston	TX	Central	2	1
18	159	TX	Central	92773	Louisville	KY	Eastern	3	1
19	198	VA	Eastern	74590	Phoenix	AZ	Mountain	3	1
20	198	VA	Eastern	75180	Tucson	AZ	Mountain	3	1

Context - Trip Type as ‘Business’ (which is ‘3’). and Hotel State as ‘Texas’

#Initialize the context.
#Pre-filter contexts - sample context we are trying out is the Trip Type as 'Business' and hotel state is 'TX'
contexts <- list(c('Trip.Type','3'), c('HOTEL_STATE','TX'))
hotel.ranking.data.context <- filterByContext(hotel.ranking.data, contexts, TRUE)

## [1] "Trip.Type=='3' & HOTEL_STATE=='TX'"

knitr::kable(hotel.ranking.data.context[10:20,])

	ID_USER	USER_STATE	USER_TIMEZONE	ID_HOTEL	HOTEL_CITY	HOTEL_STATE	HOTEL_TIMEZONE	Trip.Type	Rating
10	221	CT	Eastern	1199983	Arlington	TX	Central	3	1
11	221	CT	Eastern	1199983	Arlington	TX	Central	3	1
12	349	TX	Central	114948	Dallas	TX	Central	3	1
13	349	TX	Central	224130	Dallas	TX	Central	3	1
14	1135	CA	Pacific	225041	ELPaso	TX	Central	3	0
15	1373	GA	Eastern	107844	Houston	TX	Central	3	1
16	1373	GA	Eastern	109499	Arlington	TX	Central	3	1
17	1373	GA	Eastern	1176216	Arlington	TX	Central	3	1
18	1437	CA	Pacific	223151	Houston	TX	Central	3	1
19	1437	CA	Pacific	99132	Houston	TX	Central	3	1
20	1537	TX	Central	1966350	Houston	TX	Central	3	1

#Now convert into 2D and apply the traditional recommenders
hotel.rating <- hotel.ranking.data.context %>% select(ID_USER, ID_HOTEL, Rating)
hotel.rating.wide <-dcast(hotel.rating, ID_USER ~ ID_HOTEL, value.var='Rating', fun.aggregate= mean, na.rm=T)
dim(hotel.rating.wide)

## [1] 305 309

#This is a highly sparse matrix, so, get the real ratings matrix
hotel.realrating.matrix <- getRealRatingsMatrix(hotel.rating.wide)

## [1] "Sparsity of the matrix:0.99095973261181"

#Visualize it!
image(hotel.realrating.matrix, main = "Heatmap of the rating matrix")

#Split Dataframe
splitData <-splitDF(hotel.realrating.matrix)

Prepare models and provide some recommendations

#Prepare models
IBCF.model <- createModel(splitData$trainDF,"IBCF")
UBCF.model <- createModel(splitData$trainDF,"UBCF")

#Let us get the top 10 recommendations for user 45 for the given context.
userID <- 45
topN <- 10
predict_list <-recommendations(splitData$trainDF, UBCF.model, userID, topN)
hotel.ranking.data.context[predict_list@items[[1]],c("ID_HOTEL", "HOTEL_CITY", "HOTEL_STATE")]

##    ID_HOTEL HOTEL_CITY HOTEL_STATE
## 3     99518 SanAntonio          TX
## 4   1751886     Austin          TX
## 6    109399     Dallas          TX
## 7    240001     ELPaso          TX
## 8    109010     Austin          TX
## 9    240439    Houston          TX
## 10  1199983  Arlington          TX
## 11  1199983  Arlington          TX
## 12   114948     Dallas          TX
## 14   225041     ELPaso          TX

userID <- 45
topN <- 10
predict_list <-recommendations(splitData$trainDF, IBCF.model, userID, topN)
hotel.ranking.data.context[predict_list@items[[1]], c("ID_HOTEL", "HOTEL_CITY", "HOTEL_STATE") ]

##    ID_HOTEL HOTEL_CITY HOTEL_STATE
## 2     98827     ELPaso          TX
## 3     99518 SanAntonio          TX
## 4   1751886     Austin          TX
## 5     99120    Houston          TX
## 7    240001     ELPaso          TX
## 8    109010     Austin          TX
## 9    240439    Houston          TX
## 11  1199983  Arlington          TX
## 12   114948     Dallas          TX
## 13   224130     Dallas          TX

DATA 643 - Project 4

Sreejaya Nair, Suman K Polavarapu

July 13, 2016

Description

Common Functions

Client code

Data Preprocessing