#dependencies
## data processing packages
library(tidyr); library(dplyr); library(RCurl); library(jsonlite); library(plyr)

##formatting packages
library(knitr); library(kableExtra); library(default)

##visualization packages
library(ggplot2)

##recommender packages
library(recommenderlab); library(Metrics); library(lsa); library(diveRsity)

# global options
## knit sizing
options(max.print="100"); opts_knit$set(width=75) 

## augment chunk output
opts_chunk$set(echo=T,cache=F, tidy=T,comment=F,message=T,warning=T) #change message/warning to F upon completion

## set table style for consistency
default(kable) <- list(format="html")
default(kable_styling)  <- list(bootstrap_options = "hover",full_width=T, font_size=10)
default(scroll_box) <- list(width = "100%")

## working directory
##try(setwd("~/Github/612-group/project-4"))

Getting Started

In this assignment, we practiced working with accuracy and other recommender system metrics. We compared the performance of serveral algorithms and analyzed the diversity of our recommendations.

Data Tranformations

We cleaned our data using transformations and regular expression unite our user and restaurant data.

# restaurant dataframe concatenate restaurant tags
cuisine <- aggregate(Rcuisine ~ ., data1, toString)
## merge & transform
restaurant <- data2 %>% select(placeID, name, city, price, alcohol, smoking_area) %>% 
    mutate(name = tolower(gsub("[�'_']", " ", name))) %>% mutate(city = tolower(city))
## set/revalue factors
restaurant$city <- revalue(restaurant$city, c(`cd victoria` = "ciudad victoria", 
    `cd. victoria` = "ciudad victoria", `victoria ` = "ciudad victoria", victoria = "ciudad victoria", 
    `san luis potosi ` = "san luis potosi", `san luis potos` = "san luis potosi", 
    s.l.p = "san luis potosi", slp = "san luis potosi", s.l.p. = "san luis potosi"))
restaurant$smoking_area = revalue(restaurant$smoking_area, c(none = "no", `not permitted` = "no", 
    section = "yes", permitted = "yes", `only at bar` = "yes"))
restaurant$price <- factor(restaurant$price, levels = c("low", "medium", "high"))
restaurant$alcohol = revalue(restaurant$alcohol, c(No_Alcohol_Served = "no", 
    Full_Bar = "yes", `Wine-Beer` = "yes"))
restaurant <- inner_join(restaurant, cuisine, by = "placeID")

# user dataframe: select attributes of interest from profile
user_profile <- data4 %>% select(userID, budget, activity, smoker)
user <- inner_join(data3, user_profile, by = "userID") %>% select(-service_rating, 
    -food_rating)
## set/revalue factors
user$budget <- factor(user$budget, levels = c("low", "medium", "high"))
user$smoker <- revalue(user$smoker, c(false = "no", true = "yes"))
user$smoker <- factor(user$smoker, levels = c("no", "yes"))
## change ratings from 0-2 scale to 1-3
user$rating[user$rating == 2] <- 3
user$rating[user$rating == 1] <- 2
user$rating[user$rating == 0] <- 1

# combine user / restaurant data & subset
data <- inner_join(user, restaurant, by = "placeID")
data <- data %>% filter(city == "san luis potosi", activity == "student") %>% 
    select(-city, -activity)

The output of which can be previewed below:

# view output
data %>% head() %>% kable(caption = "User-Item Dataframe") %>% kable_styling()
User-Item Dataframe
userID placeID rating budget smoker name price alcohol smoking_area Rcuisine
U1077 135085 3 medium no tortas locas hipocampo medium no no Fast_Food
U1077 132825 3 medium no puesto de tacos low no no Mexican
U1077 135060 2 medium no restaurante marisco sam medium no no Seafood
U1015 135071 1 medium yes restaurante la cantina medium yes yes Bar, Bar_Pub_Brewery
U1108 135075 3 medium no mariscos el pescador medium no no Seafood
U1108 132572 2 medium no cafe chaires low no no Cafeteria

Data Exploration

We found that 80% of our raters were students students and 76% of our restaurants were located within the Mexican city of San Luis Potosi. As a result, we subsetted our restaurant/patron data to limit the scope of our system to this specific population. After subsetting our raw data, we identified 78 unique users and 56 restaurants to build our recommender systems from.

The following plots help visualize the distribution of our overall ratings given by users based on their budget and the restaurant’s categorized pricing. We also viewed the rating counts each restaurant received. On average, each venue received 13 user ratings.

Data Visualizations

We found that most of our restaurants received high ratings of 3. Most ratings came from users with low-medium budgets and that the majority of our restaurants were in the middle bracket for price. We had a wide spread in the number of ratings each restaurant received, with the fewest being 3 and the most being 32 ratings from our users. There were 32 unique cuisine tags assigned to our restaurants. Mexican cuisine was the most popular and those restaurants received 108 total ratings.

Prepare Recommender

Matrix Building

We converted our raw ratings data into a user-item matrix to test and train our subsequent recommender system algorithms.

# create user item matrix
ui_matrix <- data %>% select(userID, placeID, rating) %>% spread(placeID, rating)
rownames(ui_matrix) <- ui_matrix$userID  # set row names to userid
ui_matrix <- ui_matrix %>% select(-userID) %>% as.matrix()  # remove userid from columns
umat <- as(ui_matrix, "realRatingMatrix")  # save real ratings for algo 

# preview matrices
as.data.frame.array(ui_matrix) %>% head() %>% kable(caption = "Preview of User-Item Matrix") %>% 
    kable_styling() %>% scroll_box()
Preview of User-Item Matrix
132572 132754 132755 132825 132834 132845 132846 132851 132854 132856 132858 132861 132862 132866 132869 132870 132872 132885 132937 132951 132954 132955 132958 135025 135026 135027 135028 135030 135032 135034 135035 135039 135041 135042 135044 135046 135048 135049 135052 135053 135054 135055 135057 135058 135059 135060 135069 135071 135072 135073 135074 135075 135079 135085 135086 135106
U1001 NA NA NA 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 3 NA NA NA NA NA NA NA 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA
U1002 NA NA NA 3 NA NA NA NA NA NA NA NA 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 3 NA NA NA NA NA 2 NA NA NA NA NA 2 NA NA NA NA NA NA NA NA 2 NA 2
U1003 NA 3 3 3 NA NA NA NA NA NA NA NA 2 NA NA NA NA NA 3 NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA 3 NA NA NA NA NA NA 3 3 NA NA NA
U1005 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 3 NA NA NA 2 3 NA NA NA NA NA NA NA NA 2 NA NA NA NA NA NA NA NA NA NA NA NA NA
U1006 2 NA NA 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2 NA NA NA 3 NA NA 2 NA NA NA NA NA NA NA NA 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2 2 NA NA NA
U1007 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2 NA NA NA NA 2 NA NA NA NA NA NA NA NA NA NA NA NA NA 2 2 NA NA NA NA NA NA NA NA NA 2 1 NA

Training and Test Subsets

Finally, our data was split into training and tests sets for model evaluation of both two recommender algorithms. We split our data with 10 k-folds using the recommenderlab package. 90% of data was retained for training and 10% for testing purposes.

# evaluation method with 90% of data for train and 10% for test
set.seed(1000)
evalu <- evaluationScheme(umat, method = "split", train = 0.8, given = 3, goodRating = 1, 
    k = 10)

Algorithm 1: UBCF/IBCF

For the first comparison, we chose to use the recommenderlab package to compare two collaborative filtering approaches: user-based and item-based. For data normalization we chose center, as we ran tests previously against other normalization techniques and the center approach did the best. Additionally, cosine similarity was used as the method of the system.

Predictions

After training our model, we are now ready for predictions using our test data. We then wanted to ensure our predictions remained inside the 1-3 rating scale, so we set ceilings/floors for any values that fell outside. Finally, we built a table to compare the error terms for both models.

# predicted ratings on the test data set for the UBCF and IBCF recommender
ub_known <- getData(evalu, "known")
ib_known <- getData(evalu, "known")

p_ub <- predict(ub_c, ub_known, type = "ratings")
p_ib <- predict(ib_c, ib_known, type = "ratings")

# setting ceiling/floor values to ensure all are inside rating scale
p_ub@data@x[p_ub@data@x[] < 1] <- 1
p_ub@data@x[p_ub@data@x[] > 3] <- 3

p_ib@data@x[p_ib@data@x[] < 1] <- 1
p_ib@data@x[p_ib@data@x[] > 3] <- 3

# compare the predictions for user-based and item-based
ub_unknown <- getData(evalu, "unknown")
ib_unknown <- getData(evalu, "unknown")

error <- rbind(ub_c = calcPredictionAccuracy(p_ub, ub_unknown), ib_c = calcPredictionAccuracy(p_ib, 
    ib_unknown))
error %>% kable(caption = "Prediction Comparisons") %>% kable_styling()
Prediction Comparisons
RMSE MSE MAE
ub_c 0.6631373 0.4397510 0.4494453
ib_c 0.8537624 0.7289103 0.6499042

We found that the more memory intense approach, user-based collaborative filtering, faired better in terms of error metrics as user-based had lower RMSE at .66 compared to the the item-based at .85. This could be due to the fact that the users are very similar as they are all Mexican college students and so it would make send that the user collaborative filtering provides more accurate results.

Algorithm 2: SVD

Our next method uses Singular Value Decomposition (SVD). We compared a normalized SVD approach to Funk SVD and Z-score SVD.

Process

We used the SVD algorthim to create a model which makes user-item recommendation predictions. The recommender relies on SVD approximation with column-mean imputation.

With normalize parameter we are asking system to normalize the rating by subtracting avg rating per user. Two availble methods are :

  1. Normalization tries to reduce the individual rating bias by row centering the data, i.e., by subtracting from each available rating the mean of the ratings of that user (row).
  2. Z-score in addition divides by the standard deviation of the row/column. Normalization can also be done on columns.

We also looked at SVD using the funk approach. Funk SVD decomposes a matrix (with missing values) into two components: U and V. The singular values are folded into these matrices. The approximation for the original matrix can be obtained by `R = UV’

# Prep data
ratings_train <- getData(evalu, "train")  # Training Dataset 
ratings_test_known <- getData(evalu, "known")  # Test data from evaluationScheme of type KNOWN
ratings_test_unknown <- getData(evalu, "unknown")  # Unknow datset used for RMSE / model evaluation

# NORMALIZED Train Model
svdn <- Recommender(data = ratings_train, method = "SVD", param = list(k = 10, 
    normalize = "center"))
svdz <- Recommender(data = ratings_train, method = "SVD", param = list(k = 10, 
    normalize = "Z-score"))
fsvd <- funkSVD(ratings_train, verbose = F)  # k = 10, gamma = 0.015, lambda = 0.001,

## Test/Evaluation Model Predication
svd_predict <- predict(svdn, ratings_test_known, type = "ratings")
svd_pred <- calcPredictionAccuracy(svd_predict, ratings_test_unknown)
svdz_predict <- predict(svdz, ratings_test_known, type = "ratings")
svdz_pred <- calcPredictionAccuracy(svdz_predict, ratings_test_unknown)
fsvd_predict <- predict(fsvd, ratings_test_known, verbose = F)
fsvd_predict <- as(fsvd_predict, "realRatingMatrix")
fsvd_pred = calcPredictionAccuracy(fsvd_predict, ratings_test_unknown)

# bind prediction results
svd_results <- rbind(svdz_pred, svd_pred, fsvd_pred)

Predictions

Our prediction evaluations can be seen below:

svd_results %>% kable(caption = "SVD Prediction Results with 10 Latent Factors") %>% 
    kable_styling()
SVD Prediction Results with 10 Latent Factors
RMSE MSE MAE
svdz_pred 0.6218776 0.3867317 0.4059436
svd_pred 0.6273705 0.3935938 0.4103931
fsvd_pred 0.9347304 0.8737210 0.7266974

Diversity

Diversity measures how dissimilar recommended items are for a user. This similarity is often determined using the item’s content (e.g. restaurant cusine) but can also be determined using how similarly items are rated. One measure of diversity is the Intra-List Similarity (ILS). The ILS equation can calculate the similarity between any two items (ij, ik) using the cosine similarity, Jaccard similarity coefficient, or another similarity metric could be utilized in the equation.

In our Diversity test, we are using predicted rating of our restaurants from the User and Actual rating of the restaurants from the User. We only use the items that have been rate from the original dataset and calculate the cosine similarity among the predicted and actual rating.

# Function to calualte Diversity

getDiversity <- function(ActualData, Predicted, tag) {
    print(paste("Cosine Similarity Diversity for:", tag, ">>>"))
    print("==========================================================================")
    cal_cosine <- NULL
    Predicted = as(Predicted, "matrix")
    ActualData = as(ActualData, "matrix")
    
    for (i in 1:(dim(ActualData)[1])) {
        a <- as.vector(ActualData[i, ])
        b <- as.vector(Predicted[i, ])
        
        # set All NA from MAIN Data = 0
        
        a[which(is.na(a))] = 0
        b[which(is.na(b))] = 0
        
        cal_cosine[i] <- cosine(a, b)  #a %*% b / sqrt(a%*%a * b%*%b)
        print(paste(i, ": ", cal_cosine[i]))
    }
    print("==========================================================================")
    print(paste("Mean Diversity:=>", round(mean(cal_cosine, na.rm = T), 4)))
    actualDiversity_Error = 1 - mean(cal_cosine, na.rm = T)
    print(paste("Diversity Error:=> ", round(actualDiversity_Error, 4)))
}

# Return Mean Diversity Only
MeanDiversity <- function(ActualData, Predicted) {
    cal_cosine <- NULL
    Predicted = as(Predicted, "matrix")
    ActualData = as(ActualData, "matrix")
    for (i in 1:(dim(ActualData)[1])) {
        a <- as.vector(ActualData[i, ])
        b <- as.vector(Predicted[i, ])
        a[which(is.na(a))] = 0
        b[which(is.na(b))] = 0
        cal_cosine[i] <- cosine(a, b)
    }
    return(mean(cal_cosine, na.rm = T))
}

SVD: Normal

getDiversity(ratings_test_unknown, svd_predict, "SVD k-10")
FALSE [1] "Cosine Similarity Diversity for: SVD k-10 >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 :  0.268894336626814"
FALSE [1] "2 :  0.302193240687116"
FALSE [1] "3 :  0.252318294483627"
FALSE [1] "4 :  0.137360563948689"
FALSE [1] "5 :  0.333589941018245"
FALSE [1] "6 :  0.284877204338698"
FALSE [1] "7 :  0.174913248712286"
FALSE [1] "8 :  0.134843196435992"
FALSE [1] "9 :  0.195069192724345"
FALSE [1] "10 :  0.196981363437319"
FALSE [1] "11 :  0.357137466266591"
FALSE [1] "12 :  0.137360563948689"
FALSE [1] "13 :  0.384940267625149"
FALSE [1] "14 :  0.284110454217177"
FALSE [1] "15 :  0.24421013802059"
FALSE [1] "16 :  0.412081691846067"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.2563"
FALSE [1] "Diversity Error:=>  0.7437"

SVD: Z-Score

getDiversity(ratings_test_unknown, svdz_predict, "SVD with Z-score")
FALSE [1] "Cosine Similarity Diversity for: SVD with Z-score >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 :  0.268074351843233"
FALSE [1] "2 :  0.302193240687116"
FALSE [1] "3 :  0.259554906506915"
FALSE [1] "4 :  0.137360563948689"
FALSE [1] "5 :  0.333589941018245"
FALSE [1] "6 :  0.285603478341271"
FALSE [1] "7 :  0.175782885366664"
FALSE [1] "8 :  0.135713147276616"
FALSE [1] "9 :  0.195364088257438"
FALSE [1] "10 :  0.196068582269344"
FALSE [1] "11 :  0.357137466266591"
FALSE [1] "12 :  0.137360563948689"
FALSE [1] "13 :  0.385109470478274"
FALSE [1] "14 :  0.28585612265559"
FALSE [1] "15 :  0.244124723971788"
FALSE [1] "16 :  0.412081691846067"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.2569"
FALSE [1] "Diversity Error:=>  0.7431"

Funk SVD

getDiversity(ratings_test_unknown, fsvd_predict, "Funk SVD")
FALSE [1] "Cosine Similarity Diversity for: Funk SVD >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 :  0.346068932922078"
FALSE [1] "2 :  0.25121046889339"
FALSE [1] "3 :  0.203958821984534"
FALSE [1] "4 :  0.148946718206544"
FALSE [1] "5 :  0.440503372584769"
FALSE [1] "6 :  0.376058123593362"
FALSE [1] "7 :  0.0854097534211631"
FALSE [1] "8 :  0.0888500092836332"
FALSE [1] "9 :  0.178259731481384"
FALSE [1] "10 :  0.265958190585396"
FALSE [1] "11 :  0.375059818666665"
FALSE [1] "12 :  0.121333086082837"
FALSE [1] "13 :  0.293532949920923"
FALSE [1] "14 :  0.296253518522501"
FALSE [1] "15 :  0.311850040279204"
FALSE [1] "16 :  0.482617796549666"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.2666"
FALSE [1] "Diversity Error:=>  0.7334"

User-Based

getDiversity(ratings_test_unknown, p_ub, "User-Based")
FALSE [1] "Cosine Similarity Diversity for: User-Based >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 :  0.270958745736286"
FALSE [1] "2 :  0.332820117735137"
FALSE [1] "3 :  0.625525337955507"
FALSE [1] "4 :  0.218217890235992"
FALSE [1] "5 :  0.330469991735662"
FALSE [1] "6 :  0.299100647029526"
FALSE [1] "7 :  0.103671665983777"
FALSE [1] "8 :  0.344079328288924"
FALSE [1] "9 :  0.170327636941529"
FALSE [1] "10 :  0.20004655525702"
FALSE [1] "11 :  0.3672230493342"
FALSE [1] "12 :  0"
FALSE [1] "13 :  0.345022722013868"
FALSE [1] "14 :  0.314336698141438"
FALSE [1] "15 :  0.251115219359251"
FALSE [1] "16 :  0.341159125180006"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.2821"
FALSE [1] "Diversity Error:=>  0.7179"

Item-Based

getDiversity(ratings_test_unknown, p_ib, "Item-Based")
FALSE [1] "Cosine Similarity Diversity for: Item-Based >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 :  0.242468848999763"
FALSE [1] "2 :  NaN"
FALSE [1] "3 :  0.658192410922158"
FALSE [1] "4 :  NaN"
FALSE [1] "5 :  NaN"
FALSE [1] "6 :  0.382501704453842"
FALSE [1] "7 :  0.0571117611021184"
FALSE [1] "8 :  0.301939630437212"
FALSE [1] "9 :  0.243566957874093"
FALSE [1] "10 :  0.216276603436567"
FALSE [1] "11 :  NaN"
FALSE [1] "12 :  NaN"
FALSE [1] "13 :  0.468253356466701"
FALSE [1] "14 :  0.352309631780176"
FALSE [1] "15 :  0.197885259814317"
FALSE [1] "16 :  NaN"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.3121"
FALSE [1] "Diversity Error:=>  0.6879"

Conclusion

After running several algorithm variations, our methods for UBCF, Z-Score SVD, and Normalized SVD produced our strongest accuracy scores, with the Z-Score SVD performing the best. This algorithm would produce the most relevant restaurant recommendations to Mexican college students.

Once we calculated the diversity score, we found that all algorithms, with the exeption of item-based collaborative filtering, performed similarly. The SVD algorithms performed the best, with the normalized SVD producing the lowest score. We can rely on these methods to provide the most diverse restaurant recommendations to the college students in our model.

Accuracy Metrics

rbind(error, svd_results) %>% round(4) %>% kable(caption = "Compare All Accuracy Measures") %>% 
    kable_styling()
Compare All Accuracy Measures
RMSE MSE MAE
ub_c 0.6631 0.4398 0.4494
ib_c 0.8538 0.7289 0.6499
svdz_pred 0.6219 0.3867 0.4059
svd_pred 0.6274 0.3936 0.4104
fsvd_pred 0.9347 0.8737 0.7267

Diversity Metrics

svdn_d <- MeanDiversity(ratings_test_unknown, svd_predict)
svdz_d <- MeanDiversity(ratings_test_unknown, svdz_predict)
fsvd_d <- MeanDiversity(ratings_test_unknown, fsvd_predict)
ubcf_d <- MeanDiversity(ratings_test_unknown, p_ub)
ibcf_d <- MeanDiversity(ratings_test_unknown, p_ib)

mean_div <- data.frame(rbind(svdn_d, svdz_d, fsvd_d, ubcf_d, ibcf_d))
colnames(mean_div) <- "Mean"
mean_div %>% kable(caption = "Compare Mean Diversity Measures") %>% kable_styling()
Compare Mean Diversity Measures
Mean
svdn_d 0.2563051
svdz_d 0.2569360
fsvd_d 0.2666170
ubcf_d 0.2821297
ibcf_d 0.3120506

We should note that this is a very niche market as it is specific to a subset of the Mexican population. In addition, the rating scale is minimal from 1-3 so the recommendation results could drastically change if the rating scale was increased from 1-5.

In terms of online data, if we were to recommend restaurants based on real-time ratings, we would use an approach where we would allow the users to find restaurants based on ratings and distance. In our personal experience distance for food is a major factor in addition to ratings.

Our data had very rich categorical features for us to play with, which included transportation. In the future, we would like to incorporate these types of categorical features into a hybrid system and compare our findings using gradient boosting or decision tree algorithms.

References

  1. Lab 41: Recommender Systems - It’s Not All About the Accuracy
  2. Cosine Similarity:
  3. EvaluationScheme from Recommenderlab:
  4. Building Recommenders: Overview of Recommender Algorithms
  5. recommenderlab: A Framework for Developing and Testing Recommendation Algorithms
  6. UC Business Analytics R Programming Guide: Gradient Boosting Machines