#dependencies
## data processing packages
library(tidyr); library(dplyr); library(RCurl); library(jsonlite); library(plyr)
##formatting packages
library(knitr); library(kableExtra); library(default)
##visualization packages
library(ggplot2)
##recommender packages
library(recommenderlab); library(Metrics); library(lsa); library(diveRsity)
# global options
## knit sizing
options(max.print="100"); opts_knit$set(width=75)
## augment chunk output
opts_chunk$set(echo=T,cache=F, tidy=T,comment=F,message=T,warning=T) #change message/warning to F upon completion
## set table style for consistency
default(kable) <- list(format="html")
default(kable_styling) <- list(bootstrap_options = "hover",full_width=T, font_size=10)
default(scroll_box) <- list(width = "100%")
## working directory
##try(setwd("~/Github/612-group/project-4"))
In this assignment, we practiced working with accuracy and other recommender system metrics. We compared the performance of serveral algorithms and analyzed the diversity of our recommendations.
Our data was sourced from Kaggle’s Restaurant Data with Consumer Ratings collection, which contained several datasets pertaining to restaurants and their patrons. The csv files are stored within our repository in the data folder.
# cusine tags
data1 <- read.csv("https://raw.githubusercontent.com/jemceach/612-group/master/project-4/data/chefmozcuisine.csv")
# restaurant name
data2 <- read.csv("https://raw.githubusercontent.com/jemceach/612-group/master/project-4/data/geoplaces2.csv")
# user ratings
data3 <- read.csv("https://raw.githubusercontent.com/jemceach/612-group/master/project-4/data/rating_final.csv")
# user profile
data4 <- read.csv("https://raw.githubusercontent.com/jemceach/612-group/master/project-4/data/userprofile.csv")
We cleaned our data using transformations and regular expression unite our user and restaurant data.
# restaurant dataframe concatenate restaurant tags
cuisine <- aggregate(Rcuisine ~ ., data1, toString)
## merge & transform
restaurant <- data2 %>% select(placeID, name, city, price, alcohol, smoking_area) %>%
mutate(name = tolower(gsub("[�'_']", " ", name))) %>% mutate(city = tolower(city))
## set/revalue factors
restaurant$city <- revalue(restaurant$city, c(`cd victoria` = "ciudad victoria",
`cd. victoria` = "ciudad victoria", `victoria ` = "ciudad victoria", victoria = "ciudad victoria",
`san luis potosi ` = "san luis potosi", `san luis potos` = "san luis potosi",
s.l.p = "san luis potosi", slp = "san luis potosi", s.l.p. = "san luis potosi"))
restaurant$smoking_area = revalue(restaurant$smoking_area, c(none = "no", `not permitted` = "no",
section = "yes", permitted = "yes", `only at bar` = "yes"))
restaurant$price <- factor(restaurant$price, levels = c("low", "medium", "high"))
restaurant$alcohol = revalue(restaurant$alcohol, c(No_Alcohol_Served = "no",
Full_Bar = "yes", `Wine-Beer` = "yes"))
restaurant <- inner_join(restaurant, cuisine, by = "placeID")
# user dataframe: select attributes of interest from profile
user_profile <- data4 %>% select(userID, budget, activity, smoker)
user <- inner_join(data3, user_profile, by = "userID") %>% select(-service_rating,
-food_rating)
## set/revalue factors
user$budget <- factor(user$budget, levels = c("low", "medium", "high"))
user$smoker <- revalue(user$smoker, c(false = "no", true = "yes"))
user$smoker <- factor(user$smoker, levels = c("no", "yes"))
## change ratings from 0-2 scale to 1-3
user$rating[user$rating == 2] <- 3
user$rating[user$rating == 1] <- 2
user$rating[user$rating == 0] <- 1
# combine user / restaurant data & subset
data <- inner_join(user, restaurant, by = "placeID")
data <- data %>% filter(city == "san luis potosi", activity == "student") %>%
select(-city, -activity)
The output of which can be previewed below:
# view output
data %>% head() %>% kable(caption = "User-Item Dataframe") %>% kable_styling()
| userID | placeID | rating | budget | smoker | name | price | alcohol | smoking_area | Rcuisine |
|---|---|---|---|---|---|---|---|---|---|
| U1077 | 135085 | 3 | medium | no | tortas locas hipocampo | medium | no | no | Fast_Food |
| U1077 | 132825 | 3 | medium | no | puesto de tacos | low | no | no | Mexican |
| U1077 | 135060 | 2 | medium | no | restaurante marisco sam | medium | no | no | Seafood |
| U1015 | 135071 | 1 | medium | yes | restaurante la cantina | medium | yes | yes | Bar, Bar_Pub_Brewery |
| U1108 | 135075 | 3 | medium | no | mariscos el pescador | medium | no | no | Seafood |
| U1108 | 132572 | 2 | medium | no | cafe chaires | low | no | no | Cafeteria |
We found that 80% of our raters were students students and 76% of our restaurants were located within the Mexican city of San Luis Potosi. As a result, we subsetted our restaurant/patron data to limit the scope of our system to this specific population. After subsetting our raw data, we identified 78 unique users and 56 restaurants to build our recommender systems from.
The following plots help visualize the distribution of our overall ratings given by users based on their budget and the restaurant’s categorized pricing. We also viewed the rating counts each restaurant received. On average, each venue received 13 user ratings.
We found that most of our restaurants received high ratings of 3. Most ratings came from users with low-medium budgets and that the majority of our restaurants were in the middle bracket for price. We had a wide spread in the number of ratings each restaurant received, with the fewest being 3 and the most being 32 ratings from our users. There were 32 unique cuisine tags assigned to our restaurants. Mexican cuisine was the most popular and those restaurants received 108 total ratings.
data %>% group_by(rating) %>% ggplot(aes(x = rating)) + geom_histogram(bins = 3,
color = "#63707e", fill = "#c8dad3") + labs(title = "Raw Ratings Distribution")
data %>% filter(!is.na(budget)) %>% group_by(rating) %>% ggplot(aes(x = rating)) +
geom_histogram(bins = 3, color = "#63707e", fill = "#c8dad3") + labs(title = "Ratings Distribution by Budget") +
facet_wrap(~budget, nrow = 1)
ggplot(data, aes(x = rating)) + geom_histogram(bins = 3, color = "#63707e",
fill = "#c8dad3") + labs(title = "Ratings Distribution by Restaurant Pricing") +
facet_wrap(data$price, nrow = 1)
data %>% mutate(placeID = as.factor(placeID), rating = as.factor(rating)) %>%
group_by(placeID) %>% add_tally() %>% ungroup() %>% ggplot(aes(x = reorder(placeID,
-n), fill = rating)) + geom_bar(stat = "count", color = "#000000") + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(), axis.ticks.x = element_blank()) + labs(title = "Restaurant Rating Counts") +
scale_fill_manual(values = c("#c8dad3", "#93b5b3", "#63707e"))
data %>% select(Rcuisine, rating) %>% mutate(rating = as.factor(rating)) %>%
group_by(Rcuisine) %>% add_tally() %>% ungroup %>% ggplot(aes(x = reorder(Rcuisine,
-n), fill = rating)) + geom_bar(stat = "count", color = "#000000") + labs(title = "Restaurant Cuisine Tag Counts with Ratings") +
theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 30,
hjust = 1, size = 8)) + scale_fill_manual(values = c("#c8dad3", "#93b5b3",
"#63707e"))
We converted our raw ratings data into a user-item matrix to test and train our subsequent recommender system algorithms.
# create user item matrix
ui_matrix <- data %>% select(userID, placeID, rating) %>% spread(placeID, rating)
rownames(ui_matrix) <- ui_matrix$userID # set row names to userid
ui_matrix <- ui_matrix %>% select(-userID) %>% as.matrix() # remove userid from columns
umat <- as(ui_matrix, "realRatingMatrix") # save real ratings for algo
# preview matrices
as.data.frame.array(ui_matrix) %>% head() %>% kable(caption = "Preview of User-Item Matrix") %>%
kable_styling() %>% scroll_box()
| 132572 | 132754 | 132755 | 132825 | 132834 | 132845 | 132846 | 132851 | 132854 | 132856 | 132858 | 132861 | 132862 | 132866 | 132869 | 132870 | 132872 | 132885 | 132937 | 132951 | 132954 | 132955 | 132958 | 135025 | 135026 | 135027 | 135028 | 135030 | 135032 | 135034 | 135035 | 135039 | 135041 | 135042 | 135044 | 135046 | 135048 | 135049 | 135052 | 135053 | 135054 | 135055 | 135057 | 135058 | 135059 | 135060 | 135069 | 135071 | 135072 | 135073 | 135074 | 135075 | 135079 | 135085 | 135086 | 135106 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| U1001 | NA | NA | NA | 3 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3 | NA | NA | NA | NA | NA | NA | NA | 2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 | NA | NA |
| U1002 | NA | NA | NA | 3 | NA | NA | NA | NA | NA | NA | NA | NA | 3 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3 | NA | NA | NA | NA | NA | 2 | NA | NA | NA | NA | NA | 2 | NA | NA | NA | NA | NA | NA | NA | NA | 2 | NA | 2 |
| U1003 | NA | 3 | 3 | 3 | NA | NA | NA | NA | NA | NA | NA | NA | 2 | NA | NA | NA | NA | NA | 3 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3 | NA | NA | NA | NA | NA | NA | 3 | 3 | NA | NA | NA |
| U1005 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3 | NA | NA | NA | 2 | 3 | NA | NA | NA | NA | NA | NA | NA | NA | 2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| U1006 | 2 | NA | NA | 2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2 | NA | NA | NA | 3 | NA | NA | 2 | NA | NA | NA | NA | NA | NA | NA | NA | 2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2 | 2 | NA | NA | NA |
| U1007 | 2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2 | NA | NA | NA | NA | 2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2 | 2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2 | 1 | NA |
Finally, our data was split into training and tests sets for model evaluation of both two recommender algorithms. We split our data with 10 k-folds using the recommenderlab package. 90% of data was retained for training and 10% for testing purposes.
# evaluation method with 90% of data for train and 10% for test
set.seed(1000)
evalu <- evaluationScheme(umat, method = "split", train = 0.8, given = 3, goodRating = 1,
k = 10)
For the first comparison, we chose to use the recommenderlab package to compare two collaborative filtering approaches: user-based and item-based. For data normalization we chose center, as we ran tests previously against other normalization techniques and the center approach did the best. Additionally, cosine similarity was used as the method of the system.
Here we are training our recommender with the train data set for both the UBCF and IBCF approaches.
# User-based collaborative filtering recommender
ub_train <- getData(evalu, "train")
ub_c <- Recommender(ub_train, "UBCF", param = list(normalize = "center", method = "Cosine"))
# Item-based collaborative filtering recommender
ib_train <- getData(evalu, "train")
ib_c <- Recommender(ib_train, "IBCF", param = list(normalize = "center", method = "Cosine"))
After training our model, we are now ready for predictions using our test data. We then wanted to ensure our predictions remained inside the 1-3 rating scale, so we set ceilings/floors for any values that fell outside. Finally, we built a table to compare the error terms for both models.
# predicted ratings on the test data set for the UBCF and IBCF recommender
ub_known <- getData(evalu, "known")
ib_known <- getData(evalu, "known")
p_ub <- predict(ub_c, ub_known, type = "ratings")
p_ib <- predict(ib_c, ib_known, type = "ratings")
# setting ceiling/floor values to ensure all are inside rating scale
p_ub@data@x[p_ub@data@x[] < 1] <- 1
p_ub@data@x[p_ub@data@x[] > 3] <- 3
p_ib@data@x[p_ib@data@x[] < 1] <- 1
p_ib@data@x[p_ib@data@x[] > 3] <- 3
# compare the predictions for user-based and item-based
ub_unknown <- getData(evalu, "unknown")
ib_unknown <- getData(evalu, "unknown")
error <- rbind(ub_c = calcPredictionAccuracy(p_ub, ub_unknown), ib_c = calcPredictionAccuracy(p_ib,
ib_unknown))
error %>% kable(caption = "Prediction Comparisons") %>% kable_styling()
| RMSE | MSE | MAE | |
|---|---|---|---|
| ub_c | 0.6631373 | 0.4397510 | 0.4494453 |
| ib_c | 0.8537624 | 0.7289103 | 0.6499042 |
We found that the more memory intense approach, user-based collaborative filtering, faired better in terms of error metrics as user-based had lower RMSE at .66 compared to the the item-based at .85. This could be due to the fact that the users are very similar as they are all Mexican college students and so it would make send that the user collaborative filtering provides more accurate results.
Our next method uses Singular Value Decomposition (SVD). We compared a normalized SVD approach to Funk SVD and Z-score SVD.
We used the SVD algorthim to create a model which makes user-item recommendation predictions. The recommender relies on SVD approximation with column-mean imputation.
With normalize parameter we are asking system to normalize the rating by subtracting avg rating per user. Two availble methods are :
We also looked at SVD using the funk approach. Funk SVD decomposes a matrix (with missing values) into two components: U and V. The singular values are folded into these matrices. The approximation for the original matrix can be obtained by `R = UV’
# Prep data
ratings_train <- getData(evalu, "train") # Training Dataset
ratings_test_known <- getData(evalu, "known") # Test data from evaluationScheme of type KNOWN
ratings_test_unknown <- getData(evalu, "unknown") # Unknow datset used for RMSE / model evaluation
# NORMALIZED Train Model
svdn <- Recommender(data = ratings_train, method = "SVD", param = list(k = 10,
normalize = "center"))
svdz <- Recommender(data = ratings_train, method = "SVD", param = list(k = 10,
normalize = "Z-score"))
fsvd <- funkSVD(ratings_train, verbose = F) # k = 10, gamma = 0.015, lambda = 0.001,
## Test/Evaluation Model Predication
svd_predict <- predict(svdn, ratings_test_known, type = "ratings")
svd_pred <- calcPredictionAccuracy(svd_predict, ratings_test_unknown)
svdz_predict <- predict(svdz, ratings_test_known, type = "ratings")
svdz_pred <- calcPredictionAccuracy(svdz_predict, ratings_test_unknown)
fsvd_predict <- predict(fsvd, ratings_test_known, verbose = F)
fsvd_predict <- as(fsvd_predict, "realRatingMatrix")
fsvd_pred = calcPredictionAccuracy(fsvd_predict, ratings_test_unknown)
# bind prediction results
svd_results <- rbind(svdz_pred, svd_pred, fsvd_pred)
Our prediction evaluations can be seen below:
svd_results %>% kable(caption = "SVD Prediction Results with 10 Latent Factors") %>%
kable_styling()
| RMSE | MSE | MAE | |
|---|---|---|---|
| svdz_pred | 0.6218776 | 0.3867317 | 0.4059436 |
| svd_pred | 0.6273705 | 0.3935938 | 0.4103931 |
| fsvd_pred | 0.9347304 | 0.8737210 | 0.7266974 |
Diversity measures how dissimilar recommended items are for a user. This similarity is often determined using the item’s content (e.g. restaurant cusine) but can also be determined using how similarly items are rated. One measure of diversity is the Intra-List Similarity (ILS). The ILS equation can calculate the similarity between any two items (ij, ik) using the cosine similarity, Jaccard similarity coefficient, or another similarity metric could be utilized in the equation.
In our Diversity test, we are using predicted rating of our restaurants from the User and Actual rating of the restaurants from the User. We only use the items that have been rate from the original dataset and calculate the cosine similarity among the predicted and actual rating.
# Function to calualte Diversity
getDiversity <- function(ActualData, Predicted, tag) {
print(paste("Cosine Similarity Diversity for:", tag, ">>>"))
print("==========================================================================")
cal_cosine <- NULL
Predicted = as(Predicted, "matrix")
ActualData = as(ActualData, "matrix")
for (i in 1:(dim(ActualData)[1])) {
a <- as.vector(ActualData[i, ])
b <- as.vector(Predicted[i, ])
# set All NA from MAIN Data = 0
a[which(is.na(a))] = 0
b[which(is.na(b))] = 0
cal_cosine[i] <- cosine(a, b) #a %*% b / sqrt(a%*%a * b%*%b)
print(paste(i, ": ", cal_cosine[i]))
}
print("==========================================================================")
print(paste("Mean Diversity:=>", round(mean(cal_cosine, na.rm = T), 4)))
actualDiversity_Error = 1 - mean(cal_cosine, na.rm = T)
print(paste("Diversity Error:=> ", round(actualDiversity_Error, 4)))
}
# Return Mean Diversity Only
MeanDiversity <- function(ActualData, Predicted) {
cal_cosine <- NULL
Predicted = as(Predicted, "matrix")
ActualData = as(ActualData, "matrix")
for (i in 1:(dim(ActualData)[1])) {
a <- as.vector(ActualData[i, ])
b <- as.vector(Predicted[i, ])
a[which(is.na(a))] = 0
b[which(is.na(b))] = 0
cal_cosine[i] <- cosine(a, b)
}
return(mean(cal_cosine, na.rm = T))
}
getDiversity(ratings_test_unknown, svd_predict, "SVD k-10")
FALSE [1] "Cosine Similarity Diversity for: SVD k-10 >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 : 0.268894336626814"
FALSE [1] "2 : 0.302193240687116"
FALSE [1] "3 : 0.252318294483627"
FALSE [1] "4 : 0.137360563948689"
FALSE [1] "5 : 0.333589941018245"
FALSE [1] "6 : 0.284877204338698"
FALSE [1] "7 : 0.174913248712286"
FALSE [1] "8 : 0.134843196435992"
FALSE [1] "9 : 0.195069192724345"
FALSE [1] "10 : 0.196981363437319"
FALSE [1] "11 : 0.357137466266591"
FALSE [1] "12 : 0.137360563948689"
FALSE [1] "13 : 0.384940267625149"
FALSE [1] "14 : 0.284110454217177"
FALSE [1] "15 : 0.24421013802059"
FALSE [1] "16 : 0.412081691846067"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.2563"
FALSE [1] "Diversity Error:=> 0.7437"
getDiversity(ratings_test_unknown, svdz_predict, "SVD with Z-score")
FALSE [1] "Cosine Similarity Diversity for: SVD with Z-score >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 : 0.268074351843233"
FALSE [1] "2 : 0.302193240687116"
FALSE [1] "3 : 0.259554906506915"
FALSE [1] "4 : 0.137360563948689"
FALSE [1] "5 : 0.333589941018245"
FALSE [1] "6 : 0.285603478341271"
FALSE [1] "7 : 0.175782885366664"
FALSE [1] "8 : 0.135713147276616"
FALSE [1] "9 : 0.195364088257438"
FALSE [1] "10 : 0.196068582269344"
FALSE [1] "11 : 0.357137466266591"
FALSE [1] "12 : 0.137360563948689"
FALSE [1] "13 : 0.385109470478274"
FALSE [1] "14 : 0.28585612265559"
FALSE [1] "15 : 0.244124723971788"
FALSE [1] "16 : 0.412081691846067"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.2569"
FALSE [1] "Diversity Error:=> 0.7431"
getDiversity(ratings_test_unknown, fsvd_predict, "Funk SVD")
FALSE [1] "Cosine Similarity Diversity for: Funk SVD >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 : 0.346068932922078"
FALSE [1] "2 : 0.25121046889339"
FALSE [1] "3 : 0.203958821984534"
FALSE [1] "4 : 0.148946718206544"
FALSE [1] "5 : 0.440503372584769"
FALSE [1] "6 : 0.376058123593362"
FALSE [1] "7 : 0.0854097534211631"
FALSE [1] "8 : 0.0888500092836332"
FALSE [1] "9 : 0.178259731481384"
FALSE [1] "10 : 0.265958190585396"
FALSE [1] "11 : 0.375059818666665"
FALSE [1] "12 : 0.121333086082837"
FALSE [1] "13 : 0.293532949920923"
FALSE [1] "14 : 0.296253518522501"
FALSE [1] "15 : 0.311850040279204"
FALSE [1] "16 : 0.482617796549666"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.2666"
FALSE [1] "Diversity Error:=> 0.7334"
getDiversity(ratings_test_unknown, p_ub, "User-Based")
FALSE [1] "Cosine Similarity Diversity for: User-Based >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 : 0.270958745736286"
FALSE [1] "2 : 0.332820117735137"
FALSE [1] "3 : 0.625525337955507"
FALSE [1] "4 : 0.218217890235992"
FALSE [1] "5 : 0.330469991735662"
FALSE [1] "6 : 0.299100647029526"
FALSE [1] "7 : 0.103671665983777"
FALSE [1] "8 : 0.344079328288924"
FALSE [1] "9 : 0.170327636941529"
FALSE [1] "10 : 0.20004655525702"
FALSE [1] "11 : 0.3672230493342"
FALSE [1] "12 : 0"
FALSE [1] "13 : 0.345022722013868"
FALSE [1] "14 : 0.314336698141438"
FALSE [1] "15 : 0.251115219359251"
FALSE [1] "16 : 0.341159125180006"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.2821"
FALSE [1] "Diversity Error:=> 0.7179"
getDiversity(ratings_test_unknown, p_ib, "Item-Based")
FALSE [1] "Cosine Similarity Diversity for: Item-Based >>>"
FALSE [1] "=========================================================================="
FALSE [1] "1 : 0.242468848999763"
FALSE [1] "2 : NaN"
FALSE [1] "3 : 0.658192410922158"
FALSE [1] "4 : NaN"
FALSE [1] "5 : NaN"
FALSE [1] "6 : 0.382501704453842"
FALSE [1] "7 : 0.0571117611021184"
FALSE [1] "8 : 0.301939630437212"
FALSE [1] "9 : 0.243566957874093"
FALSE [1] "10 : 0.216276603436567"
FALSE [1] "11 : NaN"
FALSE [1] "12 : NaN"
FALSE [1] "13 : 0.468253356466701"
FALSE [1] "14 : 0.352309631780176"
FALSE [1] "15 : 0.197885259814317"
FALSE [1] "16 : NaN"
FALSE [1] "=========================================================================="
FALSE [1] "Mean Diversity:=> 0.3121"
FALSE [1] "Diversity Error:=> 0.6879"
After running several algorithm variations, our methods for UBCF, Z-Score SVD, and Normalized SVD produced our strongest accuracy scores, with the Z-Score SVD performing the best. This algorithm would produce the most relevant restaurant recommendations to Mexican college students.
Once we calculated the diversity score, we found that all algorithms, with the exeption of item-based collaborative filtering, performed similarly. The SVD algorithms performed the best, with the normalized SVD producing the lowest score. We can rely on these methods to provide the most diverse restaurant recommendations to the college students in our model.
rbind(error, svd_results) %>% round(4) %>% kable(caption = "Compare All Accuracy Measures") %>%
kable_styling()
| RMSE | MSE | MAE | |
|---|---|---|---|
| ub_c | 0.6631 | 0.4398 | 0.4494 |
| ib_c | 0.8538 | 0.7289 | 0.6499 |
| svdz_pred | 0.6219 | 0.3867 | 0.4059 |
| svd_pred | 0.6274 | 0.3936 | 0.4104 |
| fsvd_pred | 0.9347 | 0.8737 | 0.7267 |
svdn_d <- MeanDiversity(ratings_test_unknown, svd_predict)
svdz_d <- MeanDiversity(ratings_test_unknown, svdz_predict)
fsvd_d <- MeanDiversity(ratings_test_unknown, fsvd_predict)
ubcf_d <- MeanDiversity(ratings_test_unknown, p_ub)
ibcf_d <- MeanDiversity(ratings_test_unknown, p_ib)
mean_div <- data.frame(rbind(svdn_d, svdz_d, fsvd_d, ubcf_d, ibcf_d))
colnames(mean_div) <- "Mean"
mean_div %>% kable(caption = "Compare Mean Diversity Measures") %>% kable_styling()
| Mean | |
|---|---|
| svdn_d | 0.2563051 |
| svdz_d | 0.2569360 |
| fsvd_d | 0.2666170 |
| ubcf_d | 0.2821297 |
| ibcf_d | 0.3120506 |
We should note that this is a very niche market as it is specific to a subset of the Mexican population. In addition, the rating scale is minimal from 1-3 so the recommendation results could drastically change if the rating scale was increased from 1-5.
In terms of online data, if we were to recommend restaurants based on real-time ratings, we would use an approach where we would allow the users to find restaurants based on ratings and distance. In our personal experience distance for food is a major factor in addition to ratings.
Our data had very rich categorical features for us to play with, which included transportation. In the future, we would like to incorporate these types of categorical features into a hybrid system and compare our findings using gradient boosting or decision tree algorithms.