This Recommender System recommends Arabic Perfumes Fragrances and Colognes to Men from the Middle East region. Each Parfum or Cologne is given a score on the scale from 0 to 5. This is a toy data set with 10 types of cologne and 10 customers (users). I used the package RandomNames to generate random common names in the Middle East. Names of Perfums and Colognes are picked from Arabic fragrances sites that sell these products.
library(randomNames)
library(tidyverse)
library(kableExtra)
library(knitr)
# randomly create a toy square matrix with n users and items, ranking 1 to 5
# set simulation vars
set.seed(12345)
n <- 10
percentNAs <- .18
probs <- c(1-percentNAs, percentNAs)
#use randomNames package to get names of persons randomly (which choose male from middle eastern ethnicity)
pnames <- randomNames(10, gender=0, ethnicity=6, which.names="first")
users <- c(pnames)
#could not find a package that can do random type of objects, so I created mine
items <- c("Oud For Greatness","Rasasi La Yuqawam","Halfeti Perfume","Raghba","Armani Prive Oud","ESTEE LAUDER","Club De Nuit","Dehn El Oud Malaki","Baccarat Rouge","Shaghaf Oud")
# generate data
myData <- as.data.frame(replicate(n, floor(runif(n, 1,6))))
myData <- as.data.frame(
lapply(myData, function(df) df[sample(c(TRUE, NA), prob = probs, size = length(df), replace = TRUE) ]),
row.names = users,
col.names = items)
myData %>% kable(caption = "Arabic Perfumes and Colognes Ratings") %>% kable_styling("striped", full_width = TRUE)
Arabic Perfumes and Colognes Ratings
|
Oud.For.Greatness
|
Rasasi.La.Yuqawam
|
Halfeti.Perfume
|
Raghba
|
Armani.Prive.Oud
|
ESTEE.LAUDER
|
Club.De.Nuit
|
Dehn.El.Oud.Malaki
|
Baccarat.Rouge
|
Shaghaf.Oud
|
Fayyaad
|
3
|
4
|
4
|
NA
|
4
|
NA
|
5
|
5
|
2
|
4
|
Jameel
|
2
|
1
|
3
|
5
|
2
|
3
|
3
|
NA
|
NA
|
2
|
Kaarim
|
5
|
1
|
5
|
2
|
NA
|
2
|
1
|
1
|
5
|
4
|
Naseer
|
4
|
4
|
4
|
2
|
4
|
2
|
1
|
4
|
4
|
2
|
Arafaat
|
4
|
NA
|
2
|
4
|
5
|
1
|
NA
|
NA
|
3
|
NA
|
Jumail
|
2
|
NA
|
NA
|
3
|
2
|
4
|
2
|
3
|
5
|
5
|
Awad
|
NA
|
5
|
NA
|
4
|
NA
|
5
|
5
|
4
|
4
|
5
|
Abdul Qayyoom
|
3
|
5
|
1
|
1
|
1
|
NA
|
NA
|
4
|
5
|
3
|
Rida
|
2
|
4
|
NA
|
NA
|
NA
|
3
|
5
|
1
|
2
|
1
|
Zaaid
|
3
|
1
|
4
|
2
|
5
|
NA
|
1
|
2
|
2
|
2
|
Break your ratings into separate training and test datasets
training_portion <- sample(n, size = floor(n * 0.75), replace=F)
training.data <- myData[training_portion, ]
testing.data <- myData[-training_portion, ]
training.data %>% kable(caption = "Training Dataset") %>% kable_styling("striped", full_width = TRUE)
Training Dataset
|
Oud.For.Greatness
|
Rasasi.La.Yuqawam
|
Halfeti.Perfume
|
Raghba
|
Armani.Prive.Oud
|
ESTEE.LAUDER
|
Club.De.Nuit
|
Dehn.El.Oud.Malaki
|
Baccarat.Rouge
|
Shaghaf.Oud
|
Awad
|
NA
|
5
|
NA
|
4
|
NA
|
5
|
5
|
4
|
4
|
5
|
Arafaat
|
4
|
NA
|
2
|
4
|
5
|
1
|
NA
|
NA
|
3
|
NA
|
Jumail
|
2
|
NA
|
NA
|
3
|
2
|
4
|
2
|
3
|
5
|
5
|
Kaarim
|
5
|
1
|
5
|
2
|
NA
|
2
|
1
|
1
|
5
|
4
|
Zaaid
|
3
|
1
|
4
|
2
|
5
|
NA
|
1
|
2
|
2
|
2
|
Naseer
|
4
|
4
|
4
|
2
|
4
|
2
|
1
|
4
|
4
|
2
|
Jameel
|
2
|
1
|
3
|
5
|
2
|
3
|
3
|
NA
|
NA
|
2
|
testing.data %>% kable(caption = "Testing Dataset") %>% kable_styling("striped", full_width = TRUE)
Testing Dataset
|
Oud.For.Greatness
|
Rasasi.La.Yuqawam
|
Halfeti.Perfume
|
Raghba
|
Armani.Prive.Oud
|
ESTEE.LAUDER
|
Club.De.Nuit
|
Dehn.El.Oud.Malaki
|
Baccarat.Rouge
|
Shaghaf.Oud
|
Fayyaad
|
3
|
4
|
4
|
NA
|
4
|
NA
|
5
|
5
|
2
|
4
|
Abdul Qayyoom
|
3
|
5
|
1
|
1
|
1
|
NA
|
NA
|
4
|
5
|
3
|
Rida
|
2
|
4
|
NA
|
NA
|
NA
|
3
|
5
|
1
|
2
|
1
|
Calculate the raw average (mean) rating for every user-item combination
# raw average rating (training set)
raw_avg_train <- mean(as.matrix(training.data), na.rm=TRUE)
raw_avg_train
## [1] 3.105263
Calculate the RMSE for raw average for both your training data and your test data
RMSE <- function(observed, predicted) {
# get RMSE
sqrt(mean((observed - predicted)^2, na.rm=TRUE))
}
# the RMSE for raw average for training set
RMSE_training.data <- RMSE(as.matrix(training.data), raw_avg_train)
RMSE_training.data
## [1] 1.385187
# the RMSE for raw average for test set
RMSE_testing.data <- RMSE(as.matrix(testing.data), raw_avg_train)
RMSE_testing.data
## [1] 1.453979
calculate the bias for each user and each item
user_bias_training <- rowMeans(as.matrix(training.data), na.rm = T) - raw_avg_train
user_bias_testing <- rowMeans(as.matrix(testing.data), na.rm = T) - raw_avg_train
item_bias_training <- colMeans(as.matrix(training.data), na.rm = T) - raw_avg_train
user_bias_training %>% kable(caption = "User Bias Training") %>% kable_styling("striped", full_width = TRUE)
User Bias Training
|
x
|
Awad
|
1.4661654
|
Arafaat
|
0.0614035
|
Jumail
|
0.1447368
|
Kaarim
|
-0.2163743
|
Zaaid
|
-0.6608187
|
Naseer
|
-0.0052632
|
Jameel
|
-0.4802632
|
item_bias_training %>% kable(caption = "Item Bias Training") %>% kable_styling("striped", full_width = TRUE)
Item Bias Training
|
x
|
Oud.For.Greatness
|
0.2280702
|
Rasasi.La.Yuqawam
|
-0.7052632
|
Halfeti.Perfume
|
0.4947368
|
Raghba
|
0.0375940
|
Armani.Prive.Oud
|
0.4947368
|
ESTEE.LAUDER
|
-0.2719298
|
Club.De.Nuit
|
-0.9385965
|
Dehn.El.Oud.Malaki
|
-0.3052632
|
Baccarat.Rouge
|
0.7280702
|
Shaghaf.Oud
|
0.2280702
|
user_bias_testing %>% kable(caption = "Item Bias Testing") %>% kable_styling("striped", full_width = TRUE)
Item Bias Testing
|
x
|
Fayyaad
|
0.7697368
|
Abdul Qayyoom
|
-0.2302632
|
Rida
|
-0.5338346
|
Calculate the baseline predictors for every user-item combination.
baseline_predictor <- function(rawAvg, userBias, itemBias) {
# calculates predictions based on rawaverage and user and item bias calcs
userlist <- names(userBias)
itemlist <- names(itemBias)
df <- data.frame()
for (i in userBias) {
UserPred <- rawAvg + i + itemBias
df <- rbind(df, UserPred)
}
# Like the videos lectures suggests, we should not exceed our max and min values
df[df > 5] <- 5
df[df < 1] <- 1
row.names(df) <- userlist
names(df) <- itemlist
df
}
training_Predictions <- baseline_predictor(raw_avg_train, user_bias_training, item_bias_training)
heatmap(as.matrix(training_Predictions), Rowv=NA, Colv=NA, scale = 'none', col = heat.colors(256))

training_Predictions %>% kable(caption = "Training Predictions") %>% kable_styling("striped", full_width = TRUE)
Training Predictions
|
Oud.For.Greatness
|
Rasasi.La.Yuqawam
|
Halfeti.Perfume
|
Raghba
|
Armani.Prive.Oud
|
ESTEE.LAUDER
|
Club.De.Nuit
|
Dehn.El.Oud.Malaki
|
Baccarat.Rouge
|
Shaghaf.Oud
|
Awad
|
4.799499
|
3.866165
|
5.000000
|
4.609023
|
5.000000
|
4.299499
|
3.632832
|
4.266165
|
5.000000
|
4.799499
|
Arafaat
|
3.394737
|
2.461403
|
3.661404
|
3.204261
|
3.661404
|
2.894737
|
2.228070
|
2.861404
|
3.894737
|
3.394737
|
Jumail
|
3.478070
|
2.544737
|
3.744737
|
3.287594
|
3.744737
|
2.978070
|
2.311403
|
2.944737
|
3.978070
|
3.478070
|
Kaarim
|
3.116959
|
2.183626
|
3.383626
|
2.926483
|
3.383626
|
2.616959
|
1.950292
|
2.583626
|
3.616959
|
3.116959
|
Zaaid
|
2.672515
|
1.739181
|
2.939181
|
2.482038
|
2.939181
|
2.172515
|
1.505848
|
2.139181
|
3.172515
|
2.672515
|
Naseer
|
3.328070
|
2.394737
|
3.594737
|
3.137594
|
3.594737
|
2.828070
|
2.161404
|
2.794737
|
3.828070
|
3.328070
|
Jameel
|
2.853070
|
1.919737
|
3.119737
|
2.662594
|
3.119737
|
2.353070
|
1.686403
|
2.319737
|
3.353070
|
2.853070
|
testing_Predictions <- baseline_predictor(raw_avg_train, user_bias_testing, item_bias_training)
testing_Predictions %>% kable(caption = "Testing Predictions") %>% kable_styling("striped", full_width = TRUE)
Testing Predictions
|
Oud.For.Greatness
|
Rasasi.La.Yuqawam
|
Halfeti.Perfume
|
Raghba
|
Armani.Prive.Oud
|
ESTEE.LAUDER
|
Club.De.Nuit
|
Dehn.El.Oud.Malaki
|
Baccarat.Rouge
|
Shaghaf.Oud
|
Fayyaad
|
4.103070
|
3.169737
|
4.369737
|
3.912594
|
4.369737
|
3.603070
|
2.936403
|
3.569737
|
4.603070
|
4.103070
|
Abdul Qayyoom
|
3.103070
|
2.169737
|
3.369737
|
2.912594
|
3.369737
|
2.603070
|
1.936403
|
2.569737
|
3.603070
|
3.103070
|
Rida
|
2.799499
|
1.866165
|
3.066165
|
2.609023
|
3.066165
|
2.299499
|
1.632832
|
2.266165
|
3.299499
|
2.799499
|
Calculate the RMSE for the baseline predictors for both your training data and your test data.
# the RMSE for the baseline predictors for training set
RMSE_training_Predictions <- RMSE(as.matrix(training.data), as.matrix(training_Predictions))
RMSE_training_Predictions
## [1] 1.102288
# the RMSE for the baseline predictors for test data
RMSE_testing_Predictions <- RMSE(as.matrix(testing.data), as.matrix(testing_Predictions))
RMSE_testing_Predictions
## [1] 1.69063
Summarizing results
trainVec <- c(RMSE_training.data,
RMSE_training_Predictions,
(1-RMSE_training_Predictions/RMSE_training.data)*100)
testVec <- c(RMSE_testing.data,
RMSE_testing_Predictions,
(1-RMSE_testing_Predictions/RMSE_testing.data)*100)
summary <- data.frame(trainVec, testVec)
names(summary) <- c("Training", "Testing")
row.names(summary) <- c("Raw Average RMSE",
"Simple Predictor RMSE",
"Percent Improvement")
summary %>% kable(caption = "Summary") %>% kable_styling("striped", full_width = TRUE)
Summary
|
Training
|
Testing
|
Raw Average RMSE
|
1.385187
|
1.453979
|
Simple Predictor RMSE
|
1.102288
|
1.690630
|
Percent Improvement
|
20.423168
|
-16.276060
|
RMSE is lower for the baseline predictor compared to raw average predictor for the Training data set. 20% improvement on the Training data set. For the Testing data set, we’ve got no improvement. This is due to the fact that the test data set being randomly generated and has no inherent biases and therefore the training biases for items applied to the testing set did not improve performance. On the other hand, these same bias are fit for the training set, hence will clearly improve the RMSE for the training set.