Data612 - Project 1

This Recommender System recommends Arabic Perfumes Fragrances and Colognes to Men from the Middle East region. Each Parfum or Cologne is given a score on the scale from 0 to 5. This is a toy data set with 10 types of cologne and 10 customers (users). I used the package RandomNames to generate random common names in the Middle East. Names of Perfums and Colognes are picked from Arabic fragrances sites that sell these products.

library(randomNames)
library(tidyverse)
library(kableExtra)
library(knitr)

# randomly create a toy square matrix with n users and items, ranking 1 to 5

# set simulation vars
set.seed(12345)
n <- 10
percentNAs <- .18
probs <- c(1-percentNAs, percentNAs)

#use randomNames package to get names of persons randomly (which choose male from middle eastern ethnicity)
pnames <- randomNames(10, gender=0, ethnicity=6, which.names="first")
users <- c(pnames)

#could not find a package that can do random type of objects, so I created mine

items <- c("Oud For Greatness","Rasasi La Yuqawam","Halfeti Perfume","Raghba","Armani Prive Oud","ESTEE LAUDER","Club De Nuit","Dehn El Oud Malaki","Baccarat Rouge","Shaghaf Oud")

# generate data
myData <- as.data.frame(replicate(n, floor(runif(n, 1,6))))

myData <- as.data.frame(
    lapply(myData, function(df) df[sample(c(TRUE, NA), prob = probs, size = length(df), replace = TRUE) ]),
    row.names = users,
    col.names = items)

myData %>% kable(caption = "Arabic Perfumes and Colognes Ratings") %>% kable_styling("striped", full_width = TRUE)

Arabic Perfumes and Colognes Ratings
	Oud.For.Greatness	Rasasi.La.Yuqawam	Halfeti.Perfume	Raghba	Armani.Prive.Oud	ESTEE.LAUDER	Club.De.Nuit	Dehn.El.Oud.Malaki	Baccarat.Rouge	Shaghaf.Oud
Fayyaad	3	4	4	NA	4	NA	5	5	2	4
Jameel	2	1	3	5	2	3	3	NA	NA	2
Kaarim	5	1	5	2	NA	2	1	1	5	4
Naseer	4	4	4	2	4	2	1	4	4	2
Arafaat	4	NA	2	4	5	1	NA	NA	3	NA
Jumail	2	NA	NA	3	2	4	2	3	5	5
Awad	NA	5	NA	4	NA	5	5	4	4	5
Abdul Qayyoom	3	5	1	1	1	NA	NA	4	5	3
Rida	2	4	NA	NA	NA	3	5	1	2	1
Zaaid	3	1	4	2	5	NA	1	2	2	2

Break your ratings into separate training and test datasets

training_portion <- sample(n, size = floor(n * 0.75), replace=F)

training.data <- myData[training_portion, ]
testing.data <- myData[-training_portion, ]

training.data %>% kable(caption = "Training Dataset") %>% kable_styling("striped", full_width = TRUE)

Training Dataset
	Oud.For.Greatness	Rasasi.La.Yuqawam	Halfeti.Perfume	Raghba	Armani.Prive.Oud	ESTEE.LAUDER	Club.De.Nuit	Dehn.El.Oud.Malaki	Baccarat.Rouge	Shaghaf.Oud
Awad	NA	5	NA	4	NA	5	5	4	4	5
Arafaat	4	NA	2	4	5	1	NA	NA	3	NA
Jumail	2	NA	NA	3	2	4	2	3	5	5
Kaarim	5	1	5	2	NA	2	1	1	5	4
Zaaid	3	1	4	2	5	NA	1	2	2	2
Naseer	4	4	4	2	4	2	1	4	4	2
Jameel	2	1	3	5	2	3	3	NA	NA	2

testing.data %>% kable(caption = "Testing Dataset") %>% kable_styling("striped", full_width = TRUE)

Testing Dataset
	Oud.For.Greatness	Rasasi.La.Yuqawam	Halfeti.Perfume	Raghba	Armani.Prive.Oud	ESTEE.LAUDER	Club.De.Nuit	Dehn.El.Oud.Malaki	Baccarat.Rouge	Shaghaf.Oud
Fayyaad	3	4	4	NA	4	NA	5	5	2	4
Abdul Qayyoom	3	5	1	1	1	NA	NA	4	5	3
Rida	2	4	NA	NA	NA	3	5	1	2	1

Calculate the raw average (mean) rating for every user-item combination

# raw average rating (training set)
raw_avg_train <- mean(as.matrix(training.data), na.rm=TRUE)
raw_avg_train

## [1] 3.105263

Calculate the RMSE for raw average for both your training data and your test data

RMSE <- function(observed, predicted) {
    # get RMSE
    sqrt(mean((observed - predicted)^2, na.rm=TRUE))
}

# the RMSE for raw average for training set
RMSE_training.data <- RMSE(as.matrix(training.data), raw_avg_train)
RMSE_training.data

## [1] 1.385187

# the RMSE for raw average for test set
RMSE_testing.data <- RMSE(as.matrix(testing.data), raw_avg_train)
RMSE_testing.data

## [1] 1.453979

calculate the bias for each user and each item

user_bias_training <- rowMeans(as.matrix(training.data), na.rm = T) - raw_avg_train
user_bias_testing <- rowMeans(as.matrix(testing.data), na.rm = T) - raw_avg_train
item_bias_training <- colMeans(as.matrix(training.data), na.rm = T) - raw_avg_train

user_bias_training %>% kable(caption = "User Bias Training") %>% kable_styling("striped", full_width = TRUE)

User Bias Training
	x
Awad	1.4661654
Arafaat	0.0614035
Jumail	0.1447368
Kaarim	-0.2163743
Zaaid	-0.6608187
Naseer	-0.0052632
Jameel	-0.4802632

item_bias_training %>% kable(caption = "Item Bias Training") %>% kable_styling("striped", full_width = TRUE)

Item Bias Training
	x
Oud.For.Greatness	0.2280702
Rasasi.La.Yuqawam	-0.7052632
Halfeti.Perfume	0.4947368
Raghba	0.0375940
Armani.Prive.Oud	0.4947368
ESTEE.LAUDER	-0.2719298
Club.De.Nuit	-0.9385965
Dehn.El.Oud.Malaki	-0.3052632
Baccarat.Rouge	0.7280702
Shaghaf.Oud	0.2280702

user_bias_testing %>% kable(caption = "Item Bias Testing") %>% kable_styling("striped", full_width = TRUE)

Item Bias Testing
	x
Fayyaad	0.7697368
Abdul Qayyoom	-0.2302632
Rida	-0.5338346

Calculate the baseline predictors for every user-item combination.

baseline_predictor <- function(rawAvg, userBias, itemBias) {
    # calculates predictions based on rawaverage and user and item bias calcs
    userlist <- names(userBias)
    itemlist <- names(itemBias)
    df <- data.frame()
    
    for (i in userBias) {
        UserPred <- rawAvg + i + itemBias
        df <- rbind(df, UserPred)
    }
    
    # Like the videos lectures suggests, we should not exceed our max and min values 
    df[df > 5] <- 5
    df[df < 1] <- 1
    
    row.names(df) <- userlist
    names(df) <- itemlist
    df
}

training_Predictions <- baseline_predictor(raw_avg_train, user_bias_training, item_bias_training)
heatmap(as.matrix(training_Predictions), Rowv=NA, Colv=NA, scale = 'none', col = heat.colors(256))

training_Predictions %>% kable(caption = "Training Predictions") %>% kable_styling("striped", full_width = TRUE)

Training Predictions
	Oud.For.Greatness	Rasasi.La.Yuqawam	Halfeti.Perfume	Raghba	Armani.Prive.Oud	ESTEE.LAUDER	Club.De.Nuit	Dehn.El.Oud.Malaki	Baccarat.Rouge	Shaghaf.Oud
Awad	4.799499	3.866165	5.000000	4.609023	5.000000	4.299499	3.632832	4.266165	5.000000	4.799499
Arafaat	3.394737	2.461403	3.661404	3.204261	3.661404	2.894737	2.228070	2.861404	3.894737	3.394737
Jumail	3.478070	2.544737	3.744737	3.287594	3.744737	2.978070	2.311403	2.944737	3.978070	3.478070
Kaarim	3.116959	2.183626	3.383626	2.926483	3.383626	2.616959	1.950292	2.583626	3.616959	3.116959
Zaaid	2.672515	1.739181	2.939181	2.482038	2.939181	2.172515	1.505848	2.139181	3.172515	2.672515
Naseer	3.328070	2.394737	3.594737	3.137594	3.594737	2.828070	2.161404	2.794737	3.828070	3.328070
Jameel	2.853070	1.919737	3.119737	2.662594	3.119737	2.353070	1.686403	2.319737	3.353070	2.853070

testing_Predictions <- baseline_predictor(raw_avg_train, user_bias_testing, item_bias_training)
testing_Predictions %>% kable(caption = "Testing Predictions") %>% kable_styling("striped", full_width = TRUE)

Testing Predictions
	Oud.For.Greatness	Rasasi.La.Yuqawam	Halfeti.Perfume	Raghba	Armani.Prive.Oud	ESTEE.LAUDER	Club.De.Nuit	Dehn.El.Oud.Malaki	Baccarat.Rouge	Shaghaf.Oud
Fayyaad	4.103070	3.169737	4.369737	3.912594	4.369737	3.603070	2.936403	3.569737	4.603070	4.103070
Abdul Qayyoom	3.103070	2.169737	3.369737	2.912594	3.369737	2.603070	1.936403	2.569737	3.603070	3.103070
Rida	2.799499	1.866165	3.066165	2.609023	3.066165	2.299499	1.632832	2.266165	3.299499	2.799499

Calculate the RMSE for the baseline predictors for both your training data and your test data.

# the RMSE for the baseline predictors for training set
RMSE_training_Predictions <- RMSE(as.matrix(training.data), as.matrix(training_Predictions))
RMSE_training_Predictions

## [1] 1.102288

# the RMSE for the baseline predictors for test data
RMSE_testing_Predictions <- RMSE(as.matrix(testing.data), as.matrix(testing_Predictions))
RMSE_testing_Predictions

## [1] 1.69063

Summarizing results

trainVec <- c(RMSE_training.data,
              RMSE_training_Predictions,
              (1-RMSE_training_Predictions/RMSE_training.data)*100)
testVec <- c(RMSE_testing.data, 
             RMSE_testing_Predictions, 
             (1-RMSE_testing_Predictions/RMSE_testing.data)*100)
summary <- data.frame(trainVec, testVec)
names(summary) <- c("Training", "Testing")
row.names(summary) <- c("Raw Average RMSE", 
                       "Simple Predictor RMSE", 
                       "Percent Improvement")

summary %>% kable(caption = "Summary") %>% kable_styling("striped", full_width = TRUE)

Summary
	Training	Testing
Raw Average RMSE	1.385187	1.453979
Simple Predictor RMSE	1.102288	1.690630
Percent Improvement	20.423168	-16.276060

RMSE is lower for the baseline predictor compared to raw average predictor for the Training data set. 20% improvement on the Training data set. For the Testing data set, we’ve got no improvement. This is due to the fact that the test data set being randomly generated and has no inherent biases and therefore the training biases for items applied to the testing set did not improve performance. On the other hand, these same bias are fit for the training set, hence will clearly improve the RMSE for the training set.