Project Description

This system recommends movies to viewers (users)
We will utilzed the Baseline Predictor Methodology which = Avg Mean + User Bias + Movie Bias
A small toy sample data set was created for this exercise: 6 users spanning 5 movies were given ratings arbitrarily
Toy sample data set were partion into a Training & Testing set respectively

Loading Libraries

library(Matrix)
library(reshape2)
library(data.table)
library(tidyr)
library(dplyr)
library(kableExtra)
library("scales")

Loading Dataframe

url1 <- "https://raw.githubusercontent.com/ssufian/DAT-612/master/Projects/movie_rating.csv"

mydata <- read.csv(file=url1, sep=",",na.strings = c("NA","",""),strip.white = T, stringsAsFactors = F, header=T)
# Make sure the 1st column is a factor
mydata$MovieID<- factor(mydata$MovieID)

head(mydata)# Original Table in Long format

##             MovieID UserID Rating
## 1 Lovecraft Country      1      5
## 2             X-Men      1     NA
## 3    Schitt's Creek      1      4
## 4             Joker      1     NA
## 5         Wolverine      1      4
## 6 Lovecraft Country      2      4

Create a user-item matrix

#Casting long format data to wide format
UIMatrix <- acast(mydata, UserID~MovieID, value.var="Rating")
UIMatrix  <- apply(UIMatrix , 2,as.numeric) 

# Original user-movie matrix 
set.seed(12)
kable(UIMatrix, format = "pandoc", digits = 3,align= "c",caption = "Fig1a: Original User-Movie Matrix",font_size=12)

Fig1a: Original User-Movie Matrix
Joker	Lovecraft Country	Schitt’s Creek	Wolverine	X-Men
NA	5	4	4	NA
NA	4	5	4	3
NA	NA	NA	3	2
1	2	3	2	NA
4	4	NA	5	NA
4	4	5	NA	2

#SplitRatio for 70%:30% splitting
UIMatrix1<-sort(sample(nrow(UIMatrix), nrow(UIMatrix)*.7))

#subsetting into Train data
train <- UIMatrix[UIMatrix1,]
# Train user-movie matrix 
kable(train, format = "pandoc", digits = 3,align= "c",caption = "Fig1b: User-Movie Train Matrix",font_size=12)

Fig1b: User-Movie Train Matrix
Joker	Lovecraft Country	Schitt’s Creek	Wolverine	X-Men
NA	4	5	4	3
NA	NA	NA	3	2
1	2	3	2	NA
4	4	5	NA	2

#subsetting into Test data
test<-UIMatrix[-UIMatrix1,]
# Test user-movie matrix 
kable(test, format = "pandoc", digits = 3,align= "c",caption = "Fig1c: User-Movie Test Matrix",font_size=12)

Fig1c: User-Movie Test Matrix
Joker	Lovecraft Country	Schitt’s Creek	Wolverine	X-Men
NA	5	4	4	NA
4	4	NA	5	NA

Obtain the raw average (mean) rating for every user-item combination from the training dataset

# calculating the mean of all numberical entries in training dataset

raw_avg<-apply(train, 2, mean, na.rm = TRUE) %>% mean()
raw_avg

## [1] 3.1

Calculate the RMSE for raw average before Bias

-For both training data and test data

#function to calculate Square Error
simpleFunc <- function(x)
{
  ((x) - raw_avg)^2
}
#calculating the RSME of train set
SE_train<- apply(train,2,simpleFunc)
M_train <- apply(SE_train, 2, sum, na.rm = TRUE) %>% mean()
RSME_train <- sqrt(M_train ) 
sprintf("RSME trainset before bias: %s",format(round(RSME_train, 2), nsmall = 3))

## [1] "RSME trainset before bias: 1.990"

#calculating the RSME of test set
SE_test<- apply(test,2,simpleFunc)
M_test <- apply(SE_test, 2, sum, na.rm = TRUE) %>% mean()
RSME_test <- sqrt(M_test ) 
sprintf("RSME testset before bias: %s",format(round(RSME_test, 2), nsmall = 3))

## [1] "RSME testset before bias: 1.450"

Using your training data, calculate the bias for each user and movie.

#function to calculate Square Error
simpleFunc1 <- function(x)
{
  ((x) - raw_avg)
}

#User Bias
User_Bias1 <- apply(train, 1, mean,na.rm = TRUE) 
User_Bias<- sapply(User_Bias1,simpleFunc1)

#Movie Bias
movie_Bias1 <- apply(train, 2, mean,na.rm = TRUE) 
movie_Bias<- sapply(movie_Bias1,simpleFunc1)

User_Bias

## [1]  0.90 -0.60 -1.10  0.65

movie_Bias

##             Joker Lovecraft Country    Schitt's Creek         Wolverine 
##        -0.6000000         0.2333333         1.2333333        -0.1000000 
##             X-Men 
##        -0.7666667

Calculate the baseline predictors for every user-item combination.

Train Dataset
Test Dataset

#Baselinie Predictor for Train dataset
#Calculating the baseline predictor (raw average + userBias + movieBias)
train1a<-train #creating a new train matrix to not mess up the original training set
for (r in 1:nrow(train1a))   
    for (c in 1:ncol(train1a))  
         train1a[r,c]<-raw_avg+User_Bias[[r]]+movie_Bias[[c]]

kable(train1a, format = "pandoc", digits = 3,align= "c",caption = "Fig2a: Baseline Train set w/o clippings",font_size=12)

Fig2a: Baseline Train set w/o clippings
Joker	Lovecraft Country	Schitt’s Creek	Wolverine	X-Men
3.40	4.233	5.233	3.90	3.233
1.90	2.733	3.733	2.40	1.733
1.40	2.233	3.233	1.90	1.233
3.15	3.983	4.983	3.65	2.983

#Baselinie Predictor for Test dataset

test1b<-test #creating a new testmatrix to not mess up the original testing set
for (r in 1:nrow(test1b))   
    for (c in 1:ncol(test1b))  
         test1b[r,c]<-raw_avg+User_Bias[[r]]+movie_Bias[[c]]

kable(test1b, format = "pandoc", digits = 3,align= "c",caption = "Fig2b: Baseline Test set w/o clippings",font_size=12)

Fig2b: Baseline Test set w/o clippings
Joker	Lovecraft Country	Schitt’s Creek	Wolverine	X-Men
3.4	4.233	5.233	3.9	3.233
1.9	2.733	3.733	2.4	1.733

Clipping baseline predictors

-our movie ratings cannot be below 1 and above 5

#Clipped baseline predictors for Train Dataset
train1a[train1a<1]<-1
train1a[train1a>5]<-5

kable(train1a, format = "pandoc", digits = 3,align= "c",caption = "Fig3a: Baseline Train set w/ clippings",font_size=12)

Fig3a: Baseline Train set w/ clippings
Joker	Lovecraft Country	Schitt’s Creek	Wolverine	X-Men
3.40	4.233	5.000	3.90	3.233
1.90	2.733	3.733	2.40	1.733
1.40	2.233	3.233	1.90	1.233
3.15	3.983	4.983	3.65	2.983

#Clipped baseline predictors for TestDataset
test1b[test1b<1]<-1
test1b[test1b>5]<-5

kable(test1b, format = "pandoc", digits = 3,align= "c",caption = "Fig3b: Baseline Test set w/ clippings",font_size=12)

Fig3b: Baseline Test set w/ clippings
Joker	Lovecraft Country	Schitt’s Creek	Wolverine	X-Men
3.4	4.233	5.000	3.9	3.233
1.9	2.733	3.733	2.4	1.733

RMSE after Bias

Training dataset
Testing dataset

#calculating the RSME of train set
SE_train_afterbias<- apply(train1a,2,simpleFunc)
M_train_afterbias <- apply(SE_train_afterbias, 2, sum, na.rm = TRUE) %>% mean()
RSME_train_afterbias <- sqrt(M_train_afterbias) 
sprintf("RSME trainset after bias: %s",format(round(RSME_train_afterbias, 2), nsmall = 3))

## [1] "RSME trainset after bias: 2.150"

#calculating the RSME of test set
SE_test_afterbias<- apply(test1b,2,simpleFunc)
M_test_afterbias <- apply(SE_test_afterbias, 2, sum, na.rm = TRUE) %>% mean()
RSME_test_afterbias <- sqrt(M_test_afterbias ) 
sprintf("RSME testset after bias: %s",format(round(RSME_test_afterbias, 2), nsmall = 3))

## [1] "RSME testset after bias: 1.410"

Comparing the Results

comparing both training and testing RSME

# Training RSME comparision
train_comp <- (1-RSME_train_afterbias/RSME_train )
sprintf("RSME Train Percent: %s",percent(train_comp))

## [1] "RSME Train Percent: -8%"

# Testing RSME comparision
test_comp <- (1-RSME_test_afterbias/RSME_test)
sprintf("RSME Test Percent: %s",percent(test_comp))

## [1] "RSME Test Percent: 2%"

Summary of Results

In training set, we saw a -8% deteriation of the RSME post-biasing. While, in the testing set, we witnessed a slight 2% improvment post-biasing.  There are a multitude of reasons for such poor overall performance of which the most probable cause is the very small toy sample size of the Utility Matrix and the lack of trendlines that derived from that.  It is really hard to draw any conclusion based on such a small sample size.  This exercise is simply to allow the student a hands-on practice in creating and understanding how the Baseline Predictor Matrix is formulated for both the training and testing sets.  Lastly, it also allows the student to see how RSME is calculated and utilized as one of the accruacy metrics in Recommender Systems.

DAT 612 - Project 1

sufian

6/1/2020

Project Description

Loading Libraries

Loading Dataframe

Create a user-item matrix

Obtain the raw average (mean) rating for every user-item combination from the training dataset

Calculate the RMSE for raw average before Bias

Using your training data, calculate the bias for each user and movie.

Calculate the baseline predictors for every user-item combination.

Clipping baseline predictors

RMSE after Bias

Comparing the Results

Summary of Results