Project Description


Loading Libraries


library(Matrix)
library(reshape2)
library(data.table)
library(tidyr)
library(dplyr)
library(kableExtra)
library("scales")

Loading Dataframe

url1 <- "https://raw.githubusercontent.com/ssufian/DAT-612/master/Projects/movie_rating.csv"

mydata <- read.csv(file=url1, sep=",",na.strings = c("NA","",""),strip.white = T, stringsAsFactors = F, header=T)
# Make sure the 1st column is a factor
mydata$MovieID<- factor(mydata$MovieID)

head(mydata)# Original Table in Long format
##             MovieID UserID Rating
## 1 Lovecraft Country      1      5
## 2             X-Men      1     NA
## 3    Schitt's Creek      1      4
## 4             Joker      1     NA
## 5         Wolverine      1      4
## 6 Lovecraft Country      2      4

Create a user-item matrix

#Casting long format data to wide format
UIMatrix <- acast(mydata, UserID~MovieID, value.var="Rating")
UIMatrix  <- apply(UIMatrix , 2,as.numeric) 

# Original user-movie matrix 
set.seed(12)
kable(UIMatrix, format = "pandoc", digits = 3,align= "c",caption = "Fig1a: Original User-Movie Matrix",font_size=12) 
Fig1a: Original User-Movie Matrix
Joker Lovecraft Country Schitt’s Creek Wolverine X-Men
NA 5 4 4 NA
NA 4 5 4 3
NA NA NA 3 2
1 2 3 2 NA
4 4 NA 5 NA
4 4 5 NA 2
#SplitRatio for 70%:30% splitting
UIMatrix1<-sort(sample(nrow(UIMatrix), nrow(UIMatrix)*.7))

#subsetting into Train data
train <- UIMatrix[UIMatrix1,]
# Train user-movie matrix 
kable(train, format = "pandoc", digits = 3,align= "c",caption = "Fig1b: User-Movie Train Matrix",font_size=12) 
Fig1b: User-Movie Train Matrix
Joker Lovecraft Country Schitt’s Creek Wolverine X-Men
NA 4 5 4 3
NA NA NA 3 2
1 2 3 2 NA
4 4 5 NA 2
#subsetting into Test data
test<-UIMatrix[-UIMatrix1,]
# Test user-movie matrix 
kable(test, format = "pandoc", digits = 3,align= "c",caption = "Fig1c: User-Movie Test Matrix",font_size=12) 
Fig1c: User-Movie Test Matrix
Joker Lovecraft Country Schitt’s Creek Wolverine X-Men
NA 5 4 4 NA
4 4 NA 5 NA

Obtain the raw average (mean) rating for every user-item combination from the training dataset

# calculating the mean of all numberical entries in training dataset

raw_avg<-apply(train, 2, mean, na.rm = TRUE) %>% mean()
raw_avg
## [1] 3.1

Calculate the RMSE for raw average before Bias

-For both training data and test data

#function to calculate Square Error
simpleFunc <- function(x)
{
  ((x) - raw_avg)^2
}
#calculating the RSME of train set
SE_train<- apply(train,2,simpleFunc)
M_train <- apply(SE_train, 2, sum, na.rm = TRUE) %>% mean()
RSME_train <- sqrt(M_train ) 
sprintf("RSME trainset before bias: %s",format(round(RSME_train, 2), nsmall = 3))
## [1] "RSME trainset before bias: 1.990"
#calculating the RSME of test set
SE_test<- apply(test,2,simpleFunc)
M_test <- apply(SE_test, 2, sum, na.rm = TRUE) %>% mean()
RSME_test <- sqrt(M_test ) 
sprintf("RSME testset before bias: %s",format(round(RSME_test, 2), nsmall = 3))
## [1] "RSME testset before bias: 1.450"

Using your training data, calculate the bias for each user and movie.

#function to calculate Square Error
simpleFunc1 <- function(x)
{
  ((x) - raw_avg)
}

#User Bias
User_Bias1 <- apply(train, 1, mean,na.rm = TRUE) 
User_Bias<- sapply(User_Bias1,simpleFunc1)

#Movie Bias
movie_Bias1 <- apply(train, 2, mean,na.rm = TRUE) 
movie_Bias<- sapply(movie_Bias1,simpleFunc1)

User_Bias
## [1]  0.90 -0.60 -1.10  0.65
movie_Bias
##             Joker Lovecraft Country    Schitt's Creek         Wolverine 
##        -0.6000000         0.2333333         1.2333333        -0.1000000 
##             X-Men 
##        -0.7666667

Calculate the baseline predictors for every user-item combination.

  • Train Dataset
  • Test Dataset
#Baselinie Predictor for Train dataset
#Calculating the baseline predictor (raw average + userBias + movieBias)
train1a<-train #creating a new train matrix to not mess up the original training set
for (r in 1:nrow(train1a))   
    for (c in 1:ncol(train1a))  
         train1a[r,c]<-raw_avg+User_Bias[[r]]+movie_Bias[[c]]

kable(train1a, format = "pandoc", digits = 3,align= "c",caption = "Fig2a: Baseline Train set w/o clippings",font_size=12) 
Fig2a: Baseline Train set w/o clippings
Joker Lovecraft Country Schitt’s Creek Wolverine X-Men
3.40 4.233 5.233 3.90 3.233
1.90 2.733 3.733 2.40 1.733
1.40 2.233 3.233 1.90 1.233
3.15 3.983 4.983 3.65 2.983
#Baselinie Predictor for Test dataset

test1b<-test #creating a new testmatrix to not mess up the original testing set
for (r in 1:nrow(test1b))   
    for (c in 1:ncol(test1b))  
         test1b[r,c]<-raw_avg+User_Bias[[r]]+movie_Bias[[c]]

kable(test1b, format = "pandoc", digits = 3,align= "c",caption = "Fig2b: Baseline Test set w/o clippings",font_size=12) 
Fig2b: Baseline Test set w/o clippings
Joker Lovecraft Country Schitt’s Creek Wolverine X-Men
3.4 4.233 5.233 3.9 3.233
1.9 2.733 3.733 2.4 1.733

Clipping baseline predictors

-our movie ratings cannot be below 1 and above 5

#Clipped baseline predictors for Train Dataset
train1a[train1a<1]<-1
train1a[train1a>5]<-5

kable(train1a, format = "pandoc", digits = 3,align= "c",caption = "Fig3a: Baseline Train set w/ clippings",font_size=12) 
Fig3a: Baseline Train set w/ clippings
Joker Lovecraft Country Schitt’s Creek Wolverine X-Men
3.40 4.233 5.000 3.90 3.233
1.90 2.733 3.733 2.40 1.733
1.40 2.233 3.233 1.90 1.233
3.15 3.983 4.983 3.65 2.983
#Clipped baseline predictors for TestDataset
test1b[test1b<1]<-1
test1b[test1b>5]<-5

kable(test1b, format = "pandoc", digits = 3,align= "c",caption = "Fig3b: Baseline Test set w/ clippings",font_size=12)
Fig3b: Baseline Test set w/ clippings
Joker Lovecraft Country Schitt’s Creek Wolverine X-Men
3.4 4.233 5.000 3.9 3.233
1.9 2.733 3.733 2.4 1.733

RMSE after Bias

#calculating the RSME of train set
SE_train_afterbias<- apply(train1a,2,simpleFunc)
M_train_afterbias <- apply(SE_train_afterbias, 2, sum, na.rm = TRUE) %>% mean()
RSME_train_afterbias <- sqrt(M_train_afterbias) 
sprintf("RSME trainset after bias: %s",format(round(RSME_train_afterbias, 2), nsmall = 3))
## [1] "RSME trainset after bias: 2.150"
#calculating the RSME of test set
SE_test_afterbias<- apply(test1b,2,simpleFunc)
M_test_afterbias <- apply(SE_test_afterbias, 2, sum, na.rm = TRUE) %>% mean()
RSME_test_afterbias <- sqrt(M_test_afterbias ) 
sprintf("RSME testset after bias: %s",format(round(RSME_test_afterbias, 2), nsmall = 3))
## [1] "RSME testset after bias: 1.410"

Comparing the Results

# Training RSME comparision
train_comp <- (1-RSME_train_afterbias/RSME_train )
sprintf("RSME Train Percent: %s",percent(train_comp))
## [1] "RSME Train Percent: -8%"
# Testing RSME comparision
test_comp <- (1-RSME_test_afterbias/RSME_test)
sprintf("RSME Test Percent: %s",percent(test_comp))
## [1] "RSME Test Percent: 2%"

Summary of Results

In training set, we saw a -8% deteriation of the RSME post-biasing. While, in the testing set, we witnessed a slight 2% improvment post-biasing.  There are a multitude of reasons for such poor overall performance of which the most probable cause is the very small toy sample size of the Utility Matrix and the lack of trendlines that derived from that.  It is really hard to draw any conclusion based on such a small sample size.  This exercise is simply to allow the student a hands-on practice in creating and understanding how the Baseline Predictor Matrix is formulated for both the training and testing sets.  Lastly, it also allows the student to see how RSME is calculated and utilized as one of the accruacy metrics in Recommender Systems.