Loading Libraries
library(Matrix)
library(reshape2)
library(data.table)
library(tidyr)
library(dplyr)
library(kableExtra)
library("scales")
Loading Dataframe
url1 <- "https://raw.githubusercontent.com/ssufian/DAT-612/master/Projects/movie_rating.csv"
mydata <- read.csv(file=url1, sep=",",na.strings = c("NA","",""),strip.white = T, stringsAsFactors = F, header=T)
# Make sure the 1st column is a factor
mydata$MovieID<- factor(mydata$MovieID)
head(mydata)# Original Table in Long format
## MovieID UserID Rating
## 1 Lovecraft Country 1 5
## 2 X-Men 1 NA
## 3 Schitt's Creek 1 4
## 4 Joker 1 NA
## 5 Wolverine 1 4
## 6 Lovecraft Country 2 4
Create a user-item matrix
#Casting long format data to wide format
UIMatrix <- acast(mydata, UserID~MovieID, value.var="Rating")
UIMatrix <- apply(UIMatrix , 2,as.numeric)
# Original user-movie matrix
set.seed(12)
kable(UIMatrix, format = "pandoc", digits = 3,align= "c",caption = "Fig1a: Original User-Movie Matrix",font_size=12)
Fig1a: Original User-Movie Matrix
NA |
5 |
4 |
4 |
NA |
NA |
4 |
5 |
4 |
3 |
NA |
NA |
NA |
3 |
2 |
1 |
2 |
3 |
2 |
NA |
4 |
4 |
NA |
5 |
NA |
4 |
4 |
5 |
NA |
2 |
#SplitRatio for 70%:30% splitting
UIMatrix1<-sort(sample(nrow(UIMatrix), nrow(UIMatrix)*.7))
#subsetting into Train data
train <- UIMatrix[UIMatrix1,]
# Train user-movie matrix
kable(train, format = "pandoc", digits = 3,align= "c",caption = "Fig1b: User-Movie Train Matrix",font_size=12)
Fig1b: User-Movie Train Matrix
NA |
4 |
5 |
4 |
3 |
NA |
NA |
NA |
3 |
2 |
1 |
2 |
3 |
2 |
NA |
4 |
4 |
5 |
NA |
2 |
#subsetting into Test data
test<-UIMatrix[-UIMatrix1,]
# Test user-movie matrix
kable(test, format = "pandoc", digits = 3,align= "c",caption = "Fig1c: User-Movie Test Matrix",font_size=12)
Fig1c: User-Movie Test Matrix
NA |
5 |
4 |
4 |
NA |
4 |
4 |
NA |
5 |
NA |
Obtain the raw average (mean) rating for every user-item combination from the training dataset
# calculating the mean of all numberical entries in training dataset
raw_avg<-apply(train, 2, mean, na.rm = TRUE) %>% mean()
raw_avg
## [1] 3.1
Calculate the RMSE for raw average before Bias
-For both training data and test data
#function to calculate Square Error
simpleFunc <- function(x)
{
((x) - raw_avg)^2
}
#calculating the RSME of train set
SE_train<- apply(train,2,simpleFunc)
M_train <- apply(SE_train, 2, sum, na.rm = TRUE) %>% mean()
RSME_train <- sqrt(M_train )
sprintf("RSME trainset before bias: %s",format(round(RSME_train, 2), nsmall = 3))
## [1] "RSME trainset before bias: 1.990"
#calculating the RSME of test set
SE_test<- apply(test,2,simpleFunc)
M_test <- apply(SE_test, 2, sum, na.rm = TRUE) %>% mean()
RSME_test <- sqrt(M_test )
sprintf("RSME testset before bias: %s",format(round(RSME_test, 2), nsmall = 3))
## [1] "RSME testset before bias: 1.450"
Using your training data, calculate the bias for each user and movie.
#function to calculate Square Error
simpleFunc1 <- function(x)
{
((x) - raw_avg)
}
#User Bias
User_Bias1 <- apply(train, 1, mean,na.rm = TRUE)
User_Bias<- sapply(User_Bias1,simpleFunc1)
#Movie Bias
movie_Bias1 <- apply(train, 2, mean,na.rm = TRUE)
movie_Bias<- sapply(movie_Bias1,simpleFunc1)
User_Bias
## [1] 0.90 -0.60 -1.10 0.65
movie_Bias
## Joker Lovecraft Country Schitt's Creek Wolverine
## -0.6000000 0.2333333 1.2333333 -0.1000000
## X-Men
## -0.7666667
Calculate the baseline predictors for every user-item combination.
- Train Dataset
- Test Dataset
#Baselinie Predictor for Train dataset
#Calculating the baseline predictor (raw average + userBias + movieBias)
train1a<-train #creating a new train matrix to not mess up the original training set
for (r in 1:nrow(train1a))
for (c in 1:ncol(train1a))
train1a[r,c]<-raw_avg+User_Bias[[r]]+movie_Bias[[c]]
kable(train1a, format = "pandoc", digits = 3,align= "c",caption = "Fig2a: Baseline Train set w/o clippings",font_size=12)
Fig2a: Baseline Train set w/o clippings
3.40 |
4.233 |
5.233 |
3.90 |
3.233 |
1.90 |
2.733 |
3.733 |
2.40 |
1.733 |
1.40 |
2.233 |
3.233 |
1.90 |
1.233 |
3.15 |
3.983 |
4.983 |
3.65 |
2.983 |
#Baselinie Predictor for Test dataset
test1b<-test #creating a new testmatrix to not mess up the original testing set
for (r in 1:nrow(test1b))
for (c in 1:ncol(test1b))
test1b[r,c]<-raw_avg+User_Bias[[r]]+movie_Bias[[c]]
kable(test1b, format = "pandoc", digits = 3,align= "c",caption = "Fig2b: Baseline Test set w/o clippings",font_size=12)
Fig2b: Baseline Test set w/o clippings
3.4 |
4.233 |
5.233 |
3.9 |
3.233 |
1.9 |
2.733 |
3.733 |
2.4 |
1.733 |
Clipping baseline predictors
-our movie ratings cannot be below 1 and above 5
#Clipped baseline predictors for Train Dataset
train1a[train1a<1]<-1
train1a[train1a>5]<-5
kable(train1a, format = "pandoc", digits = 3,align= "c",caption = "Fig3a: Baseline Train set w/ clippings",font_size=12)
Fig3a: Baseline Train set w/ clippings
3.40 |
4.233 |
5.000 |
3.90 |
3.233 |
1.90 |
2.733 |
3.733 |
2.40 |
1.733 |
1.40 |
2.233 |
3.233 |
1.90 |
1.233 |
3.15 |
3.983 |
4.983 |
3.65 |
2.983 |
#Clipped baseline predictors for TestDataset
test1b[test1b<1]<-1
test1b[test1b>5]<-5
kable(test1b, format = "pandoc", digits = 3,align= "c",caption = "Fig3b: Baseline Test set w/ clippings",font_size=12)
Fig3b: Baseline Test set w/ clippings
3.4 |
4.233 |
5.000 |
3.9 |
3.233 |
1.9 |
2.733 |
3.733 |
2.4 |
1.733 |
RMSE after Bias
- Training dataset
- Testing dataset
#calculating the RSME of train set
SE_train_afterbias<- apply(train1a,2,simpleFunc)
M_train_afterbias <- apply(SE_train_afterbias, 2, sum, na.rm = TRUE) %>% mean()
RSME_train_afterbias <- sqrt(M_train_afterbias)
sprintf("RSME trainset after bias: %s",format(round(RSME_train_afterbias, 2), nsmall = 3))
## [1] "RSME trainset after bias: 2.150"
#calculating the RSME of test set
SE_test_afterbias<- apply(test1b,2,simpleFunc)
M_test_afterbias <- apply(SE_test_afterbias, 2, sum, na.rm = TRUE) %>% mean()
RSME_test_afterbias <- sqrt(M_test_afterbias )
sprintf("RSME testset after bias: %s",format(round(RSME_test_afterbias, 2), nsmall = 3))
## [1] "RSME testset after bias: 1.410"
Comparing the Results
- comparing both training and testing RSME
# Training RSME comparision
train_comp <- (1-RSME_train_afterbias/RSME_train )
sprintf("RSME Train Percent: %s",percent(train_comp))
## [1] "RSME Train Percent: -8%"
# Testing RSME comparision
test_comp <- (1-RSME_test_afterbias/RSME_test)
sprintf("RSME Test Percent: %s",percent(test_comp))
## [1] "RSME Test Percent: 2%"