Introduction

# reading libraries
library(knitr)
library(kableExtra)
library(tidyverse)
library(recommenderlab)
library(dplyr)

This is my first assignment on recommender systems and for that purpose I will want to work on a small dataset for better exploration of results. I am going to create a dataset of 6 users and their ratings on 6 random movies. This recommender system will be recommending movies to the users.

Creating data

# Creating matrix of random data 
data <- matrix(sample(1:6, 36, replace = TRUE), nrow=6)

# creating user's name
users <- c("Habib", "Bob", "Peter", "Smith", "Rebecca", "Steve")

# creating movie's name
movies <- c("V for Vendetta", "Lord of the Rings", "The Devil's Double", "National Treasure", "Mission Impossible III", "Wrong Turn")

# Replacing users and movies' name in dataset
rownames(data) <- users
colnames(data) <- movies

# Printing the data
data %>% kable(caption="**Random Ratings**") %>% kable_styling()
Random Ratings
V for Vendetta Lord of the Rings The Devil’s Double National Treasure Mission Impossible III Wrong Turn
Habib 5 4 1 1 6 5
Bob 6 5 1 4 5 1
Peter 2 3 6 4 2 6
Smith 4 1 2 4 6 5
Rebecca 5 2 3 6 2 1
Steve 2 5 4 4 2 6

Creating training and test data

# splitting the data
split_data <- sample(1:length(data), 6, replace=FALSE)

# Replacing 6 random values with NA
# train data
train_data <- data
train_data[split_data] <- NA
train_data %>% kable(caption="**Training data**") %>% kable_styling()
Training data
V for Vendetta Lord of the Rings The Devil’s Double National Treasure Mission Impossible III Wrong Turn
Habib 5 4 1 NA 6 5
Bob 6 5 1 4 5 1
Peter 2 3 6 4 NA 6
Smith NA 1 2 4 6 5
Rebecca 5 NA NA 6 NA 1
Steve 2 5 4 4 2 6
# test data
test_data <- data
test_data[-split_data] <- NA
test_data %>% kable(caption="**Testing data**") %>% kable_styling()
Testing data
V for Vendetta Lord of the Rings The Devil’s Double National Treasure Mission Impossible III Wrong Turn
Habib NA NA NA 1 NA NA
Bob NA NA NA NA NA NA
Peter NA NA NA NA 2 NA
Smith 4 NA NA NA NA NA
Rebecca NA 2 3 NA 2 NA
Steve NA NA NA NA NA NA

Calculating raw average

raw_avg <- round(mean(train_data, na.rm=TRUE), 2)
print(paste0("The raw average for every user-item combination is ", raw_avg))
## [1] "The raw average for every user-item combination is 3.9"
# Replacing user item raw average in dataset 
raw_data <- matrix(replicate(36, raw_avg), 6)
rownames(raw_data) <- rownames(train_data)
colnames(raw_data) <- colnames(train_data)
raw_data %>% kable(caption="Raw average") %>% kable_styling()
Raw average
V for Vendetta Lord of the Rings The Devil’s Double National Treasure Mission Impossible III Wrong Turn
Habib 3.9 3.9 3.9 3.9 3.9 3.9
Bob 3.9 3.9 3.9 3.9 3.9 3.9
Peter 3.9 3.9 3.9 3.9 3.9 3.9
Smith 3.9 3.9 3.9 3.9 3.9 3.9
Rebecca 3.9 3.9 3.9 3.9 3.9 3.9
Steve 3.9 3.9 3.9 3.9 3.9 3.9

Calculating RMSE for raw average training and testing data

# RMSE for training dataset
rmse_train <- round(RMSE(train_data, predicted = raw_data),2)
print(paste0("The RMSE for training dataset is ", rmse_train))
## [1] "The RMSE for training dataset is 1.8"
# RMSE for testing dataset
rmse_test <- round(RMSE(test_data, predicted = raw_data),2)
print(paste0("The RMSE for testing dataset is ", rmse_test),2)
## [1] "The RMSE for testing dataset is 1.83"

Calculating the bias for each user and item

# calculating user bias
user_bias <- round(rowMeans(train_data, na.rm=TRUE)-raw_avg, 2) 
user_bias %>% kable(col.names = "User bias") %>% kable_styling()
User bias
Habib 0.30
Bob -0.23
Peter 0.30
Smith -0.30
Rebecca 0.10
Steve -0.07
# calculating item bias
item_bias <- round(colMeans(train_data, na.rm=TRUE)-raw_avg,2)
item_bias %>% kable(col.names = "Item bias") %>% kable_styling()
Item bias
V for Vendetta 0.10
Lord of the Rings -0.30
The Devil’s Double -1.10
National Treasure 0.50
Mission Impossible III 0.85
Wrong Turn 0.10

Calculating baseline predictors for every user-item combination

bias <- apply(expand.grid((as_tibble(user_bias))[[1]], (as_tibble(item_bias))[[1]]), 1, sum)

# calculating baseline predictors

# adding bias values to raw average and creating matrix of 6 x 6
base_pred <- replicate(36, raw_avg) + bias
base_pred <- matrix(base_pred, 6)

# adding column and row names to the matrix
colnames(base_pred) <- colnames(train_data)
rownames(base_pred) <- rownames(train_data)

# printing matrix of baseline predictors for every user-item combination
base_pred %>% kable(caption= "**Baseline predictors for every user-item combination**") %>% kable_styling()
Baseline predictors for every user-item combination
V for Vendetta Lord of the Rings The Devil’s Double National Treasure Mission Impossible III Wrong Turn
Habib 4.30 3.90 3.10 4.70 5.05 4.30
Bob 3.77 3.37 2.57 4.17 4.52 3.77
Peter 4.30 3.90 3.10 4.70 5.05 4.30
Smith 3.70 3.30 2.50 4.10 4.45 3.70
Rebecca 4.10 3.70 2.90 4.50 4.85 4.10
Steve 3.93 3.53 2.73 4.33 4.68 3.93

Calculating RMSE for baseline predictors for training and testing data

# RMSE for baseline predictors for training dataset
rmse_bp_train <- round(RMSE(train_data, predicted = base_pred),2)
print(paste0("The RMSE for baseline predictors for training data is ", rmse_bp_train))
## [1] "The RMSE for baseline predictors for training data is 1.67"
# RMSE for baseline predictors for testing dataset
rmse_bp_test <- round(RMSE(test_data, predicted = base_pred),2)
print(paste0("The RMSE for baseline predictors for testing data is ", rmse_bp_test))
## [1] "The RMSE for baseline predictors for testing data is 2.38"

Summary

RMSE_C <- c("RMSE for Raw Avg - Training", "RMSE for Raw Avg - Testing", "RMSE for Baseline Predictors - Training", "RMSE for Baseline Predictors - Testing")
RMSE_Values <- c(rmse_train, rmse_test, rmse_bp_train, rmse_test)

# creating dataframe
summary <- data.frame(RMSE_C, RMSE_Values)
summary %>% kable() %>% kable_styling()
RMSE_C RMSE_Values
RMSE for Raw Avg - Training 1.80
RMSE for Raw Avg - Testing 1.83
RMSE for Baseline Predictors - Training 1.67
RMSE for Baseline Predictors - Testing 1.83

For training data, RMSE values dropped slightly which is good but with testing data it is same.