Introduction

# reading libraries
library(knitr)
library(kableExtra)
library(tidyverse)
library(recommenderlab)
library(dplyr)
This is my first assignment on recommender systems and for that purpose I will want to work on a small dataset for better exploration of results. I am going to create a dataset of 6 users and their ratings on 6 random movies. This recommender system will be recommending movies to the users.
Creating data
# Creating matrix of random data
data <- matrix(sample(1:6, 36, replace = TRUE), nrow=6)
# creating user's name
users <- c("Habib", "Bob", "Peter", "Smith", "Rebecca", "Steve")
# creating movie's name
movies <- c("V for Vendetta", "Lord of the Rings", "The Devil's Double", "National Treasure", "Mission Impossible III", "Wrong Turn")
# Replacing users and movies' name in dataset
rownames(data) <- users
colnames(data) <- movies
# Printing the data
data %>% kable(caption="**Random Ratings**") %>% kable_styling()
Random Ratings
|
V for Vendetta
|
Lord of the Rings
|
The Devil’s Double
|
National Treasure
|
Mission Impossible III
|
Wrong Turn
|
Habib
|
5
|
4
|
1
|
1
|
6
|
5
|
Bob
|
6
|
5
|
1
|
4
|
5
|
1
|
Peter
|
2
|
3
|
6
|
4
|
2
|
6
|
Smith
|
4
|
1
|
2
|
4
|
6
|
5
|
Rebecca
|
5
|
2
|
3
|
6
|
2
|
1
|
Steve
|
2
|
5
|
4
|
4
|
2
|
6
|
Creating training and test data
# splitting the data
split_data <- sample(1:length(data), 6, replace=FALSE)
# Replacing 6 random values with NA
# train data
train_data <- data
train_data[split_data] <- NA
train_data %>% kable(caption="**Training data**") %>% kable_styling()
Training data
|
V for Vendetta
|
Lord of the Rings
|
The Devil’s Double
|
National Treasure
|
Mission Impossible III
|
Wrong Turn
|
Habib
|
5
|
4
|
1
|
NA
|
6
|
5
|
Bob
|
6
|
5
|
1
|
4
|
5
|
1
|
Peter
|
2
|
3
|
6
|
4
|
NA
|
6
|
Smith
|
NA
|
1
|
2
|
4
|
6
|
5
|
Rebecca
|
5
|
NA
|
NA
|
6
|
NA
|
1
|
Steve
|
2
|
5
|
4
|
4
|
2
|
6
|
# test data
test_data <- data
test_data[-split_data] <- NA
test_data %>% kable(caption="**Testing data**") %>% kable_styling()
Testing data
|
V for Vendetta
|
Lord of the Rings
|
The Devil’s Double
|
National Treasure
|
Mission Impossible III
|
Wrong Turn
|
Habib
|
NA
|
NA
|
NA
|
1
|
NA
|
NA
|
Bob
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
Peter
|
NA
|
NA
|
NA
|
NA
|
2
|
NA
|
Smith
|
4
|
NA
|
NA
|
NA
|
NA
|
NA
|
Rebecca
|
NA
|
2
|
3
|
NA
|
2
|
NA
|
Steve
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
Calculating raw average
raw_avg <- round(mean(train_data, na.rm=TRUE), 2)
print(paste0("The raw average for every user-item combination is ", raw_avg))
## [1] "The raw average for every user-item combination is 3.9"
# Replacing user item raw average in dataset
raw_data <- matrix(replicate(36, raw_avg), 6)
rownames(raw_data) <- rownames(train_data)
colnames(raw_data) <- colnames(train_data)
raw_data %>% kable(caption="Raw average") %>% kable_styling()
Raw average
|
V for Vendetta
|
Lord of the Rings
|
The Devil’s Double
|
National Treasure
|
Mission Impossible III
|
Wrong Turn
|
Habib
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
Bob
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
Peter
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
Smith
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
Rebecca
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
Steve
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
3.9
|
Calculating RMSE for raw average training and testing data
# RMSE for training dataset
rmse_train <- round(RMSE(train_data, predicted = raw_data),2)
print(paste0("The RMSE for training dataset is ", rmse_train))
## [1] "The RMSE for training dataset is 1.8"
# RMSE for testing dataset
rmse_test <- round(RMSE(test_data, predicted = raw_data),2)
print(paste0("The RMSE for testing dataset is ", rmse_test),2)
## [1] "The RMSE for testing dataset is 1.83"
Calculating the bias for each user and item
# calculating user bias
user_bias <- round(rowMeans(train_data, na.rm=TRUE)-raw_avg, 2)
user_bias %>% kable(col.names = "User bias") %>% kable_styling()
|
User bias
|
Habib
|
0.30
|
Bob
|
-0.23
|
Peter
|
0.30
|
Smith
|
-0.30
|
Rebecca
|
0.10
|
Steve
|
-0.07
|
# calculating item bias
item_bias <- round(colMeans(train_data, na.rm=TRUE)-raw_avg,2)
item_bias %>% kable(col.names = "Item bias") %>% kable_styling()
|
Item bias
|
V for Vendetta
|
0.10
|
Lord of the Rings
|
-0.30
|
The Devil’s Double
|
-1.10
|
National Treasure
|
0.50
|
Mission Impossible III
|
0.85
|
Wrong Turn
|
0.10
|
Calculating baseline predictors for every user-item combination
bias <- apply(expand.grid((as_tibble(user_bias))[[1]], (as_tibble(item_bias))[[1]]), 1, sum)
# calculating baseline predictors
# adding bias values to raw average and creating matrix of 6 x 6
base_pred <- replicate(36, raw_avg) + bias
base_pred <- matrix(base_pred, 6)
# adding column and row names to the matrix
colnames(base_pred) <- colnames(train_data)
rownames(base_pred) <- rownames(train_data)
# printing matrix of baseline predictors for every user-item combination
base_pred %>% kable(caption= "**Baseline predictors for every user-item combination**") %>% kable_styling()
Baseline predictors for every user-item combination
|
V for Vendetta
|
Lord of the Rings
|
The Devil’s Double
|
National Treasure
|
Mission Impossible III
|
Wrong Turn
|
Habib
|
4.30
|
3.90
|
3.10
|
4.70
|
5.05
|
4.30
|
Bob
|
3.77
|
3.37
|
2.57
|
4.17
|
4.52
|
3.77
|
Peter
|
4.30
|
3.90
|
3.10
|
4.70
|
5.05
|
4.30
|
Smith
|
3.70
|
3.30
|
2.50
|
4.10
|
4.45
|
3.70
|
Rebecca
|
4.10
|
3.70
|
2.90
|
4.50
|
4.85
|
4.10
|
Steve
|
3.93
|
3.53
|
2.73
|
4.33
|
4.68
|
3.93
|
Calculating RMSE for baseline predictors for training and testing data
# RMSE for baseline predictors for training dataset
rmse_bp_train <- round(RMSE(train_data, predicted = base_pred),2)
print(paste0("The RMSE for baseline predictors for training data is ", rmse_bp_train))
## [1] "The RMSE for baseline predictors for training data is 1.67"
# RMSE for baseline predictors for testing dataset
rmse_bp_test <- round(RMSE(test_data, predicted = base_pred),2)
print(paste0("The RMSE for baseline predictors for testing data is ", rmse_bp_test))
## [1] "The RMSE for baseline predictors for testing data is 2.38"
Summary
RMSE_C <- c("RMSE for Raw Avg - Training", "RMSE for Raw Avg - Testing", "RMSE for Baseline Predictors - Training", "RMSE for Baseline Predictors - Testing")
RMSE_Values <- c(rmse_train, rmse_test, rmse_bp_train, rmse_test)
# creating dataframe
summary <- data.frame(RMSE_C, RMSE_Values)
summary %>% kable() %>% kable_styling()
RMSE_C
|
RMSE_Values
|
RMSE for Raw Avg - Training
|
1.80
|
RMSE for Raw Avg - Testing
|
1.83
|
RMSE for Baseline Predictors - Training
|
1.67
|
RMSE for Baseline Predictors - Testing
|
1.83
|
For training data, RMSE values dropped slightly which is good but with testing data it is same.