Introduction

This system recommends movies to users. The dataset used contains 10 users and 8 movies and was built by myself through excel random number generator. The rating scale for a movie is range from 1 to 5, where 1 is the lowest rating and 5 is the highest rating.

Load R Packages

# Load required packages
library(tidyverse)

Load Data

# Load movie rating dataset
movies_rating <- read.csv("https://raw.githubusercontent.com/SieSiongWong/DATA-612/master/Movies%20Rating%20Dataset.csv")

head(movies_rating)
##   user M1 M2 M3 M4 M5 M6 M7 M8
## 1   U1  2  3  4  5  4  5  5  3
## 2   U2  5  4 NA  3  1  4  5 NA
## 3   U3  5  2  1  3  2 NA  1  4
## 4   U4  1  1  1  2  2  5  1  5
## 5   U5  3  1  4  1  3 NA  4  4
## 6   U6 NA  3 NA  2  2  3  3  2

User-Item Matrix

# Gather the movies rating dataset from wide to long format
movies_rating <- gather(movies_rating, "movie", "rating", -user)

# Convert the rating dataset into matrix format
user_item <- matrix(movies_rating$rating, nrow = length(unique(movies_rating$user)), byrow = FALSE, dimnames = list(c("U1","U2","U3","U4","U5","U6","U7","U8","U9","U10"),c("M1","M2","M3","M4","M5","M6","M7","M8")))

Training & Test Datasets

# Break the rating matrix into training and test datasets

# Set the seed for random sampling
set.seed(123)

# Copy user item matrix dataset
train <- user_item

# Random sample for 10 elements as 0 for test dataset
train[sample(1:length(train), 11, replace = FALSE)] <- 0

# Test dataset
test <- user_item - train
test[test==0] <- NA

# Training dataset
train[train==0] <- NA 

Raw Average

# Calculate the raw average (mean) rating for every user-item combination.
raw_average <- mean(train, na.rm = TRUE)

raw_average
## [1] 3.032787

RSME

# Calculate the RMSE for raw average for both training and test datasets

# Square error difference function
se <- function(x) {  
  ( x - raw_average ) ^ 2}

# Training dataset RMSE
train_RMSE <- sapply(train, se) %>% 
  mean(na.rm = TRUE) %>%
  sqrt()

train_RMSE
## [1] 1.413833
# Test dataset RMSE
test_RMSE <- sapply(test, se) %>% 
  mean(na.rm = TRUE) %>%
  sqrt()

test_RMSE
## [1] 1.766564

Bias Calculation

# Calculate the bias for each user and each movie using training dataset

# Bias value for each user
bias_users <- rowMeans(train, na.rm=TRUE) - raw_average

bias_users
##          U1          U2          U3          U4          U5          U6 
##  0.46721311  1.16721311 -0.36612022 -0.60421546 -0.36612022 -0.53278689 
##          U7          U8          U9         U10 
## -0.03278689 -0.03278689 -0.43278689  0.96721311
# Bias value for each movie
bias_movies <- colMeans(train, na.rm=TRUE) - raw_average

bias_movies
##          M1          M2          M3          M4          M5          M6 
## -0.28278689  0.07832423  0.25292740 -0.28278689 -1.03278689  0.82435597 
##          M7          M8 
## -0.03278689  0.53864169

Baseline Predictor

# Calculate the baseline predictors for every user-movie combination
baseline_pred <- expand.grid(bias_users, bias_movies) %>% 
  
  # Sum both user and movie bias and raw average for each combination
  mutate(baseline = Var1 + Var2 + raw_average) %>% 
  
  # Replace baseline predictor value greater than 5 equal to 5
  mutate(baseline = replace(baseline, baseline > 5, 5)) %>%
  
  # Select the baseline column
  select(baseline)

# Convert the baseline column into matrix format
baseline_pred <- matrix(baseline_pred$baseline, nrow = 10, ncol = 8, byrow = TRUE)

# Add dimension names to the matrix
dimnames(baseline_pred) <- list(c("U1","U2","U3","U4","U5","U6","U7","U8","U9","U10"),c("M1","M2","M3","M4","M5","M6","M7","M8"))

# Round each baseline predictor value to two decimal places
baseline_pred <- round(baseline_pred, 2)

baseline_pred
##       M1   M2   M3   M4   M5   M6   M7   M8
## U1  3.22 3.92 2.38 2.15 2.38 2.22 2.72 2.72
## U2  2.32 3.72 3.58 4.28 2.74 2.51 2.74 2.58
## U3  3.08 3.08 2.68 4.08 3.75 4.45 2.92 2.68
## U4  2.92 2.75 3.25 3.25 2.85 4.25 3.22 3.92
## U5  2.38 2.15 2.38 2.22 2.72 2.72 2.32 3.72
## U6  2.47 3.17 1.63 1.40 1.63 1.47 1.97 1.97
## U7  1.57 2.97 4.32 5.00 3.49 3.25 3.49 3.32
## U8  3.82 3.82 3.42 4.82 3.47 4.17 2.63 2.40
## U9  2.63 2.47 2.97 2.97 2.57 3.97 4.04 4.74
## U10 3.21 2.97 3.21 3.04 3.54 3.54 3.14 4.54

RSME

# Calculate the RMSE for the baseline predictors for the test dataset
pred_test_RSME <- (test - baseline_pred) ^ 2 %>%
   mean(na.rm = TRUE) %>%
   sqrt()

pred_test_RSME
## [1] 2.133279
# Calculate the RMSE for the baseline predictors for the training dataset
pred_train_RSME <- (train - baseline_pred) ^ 2 %>%
   mean(na.rm = TRUE) %>%
   sqrt()

pred_train_RSME
## [1] 1.452058

Summary Results

The results of RSME for baseline predictor show negative improvement for both training and test datasets. The training data is -2.70% and the test data is -20.76%. The percentage improvement is getting worst instead of yielding better prediction. This means that simply using just the raw average did much better job than the baseline predictor.

# Training dataset percent improvement

pred_improv_train <- scales::percent((1 - (pred_train_RSME/train_RMSE)), accuracy = 0.01)

pred_improv_train
## [1] "-2.70%"
# Test dataset percent improvement

pred_improv_test <- scales::percent((1 - (pred_test_RSME/test_RMSE)), accuracy = 0.01)

pred_improv_test
## [1] "-20.76%"