This system recommends movies to users. The dataset used contains 10 users and 8 movies and was built by myself through excel random number generator. The rating scale for a movie is range from 1 to 5, where 1 is the lowest rating and 5 is the highest rating.
# Load required packages
library(tidyverse)
# Load movie rating dataset
movies_rating <- read.csv("https://raw.githubusercontent.com/SieSiongWong/DATA-612/master/Movies%20Rating%20Dataset.csv")
head(movies_rating)
## user M1 M2 M3 M4 M5 M6 M7 M8
## 1 U1 2 3 4 5 4 5 5 3
## 2 U2 5 4 NA 3 1 4 5 NA
## 3 U3 5 2 1 3 2 NA 1 4
## 4 U4 1 1 1 2 2 5 1 5
## 5 U5 3 1 4 1 3 NA 4 4
## 6 U6 NA 3 NA 2 2 3 3 2
# Gather the movies rating dataset from wide to long format
movies_rating <- gather(movies_rating, "movie", "rating", -user)
# Convert the rating dataset into matrix format
user_item <- matrix(movies_rating$rating, nrow = length(unique(movies_rating$user)), byrow = FALSE, dimnames = list(c("U1","U2","U3","U4","U5","U6","U7","U8","U9","U10"),c("M1","M2","M3","M4","M5","M6","M7","M8")))
# Break the rating matrix into training and test datasets
# Set the seed for random sampling
set.seed(123)
# Copy user item matrix dataset
train <- user_item
# Random sample for 10 elements as 0 for test dataset
train[sample(1:length(train), 11, replace = FALSE)] <- 0
# Test dataset
test <- user_item - train
test[test==0] <- NA
# Training dataset
train[train==0] <- NA
# Calculate the raw average (mean) rating for every user-item combination.
raw_average <- mean(train, na.rm = TRUE)
raw_average
## [1] 3.032787
# Calculate the RMSE for raw average for both training and test datasets
# Square error difference function
se <- function(x) {
( x - raw_average ) ^ 2}
# Training dataset RMSE
train_RMSE <- sapply(train, se) %>%
mean(na.rm = TRUE) %>%
sqrt()
train_RMSE
## [1] 1.413833
# Test dataset RMSE
test_RMSE <- sapply(test, se) %>%
mean(na.rm = TRUE) %>%
sqrt()
test_RMSE
## [1] 1.766564
# Calculate the bias for each user and each movie using training dataset
# Bias value for each user
bias_users <- rowMeans(train, na.rm=TRUE) - raw_average
bias_users
## U1 U2 U3 U4 U5 U6
## 0.46721311 1.16721311 -0.36612022 -0.60421546 -0.36612022 -0.53278689
## U7 U8 U9 U10
## -0.03278689 -0.03278689 -0.43278689 0.96721311
# Bias value for each movie
bias_movies <- colMeans(train, na.rm=TRUE) - raw_average
bias_movies
## M1 M2 M3 M4 M5 M6
## -0.28278689 0.07832423 0.25292740 -0.28278689 -1.03278689 0.82435597
## M7 M8
## -0.03278689 0.53864169
# Calculate the baseline predictors for every user-movie combination
baseline_pred <- expand.grid(bias_users, bias_movies) %>%
# Sum both user and movie bias and raw average for each combination
mutate(baseline = Var1 + Var2 + raw_average) %>%
# Replace baseline predictor value greater than 5 equal to 5
mutate(baseline = replace(baseline, baseline > 5, 5)) %>%
# Select the baseline column
select(baseline)
# Convert the baseline column into matrix format
baseline_pred <- matrix(baseline_pred$baseline, nrow = 10, ncol = 8, byrow = TRUE)
# Add dimension names to the matrix
dimnames(baseline_pred) <- list(c("U1","U2","U3","U4","U5","U6","U7","U8","U9","U10"),c("M1","M2","M3","M4","M5","M6","M7","M8"))
# Round each baseline predictor value to two decimal places
baseline_pred <- round(baseline_pred, 2)
baseline_pred
## M1 M2 M3 M4 M5 M6 M7 M8
## U1 3.22 3.92 2.38 2.15 2.38 2.22 2.72 2.72
## U2 2.32 3.72 3.58 4.28 2.74 2.51 2.74 2.58
## U3 3.08 3.08 2.68 4.08 3.75 4.45 2.92 2.68
## U4 2.92 2.75 3.25 3.25 2.85 4.25 3.22 3.92
## U5 2.38 2.15 2.38 2.22 2.72 2.72 2.32 3.72
## U6 2.47 3.17 1.63 1.40 1.63 1.47 1.97 1.97
## U7 1.57 2.97 4.32 5.00 3.49 3.25 3.49 3.32
## U8 3.82 3.82 3.42 4.82 3.47 4.17 2.63 2.40
## U9 2.63 2.47 2.97 2.97 2.57 3.97 4.04 4.74
## U10 3.21 2.97 3.21 3.04 3.54 3.54 3.14 4.54
# Calculate the RMSE for the baseline predictors for the test dataset
pred_test_RSME <- (test - baseline_pred) ^ 2 %>%
mean(na.rm = TRUE) %>%
sqrt()
pred_test_RSME
## [1] 2.133279
# Calculate the RMSE for the baseline predictors for the training dataset
pred_train_RSME <- (train - baseline_pred) ^ 2 %>%
mean(na.rm = TRUE) %>%
sqrt()
pred_train_RSME
## [1] 1.452058
The results of RSME for baseline predictor show negative improvement for both training and test datasets. The training data is -2.70% and the test data is -20.76%. The percentage improvement is getting worst instead of yielding better prediction. This means that simply using just the raw average did much better job than the baseline predictor.
# Training dataset percent improvement
pred_improv_train <- scales::percent((1 - (pred_train_RSME/train_RMSE)), accuracy = 0.01)
pred_improv_train
## [1] "-2.70%"
# Test dataset percent improvement
pred_improv_test <- scales::percent((1 - (pred_test_RSME/test_RMSE)), accuracy = 0.01)
pred_improv_test
## [1] "-20.76%"