This recommender system is built to recommend movies to users based on user ratings. The dataset was collected from grouplens.org.
library(tidyverse)
library(Matrix)
ratings <- read.csv("https://raw.githubusercontent.com/javernw/DATA612-Recommender-Systems/master/ratings.csv", header = T, stringsAsFactors = F)
#copy ratings
tbl_ratings <- ratings %>% select(-timestamp)
tbl_ratings$userId <- as.factor(tbl_ratings$userId)
tbl_ratings$movieId <- as.factor(tbl_ratings$movieId)
UI_matrix <- sparseMatrix(
i = as.integer(tbl_ratings$userId),
j = as.integer(tbl_ratings$movieId),
x = tbl_ratings$rating,
dimnames = list(levels(tbl_ratings$userId), levels(tbl_ratings$movieId))
)
UI_matrix
610 x 9724 sparse Matrix of class "dgCMatrix"
[[ suppressing 32 column names <U+393C><U+3E31>1<U+393C><U+3E32>, <U+393C><U+3E31>2<U+393C><U+3E32>, <U+393C><U+3E31>3<U+393C><U+3E32> ... ]]
[[ suppressing 32 column names <U+393C><U+3E31>1<U+393C><U+3E32>, <U+393C><U+3E31>2<U+393C><U+3E32>, <U+393C><U+3E31>3<U+393C><U+3E32> ... ]]
1 4.0 . 4 . . 4 . . . . . . . . . . . . . . . . . . . . . . . . . . ......
2 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
3 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.5 . ......
4 . . . . . . . . . . . . . . . . . . . . 3 . . . . . . . . . . 2 ......
5 4.0 . . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . ......
6 . 4 5 3 5 4 4 3 . 3 4 . 3 . 4 4 4 . 2 . 2 5 . 4 3 4 3 . . . 3.0 4 ......
7 4.5 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
8 . 4 . . . . . . . 2 4 . . . . . . . . . 4 . . . . . . . . . . 3 ......
9 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
10 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
11 . . . . . 5 . . . 3 . . . . . . . . . . . . . . . . . . . . . . ......
12 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
13 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
14 . . . 3 . . 3 . . . . . . . . . . . 1 . . . . . 4 . . . . . . 4 ......
15 2.5 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
16 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
..............................
........suppressing 9692 columns and 579 rows in show(); maybe adjust 'options(max.print= *, width = *)'
..............................
[[ suppressing 32 column names <U+393C><U+3E31>1<U+393C><U+3E32>, <U+393C><U+3E31>2<U+393C><U+3E32>, <U+393C><U+3E31>3<U+393C><U+3E32> ... ]]
596 4.0 . . . . . . . . . . . . . . . . . . . . . .
597 4.0 . . . . 3.0 1.0 . . 3.0 3.0 . . . . . 3.0 . . . 5.0 . .
598 . . . . . . . . . . . . . . . . . . . . . . .
599 3.0 2.5 1.5 . . 4.5 2.5 . 1.5 3.5 2.5 1.5 . . 1.5 3.0 3.5 3 3 1.5 4.0 . 1
600 2.5 4.0 . 1.5 2.5 . 3.5 . . . . . . . . . 3.5 . 3 . . . .
601 4.0 . . . . . . . . . . . . . . . . . . . . . .
602 . 4.0 . . . 3.0 . . . 3.0 3.0 . . 5 . 3.0 . . 2 . 4.0 3 .
603 4.0 . . . . 4.0 . . . . . . . . . 4.0 3.0 . . . 5.0 . .
604 3.0 5.0 . . 3.0 3.0 . . . . . . . 4 . . 4.0 . 1 . . 3 4
605 4.0 3.5 . . . . . . . . . . . . . . . . . . . . .
606 2.5 . . . . . 2.5 . . . 2.5 . . . 3.5 . 4.0 4 2 . . . .
607 4.0 . . . . . . . . . 3.0 . . . . . . . . . . . .
608 2.5 2.0 2.0 . . . . . . 4.0 . . . . . 4.5 . . 2 . 3.5 . .
609 3.0 . . . . . . . . 4.0 . . . . . . . . . . . . .
610 5.0 . . . . 5.0 . . . . . . . . . 4.5 . . . . . . .
596 . . . . . . . . 3.5 ......
597 . . . . . . . . . ......
598 . . . . . . . . . ......
599 2.5 . 2.5 . . 3.5 . 2 3.0 ......
600 2.0 . . . . 4.5 . . 4.5 ......
601 . . . . . . . . . ......
602 . 4 . . . 2.0 . . 3.0 ......
603 . 4 . . 5.0 2.0 4 . 3.0 ......
604 . 3 . . . . . . 4.0 ......
605 . . . . 4.0 . . . . ......
606 . . . . 3.5 4.5 . . 4.0 ......
607 . 3 . . . . . . . ......
608 2.0 . . . . . . 3 3.5 ......
609 . . . . . . . . . ......
610 . . . . . . . . 4.5 ......
# break up into training and test sets
train <- sample(x = c(T, F), size = nrow(UI_matrix), replace = T, prob = c(0.8, 0.2))
training_data <- UI_matrix[train, ]
testing_data <- UI_matrix[-train, ]
avg_train <- mean(training_data@x)
avg_train
[1] 3.49839
RMSE <- function(m, o){
sqrt(mean((m - o)^2))
}
train_rmse <- RMSE(avg_train, training_data)
train_rmse
[1] 3.470717
test_rmse <- RMSE(avg_train, testing_data)
test_rmse
[1] 3.471206
user_bias <- rowMeans(training_data) - avg_train
item_bias <- colMeans(training_data) - avg_train
baseline_predictors <- user_bias + item_bias + avg_train
#ratings cannot be lower than 1 or higher than 5
baseline_predictors[baseline_predictors < 1] <- 1
baseline_predictors[baseline_predictors > 5] <- 5
Train
base_train_rmse <- RMSE(avg_train, baseline_predictors)
Test
avg_test <- mean(testing_data@x)
test_user_bias <- rowMeans(testing_data) - avg_test
test_item_bias <- colMeans(testing_data) - avg_test
test_baseline_predictors <- avg_test + test_item_bias + test_user_bias
longer object length is not a multiple of shorter object length
#ratings cannot be lower than 1 or higher than 5
test_baseline_predictors[test_baseline_predictors < 1] <- 1
test_baseline_predictors[test_baseline_predictors > 5] <- 5
base_test_rmse <- RMSE(avg_test, test_baseline_predictors)
test_ <- (1 - (base_test_rmse / test_rmse)) * 100
test_
[1] 27.99152
train_ <- (1 - (base_train_rmse / train_rmse)) * 100
train_
[1] 28.01515
Based on the calculations above, the recommender system improved by 28%
with the test data and 28%
with the training data when making predictions.
https://stackoverflow.com/questions/26237688/rmse-root-mean-square-deviation-calculation-in-r