DATA643

The recommender system example was built with data from MovieLens linked from grouplens.org.

ratings <- read.csv("ratings.csv", header = T) #500K+ rows! 
movies <- read.csv("movies.csv", header = T)

set.seed(5)
#set.seed(100)
# pull random selection of 6 movies
# add text-title of movie, get movie IDs
little_sample <- sample_n(ratings, 10) %>% 
  left_join(movies, by = "movieId") %>% 
  mutate(title = as.character(title))

my_movies <- little_sample$movieId

Below, I’ve produced the user/item matrix that will be the basis for the remainder of the assignment:

# I found some user Ids that had a lot of ratings
my_users <- c(27248, 22966, 24688, 130710, 32806, 73026)

sm_ratings <- ratings %>% 
  filter(userId %in% my_users,
         movieId %in% my_movies) %>% 
  left_join(movies, by = "movieId") %>% 
  mutate(title = as.character(title)) %>% 
  select(userId, movieId, title, rating) 

sm_ratmatrix <- sm_ratings %>% 
  group_by(movieId, title) %>% 
  spread(userId, rating)
rm(ratings, movies) # clear up some space
sm_ratmatrix

## # A tibble: 9 x 8
## # Groups:   movieId, title [9]
##   movieId title           `22966` `24688` `27248` `32806` `73026` `130710`
##     <int> <chr>             <dbl>   <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
## 1       6 Heat (1995)         4.5     4         3     4         3        4
## 2     344 Ace Ventura: P~     3.5     2.5       4     2.5      NA       NA
## 3     480 Jurassic Park ~     4       3         4     3.5       3        3
## 4     707 Mulholland Fal~    NA      NA         4    NA         3       NA
## 5     912 Casablanca (19~     3.5     3         5     4.5       5        5
## 6    2871 Deliverance (1~    NA       3.5       4     3.5       4        4
## 7    3275 Boondock Saint~    NA       5        NA     4.5      NA        2
## 8   49272 Casino Royale ~    NA       3.5      NA    NA         3       NA
## 9   56152 Enchanted (200~    NA       3.5      NA    NA        NA       NA

As shown above, 9 movies are listed with 6 different user IDs at the top. There are several missing values, each indicating that the given user has not rated the given movie.

Next, I create my training and test sets (below, I’ve previewed the training set):

set.seed(4)
s_size <- floor(0.75 * nrow(sm_ratmatrix)) # 25% for test set
train_index <- sample(seq_len(nrow(sm_ratmatrix)), size = s_size)

train <- sm_ratmatrix[train_index, ] %>% ungroup() %>% select(-movieId, -title)
test <- sm_ratmatrix[-train_index, ] %>% ungroup() %>% select(-movieId, -title)

tr <- as.matrix(as.data.frame(lapply(train, as.numeric)))
dimnames(tr) <- NULL

te <- as.matrix(as.data.frame(lapply(test, as.numeric)))
dimnames(te) <- NULL

tr

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]   NA  3.5    4  3.5    4    4
## [2,]  4.5  4.0    3  4.0    3    4
## [3,]  4.0  3.0    4  3.5    3    3
## [4,]  3.5  2.5    4  2.5   NA   NA
## [5,]  3.5  3.0    5  4.5    5    5
## [6,]   NA  3.5   NA   NA   NA   NA

Raw means for the training and testing data (only the training set shown):

rm_tr <- matrix(1, nrow(tr), ncol(tr)) * round(mean(tr, na.rm = T),2)
rm_te <- matrix(1, nrow(te), ncol(te)) * round(mean(te, na.rm = T),2)
rm_tr

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 3.71 3.71 3.71 3.71 3.71 3.71
## [2,] 3.71 3.71 3.71 3.71 3.71 3.71
## [3,] 3.71 3.71 3.71 3.71 3.71 3.71
## [4,] 3.71 3.71 3.71 3.71 3.71 3.71
## [5,] 3.71 3.71 3.71 3.71 3.71 3.71
## [6,] 3.71 3.71 3.71 3.71 3.71 3.71

RSME:

sqrt(mean((tr - rm_tr)**2, na.rm = T)) # train RSME

## [1] 0.6869914

sqrt(mean((te - rm_te)**2, na.rm = T)) # test RSME

## [1] 0.9422087

Baseline predictor is the raw average plus the bias from the user, plus the bias from the movie. Below, I’ve recreated the original rating matrix with the bias for each user and movie in the training set.

movie_bias <- round(colMeans(tr, na.rm=T),2)-round(mean(tr, na.rm = T),2)
user_bias <- round(rowMeans(tr, na.rm=T),2) -round(mean(tr, na.rm = T),2)
tmp <- tr

tmp <- cbind(tmp, user_bias)
tmp <- rbind(tmp, movie_bias = c(movie_bias, NA))
tmp

##                                            user_bias
##              NA  3.50 4.00  3.50 4.00 4.00      0.09
##            4.50  4.00 3.00  4.00 3.00 4.00      0.04
##            4.00  3.00 4.00  3.50 3.00 3.00     -0.29
##            3.50  2.50 4.00  2.50   NA   NA     -0.59
##            3.50  3.00 5.00  4.50 5.00 5.00      0.62
##              NA  3.50   NA    NA   NA   NA     -0.21
## movie_bias 0.17 -0.46 0.29 -0.11 0.04 0.29        NA

Raw average matrix of training set:

rm_tr

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 3.71 3.71 3.71 3.71 3.71 3.71
## [2,] 3.71 3.71 3.71 3.71 3.71 3.71
## [3,] 3.71 3.71 3.71 3.71 3.71 3.71
## [4,] 3.71 3.71 3.71 3.71 3.71 3.71
## [5,] 3.71 3.71 3.71 3.71 3.71 3.71
## [6,] 3.71 3.71 3.71 3.71 3.71 3.71

Baseline prediction matrix:

x <- rm_tr + user_bias #+ movie_bias
result <- t(x) + movie_bias
result

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 3.97 3.92 3.59 3.29 4.50 3.67
## [2,] 3.34 3.29 2.96 2.66 3.87 3.04
## [3,] 4.09 4.04 3.71 3.41 4.62 3.79
## [4,] 3.69 3.64 3.31 3.01 4.22 3.39
## [5,] 3.84 3.79 3.46 3.16 4.37 3.54
## [6,] 4.09 4.04 3.71 3.41 4.62 3.79

RSME baseline

sqrt(mean((result - rm_tr)**2, na.rm = T)) # RSME

## [1] 0.4596859

References:

DATA643_P1

jbrnbrg

June 8, 2018