Project 1

This recommender system recommends movies to users.

The dataset used for this system is the latest small MovieLens dataset: 100,000 ratings and 1,300 tag applications applied to 9,000 movies by 700 users. (http://grouplens.org/datasets/movielens/latest/)

library(data.table)
library(recommenderlab)
library(reshape2)
# library(dplyr)
library(Matrix)
library(ggplot2)
library(useful)

# Read files
movies <- read.csv(file="data/movies.csv", header = T, sep = ",", stringsAsFactors = F)
ratings <- read.csv(file="data/ratings.csv", header = T, sep = ",", stringsAsFactors = F)

head(movies)
##   movieId                              title
## 1       1                   Toy Story (1995)
## 2       2                     Jumanji (1995)
## 3       3            Grumpier Old Men (1995)
## 4       4           Waiting to Exhale (1995)
## 5       5 Father of the Bride Part II (1995)
## 6       6                        Heat (1995)
##                                        genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2                  Adventure|Children|Fantasy
## 3                              Comedy|Romance
## 4                        Comedy|Drama|Romance
## 5                                      Comedy
## 6                       Action|Crime|Thriller
head(ratings)
##   userId movieId rating  timestamp
## 1      1      31    2.5 1260759144
## 2      1    1029    3.0 1260759179
## 3      1    1061    3.0 1260759182
## 4      1    1129    2.0 1260759185
## 5      1    1172    4.0 1260759205
## 6      1    1263    2.0 1260759151
dim(movies)
## [1] 9125    3
dim(ratings)
## [1] 100004      4

GENRES

Extract genres from movie dataset and create matrix.

# Split genres at '|' and transpose
genres <- movies$genres
genres <- as.data.frame(tstrsplit(genres, '[|]', type.convert=TRUE), stringsAsFactors=FALSE)
colnames(genres) <- c(1:10)
dim(genres)
## [1] 9125   10
genre_cats <- c("Action", "Adventure", "Animation", "Children", "Comedy", "Crime","Documentary", 
                "Drama", "Fantasy","Film-Noir", "Horror", "Musical", "Mystery","Romance","Sci-Fi", 
                "Thriller", "War", "Western")

# Create matrix for genres, set column names, convert to df
genre_mat <- matrix(0,9126,18) 
genre_mat[1,] <- genre_cats
colnames(genre_mat) <- genre_cats #set column names to genre list

for (i in 1:nrow(genres)) {
  for (j in 1:ncol(genres)) {
    genmat_col = which(genre_mat[1,] == genres[i, j])
    genre_mat[i+1, genmat_col] <- 1
  }
}

genre_mat <- as.data.frame(genre_mat[-1,], stringsAsFactors = F)
for (j in 1:ncol(genre_mat)) {
  genre_mat[ ,j] <- as.integer(genre_mat[ ,j])
}

dim(genre_mat)
## [1] 9125   18
# Create movie/genre matrix
genre_mat2 <- cbind(movies[,1:2], genre_mat)
corner(genre_mat2)
##   movieId                              title Action Adventure Animation
## 1       1                   Toy Story (1995)      0         1         1
## 2       2                     Jumanji (1995)      0         1         0
## 3       3            Grumpier Old Men (1995)      0         0         0
## 4       4           Waiting to Exhale (1995)      0         0         0
## 5       5 Father of the Bride Part II (1995)      0         0         0
moviegenres <- genre_mat2[, -(1:2)] # Remove movieId and title columns

corner(moviegenres)
##   Action Adventure Animation Children Comedy
## 1      0         1         1        1      1
## 2      0         1         0        1      0
## 3      0         0         0        0      1
## 4      0         0         0        0      1
## 5      0         0         0        0      1
dim(moviegenres)
## [1] 9125   18

RATINGS

Create ratings matrix.

# Delete timestamp
ratings <- ratings[, -4]
head(ratings)
##   userId movieId rating
## 1      1      31    2.5
## 2      1    1029    3.0
## 3      1    1061    3.0
## 4      1    1129    2.0
## 5      1    1172    4.0
## 6      1    1263    2.0
dim(ratings)
## [1] 100004      3
# Convert ratings to binary: 1 = >3 ,-1 = <3
user_ratings <- ratings
for (i in 1:nrow(ratings)){
  if (user_ratings[i,3] > 3){
    user_ratings[i,3] <- 1
  }
  else{
    user_ratings[i,3] <- -1
  }
}

# Reshape data (row = movieId, col = userId), convert NAs to 0, remove movieId identifier column
user_ratings2 <- dcast(user_ratings, movieId ~ userId, value.var = "rating", na.rm = F)
user_ratings2[is.na(user_ratings2)] = 0
userratings <-  user_ratings2[, -1] 
corner(userratings)
##   1 2 3 4 5
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 1
## 4 0 0 0 0 0
## 5 0 0 0 0 0
dim(userratings)
## [1] 9066  671
userratings.final <- userratings

# Sync up user ratings and movie genres
movies2 <- movies[-which((movies$movieId %in% ratings$movieId) == FALSE),]
rownames(movies2) <- NULL
moviegenres.final <- moviegenres[-which((movies$movieId %in% ratings$movieId) == FALSE),]
rownames(moviegenres.final) <- NULL
dim(moviegenres.final)
## [1] 9066   18

USER PROFILE MATRIX

calculate dot product of user ratings and movie genre matrices to create user profile matrix.

# Calculate dot product of user ratings vs. movie genres
user_profiles = matrix(0, 18, 671)

for (j in 1:ncol(userratings.final)) {
  for (i in 1:ncol(moviegenres.final)) {
    user_profiles[i, j] <- sum((moviegenres.final[, i]) * (userratings.final[, j]))
  }
}

corner(user_profiles)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]   -1   -5    0   38   12
## [2,]   -7    2   -4   42   18
## [3,]   -3    0   -2   22   10
## [4,]   -2   -2   -5   35   14
## [5,]   -5  -18    2   60   53
# Convert to binary data for processing
user_profiles <- as.matrix((user_profiles > 0) + 0)
corner(user_profiles)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    0    0    0    1    1
## [2,]    0    1    0    1    1
## [3,]    0    0    0    1    1
## [4,]    0    0    0    1    1
## [5,]    0    0    1    1    1

SAMPLE USER RECOMMENDATION

Test similarity functions using data for User 100.

# View profile for selected user
user100_profile <- user_profiles[, 100]

# Add selected user profile to movie genre matrix
similarity_mat <- rbind.data.frame(user100_profile, moviegenres.final)
similarity_mat <- data.frame(lapply(similarity_mat,function(x){as.integer(x)}))

library(proxy)
sim_results_e <- dist(similarity_mat, method = "Euclidean")
sim_results_e <- as.data.frame(as.matrix(sim_results_e[1:9066]))
closest_rows_e <- which(sim_results_e == min(sim_results_e))
paste("User 100 - Recommended movies, using Euclidean similarity:", movies[closest_rows_e, 1:2])
## [1] "User 100 - Recommended movies, using Euclidean similarity: c(2876, 3054, 5530, 6077, 26324, 70728, 82093, 114254, 115231, 118900)"                                                                                                                                                                                          
## [2] "User 100 - Recommended movies, using Euclidean similarity: c(\"Thumbelina (1994)\", \"Pokémon: The First Movie (1998)\", \"Simone (S1m0ne) (2002)\", \"Evil Under the Sun (1982)\", \"Harry and Tonto (1974)\", \"Bronson (2009)\", \"London Boulevard (2010)\", \"1971 (2014)\", \"St. Vincent (2014)\", \"Wild (2014)\")"
sim_results_j <- dist(similarity_mat, method = "Jaccard")
sim_results_j <- as.data.frame(as.matrix(sim_results_j[1:9066]))
closest_rows_j <- which(sim_results_j == min(sim_results_j))
paste("User 100 - Recommended movies, using Jaccard similarity:", movies[closest_rows_j, 1:2])
## [1] "User 100 - Recommended movies, using Jaccard similarity: c(3054, 114254)"                                         
## [2] "User 100 - Recommended movies, using Jaccard similarity: c(\"Pokémon: The First Movie (1998)\", \"1971 (2014)\")"

RECOMMENDERLAB

Use recommenderlab to predict on ratings dataset.

# Create realRating Matrix from ratings data (dcast, remove userId column)
ratings_mat_real <- dcast(ratings, userId ~ movieId, value.var = "rating", na.rm = F)
ratings_mat_real <- as.matrix(ratings_mat_real[, -1])

ratings_mat_real <- as(ratings_mat_real, "realRatingMatrix")
dim(ratings_mat_real)
## [1]  671 9066