DATA 643 - Project 1

Project 1

This recommender system recommends movies to users.

The dataset used for this system is the latest small MovieLens dataset: 100,000 ratings and 1,300 tag applications applied to 9,000 movies by 700 users. (http://grouplens.org/datasets/movielens/latest/)

library(data.table)
library(recommenderlab)
library(reshape2)
# library(dplyr)
library(Matrix)
library(ggplot2)
library(useful)

# Read files
movies <- read.csv(file="data/movies.csv", header = T, sep = ",", stringsAsFactors = F)
ratings <- read.csv(file="data/ratings.csv", header = T, sep = ",", stringsAsFactors = F)

head(movies)

##   movieId                              title
## 1       1                   Toy Story (1995)
## 2       2                     Jumanji (1995)
## 3       3            Grumpier Old Men (1995)
## 4       4           Waiting to Exhale (1995)
## 5       5 Father of the Bride Part II (1995)
## 6       6                        Heat (1995)
##                                        genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2                  Adventure|Children|Fantasy
## 3                              Comedy|Romance
## 4                        Comedy|Drama|Romance
## 5                                      Comedy
## 6                       Action|Crime|Thriller

head(ratings)

##   userId movieId rating  timestamp
## 1      1      31    2.5 1260759144
## 2      1    1029    3.0 1260759179
## 3      1    1061    3.0 1260759182
## 4      1    1129    2.0 1260759185
## 5      1    1172    4.0 1260759205
## 6      1    1263    2.0 1260759151

dim(movies)

## [1] 9125    3

dim(ratings)

## [1] 100004      4

GENRES

Extract genres from movie dataset and create matrix.

# Split genres at '|' and transpose
genres <- movies$genres
genres <- as.data.frame(tstrsplit(genres, '[|]', type.convert=TRUE), stringsAsFactors=FALSE)
colnames(genres) <- c(1:10)
dim(genres)

## [1] 9125   10

genre_cats <- c("Action", "Adventure", "Animation", "Children", "Comedy", "Crime","Documentary", 
                "Drama", "Fantasy","Film-Noir", "Horror", "Musical", "Mystery","Romance","Sci-Fi", 
                "Thriller", "War", "Western")

# Create matrix for genres, set column names, convert to df
genre_mat <- matrix(0,9126,18) 
genre_mat[1,] <- genre_cats
colnames(genre_mat) <- genre_cats #set column names to genre list

for (i in 1:nrow(genres)) {
  for (j in 1:ncol(genres)) {
    genmat_col = which(genre_mat[1,] == genres[i, j])
    genre_mat[i+1, genmat_col] <- 1
  }
}

genre_mat <- as.data.frame(genre_mat[-1,], stringsAsFactors = F)
for (j in 1:ncol(genre_mat)) {
  genre_mat[ ,j] <- as.integer(genre_mat[ ,j])
}

dim(genre_mat)

## [1] 9125   18

# Create movie/genre matrix
genre_mat2 <- cbind(movies[,1:2], genre_mat)
corner(genre_mat2)

##   movieId                              title Action Adventure Animation
## 1       1                   Toy Story (1995)      0         1         1
## 2       2                     Jumanji (1995)      0         1         0
## 3       3            Grumpier Old Men (1995)      0         0         0
## 4       4           Waiting to Exhale (1995)      0         0         0
## 5       5 Father of the Bride Part II (1995)      0         0         0

moviegenres <- genre_mat2[, -(1:2)] # Remove movieId and title columns

corner(moviegenres)

##   Action Adventure Animation Children Comedy
## 1      0         1         1        1      1
## 2      0         1         0        1      0
## 3      0         0         0        0      1
## 4      0         0         0        0      1
## 5      0         0         0        0      1

dim(moviegenres)

## [1] 9125   18

RATINGS

Create ratings matrix.

# Delete timestamp
ratings <- ratings[, -4]
head(ratings)

##   userId movieId rating
## 1      1      31    2.5
## 2      1    1029    3.0
## 3      1    1061    3.0
## 4      1    1129    2.0
## 5      1    1172    4.0
## 6      1    1263    2.0

dim(ratings)

## [1] 100004      3

# Convert ratings to binary: 1 = >3 ,-1 = <3
user_ratings <- ratings
for (i in 1:nrow(ratings)){
  if (user_ratings[i,3] > 3){
    user_ratings[i,3] <- 1
  }
  else{
    user_ratings[i,3] <- -1
  }
}

# Reshape data (row = movieId, col = userId), convert NAs to 0, remove movieId identifier column
user_ratings2 <- dcast(user_ratings, movieId ~ userId, value.var = "rating", na.rm = F)
user_ratings2[is.na(user_ratings2)] = 0
userratings <-  user_ratings2[, -1] 
corner(userratings)

##   1 2 3 4 5
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 1
## 4 0 0 0 0 0
## 5 0 0 0 0 0

dim(userratings)

## [1] 9066  671

userratings.final <- userratings

# Sync up user ratings and movie genres
movies2 <- movies[-which((movies$movieId %in% ratings$movieId) == FALSE),]
rownames(movies2) <- NULL
moviegenres.final <- moviegenres[-which((movies$movieId %in% ratings$movieId) == FALSE),]
rownames(moviegenres.final) <- NULL
dim(moviegenres.final)

## [1] 9066   18

USER PROFILE MATRIX

calculate dot product of user ratings and movie genre matrices to create user profile matrix.

# Calculate dot product of user ratings vs. movie genres
user_profiles = matrix(0, 18, 671)

for (j in 1:ncol(userratings.final)) {
  for (i in 1:ncol(moviegenres.final)) {
    user_profiles[i, j] <- sum((moviegenres.final[, i]) * (userratings.final[, j]))
  }
}

corner(user_profiles)

##      [,1] [,2] [,3] [,4] [,5]
## [1,]   -1   -5    0   38   12
## [2,]   -7    2   -4   42   18
## [3,]   -3    0   -2   22   10
## [4,]   -2   -2   -5   35   14
## [5,]   -5  -18    2   60   53

# Convert to binary data for processing
user_profiles <- as.matrix((user_profiles > 0) + 0)
corner(user_profiles)

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    0    0    0    1    1
## [2,]    0    1    0    1    1
## [3,]    0    0    0    1    1
## [4,]    0    0    0    1    1
## [5,]    0    0    1    1    1

SAMPLE USER RECOMMENDATION

Test similarity functions using data for User 100.

# View profile for selected user
user100_profile <- user_profiles[, 100]

# Add selected user profile to movie genre matrix
similarity_mat <- rbind.data.frame(user100_profile, moviegenres.final)
similarity_mat <- data.frame(lapply(similarity_mat,function(x){as.integer(x)}))

library(proxy)
sim_results_e <- dist(similarity_mat, method = "Euclidean")
sim_results_e <- as.data.frame(as.matrix(sim_results_e[1:9066]))
closest_rows_e <- which(sim_results_e == min(sim_results_e))
paste("User 100 - Recommended movies, using Euclidean similarity:", movies[closest_rows_e, 1:2])

## [1] "User 100 - Recommended movies, using Euclidean similarity: c(2876, 3054, 5530, 6077, 26324, 70728, 82093, 114254, 115231, 118900)"                                                                                                                                                                                          
## [2] "User 100 - Recommended movies, using Euclidean similarity: c(\"Thumbelina (1994)\", \"PokÃ©mon: The First Movie (1998)\", \"Simone (S1m0ne) (2002)\", \"Evil Under the Sun (1982)\", \"Harry and Tonto (1974)\", \"Bronson (2009)\", \"London Boulevard (2010)\", \"1971 (2014)\", \"St. Vincent (2014)\", \"Wild (2014)\")"

sim_results_j <- dist(similarity_mat, method = "Jaccard")
sim_results_j <- as.data.frame(as.matrix(sim_results_j[1:9066]))
closest_rows_j <- which(sim_results_j == min(sim_results_j))
paste("User 100 - Recommended movies, using Jaccard similarity:", movies[closest_rows_j, 1:2])

## [1] "User 100 - Recommended movies, using Jaccard similarity: c(3054, 114254)"                                         
## [2] "User 100 - Recommended movies, using Jaccard similarity: c(\"PokÃ©mon: The First Movie (1998)\", \"1971 (2014)\")"

RECOMMENDERLAB

Use recommenderlab to predict on ratings dataset.

# Create realRating Matrix from ratings data (dcast, remove userId column)
ratings_mat_real <- dcast(ratings, userId ~ movieId, value.var = "rating", na.rm = F)
ratings_mat_real <- as.matrix(ratings_mat_real[, -1])

ratings_mat_real <- as(ratings_mat_real, "realRatingMatrix")
dim(ratings_mat_real)

## [1]  671 9066

Create model, generate top 10 recommendation results for sample user (User 100) using user-based collaborative filtering (UBCF) and “popular” algorithm.

# UBCF
model <- Recommender(ratings_mat_real, method = "UBCF", 
                     param = list(method = "Cosine", nn = 20, normalize = "center")) # 20 closest users
recos <- predict(model, ratings_mat_real[100], n = 10)
recos_list <- as(recos, "list")

# Recommendations
results <- matrix(0, 10)
for (i in c(1:10)){
  results[i] <- movies[as.integer(recos_list[[1]][i]),2]
}
results

##       [,1]                         
##  [1,] "Scarlet Letter, The (1926)" 
##  [2,] "Mary Reilly (1996)"         
##  [3,] "Only You (1994)"            
##  [4,] "Month by the Lake, A (1995)"
##  [5,] "Scream 2 (1997)"            
##  [6,] "Caught (1996)"              
##  [7,] "Eye for an Eye (1996)"      
##  [8,] "Top Hat (1935)"             
##  [9,] "Crooklyn (1994)"            
## [10,] "Substitute, The (1996)"

# POPULAR
model2 <- Recommender(ratings_mat_real, method = "POPULAR", 
                     param = list(normalize = "Z-score"))
recos2 <- predict(model2, ratings_mat_real[100], n = 10)
recos_list2 <- as(recos2, "list")

# Recommendations
results2 <- matrix(0, 10)
for (i in c(1:10)){
  results[i] <- movies[as.integer(recos_list2[[1]][i]),2]
}
results

##       [,1]                                         
##  [1,] "Crooklyn (1994)"                            
##  [2,] "Tales from the Hood (1995)"                 
##  [3,] "Snow White and the Seven Dwarfs (1937)"     
##  [4,] "Only You (1994)"                            
##  [5,] "William Shakespeare's Romeo + Juliet (1996)"
##  [6,] "Delta of Venus (1995)"                      
##  [7,] "Street Fighter (1994)"                      
##  [8,] "Mighty Aphrodite (1995)"                    
##  [9,] "Smilla's Sense of Snow (1997)"              
## [10,] "Papillon (1973)"

Results from hand-coded Euclidean similarity and Jaccard similarity do not agree at all with the results from recommenderlab using the built-in UBCF and popular algorithms – although the results from those two algorithms also do not agree.

DATA 643 - Project 1

Honey Berk

February 14, 2017

Project 1

This recommender system recommends movies to users.

The dataset used for this system is the latest small MovieLens dataset: 100,000 ratings and 1,300 tag applications applied to 9,000 movies by 700 users. (http://grouplens.org/datasets/movielens/latest/)

GENRES

Extract genres from movie dataset and create matrix.

RATINGS

Create ratings matrix.

USER PROFILE MATRIX

calculate dot product of user ratings and movie genre matrices to create user profile matrix.

SAMPLE USER RECOMMENDATION

Test similarity functions using data for User 100.

RECOMMENDERLAB

Use recommenderlab to predict on ratings dataset.

Create model, generate top 10 recommendation results for sample user (User 100) using user-based collaborative filtering (UBCF) and “popular” algorithm.

Results from hand-coded Euclidean similarity and Jaccard similarity do not agree at all with the results from recommenderlab using the built-in UBCF and popular algorithms – although the results from those two algorithms also do not agree.