library(data.table)
library(recommenderlab)
library(reshape2)
# library(dplyr)
library(Matrix)
library(ggplot2)
library(useful)
# Read files
movies <- read.csv(file="data/movies.csv", header = T, sep = ",", stringsAsFactors = F)
ratings <- read.csv(file="data/ratings.csv", header = T, sep = ",", stringsAsFactors = F)
head(movies)
## movieId title
## 1 1 Toy Story (1995)
## 2 2 Jumanji (1995)
## 3 3 Grumpier Old Men (1995)
## 4 4 Waiting to Exhale (1995)
## 5 5 Father of the Bride Part II (1995)
## 6 6 Heat (1995)
## genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2 Adventure|Children|Fantasy
## 3 Comedy|Romance
## 4 Comedy|Drama|Romance
## 5 Comedy
## 6 Action|Crime|Thriller
head(ratings)
## userId movieId rating timestamp
## 1 1 31 2.5 1260759144
## 2 1 1029 3.0 1260759179
## 3 1 1061 3.0 1260759182
## 4 1 1129 2.0 1260759185
## 5 1 1172 4.0 1260759205
## 6 1 1263 2.0 1260759151
dim(movies)
## [1] 9125 3
dim(ratings)
## [1] 100004 4
# Split genres at '|' and transpose
genres <- movies$genres
genres <- as.data.frame(tstrsplit(genres, '[|]', type.convert=TRUE), stringsAsFactors=FALSE)
colnames(genres) <- c(1:10)
dim(genres)
## [1] 9125 10
genre_cats <- c("Action", "Adventure", "Animation", "Children", "Comedy", "Crime","Documentary",
"Drama", "Fantasy","Film-Noir", "Horror", "Musical", "Mystery","Romance","Sci-Fi",
"Thriller", "War", "Western")
# Create matrix for genres, set column names, convert to df
genre_mat <- matrix(0,9126,18)
genre_mat[1,] <- genre_cats
colnames(genre_mat) <- genre_cats #set column names to genre list
for (i in 1:nrow(genres)) {
for (j in 1:ncol(genres)) {
genmat_col = which(genre_mat[1,] == genres[i, j])
genre_mat[i+1, genmat_col] <- 1
}
}
genre_mat <- as.data.frame(genre_mat[-1,], stringsAsFactors = F)
for (j in 1:ncol(genre_mat)) {
genre_mat[ ,j] <- as.integer(genre_mat[ ,j])
}
dim(genre_mat)
## [1] 9125 18
# Create movie/genre matrix
genre_mat2 <- cbind(movies[,1:2], genre_mat)
corner(genre_mat2)
## movieId title Action Adventure Animation
## 1 1 Toy Story (1995) 0 1 1
## 2 2 Jumanji (1995) 0 1 0
## 3 3 Grumpier Old Men (1995) 0 0 0
## 4 4 Waiting to Exhale (1995) 0 0 0
## 5 5 Father of the Bride Part II (1995) 0 0 0
moviegenres <- genre_mat2[, -(1:2)] # Remove movieId and title columns
corner(moviegenres)
## Action Adventure Animation Children Comedy
## 1 0 1 1 1 1
## 2 0 1 0 1 0
## 3 0 0 0 0 1
## 4 0 0 0 0 1
## 5 0 0 0 0 1
dim(moviegenres)
## [1] 9125 18
# Delete timestamp
ratings <- ratings[, -4]
head(ratings)
## userId movieId rating
## 1 1 31 2.5
## 2 1 1029 3.0
## 3 1 1061 3.0
## 4 1 1129 2.0
## 5 1 1172 4.0
## 6 1 1263 2.0
dim(ratings)
## [1] 100004 3
# Convert ratings to binary: 1 = >3 ,-1 = <3
user_ratings <- ratings
for (i in 1:nrow(ratings)){
if (user_ratings[i,3] > 3){
user_ratings[i,3] <- 1
}
else{
user_ratings[i,3] <- -1
}
}
# Reshape data (row = movieId, col = userId), convert NAs to 0, remove movieId identifier column
user_ratings2 <- dcast(user_ratings, movieId ~ userId, value.var = "rating", na.rm = F)
user_ratings2[is.na(user_ratings2)] = 0
userratings <- user_ratings2[, -1]
corner(userratings)
## 1 2 3 4 5
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 1
## 4 0 0 0 0 0
## 5 0 0 0 0 0
dim(userratings)
## [1] 9066 671
userratings.final <- userratings
# Sync up user ratings and movie genres
movies2 <- movies[-which((movies$movieId %in% ratings$movieId) == FALSE),]
rownames(movies2) <- NULL
moviegenres.final <- moviegenres[-which((movies$movieId %in% ratings$movieId) == FALSE),]
rownames(moviegenres.final) <- NULL
dim(moviegenres.final)
## [1] 9066 18
# Calculate dot product of user ratings vs. movie genres
user_profiles = matrix(0, 18, 671)
for (j in 1:ncol(userratings.final)) {
for (i in 1:ncol(moviegenres.final)) {
user_profiles[i, j] <- sum((moviegenres.final[, i]) * (userratings.final[, j]))
}
}
corner(user_profiles)
## [,1] [,2] [,3] [,4] [,5]
## [1,] -1 -5 0 38 12
## [2,] -7 2 -4 42 18
## [3,] -3 0 -2 22 10
## [4,] -2 -2 -5 35 14
## [5,] -5 -18 2 60 53
# Convert to binary data for processing
user_profiles <- as.matrix((user_profiles > 0) + 0)
corner(user_profiles)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 1 1
## [2,] 0 1 0 1 1
## [3,] 0 0 0 1 1
## [4,] 0 0 0 1 1
## [5,] 0 0 1 1 1
# View profile for selected user
user100_profile <- user_profiles[, 100]
# Add selected user profile to movie genre matrix
similarity_mat <- rbind.data.frame(user100_profile, moviegenres.final)
similarity_mat <- data.frame(lapply(similarity_mat,function(x){as.integer(x)}))
library(proxy)
sim_results_e <- dist(similarity_mat, method = "Euclidean")
sim_results_e <- as.data.frame(as.matrix(sim_results_e[1:9066]))
closest_rows_e <- which(sim_results_e == min(sim_results_e))
paste("User 100 - Recommended movies, using Euclidean similarity:", movies[closest_rows_e, 1:2])
## [1] "User 100 - Recommended movies, using Euclidean similarity: c(2876, 3054, 5530, 6077, 26324, 70728, 82093, 114254, 115231, 118900)"
## [2] "User 100 - Recommended movies, using Euclidean similarity: c(\"Thumbelina (1994)\", \"Pokémon: The First Movie (1998)\", \"Simone (S1m0ne) (2002)\", \"Evil Under the Sun (1982)\", \"Harry and Tonto (1974)\", \"Bronson (2009)\", \"London Boulevard (2010)\", \"1971 (2014)\", \"St. Vincent (2014)\", \"Wild (2014)\")"
sim_results_j <- dist(similarity_mat, method = "Jaccard")
sim_results_j <- as.data.frame(as.matrix(sim_results_j[1:9066]))
closest_rows_j <- which(sim_results_j == min(sim_results_j))
paste("User 100 - Recommended movies, using Jaccard similarity:", movies[closest_rows_j, 1:2])
## [1] "User 100 - Recommended movies, using Jaccard similarity: c(3054, 114254)"
## [2] "User 100 - Recommended movies, using Jaccard similarity: c(\"Pokémon: The First Movie (1998)\", \"1971 (2014)\")"
# Create realRating Matrix from ratings data (dcast, remove userId column)
ratings_mat_real <- dcast(ratings, userId ~ movieId, value.var = "rating", na.rm = F)
ratings_mat_real <- as.matrix(ratings_mat_real[, -1])
ratings_mat_real <- as(ratings_mat_real, "realRatingMatrix")
dim(ratings_mat_real)
## [1] 671 9066
# UBCF
model <- Recommender(ratings_mat_real, method = "UBCF",
param = list(method = "Cosine", nn = 20, normalize = "center")) # 20 closest users
recos <- predict(model, ratings_mat_real[100], n = 10)
recos_list <- as(recos, "list")
# Recommendations
results <- matrix(0, 10)
for (i in c(1:10)){
results[i] <- movies[as.integer(recos_list[[1]][i]),2]
}
results
## [,1]
## [1,] "Scarlet Letter, The (1926)"
## [2,] "Mary Reilly (1996)"
## [3,] "Only You (1994)"
## [4,] "Month by the Lake, A (1995)"
## [5,] "Scream 2 (1997)"
## [6,] "Caught (1996)"
## [7,] "Eye for an Eye (1996)"
## [8,] "Top Hat (1935)"
## [9,] "Crooklyn (1994)"
## [10,] "Substitute, The (1996)"
# POPULAR
model2 <- Recommender(ratings_mat_real, method = "POPULAR",
param = list(normalize = "Z-score"))
recos2 <- predict(model2, ratings_mat_real[100], n = 10)
recos_list2 <- as(recos2, "list")
# Recommendations
results2 <- matrix(0, 10)
for (i in c(1:10)){
results[i] <- movies[as.integer(recos_list2[[1]][i]),2]
}
results
## [,1]
## [1,] "Crooklyn (1994)"
## [2,] "Tales from the Hood (1995)"
## [3,] "Snow White and the Seven Dwarfs (1937)"
## [4,] "Only You (1994)"
## [5,] "William Shakespeare's Romeo + Juliet (1996)"
## [6,] "Delta of Venus (1995)"
## [7,] "Street Fighter (1994)"
## [8,] "Mighty Aphrodite (1995)"
## [9,] "Smilla's Sense of Snow (1997)"
## [10,] "Papillon (1973)"