GroupLens Research has collected and made available rating data sets from the MovieLens web site (http://movielens.org). The data sets were collected over various periods of time. The selected dataset has ~100K movie ratings (1-5) from ~600 users on ~9000 movies.
Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.
The data are contained in the files links.csv
, movies.csv
, ratings.csv
and tags.csv
.
# Loading datasets, Package Installation
movies <- read.csv('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/movies.csv')
ratings <- read.csv('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/ratings.csv')
#install.packages("tidyverse")
library(dplyr)
library(tidyr)
# Data Exploration
summary(ratings)
## userId movieId rating timestamp
## Min. : 1.0 Min. : 1 Min. :0.500 Min. :8.281e+08
## 1st Qu.:177.0 1st Qu.: 1199 1st Qu.:3.000 1st Qu.:1.019e+09
## Median :325.0 Median : 2991 Median :3.500 Median :1.186e+09
## Mean :326.1 Mean : 19435 Mean :3.502 Mean :1.206e+09
## 3rd Qu.:477.0 3rd Qu.: 8122 3rd Qu.:4.000 3rd Qu.:1.436e+09
## Max. :610.0 Max. :193609 Max. :5.000 Max. :1.538e+09
head(ratings)
## userId movieId rating timestamp
## 1 1 1 4 964982703
## 2 1 3 4 964981247
## 3 1 6 4 964982224
## 4 1 47 5 964983815
## 5 1 50 5 964982931
## 6 1 70 3 964982400
summary(movies)
## movieId title
## Min. : 1 Confessions of a Dangerous Mind (2002): 2
## 1st Qu.: 3248 Emma (1996) : 2
## Median : 7300 Eros (2004) : 2
## Mean : 42200 Saturn 3 (1980) : 2
## 3rd Qu.: 76232 War of the Worlds (2005) : 2
## Max. :193609 '71 (2014) : 1
## (Other) :9731
## genres
## Drama :1053
## Comedy : 946
## Comedy|Drama : 435
## Comedy|Romance: 363
## Drama|Romance : 349
## Documentary : 339
## (Other) :6257
head(movies)
## movieId title
## 1 1 Toy Story (1995)
## 2 2 Jumanji (1995)
## 3 3 Grumpier Old Men (1995)
## 4 4 Waiting to Exhale (1995)
## 5 5 Father of the Bride Part II (1995)
## 6 6 Heat (1995)
## genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2 Adventure|Children|Fantasy
## 3 Comedy|Romance
## 4 Comedy|Drama|Romance
## 5 Comedy
## 6 Action|Crime|Thriller
# Build a user matrix with movies as columns
rating_mat <- spread(ratings[,1:3], movieId, rating)
rating_mat <- as.matrix(rating_mat[,-1]) #remove userIds
library(recommenderlab)
#Convert into a recommenderlab sparse matrix
rating_mat <- as(rating_mat, "realRatingMatrix")
#Exploring parameters of recommendation models
recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommender_models)
## [1] "ALS_realRatingMatrix" "ALS_implicit_realRatingMatrix"
## [3] "IBCF_realRatingMatrix" "POPULAR_realRatingMatrix"
## [5] "RANDOM_realRatingMatrix" "RERECOMMEND_realRatingMatrix"
## [7] "SVD_realRatingMatrix" "SVDF_realRatingMatrix"
## [9] "UBCF_realRatingMatrix"
lapply(recommender_models, "[[", "description")
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
##
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
##
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
##
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
##
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
##
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
##
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
##
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend."
##
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
#SVD Parameters
recommender_models$SVD_realRatingMatrix$parameters
## $k
## [1] 10
##
## $maxiter
## [1] 100
##
## $normalize
## [1] "center"
#Determine similarity between users (first 4)
similarity_users <- similarity(rating_mat[1:4, ], method = "cosine", which = "users")
as.matrix(similarity_users)
## 1 2 3 4
## 1 0.0000000 1 0.7919033 0.9328096
## 2 1.0000000 0 NA 1.0000000
## 3 0.7919033 NA 0.0000000 1.0000000
## 4 0.9328096 1 1.0000000 0.0000000
image(as.matrix(similarity_users), main = "User similarity")
#Determine similarity between items (movies) (first 4)
similarity_items <- similarity(rating_mat[, 1:4], method = "cosine", which ="items")
as.matrix(similarity_items)
## 1 2 3 4
## 1 0.0000000 0.9644641 0.9715415 0.9838699
## 2 0.9644641 0.0000000 0.9389013 0.9609877
## 3 0.9715415 0.9389013 0.0000000 1.0000000
## 4 0.9838699 0.9609877 1.0000000 0.0000000
image(as.matrix(similarity_items), main = "Movies similarity")
#Explore ratings distribution
vector_ratings <- as.vector(rating_mat@data)
table(vector_ratings)
## vector_ratings
## 0 0.5 1 1.5 2 2.5 3 3.5 4
## 5830804 1370 2811 1791 7551 5550 20047 13136 26818
## 4.5 5
## 8551 13211
#Explore movie performance
views_per_movie <- colCounts(rating_mat) # count views for each movie
table_views <- data.frame(movie = names(views_per_movie),views = views_per_movie)
table_views <- table_views[order(table_views$views, decreasing = TRUE), ] # sort by number of views
table_views$title <- NA
for (i in 1:nrow(table_views)){
table_views[i,3] <- as.character(subset(movies, movies$movieId == table_views[i,1])$title)
}
head(table_views)
## movie views title
## 356 356 329 Forrest Gump (1994)
## 318 318 317 Shawshank Redemption, The (1994)
## 296 296 307 Pulp Fiction (1994)
## 593 593 279 Silence of the Lambs, The (1991)
## 2571 2571 278 Matrix, The (1999)
## 260 260 251 Star Wars: Episode IV - A New Hope (1977)
#Consider only movies with total of views higher than 50 views
average_ratings <- colMeans(rating_mat)
average_ratings_relevant <- average_ratings[views_per_movie > 50]
# Out of 9700+ movies, only 436 have more thab 50 views
length(average_ratings_relevant)
## [1] 436
#Focus on a more relevant set of ratings with the following constraints: Minimum of 50 users per rates movie and 50 views per movie.
ratings_relevant <- rating_mat[rowCounts(rating_mat) > 50, colCounts(rating_mat) > 50]
ratings_relevant
## 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.
vector_ratings_relevant <- as.vector(ratings_relevant@data)
table(vector_ratings_relevant)
## vector_ratings_relevant
## 0 0.5 1 1.5 2 2.5 3 3.5 4 4.5
## 128594 322 694 367 1833 1479 6279 4605 10552 3742
## 5
## 6341
#Normalize data
#Defining Train and Test data sets
train_filter <- sample(x = c(TRUE, FALSE), size = nrow(ratings_relevant),replace = TRUE, prob = c(0.8, 0.2))
train_ratings <- as(ratings_relevant[train_filter, ], "realRatingMatrix")
test_ratings <- as(ratings_relevant[!train_filter, ], "realRatingMatrix")
#Normalize data
train_ratings <- normalize(train_ratings)
test_ratings <- normalize(test_ratings)
#Create Recommender Model. Based on SVD approximation with column-mean imputation
recommender_model <- Recommender(train_ratings, method = "SVD", param=list(k=10,maxiter=100,normalize="center"))
# Top 10 recommendations for users (1-10)
recom <- predict(recommender_model, newdata=test_ratings, n=10, type="topNList")
recom_list <- as(recom, "list")
recom_result <- list()
for (i in c(1:10)){
recom_result[[i]] <- movies[as.integer(recom_list[[i]]),2]
}
library(knitr)
recom_result_df <- as.data.frame(recom_result)
colnames(recom_result_df) <- seq(1,10,1)
kable(recom_result_df)
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
---|---|---|---|---|---|---|---|---|---|
Foxfire (1996) | NeverEnding Story III, The (1994) | S.F.W. (1994) | GLOW: The Story of the Gorgeous Ladies of Wrestling (2012) | Great White Hype, The (1996) | Instinct (1999) | Dead Man Walking (1995) | Detour (1945) | NA | Excess Baggage (1997) |
Star Wars: Episode VI - Return of the Jedi (1983) | Rules of Engagement (2000) | Universal Soldier (1992) | Picture Perfect (1997) | NeverEnding Story III, The (1994) | Escort, The (Scorta, La) (1993) | 42 Up (1998) | Air Force One (1997) | S.F.W. (1994) | Orgazmo (1997) |
Children of the Corn (1984) | 20,000 Leagues Under the Sea (1954) | Twelve Monkeys (a.k.a. 12 Monkeys) (1995) | Escort, The (Scorta, La) (1993) | Snatch (2000) | 20,000 Leagues Under the Sea (1954) | Mulan (1998) | Teenage Mutant Ninja Turtles (1990) | NA | 20,000 Leagues Under the Sea (1954) |
He Said, She Said (1991) | Sgt. Bilko (1996) | Dead Man Walking (1995) | Mrs. Dalloway (1997) | Jupiter’s Wife (1994) | Phantoms (1998) | Star Trek: Generations (1994) | Mrs. Dalloway (1997) | NA | Liar Liar (1997) |
Joe Versus the Volcano (1990) | Full Metal Jacket (1987) | 20,000 Leagues Under the Sea (1954) | Good German, The (2006) | Wonderland (1999) | Bread and Chocolate (Pane e cioccolata) (1973) | Police Academy 4: Citizens on Patrol (1987) | Usual Suspects, The (1995) | Beverly Hillbillies, The (1993) | Sgt. Bilko (1996) |
First Strike (Police Story 4: First Strike) (Ging chaat goo si 4: Ji gaan daan yam mo) (1996) | Dead Man Walking (1995) | Rules of Engagement (2000) | NA | Tarantula (1955) | NA | Masters of the Universe (1987) | Wonderland (1999) | Surviving the Game (1994) | Marvin’s Room (1996) |
Nightmare on Elm Street, A (1984) | Orgazmo (1997) | Instinct (1999) | Defiance (2008) | Great Expectations (1998) | I Like It Like That (1994) | Monster in a Box (1992) | 20,000 Leagues Under the Sea (1954) | NA | Cop Land (1997) |
Seabiscuit (2003) | Flying Tigers (1942) | Negotiator, The (1998) | Samouraï, Le (Godson, The) (1967) | Flesh & Blood (1985) | Police Academy 4: Citizens on Patrol (1987) | NA | Fog, The (1980) | Brice Man, The (Brice de Nice) (2005) | Striptease (1996) |
Cat People (1982) | Not Without My Daughter (1991) | Full Metal Jacket (1987) | Toy Story (1995) | Starman (1984) | Dracula (Bram Stoker’s Dracula) (1992) | Aliens (1986) | Sgt. Bilko (1996) | NA | Great White Hype, The (1996) |
Around the World in 80 Days (1956) | Kid in King Arthur’s Court, A (1995) | Deconstructing Harry (1997) | Woman of Paris, A (1923) | Vertical Limit (2000) | Wallace & Gromit: The Best of Aardman Animation (1996) | In Love and War (1996) | 42 Up (1998) | Flesh & Blood (1985) | Alien: Resurrection (1997) |
#Ratings assigned to the movies
recomr <- predict(recommender_model, newdata=test_ratings, type="ratingMatrix")
recomr_mat <- as(recomr, "matrix")
recomr_mat[1:5,1:5] #First 5 users and first 5 movies
## 1 2 3 6 7
## [1,] 3.9529464 0.1630435 1.163043 0.1630435 0.1630435
## [2,] -1.0121951 3.4961473 3.465581 3.5417891 3.4409979
## [3,] 0.1527778 4.3228326 4.336484 4.3586883 4.3351892
## [4,] 4.2198792 -0.9250000 3.735146 3.8431901 3.8897291
## [5,] 3.5326733 3.6586487 3.632866 0.3666667 3.6426460
#Evaluating Model with Cross-validation
eval_sch <- evaluationScheme(ratings_relevant, method="cross-validation", k=4, given=10, goodRating=3)
recommender_model <- Recommender(getData(eval_sch,"train"), method = "SVD", param=list(k=10,maxiter=100,normalize="center"))
recom <- predict(recommender_model, newdata=getData(eval_sch,"known"), n=10, type="topNList")
#Performance index of the whole model
eval_accuracy <- calcPredictionAccuracy(x = recom, data = getData(eval_sch, "unknown"), given=10, goodRating=3, byUser = FALSE)
head(eval_accuracy)
## TP FP FN TN precision
## 3.11458333 6.88541667 71.11458333 344.88541667 0.31145833
## recall
## 0.05435035
#Evaluate recommender model depending on the number of items (movies) recommended for every user (multiples of 5 up to 20)
results <- evaluate(x = eval_sch,method = "SVD",n = seq(0,20,5))
## SVD run fold/sample [model time/prediction time]
## 1 [0.03sec/0.06sec]
## 2 [0.03sec/0.06sec]
## 3 [0.03sec/0.1sec]
## 4 [0sec/0.06sec]
head(getConfusionMatrix(results)[[1]])
## TP FP FN TN precision recall TPR
## 0 0.000000 0.000000 74.22917 351.7708 NaN 0.00000000 0.00000000
## 5 1.812500 3.187500 72.41667 348.5833 0.3625000 0.03783824 0.03783824
## 10 3.114583 6.885417 71.11458 344.8854 0.3114583 0.05435035 0.05435035
## 15 4.604167 10.395833 69.62500 341.3750 0.3069444 0.07446513 0.07446513
## 20 5.927083 14.072917 68.30208 337.6979 0.2963542 0.09441751 0.09441751
## FPR
## 0 0.000000000
## 5 0.008835174
## 10 0.018980418
## 15 0.028729052
## 20 0.038950878