Contributions:

All group members contributed equally to writing the code, discussion, updating the code and submission.

Loading the library packages.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
library(DT)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(Matrix)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
myurl = "https://liangfgithub.github.io/MovieData/"
set.seed(4588)

Read data

Ratings data

# use colClasses = 'NULL' to skip columns
ratings = read.csv(paste0(myurl, 'ratings.dat?raw=true'), 
                   sep = ':',
                   colClasses = c('integer', 'NULL'), 
                   header = FALSE)
colnames(ratings) = c('UserID', 'MovieID', 'Rating', 'Timestamp')
ratings$Timestamp = NULL

Movies data

movies = readLines(paste0(myurl, 'movies.dat?raw=true'))
movies = strsplit(movies, split = "::", fixed = TRUE, useBytes = TRUE)
movies = matrix(unlist(movies), ncol = 3, byrow = TRUE)
movies = data.frame(movies, stringsAsFactors = FALSE)
colnames(movies) = c('MovieID', 'Title', 'Genres')
movies$MovieID = as.integer(movies$MovieID)

# convert accented characters
movies$Title[73]
## [1] "Mis\xe9rables, Les (1995)"
movies$Title = iconv(movies$Title, "latin1", "UTF-8")
movies$Title[73]
## [1] "Misérables, Les (1995)"
# extract year
movies$Year = as.numeric(unlist(
  lapply(movies$Title, function(x) substr(x, nchar(x)-4, nchar(x)-1))))

Users data

users = read.csv(paste0(myurl, 'users.dat?raw=true'),
                 sep = ':', header = FALSE)
users = users[, -c(2,4,6,8)] # skip columns
colnames(users) = c('UserID', 'Gender', 'Age', 'Occupation', 'Zip-code')

Here, the code read the data and construct three matrices: Ratings, movies, and users which have the following features:

head(ratings)
##   UserID MovieID Rating
## 1      1    1193      5
## 2      1     661      3
## 3      1     914      3
## 4      1    3408      4
## 5      1    2355      5
## 6      1    1197      3
head(movies)
##   MovieID                              Title                       Genres Year
## 1       1                   Toy Story (1995)  Animation|Children's|Comedy 1995
## 2       2                     Jumanji (1995) Adventure|Children's|Fantasy 1995
## 3       3            Grumpier Old Men (1995)               Comedy|Romance 1995
## 4       4           Waiting to Exhale (1995)                 Comedy|Drama 1995
## 5       5 Father of the Bride Part II (1995)                       Comedy 1995
## 6       6                        Heat (1995)        Action|Crime|Thriller 1995
head(users)
##   UserID Gender Age Occupation Zip-code
## 1      1      F   1         10    48067
## 2      2      M  56         16    70072
## 3      3      M  25         15    55117
## 4      4      M  45          7    02460
## 5      5      M  25         20    55455
## 6      6      F  50          9    55117

Exploration

Unique users

dim(users)
## [1] 6040    5
length(unique(ratings$UserID))
## [1] 6040

Unique Movies

dim(movies)
## [1] 3883    4
length(unique(ratings$MovieID))
## [1] 3706
movies_not_rated = movies %>% 
  filter(!(MovieID %in% ratings$MovieID))
dim(movies_not_rated)
## [1] 177   4
r_orig = ratings

#System 1

In this section, we are going to apply two recommendation schemes based on genres. First of all, we create a bibary indicator for the 18 genres for each movie

genres = as.data.frame(movies$Genres, stringsAsFactors=FALSE)
tmp = as.data.frame(tstrsplit(genres[,1], '[|]',
                              type.convert=TRUE),
                    stringsAsFactors=FALSE)
genre_list = c("Action", "Adventure", "Animation", 
               "Children's", "Comedy", "Crime",
               "Documentary", "Drama", "Fantasy",
               "Film-Noir", "Horror", "Musical", 
               "Mystery", "Romance", "Sci-Fi", 
               "Thriller", "War", "Western")
m = length(genre_list)
genre_matrix = matrix(0, nrow(movies), length(genre_list))
for(i in 1:nrow(tmp)){
  genre_matrix[i,genre_list %in% tmp[i,]]=1
}
colnames(genre_matrix) = genre_list
remove("tmp", "genres")
colnames(genre_matrix)[4]="Childrens"
colnames(genre_matrix)[10]="FilmNoir"
colnames(genre_matrix)[15]="SciFi"

save(genre_matrix, file = "System1.rda")

since users voting might not be similar in voting, the votes of each user are averaged and subtracted from the standard deviation (zscore). However, a simple average data were also calculated and will be reported at the end.

ratings1=  ratings %>% 
  group_by(UserID) %>% 
  summarize(average = mean(Rating), sd= sd(Rating)) %>% left_join(ratings, by = 'UserID')

ratings$Rating_z = (ratings1$Rating - ratings1$average)/ ratings1$sd

Now, we are going to add number of ratings per user to our data frame

tmp = ratings %>% 
  group_by(UserID) %>% 
  summarize(ratings_per_user = n()) 
summary(tmp$ratings_per_user)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    20.0    44.0    96.0   165.6   208.0  2314.0
stem(tmp$ratings_per_user)
## 
##   The decimal point is 2 digit(s) to the right of the |
## 
##    0 | 22222222222222222222222222222222222222222222222222222222222222222222+2922
##    1 | 00000000000000000000000000000000000000000000000000000000000000000000+1323
##    2 | 00000000000000000000000000000000000000000000000000000000000000000000+560
##    3 | 00000000000000000000000000000000000000000000000000000000000000011111+293
##    4 | 00000000000000000000000000000000000000000111111111111111111111111111+138
##    5 | 00000000000000011111111111111111111111112222222222222233333333333333+61
##    6 | 00000001111111111112222222222333333333333334444444555556666777777777+2
##    7 | 000000001111112222222333333333334444444555566666677777788888999999
##    8 | 0000112222222333333444444455555566667777777888899999
##    9 | 001122233444566667789
##   10 | 00011122224558
##   11 | 245678
##   12 | 1222234466789
##   13 | 024
##   14 | 
##   15 | 22
##   16 | 0
##   17 | 4
##   18 | 5
##   19 | 
##   20 | 
##   21 | 
##   22 | 
##   23 | 1
sum(tmp$ratings_per_user > 500)
## [1] 396
sort(tmp$ratings_per_user[tmp$ratings_per_user>1300])
## [1] 1302 1323 1344 1518 1521 1595 1743 1850 2314
tmp = tmp %>% full_join(ratings, by = 'UserID')

Here we have the completed list of movies, users and genres:

Complete_list = tmp %>% 
  left_join(data.frame(MovieID = movies$MovieID, genre_matrix), 
            by = "MovieID")

Now, two recommendation schemes are going to be selected. The first one is the popularity which measured by the number of voting per movie in that genres. Rating_per_movie parameter is calculated for this step. The next scheme is the average rating of users for a movie in that genere (top best movie).

top_list = Complete_list %>%  
  group_by(MovieID) %>% 
  summarize(ratings_per_movie = n(), ave_ratings = mean(Rating), ave_ratings_normalized = mean(Rating_z)) %>%
  inner_join(movies, by = 'MovieID') %>%
  left_join(Complete_list, by = 'MovieID') %>%
  select(-c("Genres", "Year")) 

Now, for better recommendation, we filter the data for Rating>=3. this is important for popularity measurement since we assume that the number of votes is indicator of popularity. Thus, positive votes (votes>=3) demonstrate the popularity. This filter doesnt change the second scheme (average voting) as it calculated in previous lines and doesnt change. However, for the second scheme, we want to make sure that the average rating corresponds to great amount of people; thus, we put a condition of ratings_per_movie >= 50 to make sure that at least 50 people vote for a movie.

top_list =top_list %>% filter(Rating >= 3) %>% filter(ratings_per_movie >= 50)

For example, Imagine we are going to recommend top 5 movies in “Horror” genre. Here are our recommendation based on

  1. Popular movie: these movies received more positive votes (votes>=3) than other movies.
 unique(top_list %>% filter(Horror == 1) %>% 
    arrange(desc = -ratings_per_movie) %>%
  select(c("Title", "ratings_per_movie")) )%>% 
  slice(1:5)%>% print (n=5)
## # A tibble: 5 × 2
##   Title                           ratings_per_movie
##   <chr>                                       <int>
## 1 Ghostbusters (1984)                          2181
## 2 Alien (1979)                                 2024
## 3 Jaws (1975)                                  1697
## 4 Psycho (1960)                                1263
## 5 Blair Witch Project, The (1999)              1237
  1. Top 5 movies are the movies that have higher user ratings with the condition that at least 50 people voted for it. It should be stated that the top movies were sleected based on normalized average rating per user (zscore of each user ratings).
unique(top_list%>% filter(Horror == 1) %>%  
    arrange(desc = -ave_ratings_normalized) %>%
  select(c("Title", "ave_ratings", "ave_ratings_normalized")) ) %>% 
  slice(1:5)%>% print (n=5)
## # A tibble: 5 × 3
##   Title                     ave_ratings ave_ratings_normalized
##   <chr>                           <dbl>                  <dbl>
## 1 Young Frankenstein (1974)        4.25                  0.578
## 2 Psycho (1960)                    4.22                  0.527
## 3 Alien (1979)                     4.16                  0.466
## 4 Shining, The (1980)              4.10                  0.450
## 5 Jaws (1975)                      4.09                  0.407

It can be seen that normalized average rating and average rating for horror movies are not different but it might be different for other genres.

Top 5 movies in each genre based on popularity

Now, we can wrap up things and put the results here for all genres

bind_rows ( cbind (War,Genres="War") ,
cbind (Western,Genres="Western"),
cbind (Thriller,Genres="Thriller"),
cbind (SciFi,Genres="Sci-Fi"),
cbind (FilmNoir,Genres="Film-Noir"),
cbind (Musical,Genres="Musical"),
cbind (Fantasy,Genres="Fantasy"),
cbind (Drama,Genres="Drama"),
cbind (Documentary,Genres="Documentary"),
cbind (Childrens,Genres="Childrens"),
cbind (Crime,Genres="Crime"),
cbind (Comedy,Genres="Comedy"),
cbind (Animation,Genres="Animation"),
cbind (Adventure,Genres="Adventure"),
cbind (Action,Genres="Action"),
cbind (Horror,Genres="Horror"))
##                                                    Title ratings_per_movie
## 1  Star Wars: Episode V - The Empire Strikes Back (1980)              2990
## 2      Star Wars: Episode VI - Return of the Jedi (1983)              2883
## 3                             Saving Private Ryan (1998)              2653
## 4                                      Braveheart (1995)              2443
## 5                                Schindler's List (1993)              2304
## 6                              Dances with Wolves (1990)              1451
## 7              Butch Cassidy and the Sundance Kid (1969)              1419
## 8                     Back to the Future Part III (1990)              1148
## 9                                 Blazing Saddles (1974)              1119
## 10                                     Unforgiven (1992)               997
## 11                     Terminator 2: Judgment Day (1991)              2649
## 12                                    Matrix, The (1999)              2590
## 13                      Silence of the Lambs, The (1991)              2578
## 14                                          Fargo (1996)              2513
## 15                               Sixth Sense, The (1999)              2459
## 16             Star Wars: Episode IV - A New Hope (1977)              2991
## 17 Star Wars: Episode V - The Empire Strikes Back (1980)              2990
## 18     Star Wars: Episode VI - Return of the Jedi (1983)              2883
## 19                                  Jurassic Park (1993)              2672
## 20                     Terminator 2: Judgment Day (1991)              2649
## 21                              L.A. Confidential (1997)              2288
## 22                                   Blade Runner (1982)              1800
## 23                       Who Framed Roger Rabbit? (1988)              1799
## 24                                      Chinatown (1974)              1185
## 25                            Maltese Falcon, The (1941)              1043
## 26                              Wizard of Oz, The (1939)              1718
## 27                                        Aladdin (1992)              1351
## 28                            Blues Brothers, The (1980)              1341
## 29                 Rocky Horror Picture Show, The (1975)              1233
## 30                                 Lion King, The (1994)              1121
## 31             Star Wars: Episode IV - A New Hope (1977)              2991
## 32                     E.T. the Extra-Terrestrial (1982)              2269
## 33      Star Wars: Episode I - The Phantom Menace (1999)              2250
## 34                                    Beetlejuice (1988)              1495
## 35                                            Big (1988)              1491
## 36                                American Beauty (1999)              3428
## 37 Star Wars: Episode V - The Empire Strikes Back (1980)              2990
## 38                            Saving Private Ryan (1998)              2653
## 39                      Silence of the Lambs, The (1991)              2578
## 40                                          Fargo (1996)              2513
## 41                                     Roger & Me (1989)               798
## 42                                    Hoop Dreams (1994)               716
## 43                                          Crumb (1994)               491
## 44                                 American Movie (1999)               295
## 45                             When We Were Kings (1996)               277
## 46                     E.T. the Extra-Terrestrial (1982)              2269
## 47                                      Toy Story (1995)              2077
## 48                                           Babe (1995)              1751
## 49                              Wizard of Oz, The (1939)              1718
## 50                                  Bug's Life, A (1998)              1703
## 51                                          Fargo (1996)              2513
## 52                              L.A. Confidential (1997)              2288
## 53                                 Godfather, The (1972)              2223
## 54                                   Pulp Fiction (1994)              2171
## 55                            Usual Suspects, The (1995)              1783
## 56                                American Beauty (1999)              3428
## 57                             Back to the Future (1985)              2583
## 58                                   Men in Black (1997)              2538
## 59                            Shakespeare in Love (1998)              2369
## 60                            Princess Bride, The (1987)              2318
## 61                                      Toy Story (1995)              2077
## 62                       Who Framed Roger Rabbit? (1988)              1799
## 63                                  Bug's Life, A (1998)              1703
## 64                                    Toy Story 2 (1999)              1585
## 65                                        Aladdin (1992)              1351
## 66             Star Wars: Episode IV - A New Hope (1977)              2991
## 67 Star Wars: Episode V - The Empire Strikes Back (1980)              2990
## 68     Star Wars: Episode VI - Return of the Jedi (1983)              2883
## 69                                  Jurassic Park (1993)              2672
## 70                                   Men in Black (1997)              2538
## 71             Star Wars: Episode IV - A New Hope (1977)              2991
## 72 Star Wars: Episode V - The Empire Strikes Back (1980)              2990
## 73     Star Wars: Episode VI - Return of the Jedi (1983)              2883
## 74                                  Jurassic Park (1993)              2672
## 75                            Saving Private Ryan (1998)              2653
## 76                                   Ghostbusters (1984)              2181
## 77                                          Alien (1979)              2024
## 78                                           Jaws (1975)              1697
## 79                                         Psycho (1960)              1263
## 80                       Blair Witch Project, The (1999)              1237
##         Genres
## 1          War
## 2          War
## 3          War
## 4          War
## 5          War
## 6      Western
## 7      Western
## 8      Western
## 9      Western
## 10     Western
## 11    Thriller
## 12    Thriller
## 13    Thriller
## 14    Thriller
## 15    Thriller
## 16      Sci-Fi
## 17      Sci-Fi
## 18      Sci-Fi
## 19      Sci-Fi
## 20      Sci-Fi
## 21   Film-Noir
## 22   Film-Noir
## 23   Film-Noir
## 24   Film-Noir
## 25   Film-Noir
## 26     Musical
## 27     Musical
## 28     Musical
## 29     Musical
## 30     Musical
## 31     Fantasy
## 32     Fantasy
## 33     Fantasy
## 34     Fantasy
## 35     Fantasy
## 36       Drama
## 37       Drama
## 38       Drama
## 39       Drama
## 40       Drama
## 41 Documentary
## 42 Documentary
## 43 Documentary
## 44 Documentary
## 45 Documentary
## 46   Childrens
## 47   Childrens
## 48   Childrens
## 49   Childrens
## 50   Childrens
## 51       Crime
## 52       Crime
## 53       Crime
## 54       Crime
## 55       Crime
## 56      Comedy
## 57      Comedy
## 58      Comedy
## 59      Comedy
## 60      Comedy
## 61   Animation
## 62   Animation
## 63   Animation
## 64   Animation
## 65   Animation
## 66   Adventure
## 67   Adventure
## 68   Adventure
## 69   Adventure
## 70   Adventure
## 71      Action
## 72      Action
## 73      Action
## 74      Action
## 75      Action
## 76      Horror
## 77      Horror
## 78      Horror
## 79      Horror
## 80      Horror

Top 5 movies in each genre based on average ratings (best movies)

Now, we can wrap up things and put the results here for all genres

bind_rows ( cbind (War,Genres="War") ,
cbind (Western,Genres="Western"),
cbind (Thriller,Genres="Thriller"),
cbind (SciFi,Genres="Sci-Fi"),
cbind (FilmNoir,Genres="Film-Noir"),
cbind (Musical,Genres="Musical"),
cbind (Fantasy,Genres="Fantasy"),
cbind (Drama,Genres="Drama"),
cbind (Documentary,Genres="Documentary"),
cbind (Childrens,Genres="Childrens"),
cbind (Crime,Genres="Crime"),
cbind (Comedy,Genres="Comedy"),
cbind (Animation,Genres="Animation"),
cbind (Adventure,Genres="Adventure"),
cbind (Action,Genres="Action"),
cbind (Horror,Genres="Horror"))
##                                                                          Title
## 1                                                      Schindler's List (1993)
## 2  Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
## 3                                                        Paths of Glory (1957)
## 4                                                            Casablanca (1942)
## 5                                         Bridge on the River Kwai, The (1957)
## 6                                                               Yojimbo (1961)
## 7                                                             High Noon (1952)
## 8                                    Butch Cassidy and the Sundance Kid (1969)
## 9                                                        Searchers, The (1956)
## 10                                      Good, The Bad and The Ugly, The (1966)
## 11                                                  Usual Suspects, The (1995)
## 12                                                       Close Shave, A (1995)
## 13                                                          Rear Window (1954)
## 14                                                     Sixth Sense, The (1999)
## 15                                                       Third Man, The (1949)
## 16                                   Star Wars: Episode IV - A New Hope (1977)
## 17 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
## 18                                                          Matrix, The (1999)
## 19                       Star Wars: Episode V - The Empire Strikes Back (1980)
## 20                                                         Blade Runner (1982)
## 21                               Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
## 22                                                  Maltese Falcon, The (1941)
## 23                                                     Double Indemnity (1944)
## 24                                                            Chinatown (1974)
## 25                                                            Notorious (1946)
## 26                                                  Singin' in the Rain (1952)
## 27                                                    Wizard of Oz, The (1939)
## 28                                                   This Is Spinal Tap (1984)
## 29                                                         My Fair Lady (1964)
## 30                                                      Shall We Dance? (1937)
## 31                                   Star Wars: Episode IV - A New Hope (1977)
## 32                                           E.T. the Extra-Terrestrial (1982)
## 33                                                       Watership Down (1978)
## 34                                Willy Wonka and the Chocolate Factory (1971)
## 35                                                                  Big (1988)
## 36                                            Shawshank Redemption, The (1994)
## 37         Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
## 38                                                       Godfather, The (1972)
## 39                                                     Schindler's List (1993)
## 40                                                       Paths of Glory (1957)
## 41                                                   When We Were Kings (1996)
## 42                 Paradise Lost: The Child Murders at Robin Hood Hills (1996)
## 43                                                  Thin Blue Line, The (1988)
## 44                      Microcosmos (Microcosmos: Le peuple de l'herbe) (1996)
## 45                                                                42 Up (1998)
## 46                                                    Wizard of Oz, The (1939)
## 47                                                          Toy Story 2 (1999)
## 48                                                            Toy Story (1995)
## 49                                 Winnie the Pooh and the Blustery Day (1968)
## 50                                                      Iron Giant, The (1999)
## 51                                                  Usual Suspects, The (1995)
## 52                                                       Godfather, The (1972)
## 53                                                     Double Indemnity (1944)
## 54                                              Godfather: Part II, The (1974)
## 55                                                           Sting, The (1973)
## 56                                                  Wrong Trousers, The (1993)
## 57                                                       Close Shave, A (1995)
## 58                                                         General, The (1927)
## 59                                                              Yojimbo (1961)
## 60                                                     Grand Day Out, A (1992)
## 61                                                  Wrong Trousers, The (1993)
## 62                                                       Close Shave, A (1995)
## 63                      Wallace & Gromit: The Best of Aardman Animation (1996)
## 64                                                     Grand Day Out, A (1992)
## 65                                                    Creature Comforts (1990)
## 66                                                              Sanjuro (1962)
## 67                                              Raiders of the Lost Ark (1981)
## 68                                   Star Wars: Episode IV - A New Hope (1977)
## 69                                                   Lawrence of Arabia (1962)
## 70                                                    Great Escape, The (1963)
## 71                                                              Sanjuro (1962)
## 72         Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
## 73                                                       Godfather, The (1972)
## 74                                              Raiders of the Lost Ark (1981)
## 75                                   Star Wars: Episode IV - A New Hope (1977)
## 76                                                   Young Frankenstein (1974)
## 77                                                               Psycho (1960)
## 78                                                                Alien (1979)
## 79                                                         Shining, The (1980)
## 80                                                                 Jaws (1975)
##    ave_ratings ave_ratings_normalized      Genres
## 1     4.510417              0.8028872         War
## 2     4.449890              0.7362226         War
## 3     4.473913              0.7186578         War
## 4     4.412822              0.6965618         War
## 5     4.386994              0.6840655         War
## 6     4.404651              0.6738382     Western
## 7     4.178660              0.5056460     Western
## 8     4.215645              0.5008573     Western
## 9     4.085714              0.4530497     Western
## 10    4.133820              0.4383992     Western
## 11    4.517106              0.8222305    Thriller
## 12    4.520548              0.7980935    Thriller
## 13    4.476190              0.7405397    Thriller
## 14    4.406263              0.7314324    Thriller
## 15    4.452083              0.7110159    Thriller
## 16    4.453694              0.7439864      Sci-Fi
## 17    4.449890              0.7362226      Sci-Fi
## 18    4.315830              0.6469376      Sci-Fi
## 19    4.292977              0.5789537      Sci-Fi
## 20    4.273333              0.5713030      Sci-Fi
## 21    4.491489              0.7499945   Film-Noir
## 22    4.395973              0.6779888   Film-Noir
## 23    4.415608              0.6729506   Film-Noir
## 24    4.339241              0.5981822   Film-Noir
## 25    4.294382              0.5849489   Film-Noir
## 26    4.283622              0.5681876     Musical
## 27    4.247963              0.5504546     Musical
## 28    4.179785              0.4972154     Musical
## 29    4.154088              0.4618933     Musical
## 30    4.165714              0.4360846     Musical
## 31    4.453694              0.7439864     Fantasy
## 32    3.965183              0.2957287     Fantasy
## 33    3.842623              0.2482696     Fantasy
## 34    3.861386              0.2355942     Fantasy
## 35    3.855801              0.2261077     Fantasy
## 36    4.554558              0.8417957       Drama
## 37    4.560510              0.8337012       Drama
## 38    4.524966              0.8043933       Drama
## 39    4.510417              0.8028872       Drama
## 40    4.473913              0.7186578       Drama
## 41    4.321300              0.5991014 Documentary
## 42    4.213836              0.5862466 Documentary
## 43    4.252788              0.5221005 Documentary
## 44    4.095745              0.5127087 Documentary
## 45    4.227273              0.5072586 Documentary
## 46    4.247963              0.5504546   Childrens
## 47    4.218927              0.5426468   Childrens
## 48    4.146846              0.4717375   Childrens
## 49    3.986425              0.4219000   Childrens
## 50    4.047478              0.4058730   Childrens
## 51    4.517106              0.8222305       Crime
## 52    4.524966              0.8043933       Crime
## 53    4.415608              0.6729506       Crime
## 54    4.357565              0.6440232       Crime
## 55    4.320305              0.6168535       Crime
## 56    4.507937              0.7989103      Comedy
## 57    4.520548              0.7980935      Comedy
## 58    4.368932              0.6874155      Comedy
## 59    4.404651              0.6738382      Comedy
## 60    4.361522              0.6490756      Comedy
## 61    4.507937              0.7989103   Animation
## 62    4.520548              0.7980935   Animation
## 63    4.426941              0.7078950   Animation
## 64    4.361522              0.6490756   Animation
## 65    4.335766              0.6416745   Animation
## 66    4.608696              0.9394756   Adventure
## 67    4.477725              0.7742946   Adventure
## 68    4.453694              0.7439864   Adventure
## 69    4.401925              0.6789148   Adventure
## 70    4.376437              0.6641672   Adventure
## 71    4.608696              0.9394756      Action
## 72    4.560510              0.8337012      Action
## 73    4.524966              0.8043933      Action
## 74    4.477725              0.7742946      Action
## 75    4.453694              0.7439864      Action
## 76    4.250629              0.5777589      Horror
## 77    4.218527              0.5271435      Horror
## 78    4.159585              0.4655716      Horror
## 79    4.104876              0.4496369      Horror
## 80    4.089570              0.4072202      Horror

#System 2

First create a utility matrix stored as a sparse matrix.

train=r_orig
i = paste0('u', train$UserID)
j = paste0('m', train$MovieID)
x = train$Rating
tmp = data.frame(i, j, x, stringsAsFactors = T)
Rmat = sparseMatrix(as.integer(tmp$i), as.integer(tmp$j), x = tmp$x)
rownames(Rmat) = levels(tmp$i)
colnames(Rmat) = levels(tmp$j)
Rmat = new('realRatingMatrix', data = Rmat)
#realmat = as(Rmat,'matrix')
Rmat_m = normalize (Rmat)
Rmat_z = normalize (Rmat,method="Z-score")
#getRatingMatrix(Rmat_m)
#denormalize for reverse
hist (getRatings(Rmat),breaks=15)

hist (getRatings(Rmat_m),breaks=15)

hist (getRatings(Rmat_z),breaks=15)

hist (rowCounts(Rmat), breaks=10)

hist (colMeans(Rmat), breaks=10)

First Method : UBCF

For user based collaborative filtering (UBCF), the similarity between users normally evalauted based on KNN. In UBCF, there is no training. Computation is needed at the prediction stage when new data is provided. The key computation cost is the similarity between (#_new_users) and (#_training_users) over M items, as well as the corresponding sorting for finding the top nearest neighbors. The computation cost could be much less with the evaluation scheme provided in recommenderlab when (#_new_users) is much less than N (#_of training_users).

Second Method : IBCF

For item based collaborative filtering (IBCF), the main computation occurs in the training, which is the computation of the item-to-item M-by-M similarity matrix as well as the related sorting. This also presents a memory challenge, since the large rating matrix may not even be able to be loaded into memory; see the comment at line 96 at the link below “this might not fit into memory! Maybe use a sample?” For IBCF, once the item-to-item similarity matrix is computed, prediction involves just efficient matrix products. IBCF seems to be a less-favored method in terms of computation. However, if we can compute the item similarity matrix offline, then IBCF becomes more efficient than UBCF.

Detail about two schemes

  • Given : is the parameter that shows the number of iterms to give for evaluation. We selected 15.
  • goodRating : Only keep the rating greater or equal to the given threshold
esSplit  <- evaluationScheme(Rmat, method = "split", train=0.8, given= 15 , k=10, goodRating = 5)
algorithms <- list("UBCF" = list(name = "UBCF",  param = list(nn = 5) ),
                   "IBCF" = list(name = "IBCF")
                   )
results <- evaluate(esSplit, algorithms, type = "ratings")
## UBCF run fold/sample [model time/prediction time]
##   1  [0.041sec/49.594sec] 
##   2  [0.029sec/49.745sec] 
##   3  [0.03sec/49.833sec] 
##   4  [0.031sec/50.01sec] 
##   5  [0.03sec/49.902sec] 
##   6  [0.03sec/49.917sec] 
##   7  [0.031sec/50.576sec] 
##   8  [0.031sec/49.504sec] 
##   9  [0.028sec/49.629sec] 
##   10  [0.032sec/49.745sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [83.14sec/0.332sec] 
##   2  [83.481sec/0.208sec] 
##   3  [83.337sec/0.213sec] 
##   4  [83.68sec/0.221sec] 
##   5  [83.622sec/0.236sec] 
##   6  [83.458sec/0.263sec] 
##   7  [83.294sec/0.349sec] 
##   8  [83.639sec/0.294sec] 
##   9  [83.719sec/0.224sec] 
##   10  [84.825sec/0.239sec]

In the following section, we will save our model as an rda object to be used by the shiny app. We used modified Prof. shiny app from Alvin Yang with the following link: https://github.com/alvin-yang68/Movie-Recommender

We changed their model for the recommended system with our model and run the app.

rr= Recommender(Rmat, method = "IBCF")
save(rr, file = "System_amir.rda")
#SVD_RMSE = as.data.frame(do.call(rbind,getConfusionMatrix(results$SVD)))["RMSE"]
UBCF_RMSE = as.data.frame(do.call(rbind,getConfusionMatrix(results$UBCF)))["RMSE"]
IBCF_RMSE = as.data.frame(do.call(rbind,getConfusionMatrix(results$IBCF)))["RMSE"]
#names(SVD_RMSE)="RMSE for SVD"
names(IBCF_RMSE)="RMSE for IBCF"
names(UBCF_RMSE)="RMSE for UBCF"

Now, we export the RMSE for each method for all 10 iterations below

RMSE_total = cbind (1:10,IBCF_RMSE,UBCF_RMSE)
RMSE_total
##    1:10 RMSE for IBCF RMSE for UBCF
## 1     1      1.606655      1.228546
## 2     2      1.542282      1.223350
## 3     3      1.521209      1.247617
## 4     4      1.621293      1.228569
## 5     5           Inf      1.235517
## 6     6      1.533084      1.239835
## 7     7      1.631665      1.221509
## 8     8      1.516894      1.244837
## 9     9      1.458777      1.216719
## 10   10      1.718919      1.235456

and we can show them in a graph as below

tmp <- lapply(results, function(x) slot(x, "results"))
res <- tmp %>% 
  lapply(function(x) unlist(lapply(x, function(x) unlist(x@cm[ ,"RMSE"])))) %>% 
  as.data.frame() %>% 
  gather(key = "Algorithm", value = "RMSE")

res %>% 
  ggplot(aes(Algorithm, RMSE, fill = Algorithm)) +
  geom_bar(stat = "summary") + geom_errorbar(stat = "summary", width = 0.3, size = 0.8) +
  coord_cartesian(ylim = c(0.6, 1.3)) + guides(fill = FALSE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## No summary function supplied, defaulting to `mean_se()`
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## No summary function supplied, defaulting to `mean_se()`

As it was expected, the results for UBCF has lowest RMSE with respect to IBCF.

Refrences:

Shiny app for movie recommender system:

https://github.com/alvin-yang68/Movie-Recommender