Contributions:
All group members contributed equally to writing the code, discussion, updating the code and submission.
Loading the library packages.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(DT)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(Matrix)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
myurl = "https://liangfgithub.github.io/MovieData/"
set.seed(4588)
# use colClasses = 'NULL' to skip columns
ratings = read.csv(paste0(myurl, 'ratings.dat?raw=true'),
sep = ':',
colClasses = c('integer', 'NULL'),
header = FALSE)
colnames(ratings) = c('UserID', 'MovieID', 'Rating', 'Timestamp')
ratings$Timestamp = NULL
movies = readLines(paste0(myurl, 'movies.dat?raw=true'))
movies = strsplit(movies, split = "::", fixed = TRUE, useBytes = TRUE)
movies = matrix(unlist(movies), ncol = 3, byrow = TRUE)
movies = data.frame(movies, stringsAsFactors = FALSE)
colnames(movies) = c('MovieID', 'Title', 'Genres')
movies$MovieID = as.integer(movies$MovieID)
# convert accented characters
movies$Title[73]
## [1] "Mis\xe9rables, Les (1995)"
movies$Title = iconv(movies$Title, "latin1", "UTF-8")
movies$Title[73]
## [1] "Misérables, Les (1995)"
# extract year
movies$Year = as.numeric(unlist(
lapply(movies$Title, function(x) substr(x, nchar(x)-4, nchar(x)-1))))
users = read.csv(paste0(myurl, 'users.dat?raw=true'),
sep = ':', header = FALSE)
users = users[, -c(2,4,6,8)] # skip columns
colnames(users) = c('UserID', 'Gender', 'Age', 'Occupation', 'Zip-code')
Here, the code read the data and construct three matrices: Ratings, movies, and users which have the following features:
head(ratings)
## UserID MovieID Rating
## 1 1 1193 5
## 2 1 661 3
## 3 1 914 3
## 4 1 3408 4
## 5 1 2355 5
## 6 1 1197 3
head(movies)
## MovieID Title Genres Year
## 1 1 Toy Story (1995) Animation|Children's|Comedy 1995
## 2 2 Jumanji (1995) Adventure|Children's|Fantasy 1995
## 3 3 Grumpier Old Men (1995) Comedy|Romance 1995
## 4 4 Waiting to Exhale (1995) Comedy|Drama 1995
## 5 5 Father of the Bride Part II (1995) Comedy 1995
## 6 6 Heat (1995) Action|Crime|Thriller 1995
head(users)
## UserID Gender Age Occupation Zip-code
## 1 1 F 1 10 48067
## 2 2 M 56 16 70072
## 3 3 M 25 15 55117
## 4 4 M 45 7 02460
## 5 5 M 25 20 55455
## 6 6 F 50 9 55117
dim(users)
## [1] 6040 5
length(unique(ratings$UserID))
## [1] 6040
dim(movies)
## [1] 3883 4
length(unique(ratings$MovieID))
## [1] 3706
movies_not_rated = movies %>%
filter(!(MovieID %in% ratings$MovieID))
dim(movies_not_rated)
## [1] 177 4
r_orig = ratings
#System 1
In this section, we are going to apply two recommendation schemes based on genres. First of all, we create a bibary indicator for the 18 genres for each movie
genres = as.data.frame(movies$Genres, stringsAsFactors=FALSE)
tmp = as.data.frame(tstrsplit(genres[,1], '[|]',
type.convert=TRUE),
stringsAsFactors=FALSE)
genre_list = c("Action", "Adventure", "Animation",
"Children's", "Comedy", "Crime",
"Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical",
"Mystery", "Romance", "Sci-Fi",
"Thriller", "War", "Western")
m = length(genre_list)
genre_matrix = matrix(0, nrow(movies), length(genre_list))
for(i in 1:nrow(tmp)){
genre_matrix[i,genre_list %in% tmp[i,]]=1
}
colnames(genre_matrix) = genre_list
remove("tmp", "genres")
colnames(genre_matrix)[4]="Childrens"
colnames(genre_matrix)[10]="FilmNoir"
colnames(genre_matrix)[15]="SciFi"
save(genre_matrix, file = "System1.rda")
since users voting might not be similar in voting, the votes of each user are averaged and subtracted from the standard deviation (zscore). However, a simple average data were also calculated and will be reported at the end.
ratings1= ratings %>%
group_by(UserID) %>%
summarize(average = mean(Rating), sd= sd(Rating)) %>% left_join(ratings, by = 'UserID')
ratings$Rating_z = (ratings1$Rating - ratings1$average)/ ratings1$sd
Now, we are going to add number of ratings per user to our data frame
tmp = ratings %>%
group_by(UserID) %>%
summarize(ratings_per_user = n())
summary(tmp$ratings_per_user)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.0 44.0 96.0 165.6 208.0 2314.0
stem(tmp$ratings_per_user)
##
## The decimal point is 2 digit(s) to the right of the |
##
## 0 | 22222222222222222222222222222222222222222222222222222222222222222222+2922
## 1 | 00000000000000000000000000000000000000000000000000000000000000000000+1323
## 2 | 00000000000000000000000000000000000000000000000000000000000000000000+560
## 3 | 00000000000000000000000000000000000000000000000000000000000000011111+293
## 4 | 00000000000000000000000000000000000000000111111111111111111111111111+138
## 5 | 00000000000000011111111111111111111111112222222222222233333333333333+61
## 6 | 00000001111111111112222222222333333333333334444444555556666777777777+2
## 7 | 000000001111112222222333333333334444444555566666677777788888999999
## 8 | 0000112222222333333444444455555566667777777888899999
## 9 | 001122233444566667789
## 10 | 00011122224558
## 11 | 245678
## 12 | 1222234466789
## 13 | 024
## 14 |
## 15 | 22
## 16 | 0
## 17 | 4
## 18 | 5
## 19 |
## 20 |
## 21 |
## 22 |
## 23 | 1
sum(tmp$ratings_per_user > 500)
## [1] 396
sort(tmp$ratings_per_user[tmp$ratings_per_user>1300])
## [1] 1302 1323 1344 1518 1521 1595 1743 1850 2314
tmp = tmp %>% full_join(ratings, by = 'UserID')
Here we have the completed list of movies, users and genres:
Complete_list = tmp %>%
left_join(data.frame(MovieID = movies$MovieID, genre_matrix),
by = "MovieID")
Now, two recommendation schemes are going to be selected. The first one is the popularity which measured by the number of voting per movie in that genres. Rating_per_movie parameter is calculated for this step. The next scheme is the average rating of users for a movie in that genere (top best movie).
top_list = Complete_list %>%
group_by(MovieID) %>%
summarize(ratings_per_movie = n(), ave_ratings = mean(Rating), ave_ratings_normalized = mean(Rating_z)) %>%
inner_join(movies, by = 'MovieID') %>%
left_join(Complete_list, by = 'MovieID') %>%
select(-c("Genres", "Year"))
Now, for better recommendation, we filter the data for Rating>=3. this is important for popularity measurement since we assume that the number of votes is indicator of popularity. Thus, positive votes (votes>=3) demonstrate the popularity. This filter doesnt change the second scheme (average voting) as it calculated in previous lines and doesnt change. However, for the second scheme, we want to make sure that the average rating corresponds to great amount of people; thus, we put a condition of ratings_per_movie >= 50 to make sure that at least 50 people vote for a movie.
top_list =top_list %>% filter(Rating >= 3) %>% filter(ratings_per_movie >= 50)
For example, Imagine we are going to recommend top 5 movies in “Horror” genre. Here are our recommendation based on
unique(top_list %>% filter(Horror == 1) %>%
arrange(desc = -ratings_per_movie) %>%
select(c("Title", "ratings_per_movie")) )%>%
slice(1:5)%>% print (n=5)
## # A tibble: 5 × 2
## Title ratings_per_movie
## <chr> <int>
## 1 Ghostbusters (1984) 2181
## 2 Alien (1979) 2024
## 3 Jaws (1975) 1697
## 4 Psycho (1960) 1263
## 5 Blair Witch Project, The (1999) 1237
unique(top_list%>% filter(Horror == 1) %>%
arrange(desc = -ave_ratings_normalized) %>%
select(c("Title", "ave_ratings", "ave_ratings_normalized")) ) %>%
slice(1:5)%>% print (n=5)
## # A tibble: 5 × 3
## Title ave_ratings ave_ratings_normalized
## <chr> <dbl> <dbl>
## 1 Young Frankenstein (1974) 4.25 0.578
## 2 Psycho (1960) 4.22 0.527
## 3 Alien (1979) 4.16 0.466
## 4 Shining, The (1980) 4.10 0.450
## 5 Jaws (1975) 4.09 0.407
It can be seen that normalized average rating and average rating for horror movies are not different but it might be different for other genres.
Now, we can wrap up things and put the results here for all genres
bind_rows ( cbind (War,Genres="War") ,
cbind (Western,Genres="Western"),
cbind (Thriller,Genres="Thriller"),
cbind (SciFi,Genres="Sci-Fi"),
cbind (FilmNoir,Genres="Film-Noir"),
cbind (Musical,Genres="Musical"),
cbind (Fantasy,Genres="Fantasy"),
cbind (Drama,Genres="Drama"),
cbind (Documentary,Genres="Documentary"),
cbind (Childrens,Genres="Childrens"),
cbind (Crime,Genres="Crime"),
cbind (Comedy,Genres="Comedy"),
cbind (Animation,Genres="Animation"),
cbind (Adventure,Genres="Adventure"),
cbind (Action,Genres="Action"),
cbind (Horror,Genres="Horror"))
## Title ratings_per_movie
## 1 Star Wars: Episode V - The Empire Strikes Back (1980) 2990
## 2 Star Wars: Episode VI - Return of the Jedi (1983) 2883
## 3 Saving Private Ryan (1998) 2653
## 4 Braveheart (1995) 2443
## 5 Schindler's List (1993) 2304
## 6 Dances with Wolves (1990) 1451
## 7 Butch Cassidy and the Sundance Kid (1969) 1419
## 8 Back to the Future Part III (1990) 1148
## 9 Blazing Saddles (1974) 1119
## 10 Unforgiven (1992) 997
## 11 Terminator 2: Judgment Day (1991) 2649
## 12 Matrix, The (1999) 2590
## 13 Silence of the Lambs, The (1991) 2578
## 14 Fargo (1996) 2513
## 15 Sixth Sense, The (1999) 2459
## 16 Star Wars: Episode IV - A New Hope (1977) 2991
## 17 Star Wars: Episode V - The Empire Strikes Back (1980) 2990
## 18 Star Wars: Episode VI - Return of the Jedi (1983) 2883
## 19 Jurassic Park (1993) 2672
## 20 Terminator 2: Judgment Day (1991) 2649
## 21 L.A. Confidential (1997) 2288
## 22 Blade Runner (1982) 1800
## 23 Who Framed Roger Rabbit? (1988) 1799
## 24 Chinatown (1974) 1185
## 25 Maltese Falcon, The (1941) 1043
## 26 Wizard of Oz, The (1939) 1718
## 27 Aladdin (1992) 1351
## 28 Blues Brothers, The (1980) 1341
## 29 Rocky Horror Picture Show, The (1975) 1233
## 30 Lion King, The (1994) 1121
## 31 Star Wars: Episode IV - A New Hope (1977) 2991
## 32 E.T. the Extra-Terrestrial (1982) 2269
## 33 Star Wars: Episode I - The Phantom Menace (1999) 2250
## 34 Beetlejuice (1988) 1495
## 35 Big (1988) 1491
## 36 American Beauty (1999) 3428
## 37 Star Wars: Episode V - The Empire Strikes Back (1980) 2990
## 38 Saving Private Ryan (1998) 2653
## 39 Silence of the Lambs, The (1991) 2578
## 40 Fargo (1996) 2513
## 41 Roger & Me (1989) 798
## 42 Hoop Dreams (1994) 716
## 43 Crumb (1994) 491
## 44 American Movie (1999) 295
## 45 When We Were Kings (1996) 277
## 46 E.T. the Extra-Terrestrial (1982) 2269
## 47 Toy Story (1995) 2077
## 48 Babe (1995) 1751
## 49 Wizard of Oz, The (1939) 1718
## 50 Bug's Life, A (1998) 1703
## 51 Fargo (1996) 2513
## 52 L.A. Confidential (1997) 2288
## 53 Godfather, The (1972) 2223
## 54 Pulp Fiction (1994) 2171
## 55 Usual Suspects, The (1995) 1783
## 56 American Beauty (1999) 3428
## 57 Back to the Future (1985) 2583
## 58 Men in Black (1997) 2538
## 59 Shakespeare in Love (1998) 2369
## 60 Princess Bride, The (1987) 2318
## 61 Toy Story (1995) 2077
## 62 Who Framed Roger Rabbit? (1988) 1799
## 63 Bug's Life, A (1998) 1703
## 64 Toy Story 2 (1999) 1585
## 65 Aladdin (1992) 1351
## 66 Star Wars: Episode IV - A New Hope (1977) 2991
## 67 Star Wars: Episode V - The Empire Strikes Back (1980) 2990
## 68 Star Wars: Episode VI - Return of the Jedi (1983) 2883
## 69 Jurassic Park (1993) 2672
## 70 Men in Black (1997) 2538
## 71 Star Wars: Episode IV - A New Hope (1977) 2991
## 72 Star Wars: Episode V - The Empire Strikes Back (1980) 2990
## 73 Star Wars: Episode VI - Return of the Jedi (1983) 2883
## 74 Jurassic Park (1993) 2672
## 75 Saving Private Ryan (1998) 2653
## 76 Ghostbusters (1984) 2181
## 77 Alien (1979) 2024
## 78 Jaws (1975) 1697
## 79 Psycho (1960) 1263
## 80 Blair Witch Project, The (1999) 1237
## Genres
## 1 War
## 2 War
## 3 War
## 4 War
## 5 War
## 6 Western
## 7 Western
## 8 Western
## 9 Western
## 10 Western
## 11 Thriller
## 12 Thriller
## 13 Thriller
## 14 Thriller
## 15 Thriller
## 16 Sci-Fi
## 17 Sci-Fi
## 18 Sci-Fi
## 19 Sci-Fi
## 20 Sci-Fi
## 21 Film-Noir
## 22 Film-Noir
## 23 Film-Noir
## 24 Film-Noir
## 25 Film-Noir
## 26 Musical
## 27 Musical
## 28 Musical
## 29 Musical
## 30 Musical
## 31 Fantasy
## 32 Fantasy
## 33 Fantasy
## 34 Fantasy
## 35 Fantasy
## 36 Drama
## 37 Drama
## 38 Drama
## 39 Drama
## 40 Drama
## 41 Documentary
## 42 Documentary
## 43 Documentary
## 44 Documentary
## 45 Documentary
## 46 Childrens
## 47 Childrens
## 48 Childrens
## 49 Childrens
## 50 Childrens
## 51 Crime
## 52 Crime
## 53 Crime
## 54 Crime
## 55 Crime
## 56 Comedy
## 57 Comedy
## 58 Comedy
## 59 Comedy
## 60 Comedy
## 61 Animation
## 62 Animation
## 63 Animation
## 64 Animation
## 65 Animation
## 66 Adventure
## 67 Adventure
## 68 Adventure
## 69 Adventure
## 70 Adventure
## 71 Action
## 72 Action
## 73 Action
## 74 Action
## 75 Action
## 76 Horror
## 77 Horror
## 78 Horror
## 79 Horror
## 80 Horror
Now, we can wrap up things and put the results here for all genres
bind_rows ( cbind (War,Genres="War") ,
cbind (Western,Genres="Western"),
cbind (Thriller,Genres="Thriller"),
cbind (SciFi,Genres="Sci-Fi"),
cbind (FilmNoir,Genres="Film-Noir"),
cbind (Musical,Genres="Musical"),
cbind (Fantasy,Genres="Fantasy"),
cbind (Drama,Genres="Drama"),
cbind (Documentary,Genres="Documentary"),
cbind (Childrens,Genres="Childrens"),
cbind (Crime,Genres="Crime"),
cbind (Comedy,Genres="Comedy"),
cbind (Animation,Genres="Animation"),
cbind (Adventure,Genres="Adventure"),
cbind (Action,Genres="Action"),
cbind (Horror,Genres="Horror"))
## Title
## 1 Schindler's List (1993)
## 2 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
## 3 Paths of Glory (1957)
## 4 Casablanca (1942)
## 5 Bridge on the River Kwai, The (1957)
## 6 Yojimbo (1961)
## 7 High Noon (1952)
## 8 Butch Cassidy and the Sundance Kid (1969)
## 9 Searchers, The (1956)
## 10 Good, The Bad and The Ugly, The (1966)
## 11 Usual Suspects, The (1995)
## 12 Close Shave, A (1995)
## 13 Rear Window (1954)
## 14 Sixth Sense, The (1999)
## 15 Third Man, The (1949)
## 16 Star Wars: Episode IV - A New Hope (1977)
## 17 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
## 18 Matrix, The (1999)
## 19 Star Wars: Episode V - The Empire Strikes Back (1980)
## 20 Blade Runner (1982)
## 21 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
## 22 Maltese Falcon, The (1941)
## 23 Double Indemnity (1944)
## 24 Chinatown (1974)
## 25 Notorious (1946)
## 26 Singin' in the Rain (1952)
## 27 Wizard of Oz, The (1939)
## 28 This Is Spinal Tap (1984)
## 29 My Fair Lady (1964)
## 30 Shall We Dance? (1937)
## 31 Star Wars: Episode IV - A New Hope (1977)
## 32 E.T. the Extra-Terrestrial (1982)
## 33 Watership Down (1978)
## 34 Willy Wonka and the Chocolate Factory (1971)
## 35 Big (1988)
## 36 Shawshank Redemption, The (1994)
## 37 Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
## 38 Godfather, The (1972)
## 39 Schindler's List (1993)
## 40 Paths of Glory (1957)
## 41 When We Were Kings (1996)
## 42 Paradise Lost: The Child Murders at Robin Hood Hills (1996)
## 43 Thin Blue Line, The (1988)
## 44 Microcosmos (Microcosmos: Le peuple de l'herbe) (1996)
## 45 42 Up (1998)
## 46 Wizard of Oz, The (1939)
## 47 Toy Story 2 (1999)
## 48 Toy Story (1995)
## 49 Winnie the Pooh and the Blustery Day (1968)
## 50 Iron Giant, The (1999)
## 51 Usual Suspects, The (1995)
## 52 Godfather, The (1972)
## 53 Double Indemnity (1944)
## 54 Godfather: Part II, The (1974)
## 55 Sting, The (1973)
## 56 Wrong Trousers, The (1993)
## 57 Close Shave, A (1995)
## 58 General, The (1927)
## 59 Yojimbo (1961)
## 60 Grand Day Out, A (1992)
## 61 Wrong Trousers, The (1993)
## 62 Close Shave, A (1995)
## 63 Wallace & Gromit: The Best of Aardman Animation (1996)
## 64 Grand Day Out, A (1992)
## 65 Creature Comforts (1990)
## 66 Sanjuro (1962)
## 67 Raiders of the Lost Ark (1981)
## 68 Star Wars: Episode IV - A New Hope (1977)
## 69 Lawrence of Arabia (1962)
## 70 Great Escape, The (1963)
## 71 Sanjuro (1962)
## 72 Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
## 73 Godfather, The (1972)
## 74 Raiders of the Lost Ark (1981)
## 75 Star Wars: Episode IV - A New Hope (1977)
## 76 Young Frankenstein (1974)
## 77 Psycho (1960)
## 78 Alien (1979)
## 79 Shining, The (1980)
## 80 Jaws (1975)
## ave_ratings ave_ratings_normalized Genres
## 1 4.510417 0.8028872 War
## 2 4.449890 0.7362226 War
## 3 4.473913 0.7186578 War
## 4 4.412822 0.6965618 War
## 5 4.386994 0.6840655 War
## 6 4.404651 0.6738382 Western
## 7 4.178660 0.5056460 Western
## 8 4.215645 0.5008573 Western
## 9 4.085714 0.4530497 Western
## 10 4.133820 0.4383992 Western
## 11 4.517106 0.8222305 Thriller
## 12 4.520548 0.7980935 Thriller
## 13 4.476190 0.7405397 Thriller
## 14 4.406263 0.7314324 Thriller
## 15 4.452083 0.7110159 Thriller
## 16 4.453694 0.7439864 Sci-Fi
## 17 4.449890 0.7362226 Sci-Fi
## 18 4.315830 0.6469376 Sci-Fi
## 19 4.292977 0.5789537 Sci-Fi
## 20 4.273333 0.5713030 Sci-Fi
## 21 4.491489 0.7499945 Film-Noir
## 22 4.395973 0.6779888 Film-Noir
## 23 4.415608 0.6729506 Film-Noir
## 24 4.339241 0.5981822 Film-Noir
## 25 4.294382 0.5849489 Film-Noir
## 26 4.283622 0.5681876 Musical
## 27 4.247963 0.5504546 Musical
## 28 4.179785 0.4972154 Musical
## 29 4.154088 0.4618933 Musical
## 30 4.165714 0.4360846 Musical
## 31 4.453694 0.7439864 Fantasy
## 32 3.965183 0.2957287 Fantasy
## 33 3.842623 0.2482696 Fantasy
## 34 3.861386 0.2355942 Fantasy
## 35 3.855801 0.2261077 Fantasy
## 36 4.554558 0.8417957 Drama
## 37 4.560510 0.8337012 Drama
## 38 4.524966 0.8043933 Drama
## 39 4.510417 0.8028872 Drama
## 40 4.473913 0.7186578 Drama
## 41 4.321300 0.5991014 Documentary
## 42 4.213836 0.5862466 Documentary
## 43 4.252788 0.5221005 Documentary
## 44 4.095745 0.5127087 Documentary
## 45 4.227273 0.5072586 Documentary
## 46 4.247963 0.5504546 Childrens
## 47 4.218927 0.5426468 Childrens
## 48 4.146846 0.4717375 Childrens
## 49 3.986425 0.4219000 Childrens
## 50 4.047478 0.4058730 Childrens
## 51 4.517106 0.8222305 Crime
## 52 4.524966 0.8043933 Crime
## 53 4.415608 0.6729506 Crime
## 54 4.357565 0.6440232 Crime
## 55 4.320305 0.6168535 Crime
## 56 4.507937 0.7989103 Comedy
## 57 4.520548 0.7980935 Comedy
## 58 4.368932 0.6874155 Comedy
## 59 4.404651 0.6738382 Comedy
## 60 4.361522 0.6490756 Comedy
## 61 4.507937 0.7989103 Animation
## 62 4.520548 0.7980935 Animation
## 63 4.426941 0.7078950 Animation
## 64 4.361522 0.6490756 Animation
## 65 4.335766 0.6416745 Animation
## 66 4.608696 0.9394756 Adventure
## 67 4.477725 0.7742946 Adventure
## 68 4.453694 0.7439864 Adventure
## 69 4.401925 0.6789148 Adventure
## 70 4.376437 0.6641672 Adventure
## 71 4.608696 0.9394756 Action
## 72 4.560510 0.8337012 Action
## 73 4.524966 0.8043933 Action
## 74 4.477725 0.7742946 Action
## 75 4.453694 0.7439864 Action
## 76 4.250629 0.5777589 Horror
## 77 4.218527 0.5271435 Horror
## 78 4.159585 0.4655716 Horror
## 79 4.104876 0.4496369 Horror
## 80 4.089570 0.4072202 Horror
#System 2
First create a utility matrix stored as a sparse matrix.
train=r_orig
i = paste0('u', train$UserID)
j = paste0('m', train$MovieID)
x = train$Rating
tmp = data.frame(i, j, x, stringsAsFactors = T)
Rmat = sparseMatrix(as.integer(tmp$i), as.integer(tmp$j), x = tmp$x)
rownames(Rmat) = levels(tmp$i)
colnames(Rmat) = levels(tmp$j)
Rmat = new('realRatingMatrix', data = Rmat)
#realmat = as(Rmat,'matrix')
Rmat_m = normalize (Rmat)
Rmat_z = normalize (Rmat,method="Z-score")
#getRatingMatrix(Rmat_m)
#denormalize for reverse
hist (getRatings(Rmat),breaks=15)
hist (getRatings(Rmat_m),breaks=15)
hist (getRatings(Rmat_z),breaks=15)
hist (rowCounts(Rmat), breaks=10)
hist (colMeans(Rmat), breaks=10)
For user based collaborative filtering (UBCF), the similarity between users normally evalauted based on KNN. In UBCF, there is no training. Computation is needed at the prediction stage when new data is provided. The key computation cost is the similarity between (#_new_users) and (#_training_users) over M items, as well as the corresponding sorting for finding the top nearest neighbors. The computation cost could be much less with the evaluation scheme provided in recommenderlab when (#_new_users) is much less than N (#_of training_users).
For item based collaborative filtering (IBCF), the main computation occurs in the training, which is the computation of the item-to-item M-by-M similarity matrix as well as the related sorting. This also presents a memory challenge, since the large rating matrix may not even be able to be loaded into memory; see the comment at line 96 at the link below “this might not fit into memory! Maybe use a sample?” For IBCF, once the item-to-item similarity matrix is computed, prediction involves just efficient matrix products. IBCF seems to be a less-favored method in terms of computation. However, if we can compute the item similarity matrix offline, then IBCF becomes more efficient than UBCF.
esSplit <- evaluationScheme(Rmat, method = "split", train=0.8, given= 15 , k=10, goodRating = 5)
algorithms <- list("UBCF" = list(name = "UBCF", param = list(nn = 5) ),
"IBCF" = list(name = "IBCF")
)
results <- evaluate(esSplit, algorithms, type = "ratings")
## UBCF run fold/sample [model time/prediction time]
## 1 [0.041sec/49.594sec]
## 2 [0.029sec/49.745sec]
## 3 [0.03sec/49.833sec]
## 4 [0.031sec/50.01sec]
## 5 [0.03sec/49.902sec]
## 6 [0.03sec/49.917sec]
## 7 [0.031sec/50.576sec]
## 8 [0.031sec/49.504sec]
## 9 [0.028sec/49.629sec]
## 10 [0.032sec/49.745sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [83.14sec/0.332sec]
## 2 [83.481sec/0.208sec]
## 3 [83.337sec/0.213sec]
## 4 [83.68sec/0.221sec]
## 5 [83.622sec/0.236sec]
## 6 [83.458sec/0.263sec]
## 7 [83.294sec/0.349sec]
## 8 [83.639sec/0.294sec]
## 9 [83.719sec/0.224sec]
## 10 [84.825sec/0.239sec]
In the following section, we will save our model as an rda object to be used by the shiny app. We used modified Prof. shiny app from Alvin Yang with the following link: https://github.com/alvin-yang68/Movie-Recommender
We changed their model for the recommended system with our model and run the app.
rr= Recommender(Rmat, method = "IBCF")
save(rr, file = "System_amir.rda")
#SVD_RMSE = as.data.frame(do.call(rbind,getConfusionMatrix(results$SVD)))["RMSE"]
UBCF_RMSE = as.data.frame(do.call(rbind,getConfusionMatrix(results$UBCF)))["RMSE"]
IBCF_RMSE = as.data.frame(do.call(rbind,getConfusionMatrix(results$IBCF)))["RMSE"]
#names(SVD_RMSE)="RMSE for SVD"
names(IBCF_RMSE)="RMSE for IBCF"
names(UBCF_RMSE)="RMSE for UBCF"
Now, we export the RMSE for each method for all 10 iterations below
RMSE_total = cbind (1:10,IBCF_RMSE,UBCF_RMSE)
RMSE_total
## 1:10 RMSE for IBCF RMSE for UBCF
## 1 1 1.606655 1.228546
## 2 2 1.542282 1.223350
## 3 3 1.521209 1.247617
## 4 4 1.621293 1.228569
## 5 5 Inf 1.235517
## 6 6 1.533084 1.239835
## 7 7 1.631665 1.221509
## 8 8 1.516894 1.244837
## 9 9 1.458777 1.216719
## 10 10 1.718919 1.235456
and we can show them in a graph as below
tmp <- lapply(results, function(x) slot(x, "results"))
res <- tmp %>%
lapply(function(x) unlist(lapply(x, function(x) unlist(x@cm[ ,"RMSE"])))) %>%
as.data.frame() %>%
gather(key = "Algorithm", value = "RMSE")
res %>%
ggplot(aes(Algorithm, RMSE, fill = Algorithm)) +
geom_bar(stat = "summary") + geom_errorbar(stat = "summary", width = 0.3, size = 0.8) +
coord_cartesian(ylim = c(0.6, 1.3)) + guides(fill = FALSE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## No summary function supplied, defaulting to `mean_se()`
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## No summary function supplied, defaulting to `mean_se()`
As it was expected, the results for UBCF has lowest RMSE with respect to IBCF.
Refrences:
Shiny app for movie recommender system: