Data612 Final

Final Project

Book Recommender system

library(recommenderlab)

## Loading required package: Matrix

## Loading required package: arules

## Warning: package 'arules' was built under R version 3.6.2

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## Loading required package: proxy

## Warning: package 'proxy' was built under R version 3.6.2

## 
## Attaching package: 'proxy'

## The following object is masked from 'package:Matrix':
## 
##     as.matrix

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loading required package: registry

## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy

library(data.table)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:arules':
## 
##     intersect, recode, setdiff, setequal, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ readr   1.3.1
## ✔ tibble  2.1.3     ✔ purrr   0.3.2
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1     ✔ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()   masks data.table::between()
## ✖ tidyr::expand()    masks Matrix::expand()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks data.table::first()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::last()      masks data.table::last()
## ✖ tidyr::pack()      masks Matrix::pack()
## ✖ dplyr::recode()    masks arules::recode()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ tidyr::unpack()    masks Matrix::unpack()

library(ggplot2)
library(stringr)
library(DT)
library(knitr)
library(grid)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(corrplot)

## corrplot 0.84 loaded

library(methods)
library(Matrix)

Dataset

For the final project I will be using goodbooks dataset from Kaggle.
The dataset contains ratings for ten thousand popular books. As to the source, let’s say that these ratings were found on the internet. Generally, there are 100 reviews for each book, although some have less - fewer - ratings. Ratings go from one to five. Both book IDs and user IDs are contiguous. For books, they are 1-10000, for users, 1-53424. All users have made at least two ratings. Median number of ratings per user is 8.There are also books marked to read by the users, book metadata (author, year, etc.) and tags.

books<-fread("https://raw.githubusercontent.com/ekhahm/Data612/master/Final%20project/books.csv")
ratings<-fread("https://raw.githubusercontent.com/ekhahm/Data612/master/Final%20project/ratings.csv")
btags<-fread("https://raw.githubusercontent.com/ekhahm/Data612/master/Final%20project/book_tags.csv")
tags<-fread("https://raw.githubusercontent.com/ekhahm/Data612/master/Final%20project/tags.csv")
head(books)

##    id  book_id best_book_id  work_id books_count      isbn       isbn13
## 1:  1  2767052      2767052  2792775         272 439023483 9.780439e+12
## 2:  2        3            3  4640799         491 439554934 9.780440e+12
## 3:  3    41865        41865  3212258         226 316015849 9.780316e+12
## 4:  4     2657         2657  3275794         487  61120081 9.780061e+12
## 5:  5     4671         4671   245494        1356 743273567 9.780743e+12
## 6:  6 11870085     11870085 16827462         226 525478817 9.780525e+12
##                        authors original_publication_year
## 1:             Suzanne Collins                      2008
## 2: J.K. Rowling, Mary GrandPré                      1997
## 3:             Stephenie Meyer                      2005
## 4:                  Harper Lee                      1960
## 5:         F. Scott Fitzgerald                      1925
## 6:                  John Green                      2012
##                              original_title
## 1:                         The Hunger Games
## 2: Harry Potter and the Philosopher's Stone
## 3:                                 Twilight
## 4:                    To Kill a Mockingbird
## 5:                         The Great Gatsby
## 6:                   The Fault in Our Stars
##                                                       title language_code
## 1:                  The Hunger Games (The Hunger Games, #1)           eng
## 2: Harry Potter and the Sorcerer's Stone (Harry Potter, #1)           eng
## 3:                                  Twilight (Twilight, #1)         en-US
## 4:                                    To Kill a Mockingbird           eng
## 5:                                         The Great Gatsby           eng
## 6:                                   The Fault in Our Stars           eng
##    average_rating ratings_count work_ratings_count work_text_reviews_count
## 1:           4.34       4780653            4942365                  155254
## 2:           4.44       4602479            4800065                   75867
## 3:           3.57       3866839            3916824                   95009
## 4:           4.25       3198671            3340896                   72586
## 5:           3.89       2683664            2773745                   51992
## 6:           4.26       2346404            2478609                  140739
##    ratings_1 ratings_2 ratings_3 ratings_4 ratings_5
## 1:     66715    127936    560092   1481305   2706317
## 2:     75504    101676    455024   1156318   3011543
## 3:    456191    436802    793319    875073   1355439
## 4:     60427    117415    446835   1001952   1714267
## 5:     86236    197621    606158    936012    947718
## 6:     47994     92723    327550    698471   1311871
##                                                      image_url
## 1:  https://images.gr-assets.com/books/1447303603m/2767052.jpg
## 2:        https://images.gr-assets.com/books/1474154022m/3.jpg
## 3:    https://images.gr-assets.com/books/1361039443m/41865.jpg
## 4:     https://images.gr-assets.com/books/1361975680m/2657.jpg
## 5:     https://images.gr-assets.com/books/1490528560m/4671.jpg
## 6: https://images.gr-assets.com/books/1360206420m/11870085.jpg
##                                                small_image_url
## 1:  https://images.gr-assets.com/books/1447303603s/2767052.jpg
## 2:        https://images.gr-assets.com/books/1474154022s/3.jpg
## 3:    https://images.gr-assets.com/books/1361039443s/41865.jpg
## 4:     https://images.gr-assets.com/books/1361975680s/2657.jpg
## 5:     https://images.gr-assets.com/books/1490528560s/4671.jpg
## 6: https://images.gr-assets.com/books/1360206420s/11870085.jpg

head(ratings)

##    book_id user_id rating
## 1:       1     314      5
## 2:       1     439      3
## 3:       1     588      5
## 4:       1    1169      4
## 5:       1    1185      4
## 6:       1    2077      4

head(btags)

##    goodreads_book_id tag_id  count
## 1:                 1  30574 167697
## 2:                 1  11305  37174
## 3:                 1  11557  34173
## 4:                 1   8717  12986
## 5:                 1  33114  12716
## 6:                 1  11743   9954

head(tags)

##    tag_id tag_name
## 1:      0        -
## 2:      1     --1-
## 3:      2    --10-
## 4:      3    --12-
## 5:      4   --122-
## 6:      5   --166-

books.csv has metadata for each book (goodreads IDs, authors, title, average rating, etc.).
ratings.csv contains book_id, user_id and ratings.
toread.csv provides IDs of the books marked “to read” by each user, as userid,book_id pairs.
book_tags.csv contains tags/shelves/genres assigned by users to books. Tags in this file are represented by their IDs.
tags.csv translates tag IDs to names.

Analyzing the data

Analyzing ratings

# removing duplicate ratings
ratings[, N := .N, .(user_id, book_id)]
ratings %>% group_by(user_id, book_id) %>% mutate(n=n())

## # A tibble: 981,756 x 5
## # Groups:   user_id, book_id [979,478]
##    book_id user_id rating     N     n
##      <int>   <int>  <int> <int> <int>
##  1       1     314      5     1     1
##  2       1     439      3     1     1
##  3       1     588      5     1     1
##  4       1    1169      4     1     1
##  5       1    1185      4     1     1
##  6       1    2077      4     1     1
##  7       1    2487      4     1     1
##  8       1    2900      5     1     1
##  9       1    3662      4     1     1
## 10       1    3922      5     1     1
## # … with 981,746 more rows

ratings %>% group_by(user_id)%>% mutate(n=n())

## # A tibble: 981,756 x 5
## # Groups:   user_id [53,424]
##    book_id user_id rating     N     n
##      <int>   <int>  <int> <int> <int>
##  1       1     314      5     1   181
##  2       1     439      3     1   177
##  3       1     588      5     1   186
##  4       1    1169      4     1   187
##  5       1    1185      4     1   190
##  6       1    2077      4     1   180
##  7       1    2487      4     1   193
##  8       1    2900      5     1   190
##  9       1    3662      4     1   185
## 10       1    3922      5     1   188
## # … with 981,746 more rows

ratings <- ratings[N == 1]

ratings %>% 
  ggplot(aes(x = rating, fill = factor(rating))) +
  geom_bar()+ guides(fill = FALSE) + scale_fill_brewer(palette="Dark2")

Acoording to chart, most of users rates books with 4 or 5.

Average of user’s ratings

ratings %>% 
  group_by(user_id) %>% 
  summarize(mean_user_rating = mean(rating)) %>% 
  ggplot(aes(mean_user_rating)) +
  geom_histogram(fill = "grey", color = "grey20")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Anaylzing average of user’s ratings has similar results. More users rates books with 3 and higher.

Analyzing top 10 books by average ratings

books %>% 
  arrange(-average_rating) %>% 
  top_n(10,wt = ratings_count) %>% 
  select(title, ratings_count, average_rating) %>% 
  datatable(class = "nowrap hover row-border", escape = FALSE, options = list(dom = 't',scrollX = TRUE, autoWidth = TRUE))

Building a model

ratings %>%
  group_by(user_id) %>%
  mutate(ind = row_number())

## # A tibble: 977,269 x 5
## # Groups:   user_id [53,380]
##    book_id user_id rating     N   ind
##      <int>   <int>  <int> <int> <int>
##  1       1     314      5     1     1
##  2       1     439      3     1     1
##  3       1     588      5     1     1
##  4       1    1169      4     1     1
##  5       1    1185      4     1     1
##  6       1    2077      4     1     1
##  7       1    2487      4     1     1
##  8       1    2900      5     1     1
##  9       1    3662      4     1     1
## 10       1    3922      5     1     1
## # … with 977,259 more rows

bookrate<-merge(ratings,books,by="book_id")
head(bookrate)

##    book_id user_id rating N id best_book_id  work_id books_count      isbn
## 1:       1     314      5 1 27            1 41335427         275 439785960
## 2:       1     439      3 1 27            1 41335427         275 439785960
## 3:       1     588      5 1 27            1 41335427         275 439785960
## 4:       1    1169      4 1 27            1 41335427         275 439785960
## 5:       1    1185      4 1 27            1 41335427         275 439785960
## 6:       1    2077      4 1 27            1 41335427         275 439785960
##         isbn13                     authors original_publication_year
## 1: 9.78044e+12 J.K. Rowling, Mary GrandPré                      2005
## 2: 9.78044e+12 J.K. Rowling, Mary GrandPré                      2005
## 3: 9.78044e+12 J.K. Rowling, Mary GrandPré                      2005
## 4: 9.78044e+12 J.K. Rowling, Mary GrandPré                      2005
## 5: 9.78044e+12 J.K. Rowling, Mary GrandPré                      2005
## 6: 9.78044e+12 J.K. Rowling, Mary GrandPré                      2005
##                            original_title
## 1: Harry Potter and the Half-Blood Prince
## 2: Harry Potter and the Half-Blood Prince
## 3: Harry Potter and the Half-Blood Prince
## 4: Harry Potter and the Half-Blood Prince
## 5: Harry Potter and the Half-Blood Prince
## 6: Harry Potter and the Half-Blood Prince
##                                                        title language_code
## 1: Harry Potter and the Half-Blood Prince (Harry Potter, #6)           eng
## 2: Harry Potter and the Half-Blood Prince (Harry Potter, #6)           eng
## 3: Harry Potter and the Half-Blood Prince (Harry Potter, #6)           eng
## 4: Harry Potter and the Half-Blood Prince (Harry Potter, #6)           eng
## 5: Harry Potter and the Half-Blood Prince (Harry Potter, #6)           eng
## 6: Harry Potter and the Half-Blood Prince (Harry Potter, #6)           eng
##    average_rating ratings_count work_ratings_count work_text_reviews_count
## 1:           4.54       1678823            1785676                   27520
## 2:           4.54       1678823            1785676                   27520
## 3:           4.54       1678823            1785676                   27520
## 4:           4.54       1678823            1785676                   27520
## 5:           4.54       1678823            1785676                   27520
## 6:           4.54       1678823            1785676                   27520
##    ratings_1 ratings_2 ratings_3 ratings_4 ratings_5
## 1:      7308     21516    136333    459028   1161491
## 2:      7308     21516    136333    459028   1161491
## 3:      7308     21516    136333    459028   1161491
## 4:      7308     21516    136333    459028   1161491
## 5:      7308     21516    136333    459028   1161491
## 6:      7308     21516    136333    459028   1161491
##                                               image_url
## 1: https://images.gr-assets.com/books/1361039191m/1.jpg
## 2: https://images.gr-assets.com/books/1361039191m/1.jpg
## 3: https://images.gr-assets.com/books/1361039191m/1.jpg
## 4: https://images.gr-assets.com/books/1361039191m/1.jpg
## 5: https://images.gr-assets.com/books/1361039191m/1.jpg
## 6: https://images.gr-assets.com/books/1361039191m/1.jpg
##                                         small_image_url
## 1: https://images.gr-assets.com/books/1361039191s/1.jpg
## 2: https://images.gr-assets.com/books/1361039191s/1.jpg
## 3: https://images.gr-assets.com/books/1361039191s/1.jpg
## 4: https://images.gr-assets.com/books/1361039191s/1.jpg
## 5: https://images.gr-assets.com/books/1361039191s/1.jpg
## 6: https://images.gr-assets.com/books/1361039191s/1.jpg

bookrate1<-bookrate[,c(2,13,3)]
bookrate1$user_id<-as.character(bookrate1$user_id)
bookrate1$rating<-as.numeric(bookrate1$rating)
head(bookrate1)

##    user_id                         original_title rating
## 1:     314 Harry Potter and the Half-Blood Prince      5
## 2:     439 Harry Potter and the Half-Blood Prince      3
## 3:     588 Harry Potter and the Half-Blood Prince      5
## 4:    1169 Harry Potter and the Half-Blood Prince      4
## 5:    1185 Harry Potter and the Half-Blood Prince      4
## 6:    2077 Harry Potter and the Half-Blood Prince      4

hist(bookrate1$rating)

# convert dataframe into "realRatingMatrix"
bookrate1$user_id<-as.factor(bookrate1$user_id)
bookrate1$rating<-as.numeric(bookrate1$rating)
head(bookrate1)

##    user_id                         original_title rating
## 1:     314 Harry Potter and the Half-Blood Prince      5
## 2:     439 Harry Potter and the Half-Blood Prince      3
## 3:     588 Harry Potter and the Half-Blood Prince      5
## 4:    1169 Harry Potter and the Half-Blood Prince      4
## 5:    1185 Harry Potter and the Half-Blood Prince      4
## 6:    2077 Harry Potter and the Half-Blood Prince      4

bookrate2<-as(data.frame(bookrate1), 'realRatingMatrix')
head(bookrate2)

## 1 x 794 rating matrix of class 'realRatingMatrix' with 3 ratings.

Recommendation model

SVD model

model_SVD <- Recommender(bookrate2, method = "SVD", param = list(method = "pearson", nn = 4))

## Warning: Unknown parameters: method, nn

## Available parameter (with default values):
## k     =  10
## maxiter   =  100
## normalize     =  center
## verbose   =  FALSE

recommend 4 books for user 1004

pre<-predict(model_SVD,bookrate2[1004],n=4)
as(pre, "list")

## $`11564`
## [1] "The Power of One"                        
## [2] "Slouching Towards Bethlehem: Essays"     
## [3] "Harry Potter and the Philosopher's Stone"
## [4] "Memoria de mis putas tristes"

UBCF model

model_UBCF <- Recommender(bookrate2, method = "UBCF", param = list(method = "pearson", nn = 4))

recommend 4 books for user 1004

pre1<-predict(model_UBCF,bookrate2[1004],n=4)
as(pre1, "list")

## $`11564`
## [1] "A Midsummer Night's Dream"                                   
## [2] "Getting Things Done: How To Achieve Stress-free Productivity"
## [3] "Moneyball: The Art of Winning an Unfair Game"                
## [4] "The Time Machine"

Evaluating the predictions

Accuracy comparision by using error matrix between SVD and UBCF

dimension_names <- list(user_id = sort(unique(ratings$user_id)), book_id = sort(unique(ratings$book_id)))
ratingmat <- spread(select(ratings, book_id, user_id, rating), book_id, rating) %>% select(-user_id)
ratingmat <- as.matrix(ratingmat)
dimnames(ratingmat) <- dimension_names
ratingmat[1:5, 1:5]

##        book_id
## user_id  1  2  3  4  5
##       1 NA NA NA NA NA
##       2 NA NA NA NA NA
##       3 NA NA NA NA NA
##       4 NA NA NA NA NA
##       5 NA NA NA NA NA

Restructuring rating data from the dataset

ratingmat0 <- ratingmat
# Removing NAs 
ratingmat0[is.na(ratingmat0)] <- 0
sparse_ratings <- as(ratingmat0, "sparseMatrix")
rm(ratingmat0)
gc()

##             used   (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells   2815655  150.4    7122362   380.4         NA    7122362   380.4
## Vcells 314900731 2402.6 1326853016 10123.1      16384 1382049457 10544.3

Creating realRatingMatrix

real_ratings <- new("realRatingMatrix", data = sparse_ratings)
real_ratings

## 53380 x 10000 rating matrix of class 'realRatingMatrix' with 977269 ratings.

# evaluation scheme
scheme <- evaluationScheme(real_ratings[1:500,], method = "cross-validation", k = 10, given = -1, goodRating = 5)

## Warning in .local(data, ...): The following users do not have enough items
## leaving no given items: 41, 108, 189, 302, 324, 329

SVD train

svd_train <- Recommender(getData(scheme, "train"), "SVD")
svd_train

## Recommender of type 'SVD' for 'realRatingMatrix' 
## learned using 450 users.

SVD prediction

svd_prd <- predict(svd_train, getData(scheme, "known"), type= "ratings")
svd_prd

## 50 x 10000 rating matrix of class 'realRatingMatrix' with 479281 ratings.

UBCF train

UBCF_train <- Recommender(getData(scheme, "train"), "UBCF")
UBCF_train

## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 450 users.

UBCF prediction

UBCF_prd <- predict(UBCF_train, getData(scheme, "known"), type= "ratings")
UBCF_prd

## 50 x 10000 rating matrix of class 'realRatingMatrix' with 335564 ratings.

Compare accuracy by metrics

error <- rbind(
    SVD = calcPredictionAccuracy(svd_prd, getData(scheme, "unknown")),
    UBCF = calcPredictionAccuracy(UBCF_prd, getData(scheme, "unknown"))
    )
error

##           RMSE       MSE       MAE
## SVD  1.1139352 1.2408517 0.9052800
## UBCF 0.9371241 0.8782016 0.7560911

According to RMSE of each method, UBCF is more accurate recommendation model than SVD.