library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## Warning: package 'arules' was built under R version 3.6.2
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
## Warning: package 'proxy' was built under R version 3.6.2
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ readr 1.3.1
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1 ✔ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(ggplot2)
library(stringr)
library(DT)
library(knitr)
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## corrplot 0.84 loaded
library(methods)
library(Matrix)
For the final project I will be using goodbooks dataset from Kaggle.
The dataset contains ratings for ten thousand popular books. As to the source, let’s say that these ratings were found on the internet. Generally, there are 100 reviews for each book, although some have less - fewer - ratings. Ratings go from one to five. Both book IDs and user IDs are contiguous. For books, they are 1-10000, for users, 1-53424. All users have made at least two ratings. Median number of ratings per user is 8.There are also books marked to read by the users, book metadata (author, year, etc.) and tags.
books<-fread("https://raw.githubusercontent.com/ekhahm/Data612/master/Final%20project/books.csv")
ratings<-fread("https://raw.githubusercontent.com/ekhahm/Data612/master/Final%20project/ratings.csv")
btags<-fread("https://raw.githubusercontent.com/ekhahm/Data612/master/Final%20project/book_tags.csv")
tags<-fread("https://raw.githubusercontent.com/ekhahm/Data612/master/Final%20project/tags.csv")
head(books)
## id book_id best_book_id work_id books_count isbn isbn13
## 1: 1 2767052 2767052 2792775 272 439023483 9.780439e+12
## 2: 2 3 3 4640799 491 439554934 9.780440e+12
## 3: 3 41865 41865 3212258 226 316015849 9.780316e+12
## 4: 4 2657 2657 3275794 487 61120081 9.780061e+12
## 5: 5 4671 4671 245494 1356 743273567 9.780743e+12
## 6: 6 11870085 11870085 16827462 226 525478817 9.780525e+12
## authors original_publication_year
## 1: Suzanne Collins 2008
## 2: J.K. Rowling, Mary GrandPré 1997
## 3: Stephenie Meyer 2005
## 4: Harper Lee 1960
## 5: F. Scott Fitzgerald 1925
## 6: John Green 2012
## original_title
## 1: The Hunger Games
## 2: Harry Potter and the Philosopher's Stone
## 3: Twilight
## 4: To Kill a Mockingbird
## 5: The Great Gatsby
## 6: The Fault in Our Stars
## title language_code
## 1: The Hunger Games (The Hunger Games, #1) eng
## 2: Harry Potter and the Sorcerer's Stone (Harry Potter, #1) eng
## 3: Twilight (Twilight, #1) en-US
## 4: To Kill a Mockingbird eng
## 5: The Great Gatsby eng
## 6: The Fault in Our Stars eng
## average_rating ratings_count work_ratings_count work_text_reviews_count
## 1: 4.34 4780653 4942365 155254
## 2: 4.44 4602479 4800065 75867
## 3: 3.57 3866839 3916824 95009
## 4: 4.25 3198671 3340896 72586
## 5: 3.89 2683664 2773745 51992
## 6: 4.26 2346404 2478609 140739
## ratings_1 ratings_2 ratings_3 ratings_4 ratings_5
## 1: 66715 127936 560092 1481305 2706317
## 2: 75504 101676 455024 1156318 3011543
## 3: 456191 436802 793319 875073 1355439
## 4: 60427 117415 446835 1001952 1714267
## 5: 86236 197621 606158 936012 947718
## 6: 47994 92723 327550 698471 1311871
## image_url
## 1: https://images.gr-assets.com/books/1447303603m/2767052.jpg
## 2: https://images.gr-assets.com/books/1474154022m/3.jpg
## 3: https://images.gr-assets.com/books/1361039443m/41865.jpg
## 4: https://images.gr-assets.com/books/1361975680m/2657.jpg
## 5: https://images.gr-assets.com/books/1490528560m/4671.jpg
## 6: https://images.gr-assets.com/books/1360206420m/11870085.jpg
## small_image_url
## 1: https://images.gr-assets.com/books/1447303603s/2767052.jpg
## 2: https://images.gr-assets.com/books/1474154022s/3.jpg
## 3: https://images.gr-assets.com/books/1361039443s/41865.jpg
## 4: https://images.gr-assets.com/books/1361975680s/2657.jpg
## 5: https://images.gr-assets.com/books/1490528560s/4671.jpg
## 6: https://images.gr-assets.com/books/1360206420s/11870085.jpg
head(ratings)
## book_id user_id rating
## 1: 1 314 5
## 2: 1 439 3
## 3: 1 588 5
## 4: 1 1169 4
## 5: 1 1185 4
## 6: 1 2077 4
head(btags)
## goodreads_book_id tag_id count
## 1: 1 30574 167697
## 2: 1 11305 37174
## 3: 1 11557 34173
## 4: 1 8717 12986
## 5: 1 33114 12716
## 6: 1 11743 9954
head(tags)
## tag_id tag_name
## 1: 0 -
## 2: 1 --1-
## 3: 2 --10-
## 4: 3 --12-
## 5: 4 --122-
## 6: 5 --166-
books.csv has metadata for each book (goodreads IDs, authors, title, average rating, etc.).
ratings.csv contains book_id, user_id and ratings.
toread.csv provides IDs of the books marked “to read” by each user, as userid,book_id pairs.
book_tags.csv contains tags/shelves/genres assigned by users to books. Tags in this file are represented by their IDs.
tags.csv translates tag IDs to names.
# removing duplicate ratings
ratings[, N := .N, .(user_id, book_id)]
ratings %>% group_by(user_id, book_id) %>% mutate(n=n())
## # A tibble: 981,756 x 5
## # Groups: user_id, book_id [979,478]
## book_id user_id rating N n
## <int> <int> <int> <int> <int>
## 1 1 314 5 1 1
## 2 1 439 3 1 1
## 3 1 588 5 1 1
## 4 1 1169 4 1 1
## 5 1 1185 4 1 1
## 6 1 2077 4 1 1
## 7 1 2487 4 1 1
## 8 1 2900 5 1 1
## 9 1 3662 4 1 1
## 10 1 3922 5 1 1
## # … with 981,746 more rows
ratings %>% group_by(user_id)%>% mutate(n=n())
## # A tibble: 981,756 x 5
## # Groups: user_id [53,424]
## book_id user_id rating N n
## <int> <int> <int> <int> <int>
## 1 1 314 5 1 181
## 2 1 439 3 1 177
## 3 1 588 5 1 186
## 4 1 1169 4 1 187
## 5 1 1185 4 1 190
## 6 1 2077 4 1 180
## 7 1 2487 4 1 193
## 8 1 2900 5 1 190
## 9 1 3662 4 1 185
## 10 1 3922 5 1 188
## # … with 981,746 more rows
ratings <- ratings[N == 1]
ratings %>%
ggplot(aes(x = rating, fill = factor(rating))) +
geom_bar()+ guides(fill = FALSE) + scale_fill_brewer(palette="Dark2")
Acoording to chart, most of users rates books with 4 or 5.
ratings %>%
group_by(user_id) %>%
summarize(mean_user_rating = mean(rating)) %>%
ggplot(aes(mean_user_rating)) +
geom_histogram(fill = "grey", color = "grey20")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Anaylzing average of user’s ratings has similar results. More users rates books with 3 and higher.
books %>%
arrange(-average_rating) %>%
top_n(10,wt = ratings_count) %>%
select(title, ratings_count, average_rating) %>%
datatable(class = "nowrap hover row-border", escape = FALSE, options = list(dom = 't',scrollX = TRUE, autoWidth = TRUE))
ratings %>%
group_by(user_id) %>%
mutate(ind = row_number())
## # A tibble: 977,269 x 5
## # Groups: user_id [53,380]
## book_id user_id rating N ind
## <int> <int> <int> <int> <int>
## 1 1 314 5 1 1
## 2 1 439 3 1 1
## 3 1 588 5 1 1
## 4 1 1169 4 1 1
## 5 1 1185 4 1 1
## 6 1 2077 4 1 1
## 7 1 2487 4 1 1
## 8 1 2900 5 1 1
## 9 1 3662 4 1 1
## 10 1 3922 5 1 1
## # … with 977,259 more rows
bookrate<-merge(ratings,books,by="book_id")
head(bookrate)
## book_id user_id rating N id best_book_id work_id books_count isbn
## 1: 1 314 5 1 27 1 41335427 275 439785960
## 2: 1 439 3 1 27 1 41335427 275 439785960
## 3: 1 588 5 1 27 1 41335427 275 439785960
## 4: 1 1169 4 1 27 1 41335427 275 439785960
## 5: 1 1185 4 1 27 1 41335427 275 439785960
## 6: 1 2077 4 1 27 1 41335427 275 439785960
## isbn13 authors original_publication_year
## 1: 9.78044e+12 J.K. Rowling, Mary GrandPré 2005
## 2: 9.78044e+12 J.K. Rowling, Mary GrandPré 2005
## 3: 9.78044e+12 J.K. Rowling, Mary GrandPré 2005
## 4: 9.78044e+12 J.K. Rowling, Mary GrandPré 2005
## 5: 9.78044e+12 J.K. Rowling, Mary GrandPré 2005
## 6: 9.78044e+12 J.K. Rowling, Mary GrandPré 2005
## original_title
## 1: Harry Potter and the Half-Blood Prince
## 2: Harry Potter and the Half-Blood Prince
## 3: Harry Potter and the Half-Blood Prince
## 4: Harry Potter and the Half-Blood Prince
## 5: Harry Potter and the Half-Blood Prince
## 6: Harry Potter and the Half-Blood Prince
## title language_code
## 1: Harry Potter and the Half-Blood Prince (Harry Potter, #6) eng
## 2: Harry Potter and the Half-Blood Prince (Harry Potter, #6) eng
## 3: Harry Potter and the Half-Blood Prince (Harry Potter, #6) eng
## 4: Harry Potter and the Half-Blood Prince (Harry Potter, #6) eng
## 5: Harry Potter and the Half-Blood Prince (Harry Potter, #6) eng
## 6: Harry Potter and the Half-Blood Prince (Harry Potter, #6) eng
## average_rating ratings_count work_ratings_count work_text_reviews_count
## 1: 4.54 1678823 1785676 27520
## 2: 4.54 1678823 1785676 27520
## 3: 4.54 1678823 1785676 27520
## 4: 4.54 1678823 1785676 27520
## 5: 4.54 1678823 1785676 27520
## 6: 4.54 1678823 1785676 27520
## ratings_1 ratings_2 ratings_3 ratings_4 ratings_5
## 1: 7308 21516 136333 459028 1161491
## 2: 7308 21516 136333 459028 1161491
## 3: 7308 21516 136333 459028 1161491
## 4: 7308 21516 136333 459028 1161491
## 5: 7308 21516 136333 459028 1161491
## 6: 7308 21516 136333 459028 1161491
## image_url
## 1: https://images.gr-assets.com/books/1361039191m/1.jpg
## 2: https://images.gr-assets.com/books/1361039191m/1.jpg
## 3: https://images.gr-assets.com/books/1361039191m/1.jpg
## 4: https://images.gr-assets.com/books/1361039191m/1.jpg
## 5: https://images.gr-assets.com/books/1361039191m/1.jpg
## 6: https://images.gr-assets.com/books/1361039191m/1.jpg
## small_image_url
## 1: https://images.gr-assets.com/books/1361039191s/1.jpg
## 2: https://images.gr-assets.com/books/1361039191s/1.jpg
## 3: https://images.gr-assets.com/books/1361039191s/1.jpg
## 4: https://images.gr-assets.com/books/1361039191s/1.jpg
## 5: https://images.gr-assets.com/books/1361039191s/1.jpg
## 6: https://images.gr-assets.com/books/1361039191s/1.jpg
bookrate1<-bookrate[,c(2,13,3)]
bookrate1$user_id<-as.character(bookrate1$user_id)
bookrate1$rating<-as.numeric(bookrate1$rating)
head(bookrate1)
## user_id original_title rating
## 1: 314 Harry Potter and the Half-Blood Prince 5
## 2: 439 Harry Potter and the Half-Blood Prince 3
## 3: 588 Harry Potter and the Half-Blood Prince 5
## 4: 1169 Harry Potter and the Half-Blood Prince 4
## 5: 1185 Harry Potter and the Half-Blood Prince 4
## 6: 2077 Harry Potter and the Half-Blood Prince 4
hist(bookrate1$rating)
# convert dataframe into "realRatingMatrix"
bookrate1$user_id<-as.factor(bookrate1$user_id)
bookrate1$rating<-as.numeric(bookrate1$rating)
head(bookrate1)
## user_id original_title rating
## 1: 314 Harry Potter and the Half-Blood Prince 5
## 2: 439 Harry Potter and the Half-Blood Prince 3
## 3: 588 Harry Potter and the Half-Blood Prince 5
## 4: 1169 Harry Potter and the Half-Blood Prince 4
## 5: 1185 Harry Potter and the Half-Blood Prince 4
## 6: 2077 Harry Potter and the Half-Blood Prince 4
bookrate2<-as(data.frame(bookrate1), 'realRatingMatrix')
head(bookrate2)
## 1 x 794 rating matrix of class 'realRatingMatrix' with 3 ratings.
model_SVD <- Recommender(bookrate2, method = "SVD", param = list(method = "pearson", nn = 4))
## Warning: Unknown parameters: method, nn
## Available parameter (with default values):
## k = 10
## maxiter = 100
## normalize = center
## verbose = FALSE
recommend 4 books for user 1004
pre<-predict(model_SVD,bookrate2[1004],n=4)
as(pre, "list")
## $`11564`
## [1] "The Power of One"
## [2] "Slouching Towards Bethlehem: Essays"
## [3] "Harry Potter and the Philosopher's Stone"
## [4] "Memoria de mis putas tristes"
model_UBCF <- Recommender(bookrate2, method = "UBCF", param = list(method = "pearson", nn = 4))
recommend 4 books for user 1004
pre1<-predict(model_UBCF,bookrate2[1004],n=4)
as(pre1, "list")
## $`11564`
## [1] "A Midsummer Night's Dream"
## [2] "Getting Things Done: How To Achieve Stress-free Productivity"
## [3] "Moneyball: The Art of Winning an Unfair Game"
## [4] "The Time Machine"
Accuracy comparision by using error matrix between SVD and UBCF
dimension_names <- list(user_id = sort(unique(ratings$user_id)), book_id = sort(unique(ratings$book_id)))
ratingmat <- spread(select(ratings, book_id, user_id, rating), book_id, rating) %>% select(-user_id)
ratingmat <- as.matrix(ratingmat)
dimnames(ratingmat) <- dimension_names
ratingmat[1:5, 1:5]
## book_id
## user_id 1 2 3 4 5
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 NA NA NA NA NA
Restructuring rating data from the dataset
ratingmat0 <- ratingmat
# Removing NAs
ratingmat0[is.na(ratingmat0)] <- 0
sparse_ratings <- as(ratingmat0, "sparseMatrix")
rm(ratingmat0)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2815655 150.4 7122362 380.4 NA 7122362 380.4
## Vcells 314900731 2402.6 1326853016 10123.1 16384 1382049457 10544.3
Creating realRatingMatrix
real_ratings <- new("realRatingMatrix", data = sparse_ratings)
real_ratings
## 53380 x 10000 rating matrix of class 'realRatingMatrix' with 977269 ratings.
# evaluation scheme
scheme <- evaluationScheme(real_ratings[1:500,], method = "cross-validation", k = 10, given = -1, goodRating = 5)
## Warning in .local(data, ...): The following users do not have enough items
## leaving no given items: 41, 108, 189, 302, 324, 329
SVD train
svd_train <- Recommender(getData(scheme, "train"), "SVD")
svd_train
## Recommender of type 'SVD' for 'realRatingMatrix'
## learned using 450 users.
SVD prediction
svd_prd <- predict(svd_train, getData(scheme, "known"), type= "ratings")
svd_prd
## 50 x 10000 rating matrix of class 'realRatingMatrix' with 479281 ratings.
UBCF train
UBCF_train <- Recommender(getData(scheme, "train"), "UBCF")
UBCF_train
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 450 users.
UBCF prediction
UBCF_prd <- predict(UBCF_train, getData(scheme, "known"), type= "ratings")
UBCF_prd
## 50 x 10000 rating matrix of class 'realRatingMatrix' with 335564 ratings.
error <- rbind(
SVD = calcPredictionAccuracy(svd_prd, getData(scheme, "unknown")),
UBCF = calcPredictionAccuracy(UBCF_prd, getData(scheme, "unknown"))
)
error
## RMSE MSE MAE
## SVD 1.1139352 1.2408517 0.9052800
## UBCF 0.9371241 0.8782016 0.7560911
According to RMSE of each method, UBCF is more accurate recommendation model than SVD.