the recommender system

This system recommends 9 books to readers based on a short readers’ survey.

library(Matrix)
library(scales)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
# Dataset upload 
rating <-
  read.csv(
  'book_ratings.csv',
  na.strings = 'Did not read',
  stringsAsFactors = FALSE
  )[, -1]
  names(rating) <-
  c('name',
  'book1',
  'book2',
  'book3',
  'book4',
  'book5',
  'book6',
  'book7',
  'book8',
  'book9')

Create a user-item matrix, and split train and test dataset

  (rownames(rating) <- rating[, 1])
##  [1] "David Stern"    "Andy"           "Walt"           "Dan Fanelli"   
##  [5] "James T"        "Robert Sellers" "Tulasi "        "Logan"         
##  [9] "Shyam BV"       "Yun"            "Kumudini"       "Jason Joseph"
  (rating <- as.matrix(rating[, -1]))
##                book1 book2 book3 book4 book5 book6 book7 book8 book9
## David Stern        4    NA    NA     5    NA    NA    NA     5     4
## Andy              NA    NA    NA    NA    NA    NA    NA    NA    NA
## Walt               4     5     4     4     5    NA     5    NA     5
## Dan Fanelli        4     5     3    NA    NA    NA    NA    NA     4
## James T           NA    NA     4    NA    NA     4    NA    NA     3
## Robert Sellers    NA    NA    NA    NA    NA    NA    NA    NA    NA
## Tulasi            NA    NA    NA     5     5    NA    NA    NA     5
## Logan             NA    NA     4     5     5    NA     4    NA     4
## Shyam BV          NA     4     5    NA     3    NA    NA    NA    NA
## Yun               NA     5     4     4     5    NA    NA    NA    NA
## Kumudini          NA     4     4     4     5    NA    NA    NA     4
## Jason Joseph      NA     1     1    NA    NA    NA    NA    NA     1
  set.seed(123)
  
  (i <- sample(nrow(rating), ncol(rating), replace=F))
## [1]  4  9  5  8 11  1 12 10  3
  (j <- seq(ncol(rating)))
## [1] 1 2 3 4 5 6 7 8 9
  (test <- sparseMatrix(i, j, x=rating[cbind(i,j)]))
## 12 x 9 sparse Matrix of class "dgCMatrix"
##                           
##  [1,] . . . . . NA  .  . .
##  [2,] . . . . .  .  .  . .
##  [3,] . . . . .  .  .  . 5
##  [4,] 4 . . . .  .  .  . .
##  [5,] . . 4 . .  .  .  . .
##  [6,] . . . . .  .  .  . .
##  [7,] . . . . .  .  .  . .
##  [8,] . . . 5 .  .  .  . .
##  [9,] . 4 . . .  .  .  . .
## [10,] . . . . .  .  . NA .
## [11,] . . . . 5  .  .  . .
## [12,] . . . . .  . NA  . .
  (vals <- which(test !=0, arr.ind = T))
##      row col
## [1,]   4   1
## [2,]   9   2
## [3,]   5   3
## [4,]   8   4
## [5,]  11   5
## [6,]   3   9
  (rating[vals] <- 0)
## [1] 0
  train <- rating

Using training data, calculate the raw average (mean) rating for every user-item combination, and Calculate the RMSE for raw average for both your training data and your test data.

  (row.avg <- mean(train, na.rm = TRUE))
## [1] 3.439024
  (train.rmse <- sqrt(mean((train - row.avg)^2, na.rm=TRUE)))
## [1] 1.753725
  (test.rmse <- sqrt(mean((as(test, 'matrix') - row.avg)^2, na.rm=TRUE)))
## [1] 3.351071

Using training data, calculate the bias for each user and each item.

  (train.user.bias <- rowMeans(train, na.rm = T) - row.avg)
##    David Stern           Andy           Walt    Dan Fanelli        James T 
##     1.06097561            NaN     0.41811847    -0.43902439    -1.10569106 
## Robert Sellers        Tulasi           Logan       Shyam BV            Yun 
##            NaN     1.56097561    -0.03902439    -0.77235772     1.06097561 
##       Kumudini   Jason Joseph 
##    -0.23902439    -2.43902439
  (train.item.bias <- colMeans(train, na.rm = T) - row.avg)
##      book1      book2      book3      book4      book5      book6 
## -0.7723577 -0.1056911 -0.3140244  0.2276423  0.3943089  0.5609756 
##      book7      book8      book9 
##  1.0609756  1.5609756 -0.3140244
  (test.user.bias <- rowMeans(test, na.rm = T) - row.avg)
##  [1] -3.439024 -3.439024 -2.883469 -2.994580 -2.994580 -3.439024 -3.439024
##  [8] -2.883469 -2.994580 -3.439024 -2.883469 -3.439024

Calculate the baseline predictors for every user-item combination

  base.pred <- matrix(0, 12, 9)
  for (i in 1:length(train.user.bias)) {
    for(j in 1:length(train.item.bias)) {
      base.pred[i, j] <- row.avg + train.user.bias[[i]] + train.item.bias[[j]] + base.pred[i, j]
    }
  }
  
  base.pred <- ifelse(base.pred < 1, 1, base.pred)
  base.pred <- ifelse(base.pred > 5, 5, base.pred)

Calculate the RMSE for the baseline predictors for both your training data and your test data.

  (train.rmse.pred <- sqrt(mean((train - base.pred)^2, na.rm=TRUE)))
## [1] 1.406058
  (test.rmse.pred <- sqrt(mean((as(test, 'matrix') - base.pred)^2, na.rm=TRUE)))
## [1] 3.637483

Summarize your results.

  (train.rmse.pred < train.rmse)
## [1] TRUE
paste0("train.rmse.pred: ", round(train.rmse.pred, 3), ", and train.rmse: ", round(train.rmse, 3), ". The baseline prediction reduces the training dataset RMSE by ", percent((train.rmse.pred - train.rmse)/train.rmse))
## [1] "train.rmse.pred: 1.406, and train.rmse: 1.754. The baseline prediction reduces the training dataset RMSE by -19.8%"
  (test.rmse.pred < test.rmse)
## [1] FALSE
paste0("test.rmse.pred: ", round(test.rmse.pred, 3), ", and test.rmse: ", round(test.rmse, 3), ". The baseline prediction increases the test dataset RMSE by ", percent(abs((test.rmse.pred - test.rmse)/train.rmse)) )
## [1] "test.rmse.pred: 3.637, and test.rmse: 3.351. The baseline prediction increases the test dataset RMSE by 16.3%"

Conclusion:

Comparing to row average prediction, applying the baseline prediction algorithm reduced the RMSE for train dataset, but increased it for test dataset. I suspect that the problem is due to sample selection of slitting the training and testing dataset; I would adjuste the data split, and rerun the prediction in the future.

Reference:

https://stackoverflow.com/questions/6522134/r-return-position-of-element-in-matrix

https://stackoverflow.com/questions/28439487/convert-sparse-matrix-dgcmatrix-to-realratingmatrix

https://stackoverflow.com/questions/7719830/r-getting-attribute-values-as-a-vector

https://github.com/wwells/CUNY_DATA_643/blob/master/Project1/WWells_P1.Rmd