if (!require("knitr")) install.packages("knitr")
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("kableExtra")) install.packages("kableExtra")
if (!require("dplyr")) install.packages("dplyr")
if (!require("ggrepel")) install.packages("ggrepel")
if (!require("recommenderlab")) install.packages("recommenderlab")
if (!require("tictoc")) install.packages("tictoc")

Introduction

Data

Load your data into (for example)**

The dataset here is taken from the below website.

● Amazon Reviews data (http://jmcauley.ucsd.edu/data/amazon/) The repository has several datasets. , we are using the .

● For this case study, The dataset is a product ratings for Electronics products sold on Amazon.com.

Attribute Information:

● userId : Every user identified with a unique id (First Column)

● productId : Every product identified with a unique id(Second Column)

● Rating : Rating of the corresponding product by the corresponding user(Third Column)

● timestamp : Time of the rating ( Fourth Column)

● Original set contains 6,197,180 observations and 4 variables - User ID, Product ID, Rating (from 1 to 5), and Time Stamp. It covers 4,201,696 users and 476,001 products. In order to make the set more manageable it has been reduced to a smaller subset.

● The final ratings dataset used consists of 3573 X 19491 rating matrix of class ‘realRatingMatrix’ with 68565 ratings.

ratings_data <- read.csv('C:/Users/patel/Documents/Data612/Project4/ratings_Electronics.csv')
dim(ratings_data)

## [1] 6197180       4

Display your data

head(ratings_data) %>% kable()%>%
  kable_styling(bootstrap_options = "striped", full_width = F)

userId	productId	Rating	timestamp
AKM1MP6P0OYPR	0132793040	5	1365811200
A2CX7LUOHB2NDG	0321732944	5	1341100800
A2NWSAGRHCP8N5	0439886341	1	1367193600
A2WNBOD3WNDNKT	0439886341	3	1374451200
A1GI0U4ZRJA8WN	0439886341	1	1334707200
A1QGNMC6O1VW39	0511189877	5	1397433600

hist(ratings_data$Rating)

Transform Data

I used realRatingMatrix from ‘recommenderlab’ to transform data.

ratings_data$userId <- as.factor(ratings_data$userId)

UI <- as(ratings_data, "realRatingMatrix")
dim(UI@data)

## [1] 3492876  343145

3492876 X 343145 Dimensions

#filter list
( amazonShort <- UI[rowCounts(UI) > 20, colCounts(UI) > 40] )

## 8086 x 25197 rating matrix of class 'realRatingMatrix' with 189308 ratings.

amazonShort <- UI[ , colCounts(UI) > 40]
amazonShort <- amazonShort[rowCounts(amazonShort) > 20, ]
amazonShort

## 3573 x 25197 rating matrix of class 'realRatingMatrix' with 118551 ratings.

#Check and Remove Empty Lines
table(rowCounts(amazonShort))

## 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38 
## 401 325 282 232 224 203 172 163 143 106 105  95  86  76  47  70  46  60 
##  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56 
##  41  49  44  39  25  24  31  26  28  24  16  24  26  17  18  20  14  11 
##  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74 
##  11   9   8  15  10   6   6  10  13   6   7  12   2   5   8   5   5   4 
##  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  91  92  93 
##   4   1   3   4   7   6   3   3   4   4   3   2   1   1   1   2   1   2 
##  94  95  96  97  98 100 101 103 104 105 106 107 109 110 112 113 114 117 
##   4   2   2   1   1   2   2   1   2   1   2   2   1   1   1   1   2   2 
## 118 119 121 122 123 125 126 127 128 129 130 131 133 138 139 140 158 159 
##   1   3   1   4   4   1   1   1   1   2   1   2   1   1   1   2   1   1 
## 171 181 219 244 256 263 284 
##   1   1   1   1   1   1   1

table(colCounts(amazonShort))

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 5706 5088 3568 2432 1712 1164  930  677  530  385  341  308  248  222  183 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##  142  149  127  119   91   94   87   80   54   39   41   47   35   47   34 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##   37   21   31   15   28   17   19   20   11   15   14   13   15    5    7 
##   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59 
##   10    6    9    4   12    8   11    9    8    9    7    6    6    1    3 
##   60   61   62   63   64   65   66   67   68   69   71   72   73   74   75 
##    5    7    4    5    2    4    5    2    6    3    3    2    2    2    1 
##   76   78   79   80   81   82   83   85   87   88   89   91   92   95   96 
##    3    3    3    1    5    4    1    2    2    3    1    2    2    1    1 
##   98  100  102  105  107  111  113  116  118  120  121  123  129  130  132 
##    2    1    1    2    1    1    4    3    1    2    1    1    1    1    1 
##  134  137  142  143  145  146  149  151  152  155  156  163  165  168  169 
##    2    1    2    1    1    1    1    1    1    3    1    1    1    1    2 
##  176  187  195  212  214  229  239  273  342  352 
##    1    1    1    1    1    1    1    1    1    1

( amazonShort <- amazonShort[ , colCounts(amazonShort) != 0] )

## 3573 x 19491 rating matrix of class 'realRatingMatrix' with 118551 ratings.

###Convert to data frame and save as CSV file
df<-as.matrix(amazonShort@data)
df <- as.data.frame(df)
df$userId <- rownames(df)
df <- df %>% gather(key = productId, value = Rating, -userId) %>% filter(Rating != 0)
write.csv(df, "ratings_final.csv", row.names = FALSE)

Data import and Data Preparation

Import the ratings_final dataset:

ratings <- read.csv("ratings_final.csv")

class(ratings$userId)

## [1] "factor"

class(ratings$productId)

## [1] "factor"

class(ratings$Rating)

## [1] "integer"

amazon <- as(ratings, "realRatingMatrix")
dim(amazon@data)

## [1]  3573 19491

Split training and test datasets.

Split the dataset into training set (80%) and testing set (20%).

# Train/test split
set.seed(88)
eval <- evaluationScheme(amazon, method = "split", train = 0.8, given = 5, goodRating = 3)

#Train
train <- getData(eval, "train")

#Known
known <- getData(eval, "known")

#Unknown
unknown <- getData(eval, "unknown")

# Set up data frame for timing
timing <- data.frame(Model=factor(), Training=double(), Predicting=double())

Models

Now, Let’s build three different models

USER BASED COLLABORATIVE FILTERING

model_method <- "UBCF"

# Training
tic()
modelUBCF <- Recommender(train, method = model_method)
t <- toc(quiet = TRUE)
train_time <- round(t$toc - t$tic, 2)

# Predicting
tic()
predUBCF <- predict(modelUBCF, newdata = known, type = "ratings")
t <- toc(quiet = TRUE)
predict_time <- round(t$toc - t$tic, 2)

timing <- rbind(timing, data.frame(Model = as.factor(model_method), 
                                   Training = as.double(train_time), 
                                   Predicting = as.double(predict_time))) 
# Accuracy
accUBCF <- calcPredictionAccuracy(predUBCF, unknown)

RANDOM VALUE

model_method <- "RANDOM"

# Training
tic()
modelRandom <- Recommender(train, method = model_method)
t <- toc(quiet = TRUE)
train_time <- round(t$toc - t$tic, 2)

# Predicting
tic()
predRandom <- predict(modelRandom, newdata = known, type = "ratings")
t <- toc(quiet = TRUE)
predict_time <- round(t$toc - t$tic, 2)

timing <- rbind(timing, data.frame(Model = as.factor(model_method), 
                                   Training = as.double(train_time), 
                                   Predicting = as.double(predict_time))) 

# Accuracy
accRandom <- calcPredictionAccuracy(predRandom, unknown)

Singular Value Decomposition (SVD) Model

model_method <- "SVD"

# Training
tic()
modelSVD <- Recommender(train, method = model_method, parameter = list(k = 50))
t <- toc(quiet = TRUE)
train_time <- round(t$toc - t$tic, 2)

# Predicting
tic()
predSVD <- predict(modelSVD, newdata = known, type = "ratings")
t <- toc(quiet = TRUE)
predict_time <- round(t$toc - t$tic, 2)

timing <- rbind(timing, data.frame(Model = as.factor(model_method), 
                                   Training = as.double(train_time), 
                                   Predicting = as.double(predict_time))) 

# Accuracy
accSVD <- calcPredictionAccuracy(predSVD, unknown)

Evaluation

As we have build all three models for the dataset, now we can proceed with compairing the accuracy for all three models

accuracy <- rbind(accUBCF, accRandom)
accuracy <- rbind(accuracy, accSVD)
rownames(accuracy) <- c("UBCF", "Random", "SVD")
knitr::kable(accuracy, format = "html") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

	RMSE	MSE	MAE
UBCF	1.097196	1.203839	0.7901769
Random	1.353277	1.831358	0.9499848
SVD	1.092688	1.193967	0.7764771

As we review the accuracy scores above for UBCF, Random, SVD models, we see that Random has the lowest accuracy than UBCF and SVD. Whereas, UBCF and SVD models accuracy figures are quite close to each other. It is not surprising that random recommendations are not as accurate as recommendations based on prior ratings.

ROC curve

Now we can we can review ROC curve and Precision-Recall plot for all three models.

models <- list(
  "UBCF" = list(name = "UBCF", param = NULL),
  "Random" = list(name = "RANDOM", param = NULL),
  "SVD" = list(name = "SVD", param = list(k = 50))
  )
evalResults <- evaluate(x = eval, method = models, n = c(1, 5, 10, 30, 60))

## UBCF run fold/sample [model time/prediction time]
##   1  [0.01sec/356.58sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/14.35sec] 
## SVD run fold/sample [model time/prediction time]
##   1  [34.79sec/18.36sec]

# ROC Curve
plot(evalResults, 
     annotate = TRUE, legend = "topleft", main = "ROC Curve")

# Precision-Recall Plot
plot(evalResults, "prec/rec", 
     annotate = TRUE, legend = "topright", main = "Precision-Recall")

UBCF performs better than SVD and considerably better than the Random model.

• Now, Let us see the training and prediction time.

• From the table below we can see that the UBCF model can be created fairly quickly, but predicting results takes considerable time. The Random model is pretty efficient all around. The SVD model takes longer to build than to predict, but altogether it is quicker than the UBCF model.

rownames(timing) <- timing$Model
knitr::kable(timing[, 2:3], format = "html") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

	Training	Predicting
UBCF	0.03	348.48
RANDOM	0.00	8.18
SVD	38.23	11.81

User Experiance Goal

• Since UBCF and SVD models’s accuracy scores were similar and they also performed better compared to Random model, let’s create a hybrid model consisting of UBCF and SVD models.

• It may not always be desirable to recommend products that are likely to be most highly rated by a user. Recommending somewhat unexpected products may improve user experience, expand user preferences, provide additional knowledge about a user.

• In order to make sure that most of recommendations are still likely to be highly rated we only allow very minor influence of the Random model (0.99 vs. 0.01 weight between UBCF and Random models).

model_Hybrid <- HybridRecommender(
    modelUBCF, 
    modelRandom, 
    weights = c(0.99, 0.01))
pred_Hybrid <- predict(model_Hybrid, newdata = known, type = "ratings")
( accHybrid <- calcPredictionAccuracy(pred_Hybrid, unknown) )

##      RMSE       MSE       MAE 
## 1.4057040 1.9760037 0.9335008

Accuracy

The accuracy has gone down. It is not as bad as with purely random model, but clearly not as good as UBCF or SVD models. However, the goal here is to influence user experience rather than make the most accurate model, so we need to employ different metrics.

Let us look at top 10 recommendations for the first user in the test set.

pUBCF <- predict(modelUBCF, newdata = known[1], type = "topNList")
pHybrid <- predict(model_Hybrid, newdata = known[1], type = "topNList")

pUBCF@items

## $A107CTGSINY3GJ
##  [1]  3967 12353 12743 13029 15549 10903 13212 14521     1     2

pHybrid@items

## $A107CTGSINY3GJ
##  [1]  3967 13029 12353 15549 12743 10903 13212 14521     1     2

Now as we see, the Hybrid model includes most of the items recommended by the UBCF model, but there are new items and the order is different.

Conclusion

In this project, three different recommender system algorithms was compared, the accuracy of all the three different models measured.

Additional experiment that could be performed. we can create another list off suggestion product which is compatible with recommended items. For example, if the recommended Electric product is a television than suggestion list the TV stand or Wall mount

Data 612 Project 4 - Accuracy and Beyond

Vinayak Patel

2020-07-03