READ DATA

library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
setwd('C://Users//graci//Documents//01-612 Recommen System HH MyDoc SurfaceBook2//Proj 2 HH to Submit')
getwd()
## [1] "C:/Users/graci/Documents/01-612 Recommen System HH MyDoc SurfaceBook2/Proj 2 HH to Submit"
# Lets quickly load and explore the dataset
data(MovieLense)
movMatrix<- as(MovieLense, "matrix")
dim(movMatrix)
## [1]  943 1664

Visualizing Data

## user user collaborative Filtering
#similarity matrix
similarity_users <- similarity(MovieLense[1:20, ], method = "cosine", which = "users")
image(as.matrix(similarity_users), main = "User similarity")

## Similarity Matrix for item-item
## user user collaborative Filtering
#similarity matrix
similarity_items <- similarity(MovieLense[1:20, ], method = "cosine", which = "item")
image(as.matrix(similarity_users), main = "item similarity")

set.seed(4567)
## kept at 5 unknown ratings, rest as known, 
evaldata <- evaluationScheme(MovieLense, method="split", train=0.9, given=-5, goodRating=4)

UBCF, Z Score Normalation

#Create UBCF Recommender Model. UBCF stands for User-Based Collaborative Filtering
## normalize by Z score
UBCF_model <- Recommender( getData(evaldata, "train"),
          method = "UBCF",
          param=list(normalize = "Z-score",    method="Cosine",    nn=35))

## top 4 movies Reco for User1
UserSelectivePred <- predict(UBCF_model,
                  MovieLense[16,],    
                  n=4, type = "topNList") 
class (UserSelectivePred)
## [1] "topNList"
## attr(,"package")
## [1] "recommenderlab"
UserSelectivePred <- as(UserSelectivePred, "list")

class(UserSelectivePred)
## [1] "list"
# Get recommendation for User No16 only top 4 movies
UserSelectivePred[[1]]  ##  why 16 is out of boundry??, because it is a list of 1
## [1] "Titanic (1997)"               "Devil's Advocate, The (1997)"
## [3] "Good Will Hunting (1997)"     "Full Monty, The (1997)"
#Making predictions in ratings
prediction <- predict(UBCF_model, MovieLense [1:6, ], type="ratings")
as(prediction, "matrix")[,1:3]
##   Toy Story (1995) GoldenEye (1995) Four Rooms (1995)
## 1               NA               NA                NA
## 2               NA         3.674310          3.707041
## 3         2.876188         2.718684          2.764706
## 4         4.435588         4.304348          4.304348
## 5               NA               NA          2.874286
## 6               NA         3.639423          3.639423
# Now predicting for the test set and checking accuracy of model
recom <- predict(UBCF_model,
                       getData(evaldata, "known"),
                      n=4, type = "ratings")
calcPredictionAccuracy(recom,  getData(evaldata, "unknown"))
##      RMSE       MSE       MAE 
## 1.0350109 1.0712476 0.8280937

UBCF-, Center Normalation

## change the normalization to center
UBCF_model2 <- Recommender( getData(evaldata, "train"),
          method = "UBCF",
          param=list(normalize = "center",    method="Cosine",    nn=35))

## top 4 movies Reco for User1
UserSelectivePred2 <- predict(UBCF_model2,
                  MovieLense[16,],    
                  n=4, type = "topNList")

UserSelectivePred2 <- as(UserSelectivePred2, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred2[[1]]
## [1] "Titanic (1997)"               "Good Will Hunting (1997)"    
## [3] "Full Monty, The (1997)"       "Devil's Advocate, The (1997)"
#Making predictions in ratings
prediction2 <- predict(UBCF_model2, MovieLense [1:6, ], type="ratings")
as(prediction2, "matrix")[,1:3]
##   Toy Story (1995) GoldenEye (1995) Four Rooms (1995)
## 1               NA               NA                NA
## 2               NA         3.678730          3.706735
## 3         2.799391         2.729695          2.764706
## 4         4.316401         4.304348          4.304348
## 5               NA               NA          2.874286
## 6               NA         3.639423          3.639423
recom2 <- predict(UBCF_model2,
                       getData(evaldata, "known"),
                      n=4, type = "ratings")
calcPredictionAccuracy(recom2,  getData(evaldata, "unknown"))
##      RMSE       MSE       MAE 
## 1.0364472 1.0742228 0.8281376

IBCF, Z Score Normalation

## IBCF Model
IBCF_model <- Recommender( getData(evaldata, "train"),
          method = "IBCF",
          param=list(normalize = "Z-score",     method="Cosine",     k=35))

# Now we will see top 4 movie recommendation for User No16
UserSelectivePred.IBCF <- predict(IBCF_model,
                              MovieLense[16,],
                              n=4, type = "topNList")

UserSelectivePred.IBCF <- as(UserSelectivePred.IBCF, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred.IBCF[[1]]
## [1] "White Balloon, The (1995)"                          
## [2] "Nadja (1994)"                                       
## [3] "Pillow Book, The (1995)"                            
## [4] "When the Cats Away (Chacun cherche son chat) (1996)"
#Making predictions in ratings
prediction_IBCF <- predict(IBCF_model, MovieLense[1:6, ], type="ratings")
as(prediction_IBCF, "matrix")[,1:4] 
##   Toy Story (1995) GoldenEye (1995) Four Rooms (1995) Get Shorty (1995)
## 1               NA               NA                NA                NA
## 2               NA               NA                NA                NA
## 3               NA               NA                 4                NA
## 4               NA               NA                NA                NA
## 5               NA               NA                 4                NA
## 6               NA               NA                NA                NA
#  test set and calculate the RMSE
recom.IBCF <- predict(IBCF_model,
                   getData(evaldata, "known"),
                  n=4, type = "ratings")

calcPredictionAccuracy(recom.IBCF,  getData(evaldata, "unknown"))
##      RMSE       MSE       MAE 
## 1.2973360 1.6830808 0.9469697
# Now we will see top 4 movie recommendation for User No16
UserSelectivePred.IBCF <- predict(IBCF_model,
                      MovieLense[16,],
                      n=4, type = "topNList")
UserSelectivePred.IBCF <- as(UserSelectivePred.IBCF, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred.IBCF[[1]]
## [1] "White Balloon, The (1995)"                          
## [2] "Nadja (1994)"                                       
## [3] "Pillow Book, The (1995)"                            
## [4] "When the Cats Away (Chacun cherche son chat) (1996)"
#Making predictions in ratings
prediction_IBCF3 <- predict(IBCF_model, MovieLense [1:6, ], type="ratings")
as(prediction_IBCF3, "matrix")[,1:3]  
##   Toy Story (1995) GoldenEye (1995) Four Rooms (1995)
## 1               NA               NA                NA
## 2               NA               NA                NA
## 3               NA               NA                 4
## 4               NA               NA                NA
## 5               NA               NA                 4
## 6               NA               NA                NA

IBCF- Center Normalization

## change the normalization to center
## IBCF Model
IBCF_model2 <- Recommender( getData(evaldata, "train"),
          method = "IBCF",
          param=list(normalize = "Center",     method="Cosine",     k=35))

# top 4 movie recommendation for User No16
UserSelectivePred.IBCF2 <- predict(IBCF_model2,
                              MovieLense[16,],
                              n=4, type = "topNList")
UserSelectivePred.IBCF2 <- as(UserSelectivePred.IBCF2, "list")

# Get recommendation for User No16 only top 4 movies
UserSelectivePred.IBCF2[[1]]
## [1] "Brother Minister: The Assassination of Malcolm X (1994)"
## [2] "Flipper (1996)"                                         
## [3] "Maya Lin: A Strong Clear Vision (1994)"                 
## [4] "Pillow Book, The (1995)"
#Making predictions in ratings
prediction2 <- predict(IBCF_model2, MovieLense [1:6, ], type="ratings")
as(prediction2, "matrix")[,1:3]
##   Toy Story (1995) GoldenEye (1995) Four Rooms (1995)
## 1               NA               NA                NA
## 2               NA               NA                NA
## 3               NA               NA                NA
## 4               NA               NA                NA
## 5               NA               NA                NA
## 6               NA                4                NA
#  test set and calculate the RMSE
recom.IBCF2 <- predict(IBCF_model2,
                   getData(evaldata, "known"),
                  n=4, type = "ratings")

calcPredictionAccuracy(recom.IBCF2,  getData(evaldata, "unknown"))
##     RMSE      MSE      MAE 
## 1.422607 2.023810 1.000000
# top 4 movie recommendation for User No16
UserSelectivePred.IBCF2 <- predict(IBCF_model2,
                      MovieLense[16,],
                      n=4, type = "topNList")

UserSelectivePred.IBCF2 <- as(UserSelectivePred.IBCF2, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred.IBCF2[[1]]
## [1] "Brother Minister: The Assassination of Malcolm X (1994)"
## [2] "Flipper (1996)"                                         
## [3] "Maya Lin: A Strong Clear Vision (1994)"                 
## [4] "Pillow Book, The (1995)"

From above 4 model run(UBCF Z-score, UBCF center, IBCF Z score, IBCF center), we see that RMSE is 1.111, 1.1126, 1.300, 1.194 respectively, which indicates that UBCF performs better than IBCF(UBCF have a smaller RMSE than IBDF). The center normalization method performs better than the Z score normalization.

ROC curve, Prec/Rec Curve

models_to_evaluate = list (
    IBCF_COS = list (name ='IBCF', param = list(method="cosine")),
    UBCF_COS =  list (name='UBCF', param=list(method="cosine")),
     random = list(name ="RANDOM", param = NULL))
                   
n_recommendations = c(1,5,10,20, 25)
results = evaluate (x=evaldata, method = models_to_evaluate, n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
##   1  [27.83sec/0.05sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.02sec/0.62sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.11sec]
plot(results, y="ROC", annotate = 1, legend ="topleft")
title ("ROC Curve")

plot (results, y ='prec/rec', annotate=1)
title ("Precision-Recall")

From above, we see that the UBCF model performs better than IBCF model, when using cosine correlation as the choice of method. And UBCF runs much quicker than IBCF, so it is a much better choice among the two. Since UBCF is better, next we will examine within UBCF the choice of Cosine Correlation versus person’s correlation.

models_to_evaluate2 = list (
    UBCF_COS = list (name ='UBCF', param = list(method="cosine")),
    UBCF_pear =  list (name='UBCF', param=list(method="pearson")),
     random = list(name ="RANDOM", param = NULL))

n_recommendations2 = c(1,5,10,20, 25)
results2 = evaluate (x=evaldata, method = models_to_evaluate2, n = n_recommendations)
## UBCF run fold/sample [model time/prediction time]
##   1  [0.02sec/0.59sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.02sec/0.8sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.1sec]
plot(results2, y="ROC", annotate = 1, legend ="topleft")
title ("ROC Curve")

plot (results2, y ='prec/rec', annotate=1)
title ("Precision-Recall")

We can see that the pearson’s correlation outperforms the cosine correlation in better areas under curve in ROC curve. This is the case for UBCF model. Their computational time is very similar.