library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
setwd('C://Users//graci//Documents//01-612 Recommen System HH MyDoc SurfaceBook2//Proj 2 HH to Submit')
getwd()
## [1] "C:/Users/graci/Documents/01-612 Recommen System HH MyDoc SurfaceBook2/Proj 2 HH to Submit"
# Lets quickly load and explore the dataset
data(MovieLense)
movMatrix<- as(MovieLense, "matrix")
dim(movMatrix)
## [1] 943 1664
## user user collaborative Filtering
#similarity matrix
similarity_users <- similarity(MovieLense[1:20, ], method = "cosine", which = "users")
image(as.matrix(similarity_users), main = "User similarity")
## Similarity Matrix for item-item
## user user collaborative Filtering
#similarity matrix
similarity_items <- similarity(MovieLense[1:20, ], method = "cosine", which = "item")
image(as.matrix(similarity_users), main = "item similarity")
set.seed(4567)
## kept at 5 unknown ratings, rest as known,
evaldata <- evaluationScheme(MovieLense, method="split", train=0.9, given=-5, goodRating=4)
#Create UBCF Recommender Model. UBCF stands for User-Based Collaborative Filtering
## normalize by Z score
UBCF_model <- Recommender( getData(evaldata, "train"),
method = "UBCF",
param=list(normalize = "Z-score", method="Cosine", nn=35))
## top 4 movies Reco for User1
UserSelectivePred <- predict(UBCF_model,
MovieLense[16,],
n=4, type = "topNList")
class (UserSelectivePred)
## [1] "topNList"
## attr(,"package")
## [1] "recommenderlab"
UserSelectivePred <- as(UserSelectivePred, "list")
class(UserSelectivePred)
## [1] "list"
# Get recommendation for User No16 only top 4 movies
UserSelectivePred[[1]] ## why 16 is out of boundry??, because it is a list of 1
## [1] "Titanic (1997)" "Devil's Advocate, The (1997)"
## [3] "Good Will Hunting (1997)" "Full Monty, The (1997)"
#Making predictions in ratings
prediction <- predict(UBCF_model, MovieLense [1:6, ], type="ratings")
as(prediction, "matrix")[,1:3]
## Toy Story (1995) GoldenEye (1995) Four Rooms (1995)
## 1 NA NA NA
## 2 NA 3.674310 3.707041
## 3 2.876188 2.718684 2.764706
## 4 4.435588 4.304348 4.304348
## 5 NA NA 2.874286
## 6 NA 3.639423 3.639423
# Now predicting for the test set and checking accuracy of model
recom <- predict(UBCF_model,
getData(evaldata, "known"),
n=4, type = "ratings")
calcPredictionAccuracy(recom, getData(evaldata, "unknown"))
## RMSE MSE MAE
## 1.0350109 1.0712476 0.8280937
## change the normalization to center
UBCF_model2 <- Recommender( getData(evaldata, "train"),
method = "UBCF",
param=list(normalize = "center", method="Cosine", nn=35))
## top 4 movies Reco for User1
UserSelectivePred2 <- predict(UBCF_model2,
MovieLense[16,],
n=4, type = "topNList")
UserSelectivePred2 <- as(UserSelectivePred2, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred2[[1]]
## [1] "Titanic (1997)" "Good Will Hunting (1997)"
## [3] "Full Monty, The (1997)" "Devil's Advocate, The (1997)"
#Making predictions in ratings
prediction2 <- predict(UBCF_model2, MovieLense [1:6, ], type="ratings")
as(prediction2, "matrix")[,1:3]
## Toy Story (1995) GoldenEye (1995) Four Rooms (1995)
## 1 NA NA NA
## 2 NA 3.678730 3.706735
## 3 2.799391 2.729695 2.764706
## 4 4.316401 4.304348 4.304348
## 5 NA NA 2.874286
## 6 NA 3.639423 3.639423
recom2 <- predict(UBCF_model2,
getData(evaldata, "known"),
n=4, type = "ratings")
calcPredictionAccuracy(recom2, getData(evaldata, "unknown"))
## RMSE MSE MAE
## 1.0364472 1.0742228 0.8281376
## IBCF Model
IBCF_model <- Recommender( getData(evaldata, "train"),
method = "IBCF",
param=list(normalize = "Z-score", method="Cosine", k=35))
# Now we will see top 4 movie recommendation for User No16
UserSelectivePred.IBCF <- predict(IBCF_model,
MovieLense[16,],
n=4, type = "topNList")
UserSelectivePred.IBCF <- as(UserSelectivePred.IBCF, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred.IBCF[[1]]
## [1] "White Balloon, The (1995)"
## [2] "Nadja (1994)"
## [3] "Pillow Book, The (1995)"
## [4] "When the Cats Away (Chacun cherche son chat) (1996)"
#Making predictions in ratings
prediction_IBCF <- predict(IBCF_model, MovieLense[1:6, ], type="ratings")
as(prediction_IBCF, "matrix")[,1:4]
## Toy Story (1995) GoldenEye (1995) Four Rooms (1995) Get Shorty (1995)
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA 4 NA
## 4 NA NA NA NA
## 5 NA NA 4 NA
## 6 NA NA NA NA
# test set and calculate the RMSE
recom.IBCF <- predict(IBCF_model,
getData(evaldata, "known"),
n=4, type = "ratings")
calcPredictionAccuracy(recom.IBCF, getData(evaldata, "unknown"))
## RMSE MSE MAE
## 1.2973360 1.6830808 0.9469697
# Now we will see top 4 movie recommendation for User No16
UserSelectivePred.IBCF <- predict(IBCF_model,
MovieLense[16,],
n=4, type = "topNList")
UserSelectivePred.IBCF <- as(UserSelectivePred.IBCF, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred.IBCF[[1]]
## [1] "White Balloon, The (1995)"
## [2] "Nadja (1994)"
## [3] "Pillow Book, The (1995)"
## [4] "When the Cats Away (Chacun cherche son chat) (1996)"
#Making predictions in ratings
prediction_IBCF3 <- predict(IBCF_model, MovieLense [1:6, ], type="ratings")
as(prediction_IBCF3, "matrix")[,1:3]
## Toy Story (1995) GoldenEye (1995) Four Rooms (1995)
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA 4
## 4 NA NA NA
## 5 NA NA 4
## 6 NA NA NA
## change the normalization to center
## IBCF Model
IBCF_model2 <- Recommender( getData(evaldata, "train"),
method = "IBCF",
param=list(normalize = "Center", method="Cosine", k=35))
# top 4 movie recommendation for User No16
UserSelectivePred.IBCF2 <- predict(IBCF_model2,
MovieLense[16,],
n=4, type = "topNList")
UserSelectivePred.IBCF2 <- as(UserSelectivePred.IBCF2, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred.IBCF2[[1]]
## [1] "Brother Minister: The Assassination of Malcolm X (1994)"
## [2] "Flipper (1996)"
## [3] "Maya Lin: A Strong Clear Vision (1994)"
## [4] "Pillow Book, The (1995)"
#Making predictions in ratings
prediction2 <- predict(IBCF_model2, MovieLense [1:6, ], type="ratings")
as(prediction2, "matrix")[,1:3]
## Toy Story (1995) GoldenEye (1995) Four Rooms (1995)
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA 4 NA
# test set and calculate the RMSE
recom.IBCF2 <- predict(IBCF_model2,
getData(evaldata, "known"),
n=4, type = "ratings")
calcPredictionAccuracy(recom.IBCF2, getData(evaldata, "unknown"))
## RMSE MSE MAE
## 1.422607 2.023810 1.000000
# top 4 movie recommendation for User No16
UserSelectivePred.IBCF2 <- predict(IBCF_model2,
MovieLense[16,],
n=4, type = "topNList")
UserSelectivePred.IBCF2 <- as(UserSelectivePred.IBCF2, "list")
# Get recommendation for User No16 only top 4 movies
UserSelectivePred.IBCF2[[1]]
## [1] "Brother Minister: The Assassination of Malcolm X (1994)"
## [2] "Flipper (1996)"
## [3] "Maya Lin: A Strong Clear Vision (1994)"
## [4] "Pillow Book, The (1995)"
From above 4 model run(UBCF Z-score, UBCF center, IBCF Z score, IBCF center), we see that RMSE is 1.111, 1.1126, 1.300, 1.194 respectively, which indicates that UBCF performs better than IBCF(UBCF have a smaller RMSE than IBDF). The center normalization method performs better than the Z score normalization.
models_to_evaluate = list (
IBCF_COS = list (name ='IBCF', param = list(method="cosine")),
UBCF_COS = list (name='UBCF', param=list(method="cosine")),
random = list(name ="RANDOM", param = NULL))
n_recommendations = c(1,5,10,20, 25)
results = evaluate (x=evaldata, method = models_to_evaluate, n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
## 1 [27.83sec/0.05sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0.02sec/0.62sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/0.11sec]
plot(results, y="ROC", annotate = 1, legend ="topleft")
title ("ROC Curve")
plot (results, y ='prec/rec', annotate=1)
title ("Precision-Recall")
From above, we see that the UBCF model performs better than IBCF model, when using cosine correlation as the choice of method. And UBCF runs much quicker than IBCF, so it is a much better choice among the two. Since UBCF is better, next we will examine within UBCF the choice of Cosine Correlation versus person’s correlation.
models_to_evaluate2 = list (
UBCF_COS = list (name ='UBCF', param = list(method="cosine")),
UBCF_pear = list (name='UBCF', param=list(method="pearson")),
random = list(name ="RANDOM", param = NULL))
n_recommendations2 = c(1,5,10,20, 25)
results2 = evaluate (x=evaldata, method = models_to_evaluate2, n = n_recommendations)
## UBCF run fold/sample [model time/prediction time]
## 1 [0.02sec/0.59sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0.02sec/0.8sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/0.1sec]
plot(results2, y="ROC", annotate = 1, legend ="topleft")
title ("ROC Curve")
plot (results2, y ='prec/rec', annotate=1)
title ("Precision-Recall")
We can see that the pearson’s correlation outperforms the cosine correlation in better areas under curve in ROC curve. This is the case for UBCF model. Their computational time is very similar.