## Warning: package 'irlba' was built under R version 3.3.3
## Warning: package 'svd' was built under R version 3.3.3
## Warning: package 'recommenderlab' was built under R version 3.3.3
## Warning: package 'arules' was built under R version 3.3.3
## Warning: package 'proxy' was built under R version 3.3.3
#load data and take a look at it
data(MovieLense)
head(MovieLense)
## 1 x 1664 rating matrix of class 'realRatingMatrix' with 271 ratings.
ml <- as(MovieLense,"realRatingMatrix")
ml_n <- normalize(ml)
image(ml, main = "Raw Movielense Data")
image(ml_n, main = "Normalized Movielens Data")
image(MovieLense[1:100,1:100])
#Create the user-item matrix
ml_matrix <- as(MovieLense,"realRatingMatrix")
ml_matrix
## 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
head(ml_matrix)
## 1 x 1664 rating matrix of class 'realRatingMatrix' with 271 ratings.
#Normalize by subtracting the row mean from all ratings in the row
ml_n <- normalize(ml_matrix)
#SVD
ml_svd <- svd(ml_n@data)
ml_diag <- ml_svd$d
ml_U <- ml_svd$u
ml_V <- t(as.matrix(ml_svd$v))
ml_diag <- Diagonal(x = ml_diag[1:40])
ml_U <- ml_U[1:40, ]
ml_V <- ml_V[1:40, ]
ml_predict <- rowMeans(ml_n)[1:40] + (ml_U * sqrt(ml_diag)) * (sqrt(ml_diag)) * t(ml_V)
## Note: method with signature 'matrix#Matrix' chosen for function '*',
## target signature 'matrix#ddiMatrix'.
## "ANY#ddiMatrix" would also be valid
## Note: method with signature 'Matrix#matrix' chosen for function '*',
## target signature 'ddiMatrix#matrix'.
## "ddiMatrix#ANY" would also be valid
#Create prediction matrix
ml_prediction_matrix <- t(as.matrix(ml_predict))
#Build RMSE function
RMSE <- function(ml_predict, ml_V){
sqrt(mean((ml_predict - ml_V)^2, na.rm = T))
}
#Check number of rows since the last line is erroring out with a row count error - ran perfectly fine the first time, don't understand why it started erroring out with no change in the code
nrow(ml_U)
## [1] 40
nrow(ml_V)
## [1] 40
nrow(ml_predict)
## [1] 40
ncol(ml_U)
## [1] 943
ncol(ml_V)
## [1] 1664
ncol(ml_diag)
## [1] 40
#Calculate RMSE
#RMSE(ml_predict, ml_V)
The last line worked perfectly, giving me an RMSE of ~3.5, and then suddenly stopped working without any change in the code. I don’t understand why. I put the checks in above to check the number of rows to make sure they were equal, and they are. There are no solutions online when the error is searched online. I tried clearing my environment completely using rm(list=ls()) hoping that would clear it up, but nothing seems to be working.
#binarize
ml_b <- binarize(ml_matrix, minRating = 0.001)
#Create a random sample of 500 users
set.seed(1234)
ml.samp <- sample(MovieLense, 500)
ml.samp
## 500 x 1664 rating matrix of class 'realRatingMatrix' with 53136 ratings.
#show all of the ratings for one user
rowCounts(ml.samp[1,])
## 108
## 33
as(ml.samp[1,], "list")
## $`108`
## Toy Story (1995)
## 4
## Twelve Monkeys (1995)
## 5
## Richard III (1995)
## 5
## Mighty Aphrodite (1995)
## 3
## Postino, Il (1994)
## 5
## Muppet Treasure Island (1996)
## 3
## Star Wars (1977)
## 4
## Fargo (1996)
## 4
## Independence Day (ID4) (1996)
## 3
## Lone Star (1996)
## 4
## Phenomenon (1996)
## 3
## Godfather, The (1972)
## 4
## Big Night (1996)
## 5
## Return of the Jedi (1983)
## 3
## Star Trek: First Contact (1996)
## 2
## Jerry Maguire (1996)
## 3
## Lost World: Jurassic Park, The (1997)
## 3
## My Best Friend's Wedding (1997)
## 2
## Sense and Sensibility (1995)
## 5
## River Wild, The (1994)
## 4
## Time to Kill, A (1996)
## 3
## Tin Cup (1996)
## 3
## Fierce Creatures (1997)
## 4
## Liar Liar (1997)
## 4
## Fly Away Home (1996)
## 3
## Everyone Says I Love You (1996)
## 5
## Mission: Impossible (1996)
## 3
## Courage Under Fire (1996)
## 2
## Boot, Das (1981)
## 5
## In the Bleak Midwinter (1995)
## 4
## Jane Eyre (1996)
## 3
## Saint, The (1997)
## 3
## Island of Dr. Moreau, The (1996)
## 2
#show rating averages for the first 6 users in the randomized sample
#this data set is enormous so we'll just show the head
head(rowMeans(ml.samp))
## 108 587 574 586 809 601
## 3.575758 2.967742 3.500000 3.325301 3.450000 3.081081
#shows how many movies each user rated in the full data set
#this data set is enormous so we'll just show the head
head(rowCounts(MovieLense))
## 1 2 3 4 5 6
## 271 61 51 23 175 208
hist(rowCounts(MovieLense), main = "Number of Ratings per User")
#shows how many times each movie was rated in the full data set
#this data set is enormous so we'll just show the head
head(colCounts(MovieLense))
## Toy Story (1995)
## 452
## GoldenEye (1995)
## 131
## Four Rooms (1995)
## 90
## Get Shorty (1995)
## 209
## Copycat (1995)
## 86
## Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
## 26
hist(colCounts(MovieLense), main = "Number of Ratings per Movie")
#shows the average rating per user in the full data set
#this data set is enormous so we'll just show the head
head(rowMeans(MovieLense))
## 1 2 3 4 5 6
## 3.605166 3.704918 2.764706 4.304348 2.874286 3.639423
#shows the average rating per movie in the full data set
#this data set is enormous so we'll just show the head
head(colMeans(MovieLense))
## Toy Story (1995)
## 3.878319
## GoldenEye (1995)
## 3.206107
## Four Rooms (1995)
## 3.033333
## Get Shorty (1995)
## 3.550239
## Copycat (1995)
## 3.302326
## Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
## 3.576923
############################# POPULAR ##################################
#Build a popularity recommender using the first 1,000 users
r <- Recommender(MovieLense[1:800], method = "POPULAR")
r
## Recommender of type 'POPULAR' for 'realRatingMatrix'
## learned using 800 users.
getModel(r)$topN
## Recommendations as 'topNList' with n = 1664 for 1 users.
#Recommend 3 movies to 2 users
recom <- predict(r, MovieLense[809:810], n = 3)
as(recom, "list")
## $`809`
## [1] "Star Wars (1977)" "Fargo (1996)" "Godfather, The (1972)"
##
## $`810`
## [1] "Star Wars (1977)" "Fargo (1996)" "Godfather, The (1972)"
#gives us predicted ratings for these users
recom <- predict(r, MovieLense[809:810], type = "ratings")
as(recom, "matrix")[,1:10]
## Toy Story (1995) GoldenEye (1995) Four Rooms (1995) Get Shorty (1995)
## 809 3.741677 3.175188 3.100275 3.427367
## 810 4.731677 4.165188 4.090275 4.417367
## Copycat (1995) Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
## 809 3.221336 3.6587
## 810 4.211336 4.6487
## Twelve Monkeys (1995) Babe (1995) Dead Man Walking (1995)
## 809 3.658617 3.834488 3.762438
## 810 4.648617 4.824488 4.752438
## Richard III (1995)
## 809 3.73027
## 810 4.72027
#if we want to include the original ratings for the users as well:
recom <- predict(r, MovieLense[809:810], type = "ratingMatrix")
as(recom, "matrix")[,1:10]
## Toy Story (1995) GoldenEye (1995) Four Rooms (1995) Get Shorty (1995)
## 809 3.741677 3.175188 3.100275 3.427367
## 810 4.731677 4.165188 4.090275 4.417367
## Copycat (1995) Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
## 809 3.221336 3.6587
## 810 4.211336 4.6487
## Twelve Monkeys (1995) Babe (1995) Dead Man Walking (1995)
## 809 3.658617 3.834488 3.762438
## 810 4.648617 4.824488 4.752438
## Richard III (1995)
## 809 3.73027
## 810 4.72027
ml.cross <- evaluationScheme(MovieLense[1:800], method = "cross", k = 4, given = 3, goodRating = 5)
ml.cross
## Evaluation scheme with 3 items given
## Method: 'cross-validation' with 4 run(s).
## Good ratings: >=5.000000
## Data set: 800 x 1664 rating matrix of class 'realRatingMatrix' with 85001 ratings.
ml.cross.results <- evaluate(ml.cross, method = "POPULAR", type = "topNList", n = c(1, 3, 5, 10, 15, 20))
## POPULAR run fold/sample [model time/prediction time]
## 1 [0sec/0.48sec]
## 2 [0sec/0.45sec]
## 3 [0.02sec/0.44sec]
## 4 [0sec/0.45sec]
plot(ml.cross.results, annotate = TRUE, main = "ROC Curve for POPULAR Recommender Method")
plot(ml.cross.results, "prec/rec", annotate = TRUE, main = "Precision-Recall for POPULAR Recommender Method")
######################### UBCF and IBCF ############################
#divide the data into 90% training 10% test
div <- evaluationScheme(MovieLense[1:943], method="split", train = 0.9, given = 15, goodRating = 5)
div
## Evaluation scheme with 15 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.900
## Good ratings: >=5.000000
## Data set: 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
#Create the user-based collaborative filtering recommender using the training data
r.ubcf <- Recommender(getData(div, "train"), "UBCF")
r.ubcf
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 848 users.
#Create the item-based collaborative filtering recommender using the training data
r.ibcf <- Recommender(getData(div, "train"), "IBCF")
r.ibcf
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 848 users.
#Compute predicted ratings for test data that is known using the UBCF algorithm
p.ubcf <- predict(r.ubcf, getData(div, "known"), type = "ratings")
p.ubcf
## 95 x 1664 rating matrix of class 'realRatingMatrix' with 156655 ratings.
#Compute predicted ratings for test data that is known using the IBCF algorithm
p.ibcf <- predict(r.ibcf, getData(div, "known"), type = "ratings")
p.ibcf
## 95 x 1664 rating matrix of class 'realRatingMatrix' with 19363 ratings.
#Calculate the error between training prediction and unknown test data
error <- rbind(
UBCF = calcPredictionAccuracy(p.ubcf, getData(div, "unknown")),
IBCF = calcPredictionAccuracy(p.ibcf, getData(div, "unknown")))
error
## RMSE MSE MAE
## UBCF 1.085484 1.178276 0.8644544
## IBCF 1.265857 1.602394 0.9299493
#another less clean way to code the same thing
error.ubcf<-calcPredictionAccuracy(p.ubcf, getData(div, "unknown"))
error.ibcf<-calcPredictionAccuracy(p.ibcf, getData(div, "unknown"))
error <- rbind(error.ubcf,error.ibcf)
rownames(error) <- c("UBCF","IBCF")
error
## RMSE MSE MAE
## UBCF 1.085484 1.178276 0.8644544
## IBCF 1.265857 1.602394 0.9299493