## [1] 24938 101
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 20.00 25.00 24.92 30.00 35.00
## [1] -9.95
## [1] 9.81
hist(as.vector(as.matrix(jester)), main = "Distribution of Jester Ratings",
col = "yellow", xlab = "Ratings")#ok
boxplot(as.vector(as.matrix(jester)), col = "yellow", main = "Distribution of Jester Ratings", ylab = "Ratings")
average_ratings_per_user <- rowMeans(jester, na.rm = TRUE)
hist(average_ratings_per_user, main = "Distribution of the average rating per user",
col = "yellow")#ok
# memory cleanup
rm(average_ratings_per_user)
Prior to using any of the pre-built recommenderlab functions for collaborative filtering we must first convert the data frame to a realRatingMatrix. This is done by first converting the data frame to an R matrix, then converting that matrix to a realRatingMatrix using the as() function.
e <- evaluationScheme(rmat, method="split", train=0.8, given=15, goodRating=0)
#train UBCF cosine similarity models
# non-normalized
UBCF_N_C <- Recommender(getData(e, "train"), "UBCF",
param=list(normalize = NULL, method="Cosine"))
# centered
UBCF_C_C <- Recommender(getData(e, "train"), "UBCF",
param=list(normalize = "center",method="Cosine"))
# Z-score normalization
UBCF_Z_C <- Recommender(getData(e, "train"), "UBCF",
param=list(normalize = "Z-score",method="Cosine"))
p1 <- predict(UBCF_N_C, getData(e, "known"), type="ratings")
p2 <- predict(UBCF_C_C, getData(e, "known"), type="ratings")
p3 <- predict(UBCF_Z_C, getData(e, "known"), type="ratings")
error_UCOS <- rbind(
UBCF_N_C = calcPredictionAccuracy(p1, getData(e, "unknown")),
UBCF_C_C = calcPredictionAccuracy(p2, getData(e, "unknown")),
UBCF_Z_C = calcPredictionAccuracy(p3, getData(e, "unknown"))
)
#kable(error_UCOS)
head(error_UCOS)
## RMSE MSE MAE
## UBCF_N_C 4.938278 24.38659 3.950918
## UBCF_C_C 4.844106 23.46536 3.772790
## UBCF_Z_C 4.788592 22.93061 3.653038
boxplot(as.vector(as(p3, "matrix")), col = "yellow", main = "Distribution of Predicted Values for UBCF Z-Score/Cosine Model", ylab = "Ratings")
hist(as.vector(as(p3, "matrix")), main = "Distrib. of Predicted Values for UBCF Z-Score/Cosine Model", col = "yellow", xlab = "Predicted Ratings")
A direct comparison of the summary statistics for the raw data and the predictions obtained from the UBCF_Z_C model shows that the predicted values appear to fall within a narrower 1st to 3rd quartile range than do the raw ratings. Furthermore, we can see that predictions have been made for each of the 752,894 missing values within the original data set as evidenced by the lack of NA values in the prediction results.
summary(as.vector(as.matrix(jester)))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -9.95 -4.13 0.92 0.37 4.90 9.81 75076
summary(as.vector(p3@data@x))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10.00000 -2.78152 0.28162 -0.06342 2.95520 10.00000
Item-based collaborative filtering models using Euclidean Distance as the similarity metric are generated following the approach outlined above for the cosine similarity models:
# non-normalized
IBCF_N_E <- Recommender(getData(e, "train"), "IBCF",
param=list(normalize = NULL, method="Euclidean"))
# centered
IBCF_C_E <- Recommender(getData(e, "train"), "IBCF",
param=list(normalize = "center",method="Euclidean"))
# Z-score normalization
IBCF_Z_E <- Recommender(getData(e, "train"), "IBCF",
param=list(normalize = "Z-score",method="Euclidean"))
# compute predicted ratings
p1 <- predict(IBCF_N_E, getData(e, "known"), type="ratings")
p2 <- predict(IBCF_C_E, getData(e, "known"), type="ratings")
p3 <- predict(IBCF_Z_E, getData(e, "known"), type="ratings")
#Set all predictions that fall outside the valid range to the boundary values
p1@data@x[p1@data@x[] < -10] <- -10
p1@data@x[p1@data@x[] > 10] <- 10
p2@data@x[p2@data@x[] < -10] <- -10
p2@data@x[p2@data@x[] > 10] <- 10
p3@data@x[p3@data@x[] < -10] <- -10
p3@data@x[p3@data@x[] > 10] <- 10
error_IEUC <- rbind(
IBCF_N_E = calcPredictionAccuracy(p1, getData(e, "unknown")),
IBCF_C_E = calcPredictionAccuracy(p2, getData(e, "unknown")),
IBCF_Z_E = calcPredictionAccuracy(p3, getData(e, "unknown"))
)
head(error_IEUC)
## RMSE MSE MAE
## IBCF_N_E 4.981684 24.81717 3.725146
## IBCF_C_E 4.984341 24.84366 3.728713
## IBCF_Z_E 4.998385 24.98385 3.744801
Item-based collaborative filtering models using Pearson Correlation as the similarity metric are generated following the approach outlined above for our previous models:
# non-normalized
IBCF_N_P <- Recommender(getData(e, "train"), "IBCF",
param=list(normalize = NULL, method="pearson"))
# centered
IBCF_C_P <- Recommender(getData(e, "train"), "IBCF",
param=list(normalize = "center",method="pearson"))
# Z-score normalization
IBCF_Z_P <- Recommender(getData(e, "train"), "IBCF",
param=list(normalize = "Z-score",method="pearson"))
# compute predicted ratings
p1 <- predict(IBCF_N_P, getData(e, "known"), type="ratings")
p2 <- predict(IBCF_C_P, getData(e, "known"), type="ratings")
p3 <- predict(IBCF_Z_P, getData(e, "known"), type="ratings")
p1@data@x[p1@data@x[] < -10] <- -10
p1@data@x[p1@data@x[] > 10] <- 10
p2@data@x[p2@data@x[] < -10] <- -10
p2@data@x[p2@data@x[] > 10] <- 10
p3@data@x[p3@data@x[] < -10] <- -10
p3@data@x[p3@data@x[] > 10] <- 10
# aggregate the performance statistics
error_IPC <- rbind(
IBCF_N_P = calcPredictionAccuracy(p1, getData(e, "unknown")),
IBCF_C_P = calcPredictionAccuracy(p2, getData(e, "unknown")),
IBCF_Z_P = calcPredictionAccuracy(p3, getData(e, "unknown"))
)
head(error_IPC)
The table and barplot below summarize the performance of each of the 18 models evaluated above, with the models sorted in ascending order according to their respective RMSE scores.
#c_res <- data.frame(rbind(error_UCOS, error_UEUC, error_UPC, error_ICOS, error_IEUC, error_IPC))
c_res <- data.frame(rbind(error_UCOS,error_IEUC))#, error_IPC))
c_res <- c_res[order(c_res$RMSE ),]
head(c_res)
## RMSE MSE MAE
## UBCF_Z_C 4.788592 22.93061 3.653038
## UBCF_C_C 4.844106 23.46536 3.772790
## UBCF_N_C 4.938278 24.38659 3.950918
## IBCF_N_E 4.981684 24.81717 3.725146
## IBCF_C_E 4.984341 24.84366 3.728713
## IBCF_Z_E 4.998385 24.98385 3.744801