## [1] 24938   101
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   20.00   25.00   24.92   30.00   35.00
## [1] -9.95
## [1] 9.81

Plot histogram

hist(as.vector(as.matrix(jester)), main = "Distribution of Jester Ratings",
     col = "yellow", xlab = "Ratings")#ok

boxplot(as.vector(as.matrix(jester)), col = "yellow", main = "Distribution of Jester Ratings", ylab = "Ratings")

average_ratings_per_user <- rowMeans(jester, na.rm = TRUE)

hist(average_ratings_per_user, main = "Distribution of the average rating per user",
     col = "yellow")#ok

# memory cleanup
rm(average_ratings_per_user)

Creating Training and Testing Subsets

Prior to using any of the pre-built recommenderlab functions for collaborative filtering we must first convert the data frame to a realRatingMatrix. This is done by first converting the data frame to an R matrix, then converting that matrix to a realRatingMatrix using the as() function.

Convert the jester data frame to a matrix

e <- evaluationScheme(rmat, method="split", train=0.8, given=15, goodRating=0)

#train UBCF cosine similarity models

# non-normalized
UBCF_N_C <- Recommender(getData(e, "train"), "UBCF", 
      param=list(normalize = NULL, method="Cosine"))

# centered
UBCF_C_C <- Recommender(getData(e, "train"), "UBCF", 
      param=list(normalize = "center",method="Cosine"))

# Z-score normalization
UBCF_Z_C <- Recommender(getData(e, "train"), "UBCF", 
      param=list(normalize = "Z-score",method="Cosine"))

Compute predicted ratings

p1 <- predict(UBCF_N_C, getData(e, "known"), type="ratings")

p2 <- predict(UBCF_C_C, getData(e, "known"), type="ratings")

p3 <- predict(UBCF_Z_C, getData(e, "known"), type="ratings")

Set all predictions that fall outside the valid range to the boundary values

Aggregate the performance statistics

error_UCOS <- rbind(
  UBCF_N_C = calcPredictionAccuracy(p1, getData(e, "unknown")),
  UBCF_C_C = calcPredictionAccuracy(p2, getData(e, "unknown")),
  UBCF_Z_C = calcPredictionAccuracy(p3, getData(e, "unknown"))
)
#kable(error_UCOS)
head(error_UCOS)
##              RMSE      MSE      MAE
## UBCF_N_C 4.938278 24.38659 3.950918
## UBCF_C_C 4.844106 23.46536 3.772790
## UBCF_Z_C 4.788592 22.93061 3.653038

A boxplot and histogram of the Z-score model’s predicted values demonstrates that their distribution is nearly normal:

boxplot(as.vector(as(p3, "matrix")), col = "yellow", main = "Distribution of Predicted Values for UBCF Z-Score/Cosine Model", ylab = "Ratings")

hist(as.vector(as(p3, "matrix")), main = "Distrib. of Predicted Values for UBCF Z-Score/Cosine Model", col = "yellow", xlab = "Predicted Ratings")

A direct comparison of the summary statistics for the raw data and the predictions obtained from the UBCF_Z_C model shows that the predicted values appear to fall within a narrower 1st to 3rd quartile range than do the raw ratings. Furthermore, we can see that predictions have been made for each of the 752,894 missing values within the original data set as evidenced by the lack of NA values in the prediction results.

summary(as.vector(as.matrix(jester)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -9.95   -4.13    0.92    0.37    4.90    9.81   75076
summary(as.vector(p3@data@x))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -10.00000  -2.78152   0.28162  -0.06342   2.95520  10.00000

Item-Based Collaborative Filtering: Euclidean Distance

Item-based collaborative filtering models using Euclidean Distance as the similarity metric are generated following the approach outlined above for the cosine similarity models:

Train IBCF Euclidean Distance models

# non-normalized
IBCF_N_E <- Recommender(getData(e, "train"), "IBCF", 
      param=list(normalize = NULL, method="Euclidean"))

# centered
IBCF_C_E <- Recommender(getData(e, "train"), "IBCF", 
      param=list(normalize = "center",method="Euclidean"))

# Z-score normalization
IBCF_Z_E <- Recommender(getData(e, "train"), "IBCF", 
      param=list(normalize = "Z-score",method="Euclidean"))

Evaluation of the models is performed as follows:

# compute predicted ratings
p1 <- predict(IBCF_N_E, getData(e, "known"), type="ratings")

p2 <- predict(IBCF_C_E, getData(e, "known"), type="ratings")

p3 <- predict(IBCF_Z_E, getData(e, "known"), type="ratings")

#Set all predictions that fall outside the valid range to the boundary values

p1@data@x[p1@data@x[] < -10] <- -10
p1@data@x[p1@data@x[] > 10] <- 10

p2@data@x[p2@data@x[] < -10] <- -10
p2@data@x[p2@data@x[] > 10] <- 10

p3@data@x[p3@data@x[] < -10] <- -10
p3@data@x[p3@data@x[] > 10] <- 10

Aggregate the performance statistics

error_IEUC <- rbind(
  IBCF_N_E = calcPredictionAccuracy(p1, getData(e, "unknown")),
  IBCF_C_E = calcPredictionAccuracy(p2, getData(e, "unknown")),
  IBCF_Z_E = calcPredictionAccuracy(p3, getData(e, "unknown"))
)
head(error_IEUC)
##              RMSE      MSE      MAE
## IBCF_N_E 4.981684 24.81717 3.725146
## IBCF_C_E 4.984341 24.84366 3.728713
## IBCF_Z_E 4.998385 24.98385 3.744801

Item-Based Collaborative Filtering: Pearson Correlation

Item-based collaborative filtering models using Pearson Correlation as the similarity metric are generated following the approach outlined above for our previous models:

Train IBCF pearson correlation models

# non-normalized
IBCF_N_P <- Recommender(getData(e, "train"), "IBCF", 
      param=list(normalize = NULL, method="pearson"))

# centered
IBCF_C_P <- Recommender(getData(e, "train"), "IBCF", 
      param=list(normalize = "center",method="pearson"))

# Z-score normalization
IBCF_Z_P <- Recommender(getData(e, "train"), "IBCF", 
      param=list(normalize = "Z-score",method="pearson"))

# compute predicted ratings
p1 <- predict(IBCF_N_P, getData(e, "known"), type="ratings")

p2 <- predict(IBCF_C_P, getData(e, "known"), type="ratings")

p3 <- predict(IBCF_Z_P, getData(e, "known"), type="ratings")

Set all predictions that fall outside the valid range to the boundary values

p1@data@x[p1@data@x[] < -10] <- -10
p1@data@x[p1@data@x[] > 10] <- 10

p2@data@x[p2@data@x[] < -10] <- -10
p2@data@x[p2@data@x[] > 10] <- 10

p3@data@x[p3@data@x[] < -10] <- -10
p3@data@x[p3@data@x[] > 10] <- 10

# aggregate the performance statistics
error_IPC <- rbind(
  IBCF_N_P = calcPredictionAccuracy(p1, getData(e, "unknown")),
  IBCF_C_P = calcPredictionAccuracy(p2, getData(e, "unknown")),
  IBCF_Z_P = calcPredictionAccuracy(p3, getData(e, "unknown"))
)
head(error_IPC)

Conclusions

The table and barplot below summarize the performance of each of the 18 models evaluated above, with the models sorted in ascending order according to their respective RMSE scores.

#c_res <- data.frame(rbind(error_UCOS, error_UEUC, error_UPC, error_ICOS, error_IEUC, error_IPC))
c_res <- data.frame(rbind(error_UCOS,error_IEUC))#, error_IPC))
c_res <- c_res[order(c_res$RMSE ),]

head(c_res)
##              RMSE      MSE      MAE
## UBCF_Z_C 4.788592 22.93061 3.653038
## UBCF_C_C 4.844106 23.46536 3.772790
## UBCF_N_C 4.938278 24.38659 3.950918
## IBCF_N_E 4.981684 24.81717 3.725146
## IBCF_C_E 4.984341 24.84366 3.728713
## IBCF_Z_E 4.998385 24.98385 3.744801