Data 612 Project 2
Introduction
For this project we will be comparing user-user collaborative filtering models with item-item collaborative filtering models on the Jester dataset from UC Berkeley. We will be using the recommenderlab library to train the models and compare the results of various tuning methods such as the similarity metric, neighborhood sizes, and normalization methods.
Exploratory Data Analysis
data("Jester")
sparse_matrix <- Jester@data
n_ratings <- length(sparse_matrix@x)
basic_summary <- data.frame(
Metric = c("Number of Users", "Number of Items", "Total Ratings",
"Sparsity (%)", "Density (%)"),
Value = c(nrow(Jester),
ncol(Jester),
n_ratings,
round((1 - n_ratings / (nrow(Jester) * ncol(Jester))) * 100, 2),
round((n_ratings / (nrow(Jester) * ncol(Jester))) * 100, 2))
)
rating_stats_df <- data.frame(
Statistic = c("Mean", "Median", "Std Dev",
"Minimum", "Maximum", "Q1", "Q3"),
Value = c(round(mean(sparse_matrix@x), 3),
round(median(sparse_matrix@x), 3),
round(sd(sparse_matrix@x), 3),
round(min(sparse_matrix@x), 3),
round(max(sparse_matrix@x), 3),
round(quantile(sparse_matrix@x, 0.25), 3),
round(quantile(sparse_matrix@x, 0.75), 3))
)We can see there are significantly more users than there are jokes, which will henceforth be referred to as items for simplicity. The dataset is actually very dense compared to more traditional datasets for recommender systems. While there are other methods we could use that would make better use of the density of the dataset we will stick to the previously mentioned models as per the assignment.
| Metric | Value |
|---|---|
| Number of Users | 24983.00 |
| Number of Items | 100.00 |
| Total Ratings | 1810455.00 |
| Sparsity (%) | 27.53 |
| Density (%) | 72.47 |
The mean and median are very close to 0 given the total value range of -10 to 10. This indicates the ratings are likely relatively centered, with a slight left skew given the difference in median and mean values. The first and third quantile values lend further support to a left skew being present.
| Statistic | Value |
|---|---|
| Mean | 0.880 |
| Median | 1.500 |
| Std Dev | 5.236 |
| Minimum | -9.950 |
| Maximum | 10.000 |
| Q1 | -2.960 |
| Q3 | 5.100 |
When we plot the ratings in a histogram we can see the left skew is indeed present, but the data overall is fairly uniformly distributed which is unexpected.
hist(getRatings(Jester), breaks = 50,
main = "Distribution of Joke Ratings",
xlab = "Rating", ylab = "Frequency")Instead of the traditional train/test split we will be dividing the data into three sets called train, test known, and test unknown. This split is commonly used for recommender systems to avoid the cold start problem when evaluating the models. The two test sets allow us to show the model some known ratings and ask for the unknown ratings from a given user.
Model Training Functions
The model training process is easily parallelized and as a result much of the logic works best broken into functions. The functions are defined in this section, and the analysis will continue in the next section.
Functions for caching and loading models/results to allow for quicker editing of non-modeling code and formatting.
# Create unique hash of parameter combinations
create_param_hash <- function(method, params) {
param_string <- paste(method, paste(names(params), params, collapse = "_"), sep = "_")
return(digest::md5(param_string))
}
# Function to save model results
save_model_results <- function(results, filename) {
filepath <- file.path(cache_dir, paste0(filename, ".rds"))
saveRDS(results, filepath)
cat("Saved results to:", filepath, "\n")
}
# Function to load cached results
load_cached_results <- function(filename) {
filepath <- file.path(cache_dir, paste0(filename, ".rds"))
if (file.exists(filepath)) {
return(readRDS(filepath))
}
return(NULL)
}A function to train and evaluate a recommender model, allows for easy parallelization when training models.
evaluate_model_parallel <- function(config, train_data, eval_scheme) {
method <- config$method
params <- config$params
result <- tryCatch({
# Basic modeling steps
model <- Recommender(train_data, method = method, parameter = params)
predictions <- predict(model, getData(eval_scheme, "known"), type = "ratings")
accuracy <- calcPredictionAccuracy(predictions, getData(eval_scheme, "unknown"))
data.frame(
method = method,
parameters = toString(params),
RMSE = accuracy["RMSE"],
MAE = accuracy["MAE"],
stringsAsFactors = FALSE
)
}, error = function(e) {
data.frame(
method = method,
parameters = toString(params),
RMSE = NA,
MAE = NA,
error = e$message,
stringsAsFactors = FALSE
)
})
return(result)
}Exporting the custom functions for use by the parallelized clusters.
The majority of the logic for model training exists in this function. Used for both user-user and item-item models to cut down on repeated code. The default values for passed parameters are used when the function is called later, and are included to aid with debugging as the function was developed. Results are cached at the end of the training to allow for quicker iteration of analysis.
cf_grid_search <- function(train_data, eval_scheme, method = "UBCF",
similarity_methods = c("cosine", "pearson", "jaccard"),
neighbor_k_values = c(10, 20, 30, 40, 50),
sample_sizes = c(0, 50), # Only used for UBCF
normalize_methods = c("center", "Z-score", NULL),
normalize_row = c(TRUE, FALSE), # Row vs column normalization
use_cache = TRUE) {
# Set cache filename based on method
cache_file <- paste0(tolower(method), "_grid_results")
# Try to load cached results first
if (use_cache) {
cached_results <- load_cached_results(cache_file)
if (!is.null(cached_results)) {
return(cached_results)
}
}
# Create parameter grids based on method
if (method == "UBCF") {
param_grid <- expand.grid(
similarity = similarity_methods,
neighbor_k = neighbor_k_values,
sample = sample_sizes,
normalize = normalize_methods,
norm_row = normalize_row,
stringsAsFactors = FALSE
)
} else if (method == "IBCF") {
param_grid <- expand.grid(
similarity = similarity_methods,
neighbor_k = neighbor_k_values,
normalize = normalize_methods,
norm_row = normalize_row,
stringsAsFactors = FALSE
)
param_grid$sample <- 0 # IBCF doesn't use sampling, set to 0 for consistency
} else {
stop("Method must be either 'UBCF' or 'IBCF'")
}
# Prepare configurations for parallel processing
configs <- list()
for (i in 1:nrow(param_grid)) {
params_row <- param_grid[i, ]
# Build parameter list based on method
if (method == "UBCF") {
cf_params <- list(
method = params_row$similarity,
nn = params_row$neighbor_k # neighborhood size for UBCF
)
# Add sample parameter if not using all users
if (params_row$sample > 0) {
cf_params$sample <- params_row$sample
}
} else { # IBCF
cf_params <- list(
method = params_row$similarity,
k = params_row$neighbor_k # k similar items for IBCF
)
}
# Add normalization if specified (both methods)
if (!is.na(params_row$normalize)) {
cf_params$normalize <- params_row$normalize
cf_params$row <- params_row$norm_row # TRUE = row normalization, FALSE = column
}
configs[[i]] <- list(method = method, params = cf_params)
}
# Run parallel evaluation
results_list <- foreach(config = configs, .packages = c("recommenderlab")) %dopar% {
evaluate_model_parallel(config, train_data, eval_scheme)
}
# Combine results
results <- do.call(rbind, results_list)
# Add original parameter columns for analysis
results$similarity <- param_grid$similarity
# Add method-specific columns with consistent naming
if (method == "UBCF") {
results$neighbors <- param_grid$neighbor_k
results$sample <- param_grid$sample
} else { # IBCF
results$k <- param_grid$neighbor_k
results$sample <- 0 # For consistency in analysis functions
}
results$normalize <- ifelse(is.na(param_grid$normalize), "none", param_grid$normalize)
results$norm_row <- param_grid$norm_row
results$cf_method <- method
# Remove failed configurations
results <- results[!is.na(results$RMSE), ]
# Cache results
if (use_cache) {
save_model_results(results, cache_file)
}
return(results)
}Wrapper functions, largely used for development.
ubcf_grid_search <- function(train_data, eval_scheme,
similarity_methods = c("cosine", "pearson", "jaccard"),
neighborhood_sizes = c(10, 20, 30, 40, 50),
sample_sizes = c(0, 50),
normalize_methods = c("center", "Z-score", NULL),
normalize_row = c(TRUE, FALSE),
use_cache = TRUE) {
return(cf_grid_search(train_data, eval_scheme, method = "UBCF",
similarity_methods = similarity_methods,
neighbor_k_values = neighborhood_sizes,
sample_sizes = sample_sizes,
normalize_methods = normalize_methods,
normalize_row = normalize_row,
use_cache = use_cache))
}
ibcf_grid_search <- function(train_data, eval_scheme,
similarity_methods = c("cosine", "pearson", "jaccard"),
k_values = c(10, 20, 30, 40, 50),
normalize_methods = c("center", "Z-score", NULL),
normalize_row = c(TRUE, FALSE),
use_cache = TRUE) {
return(cf_grid_search(train_data, eval_scheme, method = "IBCF",
similarity_methods = similarity_methods,
neighbor_k_values = k_values,
sample_sizes = c(0), # Not used for IBCF
normalize_methods = normalize_methods,
normalize_row = normalize_row,
use_cache = use_cache))
}A generic function for analysis of model results.
analyze_cf_results <- function(results, method = NULL) {
# Determine method from results if not specified
if (is.null(method)) {
if ("cf_method" %in% colnames(results)) {
method <- results$cf_method[1]
} else if ("neighbors" %in% colnames(results)) {
method <- "UBCF"
} else if ("k" %in% colnames(results)) {
method <- "IBCF"
} else {
stop("Cannot determine method type from results")
}
}
cat(paste("\n", method, "RESULTS ANALYSIS\n"))
# Sort by RMSE
results <- results[order(results$RMSE), ]
# Overall best models
cat(paste("\nTOP 5", method, "CONFIGURATIONS:\n"))
for (i in 1:min(5, nrow(results))) {
if (method == "UBCF") {
cat(sprintf("%d. %s similarity, %d neighbors, sample=%s, normalize=%s (%s)\n",
i, results$similarity[i], results$neighbors[i],
ifelse(results$sample[i] == 0, "all", results$sample[i]),
results$normalize[i],
ifelse(results$norm_row[i], "row", "col")))
} else { # IBCF
cat(sprintf("%d. %s similarity, k=%d, normalize=%s (%s)\n",
i, results$similarity[i], results$k[i],
results$normalize[i],
ifelse(results$norm_row[i], "row", "col")))
}
cat(sprintf(" RMSE: %.4f, MAE: %.4f\n\n", results$RMSE[i], results$MAE[i]))
}
# Analysis by similarity method
cat("PERFORMANCE BY SIMILARITY METHOD:\n")
for (sim in unique(results$similarity)) {
sim_results <- results[results$similarity == sim, ]
best_sim <- sim_results[1, ] # Already sorted by RMSE
avg_rmse <- mean(sim_results$RMSE)
if (method == "UBCF") {
cat(sprintf("%s: Best RMSE %.4f (avg: %.4f) with %d neighbors\n",
toupper(sim), best_sim$RMSE, avg_rmse, best_sim$neighbors))
} else { # IBCF
cat(sprintf("%s: Best RMSE %.4f (avg: %.4f) with k=%d\n",
toupper(sim), best_sim$RMSE, avg_rmse, best_sim$k))
}
}
# Analysis by neighborhood/k size
if (method == "UBCF") {
cat("\nOPTIMAL NEIGHBORHOOD SIZES:\n")
size_analysis <- aggregate(RMSE ~ neighbors, data = results, FUN = mean)
size_analysis <- size_analysis[order(size_analysis$RMSE), ]
for (i in 1:min(5, nrow(size_analysis))) {
cat(sprintf("%d neighbors: Average RMSE %.4f\n",
size_analysis$neighbors[i], size_analysis$RMSE[i]))
}
} else { # IBCF
cat("\nOPTIMAL K VALUES:\n")
size_analysis <- aggregate(RMSE ~ k, data = results, FUN = mean)
size_analysis <- size_analysis[order(size_analysis$RMSE), ]
for (i in 1:min(5, nrow(size_analysis))) {
cat(sprintf("k=%d: Average RMSE %.4f\n",
size_analysis$k[i], size_analysis$RMSE[i]))
}
}
# Normalization impact analysis
if (length(unique(results$normalize)) > 1) {
cat("\nNORMALIZATION METHOD IMPACT:\n")
norm_analysis <- aggregate(RMSE ~ normalize, data = results, FUN = mean)
norm_analysis <- norm_analysis[order(norm_analysis$RMSE), ]
for (i in 1:nrow(norm_analysis)) {
cat(sprintf("%s: Average RMSE %.4f\n",
norm_analysis$normalize[i], norm_analysis$RMSE[i]))
}
}
# Row vs Column normalization comparison
if ("norm_row" %in% colnames(results) && length(unique(results$norm_row)) > 1) {
cat("\nROW vs COLUMN NORMALIZATION COMPARISON:\n")
# Overall comparison
row_norm_results <- results[results$norm_row == TRUE, ]
col_norm_results <- results[results$norm_row == FALSE, ]
if (nrow(row_norm_results) > 0 && nrow(col_norm_results) > 0) {
row_mean_rmse <- mean(row_norm_results$RMSE)
col_mean_rmse <- mean(col_norm_results$RMSE)
cat(sprintf("Row normalization: Average RMSE %.4f (%d configurations)\n",
row_mean_rmse, nrow(row_norm_results)))
cat(sprintf("Column normalization: Average RMSE %.4f (%d configurations)\n",
col_mean_rmse, nrow(col_norm_results)))
if (row_mean_rmse < col_mean_rmse) {
improvement <- ((col_mean_rmse - row_mean_rmse) / col_mean_rmse) * 100
cat(sprintf("→ Row normalization performs %.2f%% better on average\n", improvement))
} else {
improvement <- ((row_mean_rmse - col_mean_rmse) / row_mean_rmse) * 100
cat(sprintf("→ Column normalization performs %.2f%% better on average\n", improvement))
}
}
}
return(results)
}More wrappers mostly used for development.
analyze_ubcf_results <- function(results) {
return(analyze_cf_results(results, method = "UBCF"))
}
analyze_ibcf_results <- function(results) {
return(analyze_cf_results(results, method = "IBCF"))
}A function to format a printout of a comparison between the ibcf and ubcf optimal models.
compare_ubcf_ibcf <- function(ubcf_results, ibcf_results) {
cat("\nUBCF vs IBCF COMPARISON\n")
# Get best from each
best_ubcf <- ubcf_results[1, ]
best_ibcf <- ibcf_results[1, ]
cat("BEST UBCF MODEL:\n")
cat(sprintf(" %s similarity, %d neighbors, normalize=%s\n",
best_ubcf$similarity, best_ubcf$neighbors, best_ubcf$normalize))
cat(sprintf(" RMSE: %.4f, MAE: %.4f\n\n", best_ubcf$RMSE, best_ubcf$MAE))
cat("BEST IBCF MODEL:\n")
cat(sprintf(" %s similarity, k=%d, normalize=%s\n",
best_ibcf$similarity, best_ibcf$k, best_ibcf$normalize))
cat(sprintf(" RMSE: %.4f, MAE: %.4f\n\n", best_ibcf$RMSE, best_ibcf$MAE))
# Determine winner
if (best_ubcf$RMSE < best_ibcf$RMSE) {
cat("Best performing model: User-Based Collaborative Filtering\n")
cat(sprintf("UBCF achieves %.4f RMSE vs IBCF's %.4f RMSE\n",
best_ubcf$RMSE, best_ibcf$RMSE))
return("UBCF")
} else {
cat("Best performing model: Item-Based Collaborative Filtering\n")
cat(sprintf("IBCF achieves %.4f RMSE vs UBCF's %.4f RMSE\n",
best_ibcf$RMSE, best_ubcf$RMSE))
return("IBCF")
}
}A function to fetch and build the best performing user and item based models.
# Function to build and cache the best models (unified)
build_best_models <- function(train_data, ubcf_results, ibcf_results, use_cache = TRUE) {
# Check for cached models
ubcf_model_file <- "best_ubcf_model"
ibcf_model_file <- "best_ibcf_model"
if (use_cache) {
cached_ubcf <- load_cached_results(ubcf_model_file)
cached_ibcf <- load_cached_results(ibcf_model_file)
if (!is.null(cached_ubcf) && !is.null(cached_ibcf)) {
return(list(ubcf = cached_ubcf, ibcf = cached_ibcf))
}
}
# Get best configurations
best_ubcf_config <- ubcf_results[1, ]
best_ibcf_config <- ibcf_results[1, ]
# Build UBCF model
ubcf_params <- list(
method = best_ubcf_config$similarity,
nn = best_ubcf_config$neighbors
)
if (best_ubcf_config$sample > 0) {
ubcf_params$sample <- best_ubcf_config$sample
}
if (best_ubcf_config$normalize != "none") {
ubcf_params$normalize <- best_ubcf_config$normalize
}
ubcf_model <- Recommender(train_data, method = "UBCF", parameter = ubcf_params)
# Build IBCF model
ibcf_params <- list(
method = best_ibcf_config$similarity,
k = best_ibcf_config$k
)
if (best_ibcf_config$normalize != "none") {
ibcf_params$normalize <- best_ibcf_config$normalize
}
ibcf_model <- Recommender(train_data, method = "IBCF", parameter = ibcf_params)
# Cache models
if (use_cache) {
save_model_results(ubcf_model, ubcf_model_file)
save_model_results(ibcf_model, ibcf_model_file)
}
return(list(ubcf = ubcf_model, ibcf = ibcf_model))
}Model Results
Looking at the top 5 performing UBCF by RMSE, we can see the user-user models tend to perform best with larger neighborhoods with pearson and cosine similarity methods performing meaningfully better than jaccard models. It is likely we would see further improvement with larger neighborhoods as which is not surprising given the density of the dataset.
ubcf_results <- ubcf_grid_search(
train_data,
eval_scheme,
similarity_methods = c("cosine", "pearson", "jaccard"),
neighborhood_sizes = c(10, 20, 30, 40, 50),
sample_sizes = c(0, 50),
normalize_methods = c("center", "Z-score", NULL),
normalize_row = c(TRUE, FALSE),
use_cache = TRUE
)
# Analysis of UBCF results
ubcf_final <- analyze_ubcf_results(ubcf_results)##
## UBCF RESULTS ANALYSIS
##
## TOP 5 UBCF CONFIGURATIONS:
## 1. pearson similarity, 40 neighbors, sample=50, normalize=Z-score (row)
## RMSE: 4.5011, MAE: 3.5080
##
## 2. cosine similarity, 40 neighbors, sample=50, normalize=center (row)
## RMSE: 4.5159, MAE: 3.5677
##
## 3. cosine similarity, 50 neighbors, sample=50, normalize=Z-score (col)
## RMSE: 4.5166, MAE: 3.5325
##
## 4. pearson similarity, 30 neighbors, sample=50, normalize=Z-score (row)
## RMSE: 4.5166, MAE: 3.5149
##
## 5. pearson similarity, 50 neighbors, sample=50, normalize=Z-score (col)
## RMSE: 4.5172, MAE: 3.5286
##
## PERFORMANCE BY SIMILARITY METHOD:
## PEARSON: Best RMSE 4.5011 (avg: 4.6761) with 40 neighbors
## COSINE: Best RMSE 4.5159 (avg: 4.6664) with 40 neighbors
## JACCARD: Best RMSE 4.5319 (avg: 4.6070) with 50 neighbors
##
## OPTIMAL NEIGHBORHOOD SIZES:
## 50 neighbors: Average RMSE 4.5590
## 40 neighbors: Average RMSE 4.5770
## 30 neighbors: Average RMSE 4.6132
## 20 neighbors: Average RMSE 4.6694
## 10 neighbors: Average RMSE 4.8306
##
## NORMALIZATION METHOD IMPACT:
## Z-score: Average RMSE 4.6464
## center: Average RMSE 4.6534
##
## ROW vs COLUMN NORMALIZATION COMPARISON:
## Row normalization: Average RMSE 4.6504 (60 configurations)
## Column normalization: Average RMSE 4.6493 (60 configurations)
## → Column normalization performs 0.03% better on average
The item-item models saw more separation based on distance measure, with jaccard similarity performing significantly worse than the other distance metrics. As with the user-user model larger K values saw better performance, again as we would expect from the density of the dataset.
ibcf_results <- ibcf_grid_search(
train_data,
eval_scheme,
similarity_methods = c("cosine", "pearson", "jaccard"),
k_values = c(10, 20, 30, 40, 50),
normalize_methods = c("center", "Z-score", NULL),
normalize_row = c(TRUE, FALSE),
use_cache = TRUE
)
ibcf_final <- analyze_ibcf_results(ibcf_results)##
## IBCF RESULTS ANALYSIS
##
## TOP 5 IBCF CONFIGURATIONS:
## 1. cosine similarity, k=50, normalize=center (row)
## RMSE: 4.5121, MAE: 3.5130
##
## 2. cosine similarity, k=50, normalize=center (col)
## RMSE: 4.5121, MAE: 3.5130
##
## 3. cosine similarity, k=50, normalize=Z-score (row)
## RMSE: 4.5147, MAE: 3.5133
##
## 4. cosine similarity, k=50, normalize=Z-score (col)
## RMSE: 4.5147, MAE: 3.5133
##
## 5. cosine similarity, k=40, normalize=center (row)
## RMSE: 4.5382, MAE: 3.5047
##
## PERFORMANCE BY SIMILARITY METHOD:
## COSINE: Best RMSE 4.5121 (avg: 4.7010) with k=50
## PEARSON: Best RMSE 4.5600 (avg: 4.7296) with k=50
## JACCARD: Best RMSE 5.0878 (avg: 5.4896) with k=50
##
## OPTIMAL K VALUES:
## k=50: Average RMSE 4.7252
## k=40: Average RMSE 4.8186
## k=30: Average RMSE 4.9207
## k=20: Average RMSE 5.0367
## k=10: Average RMSE 5.3659
##
## NORMALIZATION METHOD IMPACT:
## center: Average RMSE 4.9700
## Z-score: Average RMSE 4.9768
##
## ROW vs COLUMN NORMALIZATION COMPARISON:
## Row normalization: Average RMSE 4.9734 (30 configurations)
## Column normalization: Average RMSE 4.9734 (30 configurations)
## → Column normalization performs 0.00% better on average
For both model types the normalization method did not have a significant impact on performance. Neither the normalization method nor the direction changed the average RMSE of models by a meaningful amount. As mentioned in the EDA, the joke ratings are fairly uniform so we would expect normalization to have little effect.
Comparing the optimal user and item models, we can see they perform very similarly. The UBCF model performs slightly better on the test set, and would therefore be the recommended model in a real world scenario if choosing between the two examined model classes. Given the density of the dataset, memory based approaches could likely improve performance significantly.
##
## UBCF vs IBCF COMPARISON
## BEST UBCF MODEL:
## pearson similarity, 40 neighbors, normalize=Z-score
## RMSE: 4.5011, MAE: 3.5080
##
## BEST IBCF MODEL:
## cosine similarity, k=50, normalize=center
## RMSE: 4.5121, MAE: 3.5130
##
## Best performing model: User-Based Collaborative Filtering
## UBCF achieves 4.5011 RMSE vs IBCF's 4.5121 RMSE
Finally, examining the impact of the different tuning parameters, we can see that the most significant parameter was the neighborhood size or k value depending on model class. The similarity measure did have a significant impact on the IBCF model, as the jaccard distance resulted in one of the largest performance differences observed. Interestingly this was not seen in the UBCF model, where the different distance measures saw little performance variation especially for large neighborhoods/k values.
ubcf_plot_data <- ubcf_final |>
mutate(method = "UBCF",
size_param = neighbors,
# Fix NULL handling in normalize column
normalize = ifelse(is.na(normalize) | normalize == "none", "none", as.character(normalize)),
norm_type = paste(normalize, ifelse(norm_row, "(row)", "(col)"), sep=" "))
ibcf_plot_data <- ibcf_final |>
mutate(method = "IBCF",
size_param = k,
# Fix NULL handling in normalize column
normalize = ifelse(is.na(normalize) | normalize == "none", "none", as.character(normalize)),
norm_type = paste(normalize, ifelse(norm_row, "(row)", "(col)"), sep=" "))
combined_data <- rbind(
ubcf_plot_data |> select(method, similarity, size_param, RMSE, MAE, norm_type, normalize),
ibcf_plot_data |> select(method, similarity, size_param, RMSE, MAE, norm_type, normalize)
)
p1 <- combined_data |>
group_by(method, similarity) |>
summarise(mean_RMSE = mean(RMSE), .groups = 'drop') |>
ggplot(aes(x = similarity, y = mean_RMSE, fill = method)) +
geom_col(position = "dodge", alpha = 0.8) +
scale_fill_viridis_d(name = "Algorithm") +
labs(title = "Average RMSE by Similarity Method",
subtitle = "Comparing UBCF vs IBCF",
x = "Similarity Method", y = "RMSE") +
theme_minimal()
p2 <- combined_data |>
ggplot(aes(x = size_param, y = RMSE, color = method)) +
geom_point(alpha = 0.6, size = 2) +
geom_smooth(method = "loess", se = TRUE, alpha = 0.2) +
facet_wrap(~similarity, scales = "free_x") +
scale_color_viridis_d(name = "Algorithm") +
labs(title = "RMSE vs Neighborhood/K Size",
subtitle = "By similarity method",
x = "Neighborhood Size / K Value", y = "RMSE") +
theme_minimal()
p3 <- combined_data |>
filter(normalize != "none") |>
ggplot(aes(x = norm_type, y = RMSE, fill = method)) +
geom_boxplot(alpha = 0.7) +
scale_fill_viridis_d(name = "Algorithm") +
labs(title = "Normalization Impact on RMSE",
subtitle = "Row vs Column normalization",
x = "Normalization Type", y = "RMSE") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Combine with unified legend
layout <- (p1 + p3) / p2 +
plot_layout(heights = c(1, 1.5), guides = "collect") &
theme(legend.position = "bottom")
layout