options(scipen = 999)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(syuzhet)
library(stats)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(lsa)
## Loading required package: SnowballC
library(Matrix)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(viridis)
## Loading required package: viridisLite
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# --- 数据加载 ---
# 文件路径
setwd("/Users/meihanhe/Downloads")
review_file <- "merged_yelp_data.csv"
tfidf_file <- "tfidf_matrix.csv"
# 使用 readr 加载评论数据
review_data_orig <- data.table::fread(review_file) # 使用 data.table
# 加载 TF-IDF 数据
tfidf_matrix_orig <- data.table::fread(tfidf_file)
set.seed(123) # 设置随机种子以确保结果可重复
# 采样 100000 个索引
idx <- sample(1:nrow(review_data_orig), 100000, replace = FALSE)
# 根据索引提取样本
review_data <- review_data_orig[idx, ]
cat("Review data loaded. Dimensions:", dim(review_data), "\n")
## Review data loaded. Dimensions: 100000 18
print(head(review_data, 3))
## business_id review_id user_id stars.x
## <char> <char> <char> <int>
## 1: lTeoNvVo9cNvI5DLRkl9Sg 1RcX1_h4ngAYm74ObqDZHg zh4bVOKTUytds4u9HuNAiw 5
## 2: 469cDiOBw6Hs0dUdFQrEmg AsiPBSecONcHcqM6bx4j7w AdR-CWXalPXe7XqCm-BoQw 4
## 3: j99PtxlKTW_u5alE4jiqHQ P-I9byCkn4TlqEWDpM-VpA t-MbBRkNIR0eM_Ha2dAF0w 3
## useful funny cool
## <int> <int> <int>
## 1: 0 0 0
## 2: 0 0 0
## 3: 7 1 4
## text
## <char>
## 1: This really an Oasis. Great fresh food prepared just for you. There is something for everybody. Everything we ate was made just for us. A real treasure for downtown.
## 2: In need of a bit of help. I leave for my trip on Saturday morning March 4th but my appointment isn't until Thursday March 2nd. By going through this process will I receive my passport that same day, March 2nd? If not, is there any way to schedule another appointment otherwise? Thank you!
## 3: Ralph's is the oldest operating Italian restaurant in the country, so they're definitely doing some things right. Reservations were limited an we arrived to a packed house at 9pm. There are 3 floors of dining so you may end up climbing 2 flights of stairs. We only had to climb one set.\n\nFirst, it's small and cramped. The tables are small and you feel like you're in a china shop and if you move the wrong way, you're going to knock over your glass or drag your sleeve through some sauce. If you're good with that, then great. It wasn't relaxing though.\n\nWe started off by ordering a glass of wine. I had a cab and my wife had a pinot noir. The glasses are cute tiny wine glasses that were almost filled to the rim. A bigger glass would have been nice, not because it would have more wine, but because there probably wouldn't have been wine on the outside of the glass and around the base leaving little wine rings when you set it down. That's a waste of wine!\n\nBeing on the 2nd floor, everything is being brought up the small staircase. And the waiters are hustling and running and doing a great job. They are definitely in shape. They were pretty on top of things, but it was so busy that our water glasses sat empty for quite a while. Definitely not top notch service, but not horrible either.\n\nWe ordered the Antipasto which had a nice assortment of meats and cheeses and veggies. It wasn't swimming in dressing and it could have used a little more. It was my first antipasto with tuna (not ahi, more like the ""canned"" kind) and it wasn't bad. I wish there was more meat though. If you're sharing, you each get one piece of salami, prosciutto, and mortadella (all excellent). And it did have anchovies, so either rejoice or ask for it without!\n\nThe main courses came out and the the lasagna was excellent. It had many layers of pasta, ricotta, meat sauce, and a nice helping of their red gravy. I also ordered a side of meatballs and they were pretty good. Not too dense and not too light. Someone back there knows how to roll a meatball.\n\nThe veal piccata was flavorful and tasty, however, the meat itself was a bit tough. You really had to chew it and it really needed the sauce for the moisture. It was disappointing. There were 3 pieces and they were pretty tiny. The side of pasta was ok, but a little bland. We ended up mixing it with the lemon sauce and think we invented a new dish! It was much better. Lemon and red sauce! Who knew?\n\nRalph's was a mixed bag for us. It's definitely classic Italian. The lasagna portion was quite large, the veal not so much. It's probably something you should experience if you have the time and want to see what the fuss is about. We wouldn't go back again though. It was good, but not knock your socks off amazing.
## date name address
## <IDat> <char> <char>
## 1: 2014-04-01 The Oasis 821 W Main St
## 2: 2017-02-22 Philadelphia Passport Agency 200 Chestnut St, Room 103
## 3: 2020-03-15 Ralphs Italian Restaurant 760 S 9th St
## city state latitude longitude stars.y review_count
## <char> <char> <num> <num> <num> <int>
## 1: Lansdale PA 40.24766 -75.29350 4.5 127
## 2: Philadelphia PA 39.94782 -75.14514 4.0 100
## 3: Philadelphia PA 39.94006 -75.15787 3.5 653
## categories
## <char>
## 1: Mediterranean, Food, Specialty Food, Restaurants, Middle Eastern
## 2: Hotels & Travel, Travel Services, Passport & Visa Services, Public Services & Government, Embassy
## 3: Restaurants, Italian
tfidf_matrix <- tfidf_matrix_orig[idx, ]
cat("TF-IDF data loaded. Dimensions:", dim(tfidf_matrix), "\n")
## TF-IDF data loaded. Dimensions: 100000 151
print(head(tfidf_matrix[, 1:5], 3))
## also always amazing another area
## <num> <num> <num> <num> <num>
## 1: 0.0000000 0 0.00000000 0.0000000 0
## 2: 0.0000000 0 0.00000000 0.1506528 0
## 3: 0.0102515 0 0.01408533 0.0000000 0
# --- 1. 情感分析与评分关系 ---
cat("\n--- Analysis 1: Sentiment Analysis vs. Star Rating ---\n")
##
## --- Analysis 1: Sentiment Analysis vs. Star Rating ---
# 假设: 评论情感分数和用户评分有明显的正相关。
# 提取文本和评分
sentiment_data <- review_data %>%
select(review_id, text, stars_review = stars.x) %>%
filter(!is.na(text) & !is.na(stars_review)) %>%
# 添加一个临时的行号,用于后续关联情感分数
mutate(temp_id = row_number())
# 过滤掉空的文本,避免 syuzhet 出错
text_to_analyze <- sentiment_data$text[!is.na(sentiment_data$text)]
ids_to_analyze <- sentiment_data$temp_id[!is.na(sentiment_data$text)]
# 计算情感分数
sentiment_scores_vec <- get_sentiment(text_to_analyze, method = "syuzhet") # 使用 get_sentiment 获取单一分数
# 创建一个包含 ID 和分数的数据框
sentiment_df <- data.frame(temp_id = ids_to_analyze, sentiment_score = sentiment_scores_vec)
# 合并回原始数据
sentiment_data <- left_join(sentiment_data, sentiment_df, by = "temp_id")
# 查看情感分数分布和与评分的关系
summary(sentiment_data$sentiment_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -18.40 1.25 2.90 3.33 4.90 36.90
cor_test <- cor.test(sentiment_data$stars_review, sentiment_data$sentiment_score, method = "pearson")
cat("Correlation between review stars and sentiment score (Pearson):\n")
## Correlation between review stars and sentiment score (Pearson):
print(cor_test)
##
## Pearson's product-moment correlation
##
## data: sentiment_data$stars_review and sentiment_data$sentiment_score
## t = 146.58, df = 99998, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4154356 0.4256392
## sample estimates:
## cor
## 0.4205507
# 可视化结果
plot1 <- ggplot(sentiment_data, aes(x = as.factor(stars_review), y = sentiment_score)) +
geom_boxplot(na.rm = TRUE) +
labs(title = "Sentiment Score Distribution by Review Star Rating",
x = "Review Stars (stars.x)",
y = "Sentiment Score (Syuzhet Method)") +
theme_minimal()
print(plot1)
# 清理临时变量
rm(sentiment_scores_vec, sentiment_df, text_to_analyze, ids_to_analyze)
# --- 2. 聚类分析 ---
cat("\n--- Analysis 2: User Clustering ---\n")
##
## --- Analysis 2: User Clustering ---
# 假设: 多维度聚类分析会发现至少两个消费行为明显不同的用户群体。
# 准备用户级别的特征数据
# 合并情感分数 (使用上面计算的 sentiment_data)
review_data_with_sentiment <- left_join(review_data, select(sentiment_data, review_id, sentiment_score), by="review_id", relationship = "many-to-one")
user_features <- review_data_with_sentiment %>%
filter(!is.na(user_id) & !is.na(stars.x)) %>%
group_by(user_id) %>%
summarise(
review_count = n(),
avg_stars = mean(stars.x, na.rm = TRUE),
sd_stars = sd(stars.x, na.rm = TRUE),
avg_sentiment = mean(sentiment_score, na.rm = TRUE),
avg_useful = mean(useful, na.rm = TRUE),
avg_funny = mean(funny, na.rm = TRUE),
avg_cool = mean(cool, na.rm = TRUE),
.groups = 'drop' # 总结后放弃分组
) %>%
filter(review_count > 1)
# 处理可能的 NA/NaN 值 (在 sd_stars 和 avg_sentiment 中)
user_features <- user_features %>%
mutate(
sd_stars = ifelse(is.na(sd_stars) | is.nan(sd_stars), 0, sd_stars),
avg_sentiment = ifelse(is.na(avg_sentiment) | is.nan(avg_sentiment), 0, avg_sentiment)
)
# 选择用于聚类的特征
features_to_select <- c("avg_stars", "sd_stars", "review_count", "avg_useful")
features_to_select <- c(features_to_select, "avg_sentiment")
# 创建只包含选定特征的数据框,并识别有效行
features_for_clustering_df <- user_features %>%
select(user_id, all_of(features_to_select)) # Keep user_id for potential later use
# 识别用于聚类的有效行 (Complete and Finite Cases)
# 先只取数值列进行检查
numeric_features_df <- features_for_clustering_df %>% select(where(is.numeric))
valid_rows_logical <- complete.cases(numeric_features_df) &
(rowSums(is.finite(as.matrix(numeric_features_df))) == ncol(numeric_features_df))
# 创建只包含有效行的用户特征数据 (用于添加聚类结果)
user_features_valid <- user_features[valid_rows_logical, ]
# 创建用于聚类的数值矩阵 (从有效行中提取)
features_matrix_valid <- numeric_features_df[valid_rows_logical, ]
cat("Number of users eligible for clustering after filtering NAs/NaNs:", nrow(user_features_valid), "\n")
## Number of users eligible for clustering after filtering NAs/NaNs: 13626
# 标准化特征矩阵
scaled_features <- scale(features_matrix_valid) # scaled_features is now a matrix
optimal_k <- 3 # 根据图形选择 k=3
if (nrow(scaled_features) < optimal_k) {
cat("Warning: Number of valid data points (", nrow(scaled_features), ") is less than the chosen k (", optimal_k, "). Reducing k.\n")
optimal_k <- max(1, nrow(scaled_features)) # Adjust k if necessary
}
# 执行 K-Means 聚类
set.seed(123)
kmeans_result <- kmeans(scaled_features, centers = optimal_k, nstart = 25)
# 将聚类结果添加回有效的用户特征数据框,因为 user_features_valid 和 scaled_features 对应相同的行
user_features_valid$cluster <- as.factor(kmeans_result$cluster)
# 分析聚类结果 (使用 user_features_valid)
cluster_summary <- user_features_valid %>%
group_by(cluster) %>%
# Summarise across the features used for clustering plus review_count
summarise(
count = n(),
across(all_of(c(features_to_select, "review_count")), list(mean = ~mean(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE))),
.groups = 'drop'
)
cat("User Clustering Summary (k=", optimal_k, "):\n")
## User Clustering Summary (k= 3 ):
print.data.frame(cluster_summary)
## cluster count avg_stars_mean avg_stars_sd sd_stars_mean sd_stars_sd
## 1 1 7244 4.409013 0.5245015 0.4680043 0.4523396
## 2 2 744 3.789808 0.4618973 1.0321829 0.3398558
## 3 3 5638 2.916892 0.8073675 1.5703014 0.7966277
## review_count_mean review_count_sd avg_useful_mean avg_useful_sd
## 1 2.886113 1.477478 1.021186 1.254387
## 2 14.655914 10.978501 3.878483 7.664218
## 3 2.879212 1.480207 1.294681 1.603341
## avg_sentiment_mean avg_sentiment_sd
## 1 4.408676 2.245648
## 2 4.843043 2.152674
## 3 2.105555 1.767473
# 可视化聚类结果 (使用 PCA 降维)
pca_res <- prcomp(scaled_features, scale. = FALSE)
plot_cluster <- fviz_cluster(kmeans_result, data = scaled_features,
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal(),
main = paste("User Clusters (k=", optimal_k, ")"))
print(plot_cluster)
# --- 3. 推荐系统 ---
cat("\n--- Analysis 3: Recommendation System (Manual Implementation) ---\n")
##
## --- Analysis 3: Recommendation System (Manual Implementation) ---
# 我们实现了 Item-Based Collaborative Filtering (IBCF) 和一个 Content-Based Baseline。
# 然后结合两者实现一个简单的 Hybrid 模型。
# 最后,在测试集上评估它们的表现 (RMSE, MAE)。
# 数据准备和分割
set.seed(123) # for reproducibility
rating_data <- review_data %>%
select(user_id, business_id, rating = stars.x) %>%
filter(!is.na(user_id) & !is.na(business_id) & !is.na(rating)) %>%
# 为用户和物品创建连续的数字 ID
mutate(user_idx = as.numeric(factor(user_id)),
item_idx = as.numeric(factor(business_id))) %>%
select(user_idx, item_idx, rating, user_id, business_id) # 保留原始ID用于可能的调试
# 创建用户/物品 ID 到原始 ID 的映射
user_map <- distinct(rating_data, user_idx, user_id)
item_map <- distinct(rating_data, item_idx, business_id)
# 分割数据 (80% 训练, 20% 测试)
n_ratings <- nrow(rating_data)
train_indices <- sample(1:n_ratings, size = floor(0.8 * n_ratings))
train_data <- rating_data[train_indices, ]
test_data <- rating_data[-train_indices, ]
cat("Data split: Train =", nrow(train_data), "ratings, Test =", nrow(test_data), "ratings\n")
## Data split: Train = 80000 ratings, Test = 20000 ratings
# 检查测试集中的用户/物品是否也存在于训练集中
test_data <- test_data %>%
filter(user_idx %in% unique(train_data$user_idx) &
item_idx %in% unique(train_data$item_idx))
cat("Filtered Test set (users/items must be in train):", nrow(test_data), "ratings\n")
## Filtered Test set (users/items must be in train): 8218 ratings
# 创建训练集的用户-物品稀疏矩阵
n_users <- max(rating_data$user_idx)
n_items <- max(rating_data$item_idx)
train_matrix <- sparseMatrix(
i = train_data$user_idx,
j = train_data$item_idx,
x = train_data$rating,
dims = c(n_users, n_items),
dimnames = list(Users = 1:n_users, Items = 1:n_items)
)
cat("Training matrix created. Dimensions (users, items):", dim(train_matrix), "\n")
## Training matrix created. Dimensions (users, items): 65582 22187
# 计算全局平均分和用户平均分 (来自训练集)
global_avg_rating <- mean(train_data$rating)
user_avg_ratings <- train_data %>%
group_by(user_idx) %>%
summarise(user_avg = mean(rating), .groups = 'drop')
cat("Global average rating (train):", global_avg_rating, "\n")
## Global average rating (train): 3.69375
# Item-Based Collaborative Filtering (IBCF)
# 计算 Item-Item 相似度 (Cosine)
#
# 归一化处理: 余弦相似度本身有归一化效果
cat("Calculating Item-Item similarity...\n")
## Calculating Item-Item similarity...
# 我们考虑使用 tryCatch 处理可能的内存或计算问题
item_similarity_matrix <- tryCatch({
# 计算相似度
sim <- cosine(t(train_matrix))
# 将对角线设为 0,物品与自身相似度不用于预测
diag(sim) <- 0
# 处理 NaN (如果某物品没有评分,其列向量是0,会导致 cosine 为 NaN)
sim[is.na(sim)] <- 0
sim
}, error = function(e) {
cat("Error calculating item similarity:", e$message, "\n")
cat("Returning an empty similarity matrix.\n")
matrix(0, nrow=ncol(train_matrix), ncol=ncol(train_matrix),
dimnames=list(Items=colnames(train_matrix), Items=colnames(train_matrix)))
})
## Error calculating item similarity: argument mismatch. Either one matrix or two vectors needed as input.
## Returning an empty similarity matrix.
cat("Item similarity matrix calculated. Dimensions:", dim(item_similarity_matrix), "\n")
## Item similarity matrix calculated. Dimensions: 22187 22187
# IBCF 预测函数
predict_ibcf <- function(user_idx, item_idx, train_matrix, item_similarity,
user_avg_ratings, global_avg, k = 25) {
# 获取该用户在训练集中评价过的物品
items_rated_by_user <- which(train_matrix[user_idx, ] > 0)
if (length(items_rated_by_user) == 0) {
# 用户冷启动,返回用户平均分或全局平均分
user_avg <- user_avg_ratings$user_avg[user_avg_ratings$user_idx == user_idx]
return(ifelse(length(user_avg) > 0, user_avg, global_avg))
}
# 获取目标物品与其他物品的相似度
similarities_to_target <- item_similarity[item_idx, items_rated_by_user]
# 选取 Top-K 最相似的物品 (且相似度 > 0)
valid_indices <- which(similarities_to_target > 1e-6) # 避免浮点数误差和0相似度
if (length(valid_indices) == 0) {
# 没有相似的已评价物品,返回用户平均或全局平均
user_avg <- user_avg_ratings$user_avg[user_avg_ratings$user_idx == user_idx]
return(ifelse(length(user_avg) > 0, user_avg, global_avg))
}
# 排序并选取 top-k
top_k_indices_local <- order(similarities_to_target[valid_indices], decreasing = TRUE)[1:min(k, length(valid_indices))]
top_k_similarities <- similarities_to_target[valid_indices][top_k_indices_local]
top_k_items_global_idx <- items_rated_by_user[valid_indices][top_k_indices_local]
# 获取用户对这些 top-k 物品的评分
ratings_for_top_k <- train_matrix[user_idx, top_k_items_global_idx]
# 计算加权平均预测值
numerator <- sum(top_k_similarities * ratings_for_top_k)
denominator <- sum(abs(top_k_similarities)) # 使用绝对值避免负相似度问题
if (denominator == 0) {
# 分母为0,无法加权,返回用户平均或全局平均
user_avg <- user_avg_ratings$user_avg[user_avg_ratings$user_idx == user_idx]
return(ifelse(length(user_avg) > 0, user_avg, global_avg))
} else {
prediction <- numerator / denominator
# 限制预测值在合理范围 ( 1-5 星)
return(max(1, min(5, prediction)))
}
}
# Content-Based Baseline 预测函数
# 预测值 = 用户在训练集中的平均评分
predict_content_baseline <- function(user_idx, item_idx, user_avg_ratings, global_avg) {
user_avg <- user_avg_ratings$user_avg[user_avg_ratings$user_idx == user_idx]
# 如果用户是新用户,返回全局平均
return(ifelse(length(user_avg) > 0, user_avg, global_avg))
}
# Hybrid 预测函数
predict_hybrid <- function(ibcf_pred, content_pred, alpha = 0.7) {
prediction <- alpha * ibcf_pred + (1 - alpha) * content_pred
# 限制预测值在合理范围 (1-5 星)
return(max(1, min(5, prediction)))
}
# 在测试集上生成预测并评估
cat("Generating predictions on the test set...\n")
## Generating predictions on the test set...
n_test <- nrow(test_data)
predictions <- data.frame(
actual = test_data$rating,
pred_ibcf = numeric(n_test),
pred_content = numeric(n_test),
pred_hybrid = numeric(n_test)
)
pb <- txtProgressBar(min = 0, max = n_test, style = 3) # 查看进度条
## | | | 0%
# 检查 item_similarity_matrix 是否有效
similarity_valid <- exists("item_similarity_matrix") && is.matrix(item_similarity_matrix) && nrow(item_similarity_matrix) == n_items
for (i in 1:n_test) {
user <- test_data$user_idx[i]
item <- test_data$item_idx[i]
# IBCF Prediction
if (similarity_valid) {
predictions$pred_ibcf[i] <- predict_ibcf(user, item, train_matrix, item_similarity_matrix, user_avg_ratings, global_avg_rating, k = 25)
} else {
# 如果相似度矩阵计算失败了,我们可以使用基线代替 IBCF
user_avg <- user_avg_ratings$user_avg[user_avg_ratings$user_idx == user]
predictions$pred_ibcf[i] <- ifelse(length(user_avg) > 0, user_avg, global_avg_rating)
}
# Content Baseline Prediction
predictions$pred_content[i] <- predict_content_baseline(user, item, user_avg_ratings, global_avg_rating)
# Hybrid Prediction
predictions$pred_hybrid[i] <- predict_hybrid(predictions$pred_ibcf[i], predictions$pred_content[i], alpha = 0.7)
setTxtProgressBar(pb, i)
}
## | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |== | 4% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======= | 11% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========= | 14% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 19% | |============== | 20% | |============== | 21% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================ | 24% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |===================== | 31% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================= | 34% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |============================== | 44% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |===================================== | 54% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |=================================================== | 74% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |======================================================== | 81% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |========================================================== | 84% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================= | 94% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 99% | |======================================================================| 100%
close(pb)
cat("\nPredictions generated.\n")
##
## Predictions generated.
# 计算评估指标 (RMSE, MAE)
calculate_metrics <- function(predictions, actual) {
valid_indices <- !is.na(predictions) & !is.na(actual)
if(sum(valid_indices) == 0) {
return(list(RMSE = NA, MAE = NA, N = 0))
}
predictions <- predictions[valid_indices]
actual <- actual[valid_indices]
error <- predictions - actual
rmse <- sqrt(mean(error^2))
mae <- mean(abs(error))
return(list(RMSE = rmse, MAE = mae, N = length(predictions)))
}
metrics_ibcf <- calculate_metrics(predictions$pred_ibcf, predictions$actual)
metrics_content <- calculate_metrics(predictions$pred_content, predictions$actual)
metrics_hybrid <- calculate_metrics(predictions$pred_hybrid, predictions$actual)
cat("\n--- Evaluation Results (on Test Set) ---\n")
##
## --- Evaluation Results (on Test Set) ---
results_table <- data.frame(
Method = c("Item-Based CF (Manual)", "Content Baseline (User Avg)", "Hybrid (Weighted)"),
RMSE = c(metrics_ibcf$RMSE, metrics_content$RMSE, metrics_hybrid$RMSE),
MAE = c(metrics_ibcf$MAE, metrics_content$MAE, metrics_hybrid$MAE),
N_Predictions = c(metrics_ibcf$N, metrics_content$N, metrics_hybrid$N)
)
print(results_table, row.names = FALSE)
## Method RMSE MAE N_Predictions
## Item-Based CF (Manual) 1.464053 1.057748 8218
## Content Baseline (User Avg) 1.464053 1.057748 8218
## Hybrid (Weighted) 1.464053 1.057748 8218
# --- 4. 空间分析 ---
cat("\n--- Analysis 4: Spatial Analysis ---\n")
##
## --- Analysis 4: Spatial Analysis ---
# 假设: 发达地区和欠发达地区的店铺数量和平均评分有显著不同。
# 准备商家级别的空间数据
business_data <- review_data %>%
select(business_id, name, address, city, state, latitude, longitude, stars_business = stars.y, review_count, categories) %>%
filter(!is.na(latitude) & !is.na(longitude) & !is.na(state) & !is.na(stars_business)) %>%
# 确保经纬度在合理范围内
filter(latitude >= -90 & latitude <= 90 & longitude >= -180 & longitude <= 180) %>%
distinct(business_id, .keep_all = TRUE)
# 转换为 sf 空间对象
business_sf <- st_as_sf(business_data, coords = c("longitude", "latitude"), crs = 4326, remove = FALSE)
cat("Number of unique businesses with valid spatial data:", nrow(business_sf), "\n")
## Number of unique businesses with valid spatial data: 22187
# 店铺数量地区差异
state_counts <- business_sf %>%
st_drop_geometry() %>%
group_by(state) %>%
summarise(business_count = n()) %>%
arrange(desc(business_count))
cat("Business counts by state (Top 10):\n")
## Business counts by state (Top 10):
print(head(state_counts, 10))
## # A tibble: 1 × 2
## state business_count
## <chr> <int>
## 1 PA 22187
# 可视化店铺数量
if(nrow(state_counts) > 0) {
plot_state_counts <- ggplot(head(state_counts, 15), aes(x = reorder(state, business_count), y = business_count)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Number of Businesses per State (Top 15)", x = "State", y = "Number of Businesses") +
theme_minimal()
print(plot_state_counts)
}
# 店铺平均评分地区差异
state_avg_rating <- business_sf %>%
st_drop_geometry() %>%
group_by(state) %>%
summarise(
avg_rating = mean(stars_business, na.rm = TRUE),
business_count = n()
) %>%
filter(business_count >= 10) %>% # 只比较有一定数量商家的州
arrange(desc(avg_rating))
cat("Average business rating by state (Min. 10 businesses):\n")
## Average business rating by state (Min. 10 businesses):
print(state_avg_rating)
## # A tibble: 1 × 3
## state avg_rating business_count
## <chr> <dbl> <int>
## 1 PA 3.58 22187
# 可视化评分差异
top_states_for_plot <- state_avg_rating %>% arrange(desc(business_count)) %>% head(5) %>% pull(state) # 取商家最多的5个州进行比较
if (length(top_states_for_plot) > 0) {
plot_state_ratings <- ggplot(business_sf %>% filter(state %in% top_states_for_plot),
aes(x = reorder(state, stars_business, FUN = median), y = stars_business)) +
geom_boxplot(na.rm = TRUE) +
labs(title = paste("Business Rating Distribution in Top", length(top_states_for_plot), "States by Count"),
x = "State",
y = "Business Average Rating (stars.y)") +
theme_minimal()
print(plot_state_ratings)
}
# 空间分布和密度图
plot_map <- ggplot(business_sf) +
geom_sf(aes(color = stars_business), size = 0.5, alpha = 0.6, na.rm = TRUE) +
scale_color_viridis(option = "viridis", name = "Avg Rating") +
labs(title = "Spatial Distribution of Businesses by Average Rating") +
theme_minimal() +
theme(axis.text = element_blank(), axis.ticks = element_blank())
print(plot_map)