1.Data Preprocessing
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: US/Pacific
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.37 R6_2.6.1 fastmap_1.2.0 xfun_0.53
## [5] cachem_1.1.0 knitr_1.50 htmltools_0.5.8.1 rmarkdown_2.29
## [9] lifecycle_1.0.4 cli_3.6.5 sass_0.4.10 jquerylib_0.1.4
## [13] compiler_4.5.1 rstudioapi_0.17.1 tools_4.5.1 evaluate_1.0.4
## [17] bslib_0.9.0 yaml_2.3.10 rlang_1.1.6 jsonlite_2.0.0
2.EDA
library(skimr)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# Color palette used later
sentiment_colors <- c("0" = "#B00000", "1" = "#2ECC71")
# --- USER LOYALTY DATA (for EDA Plot 2) ---
# Positivity rate per user vs how many reviews they wrote
user_loyalty <- df %>%
group_by(user_id) %>%
summarise(
review_count = n(),
positivity_rate = mean(target == 1)
) %>%
ungroup()
# --- TF-IDF DATA (for EDA Plot 5) ---
tfidf_words <- df %>%
unnest_tokens(word, text_clean) %>%
anti_join(stop_words, by = "word") %>%
count(target, word, sort = TRUE) %>%
bind_tf_idf(word, target, n)
top_tfidf <- tfidf_words %>%
filter(nchar(word) > 3) %>%
group_by(target) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
mutate(word = reorder_within(word, tf_idf, target))
# ---------- EDA Plot 1: Business Variability by Sentiment ----------
# (keep your existing code from here)
# Create business-level variability feature
business_variability <- df %>%
group_by(business_id) %>%
summarise(business_review_variability = sd(target, na.rm = TRUE))
# Join back to main dataset
df <- df %>%
left_join(business_variability, by = "business_id")
ggplot(df, aes(x = factor(target),
y = business_review_variability,
fill = factor(target))) +
geom_boxplot(alpha = 0.7, color = "black") +
scale_fill_manual(values = c("0" = "#B00000", # Negative red
"1" = "#2ECC71")) + # Positive green
scale_x_discrete(labels = c("0" = "Negative", "1" = "Positive")) +
labs(
title = "Business Rating Variability by Sentiment Class",
subtitle = "Comparing variability of review sentiment within businesses",
x = "Sentiment (1 = Positive, 0 = Negative)",
y = "Standard Deviation of Review Sentiment per Business"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 18),
legend.position = "none"
)
## Warning: Removed 11804 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# EDA Plot 2: User Loyalty & Sentiment Behavior
ggplot(user_loyalty, aes(x = review_count, y = positivity_rate)) +
geom_point(aes(color = positivity_rate), alpha = 0.6, size = 2) +
scale_x_log10() +
scale_color_gradient(low = "#CC0000", high = "#2ECC71") + # red → green
labs(
title = "User Loyalty & Sentiment Behavior",
subtitle = "More experienced reviewers leave a more balanced mix of positive and negative sentiment",
x = "Total Reviews Written (log scale)",
y = "Positivity Rate (0 = all negative, 1 = all positive)",
color = "Positivity Rate"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12, color = "gray30"),
axis.title = element_text(face = "bold")
)
# EDA Plot 3: Review Length By Star Rating
df %>%
mutate(review_length = nchar(text_clean)) %>%
group_by(stars) %>%
summarise(mean_length = mean(review_length)) %>%
ggplot(aes(x = stars, y = mean_length, color = stars)) +
geom_line(size = 1.6) +
geom_point(size = 4, color = "black") + # black points
scale_color_gradientn(
colors = c("#CC0000", "#FFCC00", "#2ECC71") # red -> yellow -> green
) +
labs(
title = "Average Review Length by Star Rating",
subtitle = "Strong emotional responses (negative) produce longer reviews",
x = "Star Rating",
y = "Average Review Length (characters)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray30"),
axis.title = element_text(face = "bold")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# EDA Plot 4: Review Engagement (Reactions) by Sentiment Class
data %>%
pivot_longer(cols = c(useful, funny, cool),
names_to = "reaction_type",
values_to = "value") %>%
ggplot(aes(x = factor(target), y = value, fill = factor(target))) +
geom_boxplot(alpha = 0.75, outlier.alpha = 0.25, color = "gray30") +
stat_summary(fun = mean, geom = "point",
shape = 21, size = 3.5, fill = "black", color = "white") +
scale_y_log10() +
facet_wrap(~ reaction_type, scales = "free_y") +
scale_fill_manual(values = sentiment_colors) +
labs(
title = "Review Engagement by Sentiment Class",
subtitle = "Negative reviews receive higher engagement across useful (log scale)",
x = "Sentiment (1 = Positive, 0 = Negative)",
y = "Reaction Count (log scale)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12, color = "gray30"),
axis.title = element_text(face = "bold"),
strip.text = element_text(face = "bold"),
legend.position = "none"
)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 209367 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 209367 rows containing non-finite outside the scale range
## (`stat_summary()`).
# EDA Plot 5: Most Distinguishing Words in Positive vs Negative Reviews (TF-IDF)
ggplot(top_tfidf, aes(tf_idf, word, fill = factor(target))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ target, scales = "free_y",
labeller = as_labeller(c(`0` = "Negative Reviews", `1` = "Positive Reviews"))) +
scale_y_reordered() +
scale_fill_manual(values = sentiment_colors) +
labs(
title = "Most Distinguishing Words in Positive vs. Negative Reviews (TF-IDF)",
subtitle = "Negative sentiment displays highly differentiating problem language; positive sentiment uses more generic descriptors",
x = "TF-IDF Score",
y = ""
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12, color = "gray30"),
axis.title = element_text(face = "bold")
)
# Summary Statistics Table
library(skimr)
library(knitr)
library(kableExtra)
summary_vars <- df %>%
select(
stars, target,
useful, funny, cool,
user_review_count, user_average_stars, user_fans,
business_stars, business_review_count
)
skim_table <- skim(summary_vars) %>%
select(-n_missing, -complete_rate) # drop unwanted columns
skim_table %>%
kable(format = "html", caption = "Summary Statistics for Key Numeric Variables") %>%
kable_styling(
bootstrap_options = c("hover", "striped"),
full_width = FALSE,
font_size = 13
) %>%
row_spec(0, bold = TRUE, color = "white", background = "#2ECC71")
| skim_type | skim_variable | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
|---|---|---|---|---|---|---|---|---|---|
| numeric | stars | 3.7851474 | 1.3925425 | 1 | 3.00 | 4.00 | 5.00 | 5 | ▂▂▂▅▇ |
| numeric | target | 0.6755368 | 0.4681764 | 0 | 0.00 | 1.00 | 1.00 | 1 | ▃▁▁▁▇ |
| numeric | useful | 0.9831263 | 2.5022857 | 0 | 0.00 | 0.00 | 1.00 | 152 | ▇▁▁▁▁ |
| numeric | funny | 0.2985158 | 1.7885266 | 0 | 0.00 | 0.00 | 0.00 | 361 | ▇▁▁▁▁ |
| numeric | cool | 0.4773158 | 1.9531269 | 0 | 0.00 | 0.00 | 0.00 | 141 | ▇▁▁▁▁ |
| numeric | user_review_count | 121.6553474 | 331.9615460 | 0 | 8.00 | 27.00 | 103.00 | 17473 | ▇▁▁▁▁ |
| numeric | user_average_stars | 3.7792680 | 0.7996369 | 1 | 3.44 | 3.89 | 4.28 | 5 | ▁▁▃▇▅ |
| numeric | user_fans | 12.4285158 | 83.3267267 | 0 | 0.00 | 0.00 | 4.00 | 12497 | ▇▁▁▁▁ |
| numeric | business_stars | 3.7891474 | 0.6271263 | 1 | 3.50 | 4.00 | 4.00 | 5 | ▁▁▁▇▃ |
| numeric | business_review_count | 489.0101579 | 843.0648177 | 5 | 90.00 | 220.00 | 506.00 | 7568 | ▇▁▁▁▁ |
# Correlation heatmap
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
numeric_df <- df %>%
mutate(review_length = nchar(text_clean)) %>% # correct column name
select(
stars, target,
useful, funny, cool,
user_review_count, user_average_stars, user_fans,
business_stars, business_review_count,
review_length
)
corr_matrix <- round(cor(numeric_df, use = "complete.obs"), 2)
corr_melt <- melt(corr_matrix)
ggplot(corr_melt, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = value), color = "black", size = 3) +
scale_fill_gradient2(
low = "#CC0000", mid = "white", high = "#2ECC71", midpoint = 0,
limits = c(-1, 1)
) +
labs(
title = "Correlation Heatmap of Numeric Features",
subtitle = "Highlights relationships across review, user, and business metadata",
x = "",
y = "",
fill = "Correlation"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text.x = element_text(angle = 45, hjust = 1)
)
3.Feature Engineering
# Text Features
data <- data %>%
mutate(
# 1) Review Length
review_length = nchar(text_clean),
# 2) Negative Keyword Count
neg_word_count = str_count(text_clean, "\\b(bad|worst|awful|terrible|rude|disgusting|dirty|slow|unhelpful)\\b"),
# 3) Positive Keyword Count
pos_word_count = str_count(text_clean, "\\b(good|great|amazing|excellent|friendly|love|perfect|wonderful|happy)\\b"),
# 4) Negation Count
negation_count = str_count(text_clean, "\\b(not|no|never|nothing|none|n't)\\b"),
# 5) Emphasis Score
emphasis_score = str_count(text_clean, "[!?]{2,}") +
str_count(text_clean, "([A-Za-z])\\1{2,}"),
# Metadata Features
# 6) Total Reactions
total_reactions = useful + funny + cool,
# 7) Useful Ratio
useful_ratio = ifelse(total_reactions == 0, 0, useful / total_reactions),
# 8) Log User Review Count
log_user_reviews = log1p(user_review_count),
# 9) Log Business Reviews
log_business_reviews = log1p(business_review_count)
)
#10 ) Business Varaibility
business_variability <- data %>%
group_by(business_id) %>%
summarise(
business_review_variability = sd(target, na.rm = TRUE)
)
data <- data %>%
left_join(business_variability, by = "business_id")
colnames(data)
## [1] "review_id" "user_id"
## [3] "business_id" "stars"
## [5] "date" "text"
## [7] "useful" "funny"
## [9] "cool" "user_review_count"
## [11] "user_average_stars" "user_fans"
## [13] "business_name" "business_city"
## [15] "business_state" "business_stars"
## [17] "business_review_count" "target"
## [19] "text_clean" "review_length"
## [21] "neg_word_count" "pos_word_count"
## [23] "negation_count" "emphasis_score"
## [25] "total_reactions" "useful_ratio"
## [27] "log_user_reviews" "log_business_reviews"
## [29] "business_review_variability"
# Scale features appropriately
features_to_scale <- c(
"review_length",
"neg_word_count",
"pos_word_count",
"negation_count",
"emphasis_score",
"total_reactions",
"useful_ratio",
"log_user_reviews",
"log_business_reviews",
"business_review_variability"
)
data[features_to_scale] <- scale(data[features_to_scale])
summary(data[features_to_scale])
## review_length neg_word_count pos_word_count negation_count
## Min. :-1.0837 Min. :-0.3596 Min. :-1.0588 Min. :-0.64925
## 1st Qu.:-0.6488 1st Qu.:-0.3596 1st Qu.:-0.4523 1st Qu.:-0.64925
## Median :-0.3108 Median :-0.3596 Median :-0.4523 Median :-0.64925
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.2992 3rd Qu.:-0.3596 3rd Qu.: 0.7607 3rd Qu.:-0.00614
## Max. : 8.9246 Max. :15.2595 Max. :12.2845 Max. :57.23103
##
## emphasis_score total_reactions useful_ratio log_user_reviews
## Min. :-0.1648 Min. :-0.31949 Min. :-0.7682 Min. :-2.1539
## 1st Qu.:-0.1648 1st Qu.:-0.31949 1st Qu.:-0.7682 1st Qu.:-0.7918
## Median :-0.1648 Median :-0.31949 Median :-0.7682 Median :-0.0882
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.1648 3rd Qu.: 0.04378 3rd Qu.: 0.7229 3rd Qu.: 0.7252
## Max. :32.7685 Max. :75.42220 Max. : 1.7169 Max. : 3.9017
##
## log_business_reviews business_review_variability
## Min. :-2.72771 Min. :-1.7015
## 1st Qu.:-0.65157 1st Qu.:-0.3903
## Median : 0.02592 Median : 0.3527
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.65992 3rd Qu.: 0.6440
## Max. : 2.72400 Max. : 1.5102
## NA's :11804
feature_cols <- c(
"review_length",
"neg_word_count",
"pos_word_count",
"negation_count",
"emphasis_score",
"total_reactions",
"useful_ratio",
"log_user_reviews",
"log_business_reviews",
"business_review_variability"
)
4.Train/Test
set.seed(123) # ensures reproducible split
# 80/20 split index
train_index <- sample(seq_len(nrow(data)), size = 0.8 * nrow(data))
# Create train/test datasets
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
dim(train_data)
## [1] 76000 29
dim(test_data)
## [1] 19000 29
# Rebuild model datasets with only selected features + target
train_model <- train_data[, c(feature_cols, "target")]
test_model <- test_data[, c(feature_cols, "target")]
# Drop rows with missing values BEFORE training/testing
train_model <- drop_na(train_model)
test_model <- drop_na(test_model)
# Verify sizes
nrow(train_model)
## [1] 66584
nrow(test_model)
## [1] 16612
table(test_model$target)
##
## 0 1
## 5279 11333
#Why do we do this? : An 80/20 split ensures that the model is trained on the majority of the data while reserving a meaningful portion for unbiased evaluation. The training set allows the model to learn patterns in the data, while the test set simulates real-world "unseen"/"new" reviews to measure how well the model generalizes. Without a proper split, performance estimates would be overly optimistic because the model would be evaluated on data it has already seen.
#We set a seed to ensure reproducibility. Random operations such as train/test splitting can produce different results each time they run. Setting a seed locks in the randomization process so that the same rows are selected for training and testing every time the code is executed, allowing results to be consistently replicated and compared.
# ===========================
# MODEL 1: LOGISTIC REGRESSION
# ===========================
# Convert target to factor with explicit labels
train_data <- train_data %>%
mutate(target = ifelse(target == 1, "pos", "neg"),
target = factor(target, levels = c("neg", "pos")))
test_data <- test_data %>%
mutate(target = ifelse(target == 1, "pos", "neg"),
target = factor(target, levels = c("neg", "pos")))
# keep only features + target for modeling
train_model <- train_data[, c(feature_cols, "target")]
test_model <- test_data[, c(feature_cols, "target")]
train_model <- train_model %>% drop_na()
test_model <- test_model %>% drop_na()
library(caret)
library(pROC)
# Cross-validation setup for ROC
ctrl_roc <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
set.seed(123)
log_model <- train(
target ~ .,
data = train_model,
method = "glm",
family = "binomial",
trControl = ctrl_roc,
metric = "ROC"
)
# Weighted Logistic Regression
set.seed(123)
log_model_weighted <- train(
target ~ .,
data = train_model,
method = "glm",
family = "binomial",
trControl = ctrl_roc,
metric = "ROC", # still optimize by ROC
weights = ifelse(train_model$target == "neg", 3, 1)
)
# AUC on test set
log_pred_prob <- predict(log_model_weighted, newdata = test_model, type = "prob")[, 2]
log_roc <- roc(test_model$target, log_pred_prob)
## Setting levels: control = neg, case = pos
## Setting direction: controls < cases
log_auc <- as.numeric(auc(log_roc))
log_auc # ~0.84
## [1] 0.8421641
# ===========================
# MODEL 2: RANDOM FOREST (WEIGHTED & TUNED)
# ===========================
set.seed(123)
rf_grid_small <- expand.grid(
mtry = c(4, 6),
splitrule = "gini",
min.node.size = c(5, 10)
)
rf_model_weighted <- train(
target ~ .,
data = train_model,
method = "ranger",
metric = "ROC",
trControl = ctrl_roc,
tuneGrid = rf_grid_small,
class.weights = c(neg = 3, pos = 1),
num.trees = 400,
importance = "impurity"
)
rf_model_weighted
## Random Forest
##
## 66584 samples
## 10 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 53267, 53268, 53267, 53268, 53266
## Resampling results across tuning parameters:
##
## mtry min.node.size ROC Sens Spec
## 4 5 0.8517876 0.5936854 0.9028144
## 4 10 0.8536345 0.5945061 0.9033376
## 6 5 0.8477777 0.5964376 0.8971245
## 6 10 0.8502589 0.5947475 0.8999803
##
## Tuning parameter 'splitrule' was held constant at a value of gini
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 4, splitrule = gini
## and min.node.size = 10.
rfw_pred_prob <- predict(rf_model_weighted, newdata = test_model, type = "prob")[, 2]
rfw_roc <- roc(test_model$target, rfw_pred_prob)
## Setting levels: control = neg, case = pos
## Setting direction: controls < cases
rfw_auc <- as.numeric(auc(rfw_roc))
rfw_auc # ~0.85
## [1] 0.8536938
# ===========================
# MODEL 3: XGBOOST
# ===========================
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Use the same train_model / test_model (no redefinition)
xgb_train <- xgb.DMatrix(
data = as.matrix(train_model[, feature_cols]),
label = ifelse(train_model$target == "pos", 1, 0)
)
xgb_test <- xgb.DMatrix(
data = as.matrix(test_model[, feature_cols]),
label = ifelse(test_model$target == "pos", 1, 0)
)
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = 4,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8
)
set.seed(123)
xgb_model <- xgb.train(
params = params,
data = xgb_train,
nrounds = 150,
watchlist = list(val = xgb_test),
verbose = 1
)
## [1] val-auc:0.810508
## [2] val-auc:0.830280
## [3] val-auc:0.835386
## [4] val-auc:0.836768
## [5] val-auc:0.836858
## [6] val-auc:0.841557
## [7] val-auc:0.842659
## [8] val-auc:0.844684
## [9] val-auc:0.847355
## [10] val-auc:0.846860
## [11] val-auc:0.847291
## [12] val-auc:0.847199
## [13] val-auc:0.848977
## [14] val-auc:0.849693
## [15] val-auc:0.851784
## [16] val-auc:0.853015
## [17] val-auc:0.853595
## [18] val-auc:0.854361
## [19] val-auc:0.854587
## [20] val-auc:0.855387
## [21] val-auc:0.856106
## [22] val-auc:0.856589
## [23] val-auc:0.857023
## [24] val-auc:0.857156
## [25] val-auc:0.858688
## [26] val-auc:0.858979
## [27] val-auc:0.859381
## [28] val-auc:0.859599
## [29] val-auc:0.859856
## [30] val-auc:0.860019
## [31] val-auc:0.860212
## [32] val-auc:0.860578
## [33] val-auc:0.860649
## [34] val-auc:0.861149
## [35] val-auc:0.861122
## [36] val-auc:0.861201
## [37] val-auc:0.861913
## [38] val-auc:0.862160
## [39] val-auc:0.862414
## [40] val-auc:0.862396
## [41] val-auc:0.862790
## [42] val-auc:0.862910
## [43] val-auc:0.862953
## [44] val-auc:0.863293
## [45] val-auc:0.863426
## [46] val-auc:0.863483
## [47] val-auc:0.863763
## [48] val-auc:0.863915
## [49] val-auc:0.864146
## [50] val-auc:0.864245
## [51] val-auc:0.864460
## [52] val-auc:0.864551
## [53] val-auc:0.864562
## [54] val-auc:0.864737
## [55] val-auc:0.864734
## [56] val-auc:0.864967
## [57] val-auc:0.865003
## [58] val-auc:0.865083
## [59] val-auc:0.865258
## [60] val-auc:0.865450
## [61] val-auc:0.865422
## [62] val-auc:0.865421
## [63] val-auc:0.865523
## [64] val-auc:0.865686
## [65] val-auc:0.865722
## [66] val-auc:0.865780
## [67] val-auc:0.865806
## [68] val-auc:0.865936
## [69] val-auc:0.865930
## [70] val-auc:0.866047
## [71] val-auc:0.866170
## [72] val-auc:0.866385
## [73] val-auc:0.866403
## [74] val-auc:0.866508
## [75] val-auc:0.866546
## [76] val-auc:0.866571
## [77] val-auc:0.866594
## [78] val-auc:0.866627
## [79] val-auc:0.866659
## [80] val-auc:0.866639
## [81] val-auc:0.866634
## [82] val-auc:0.866776
## [83] val-auc:0.866877
## [84] val-auc:0.866905
## [85] val-auc:0.866940
## [86] val-auc:0.866931
## [87] val-auc:0.866897
## [88] val-auc:0.866932
## [89] val-auc:0.866933
## [90] val-auc:0.866950
## [91] val-auc:0.866944
## [92] val-auc:0.866973
## [93] val-auc:0.866998
## [94] val-auc:0.866974
## [95] val-auc:0.866982
## [96] val-auc:0.866971
## [97] val-auc:0.867051
## [98] val-auc:0.867102
## [99] val-auc:0.867127
## [100] val-auc:0.867136
## [101] val-auc:0.867135
## [102] val-auc:0.867106
## [103] val-auc:0.867145
## [104] val-auc:0.867168
## [105] val-auc:0.867177
## [106] val-auc:0.867170
## [107] val-auc:0.867176
## [108] val-auc:0.867169
## [109] val-auc:0.867179
## [110] val-auc:0.867208
## [111] val-auc:0.867245
## [112] val-auc:0.867290
## [113] val-auc:0.867312
## [114] val-auc:0.867371
## [115] val-auc:0.867377
## [116] val-auc:0.867478
## [117] val-auc:0.867510
## [118] val-auc:0.867566
## [119] val-auc:0.867526
## [120] val-auc:0.867501
## [121] val-auc:0.867503
## [122] val-auc:0.867520
## [123] val-auc:0.867531
## [124] val-auc:0.867522
## [125] val-auc:0.867518
## [126] val-auc:0.867570
## [127] val-auc:0.867597
## [128] val-auc:0.867589
## [129] val-auc:0.867629
## [130] val-auc:0.867624
## [131] val-auc:0.867630
## [132] val-auc:0.867665
## [133] val-auc:0.867639
## [134] val-auc:0.867651
## [135] val-auc:0.867638
## [136] val-auc:0.867680
## [137] val-auc:0.867689
## [138] val-auc:0.867679
## [139] val-auc:0.867758
## [140] val-auc:0.867791
## [141] val-auc:0.867791
## [142] val-auc:0.867842
## [143] val-auc:0.867834
## [144] val-auc:0.867829
## [145] val-auc:0.867839
## [146] val-auc:0.867852
## [147] val-auc:0.867824
## [148] val-auc:0.867829
## [149] val-auc:0.867824
## [150] val-auc:0.867782
xgb_pred <- predict(xgb_model, xgb_test)
xgb_auc <- roc(test_model$target, xgb_pred)$auc
## Setting levels: control = neg, case = pos
## Setting direction: controls < cases
xgb_auc # ~0.8656
## Area under the curve: 0.8678
# ===========================
# MODEL TRAINING SUMMARY TABLE
# ===========================
library(knitr)
library(kableExtra)
model_summary <- data.frame(
Model = c("Logistic Regression (Weighted)",
"Random Forest (Weighted, Tuned)",
"XGBoost (Tuned)"),
Tuning = c("class weights only (3:1 neg:pos)",
"mtry & min.node.size grid (5-fold CV) + class weights",
"depth, learning rate, nrounds, subsampling"),
Key_Parameters = c(
"glm (binomial link), weights(neg=3,pos=1)",
"ranger, mtry ∈ {4,6}, min.node.size ∈ {5,10}, class.weights(neg=3,pos=1), num.trees=400",
"nrounds=150, max_depth=4, eta=0.1, subsample=0.8, colsample_bytree=0.8"
),
AUC = c(round(log_auc, 4), round(rfw_auc, 4), round(as.numeric(xgb_auc), 4))
)
kable(model_summary, format = "html", caption = "Model Training Summary") %>%
kable_classic(full_width = FALSE, html_font = "Calibri") %>%
column_spec(1, bold = TRUE) %>%
column_spec(4, bold = TRUE, color = c("black", "black", "darkgreen"))
| Model | Tuning | Key_Parameters | AUC |
|---|---|---|---|
| Logistic Regression (Weighted) | class weights only (3:1 neg:pos) | glm (binomial link), weights(neg=3,pos=1) | 0.8422 |
| Random Forest (Weighted, Tuned) | mtry & min.node.size grid (5-fold CV) + class weights | ranger, mtry ∈ {4,6}, min.node.size ∈ {5,10}, class.weights(neg=3,pos=1), num.trees=400 | 0.8537 |
| XGBoost (Tuned) | depth, learning rate, nrounds, subsampling | nrounds=150, max_depth=4, eta=0.1, subsample=0.8, colsample_bytree=0.8 | 0.8678 |
library(caret)
library(pROC)
# 1) WEIGHTED LOGISTIC PREDICTIONS @ 0.5
log_pred_prob <- predict(log_model_weighted, newdata = test_model, type = "prob")[, 2]
log_pred_class <- ifelse(log_pred_prob > 0.5, "pos", "neg")
log_pred_class <- factor(log_pred_class, levels = c("neg", "pos"))
cm_log <- confusionMatrix(log_pred_class, test_model$target, positive = "pos")
log_roc <- roc(test_model$target, log_pred_prob)
## Setting levels: control = neg, case = pos
## Setting direction: controls < cases
log_auc <- as.numeric(auc(log_roc))
# 2) RANDOM FOREST (WEIGHTED) @ 0.5
rf_pred_prob <- predict(rf_model_weighted, newdata = test_model, type = "prob")[, 2]
rf_pred_class <- ifelse(rf_pred_prob > 0.5, "pos", "neg")
rf_pred_class <- factor(rf_pred_class, levels = c("neg", "pos"))
cm_rf <- confusionMatrix(rf_pred_class, test_model$target, positive = "pos")
rf_roc <- roc(test_model$target, rf_pred_prob)
## Setting levels: control = neg, case = pos
## Setting direction: controls < cases
rf_auc <- as.numeric(auc(rf_roc))
# 3) XGBOOST @ 0.5
test_matrix <- model.matrix(target ~ . - 1, data = test_model)
xgb_pred_prob <- predict(xgb_model, test_matrix)
xgb_pred_class <- ifelse(xgb_pred_prob > 0.5, "pos", "neg")
xgb_pred_class <- factor(xgb_pred_class, levels = c("neg", "pos"))
cm_xgb <- confusionMatrix(xgb_pred_class, test_model$target, positive = "pos")
xgb_roc <- roc(test_model$target, xgb_pred_prob)
## Setting levels: control = neg, case = pos
## Setting direction: controls < cases
xgb_auc <- as.numeric(auc(xgb_roc))
# 4) COLLECT METRICS INTO FINAL TABLE
results_test <- data.frame(
Model = c("Logistic Regression (0.5)",
"Random Forest (Tuned, 0.5)",
"XGBoost (0.5)"),
Accuracy = c(cm_log$overall["Accuracy"],
cm_rf$overall["Accuracy"],
cm_xgb$overall["Accuracy"]),
Precision = c(cm_log$byClass["Precision"],
cm_rf$byClass["Precision"],
cm_xgb$byClass["Precision"]),
Recall = c(cm_log$byClass["Recall"],
cm_rf$byClass["Recall"],
cm_xgb$byClass["Recall"]),
F1 = c(cm_log$byClass["F1"],
cm_rf$byClass["F1"],
cm_xgb$byClass["F1"]),
AUC = c(log_auc, rf_auc, xgb_auc)
)
results_test
## Model Accuracy Precision Recall F1 AUC
## 1 Logistic Regression (0.5) 0.7456056 0.8830441 0.7228448 0.7949539 0.8421641
## 2 Random Forest (Tuned, 0.5) 0.8022514 0.8228498 0.9049678 0.8619574 0.8536938
## 3 XGBoost (0.5) 0.8104984 0.8253956 0.9159975 0.8683396 0.8677819
library(kableExtra)
kable(results_test, format = "html", caption = "Model Evaluation on Test Set") %>%
kable_styling(full_width = FALSE, position = "center", bootstrap_options = c("striped", "hover")) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:6, width = "8em")
| Model | Accuracy | Precision | Recall | F1 | AUC |
|---|---|---|---|---|---|
| Logistic Regression (0.5) | 0.7456056 | 0.8830441 | 0.7228448 | 0.7949539 | 0.8421641 |
| Random Forest (Tuned, 0.5) | 0.8022514 | 0.8228498 | 0.9049678 | 0.8619574 | 0.8536938 |
| XGBoost (0.5) | 0.8104984 | 0.8253956 | 0.9159975 | 0.8683396 | 0.8677819 |
# Build tidy ROC data for each model
log_df <- data.frame(
fpr = 1 - log_roc$specificities,
tpr = log_roc$sensitivities,
model = "Logistic"
)
rf_df <- data.frame(
fpr = 1 - rf_roc$specificities,
tpr = rf_roc$sensitivities,
model = "Random Forest"
)
xgb_df <- data.frame(
fpr = 1 - xgb_roc$specificities,
tpr = xgb_roc$sensitivities,
model = "XGBoost"
)
roc_df <- bind_rows(log_df, rf_df, xgb_df)
# Add AUC to legend labels
auc_vals <- c(
Logistic = log_auc,
`Random Forest` = rf_auc,
XGBoost = xgb_auc
)
roc_df$model <- factor(
roc_df$model,
levels = c("Logistic", "Random Forest", "XGBoost"),
labels = paste0(
c("Logistic", "Random Forest", "XGBoost"),
" (AUC = ",
round(auc_vals[c("Logistic","Random Forest","XGBoost")], 3),
")"
)
)
ggplot(roc_df, aes(x = fpr, y = tpr, color = model)) +
geom_line(size = 1.1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", alpha = 0.5) +
labs(
title = "ROC Curves on Test Set",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
color = "Model"
) +
coord_equal() +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "bottom"
)
#Find Feature Importance
# Extract coefficients from weighted logistic model
log_coef <- summary(log_model_weighted)$coefficients
# Convert to data frame
log_imp <- data.frame(
Feature = rownames(log_coef),
Coefficient = log_coef[, "Estimate"]
)
# Remove intercept
log_imp <- log_imp[log_imp$Feature != "(Intercept)", ]
# Add absolute magnitude for ranking
log_imp$Abs_Coefficient <- abs(log_imp$Coefficient)
# Sort by importance
log_imp <- log_imp[order(-log_imp$Abs_Coefficient), ]
# Plot
library(ggplot2)
ggplot(log_imp, aes(x = reorder(Feature, Abs_Coefficient), y = Abs_Coefficient, fill = Coefficient > 0)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c("#B00000", "#2ECC71"), labels = c("Negative Effect", "Positive Effect")) +
labs(
title = "Logistic Regression Feature Importance",
subtitle = "Importance measured by absolute coefficient magnitude",
x = "Feature",
y = "Coefficient Magnitude",
fill = "Effect Direction"
) +
theme_minimal(base_size = 14)
rf_imp <- varImp(rf_model_weighted, scale = FALSE)
rf_imp
## ranger variable importance
##
## Overall
## negation_count 6458.5
## business_review_variability 6348.2
## review_length 6284.4
## log_business_reviews 6023.2
## neg_word_count 6001.4
## log_user_reviews 5609.0
## pos_word_count 5312.4
## total_reactions 1524.9
## useful_ratio 1458.6
## emphasis_score 260.5
plot(rf_imp, top = 10, main = "Random Forest Feature Importance")
library(xgboost)
library(ggplot2)
library(dplyr)
library(forcats)
# Extract importance
xgb_imp <- xgb.importance(model = xgb_model)
# Plot with new colors
ggplot(xgb_imp, aes(x = fct_rev(Feature), y = Gain)) +
geom_col(fill = "#3b82f6") + # blue
coord_flip() +
labs(title = "XGBoost Feature Importance",
x = "Feature",
y = "Gain (Contribution to Model Performance)") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
panel.grid.minor = element_blank()
)