HW4_Anna_Keough

1.Data Preprocessing

## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: US/Pacific
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.37     R6_2.6.1          fastmap_1.2.0     xfun_0.53        
##  [5] cachem_1.1.0      knitr_1.50        htmltools_0.5.8.1 rmarkdown_2.29   
##  [9] lifecycle_1.0.4   cli_3.6.5         sass_0.4.10       jquerylib_0.1.4  
## [13] compiler_4.5.1    rstudioapi_0.17.1 tools_4.5.1       evaluate_1.0.4   
## [17] bslib_0.9.0       yaml_2.3.10       rlang_1.1.6       jsonlite_2.0.0

2.EDA

library(skimr)
library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Color palette used later
sentiment_colors <- c("0" = "#B00000", "1" = "#2ECC71")

# --- USER LOYALTY DATA (for EDA Plot 2) ---
# Positivity rate per user vs how many reviews they wrote
user_loyalty <- df %>%
  group_by(user_id) %>%
  summarise(
    review_count   = n(),
    positivity_rate = mean(target == 1)
  ) %>%
  ungroup()

# --- TF-IDF DATA (for EDA Plot 5) ---
tfidf_words <- df %>%
  unnest_tokens(word, text_clean) %>%
  anti_join(stop_words, by = "word") %>%
  count(target, word, sort = TRUE) %>%
  bind_tf_idf(word, target, n)

top_tfidf <- tfidf_words %>%
  filter(nchar(word) > 3) %>%
  group_by(target) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, tf_idf, target))

# ---------- EDA Plot 1: Business Variability by Sentiment ----------
# (keep your existing code from here)
# Create business-level variability feature
business_variability <- df %>%
  group_by(business_id) %>%
  summarise(business_review_variability = sd(target, na.rm = TRUE))

# Join back to main dataset
df <- df %>%
  left_join(business_variability, by = "business_id")

ggplot(df, aes(x = factor(target),
               y = business_review_variability,
               fill = factor(target))) +
  geom_boxplot(alpha = 0.7, color = "black") +
  scale_fill_manual(values = c("0" = "#B00000",   # Negative red
                               "1" = "#2ECC71")) +  # Positive green
  scale_x_discrete(labels = c("0" = "Negative", "1" = "Positive")) +
  labs(
    title = "Business Rating Variability by Sentiment Class",
    subtitle = "Comparing variability of review sentiment within businesses",
    x = "Sentiment (1 = Positive, 0 = Negative)",
    y = "Standard Deviation of Review Sentiment per Business"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    legend.position = "none"
  )

## Warning: Removed 11804 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# EDA Plot 2: User Loyalty & Sentiment Behavior
ggplot(user_loyalty, aes(x = review_count, y = positivity_rate)) +
  geom_point(aes(color = positivity_rate), alpha = 0.6, size = 2) +
  scale_x_log10() +
  scale_color_gradient(low = "#CC0000", high = "#2ECC71") +  # red → green
  labs(
    title = "User Loyalty & Sentiment Behavior",
    subtitle = "More experienced reviewers leave a more balanced mix of positive and negative sentiment",
    x = "Total Reviews Written (log scale)",
    y = "Positivity Rate (0 = all negative, 1 = all positive)",
    color = "Positivity Rate"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, color = "gray30"),
    axis.title = element_text(face = "bold")
  )

# EDA Plot 3: Review Length By Star Rating
df %>%
  mutate(review_length = nchar(text_clean)) %>%
  group_by(stars) %>%
  summarise(mean_length = mean(review_length)) %>%
  ggplot(aes(x = stars, y = mean_length, color = stars)) +
  geom_line(size = 1.6) +
  geom_point(size = 4, color = "black") +   # black points
  scale_color_gradientn(
    colors = c("#CC0000", "#FFCC00", "#2ECC71")  # red -> yellow -> green
  ) +
  labs(
    title = "Average Review Length by Star Rating",
    subtitle = "Strong emotional responses (negative) produce longer reviews",
    x = "Star Rating",
    y = "Average Review Length (characters)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray30"),
    axis.title = element_text(face = "bold")
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# EDA Plot 4: Review Engagement (Reactions) by Sentiment Class
data %>%
  pivot_longer(cols = c(useful, funny, cool),
               names_to = "reaction_type",
               values_to = "value") %>%
  ggplot(aes(x = factor(target), y = value, fill = factor(target))) +
  geom_boxplot(alpha = 0.75, outlier.alpha = 0.25, color = "gray30") +
  stat_summary(fun = mean, geom = "point",
               shape = 21, size = 3.5, fill = "black", color = "white") +
  scale_y_log10() +
  facet_wrap(~ reaction_type, scales = "free_y") +
  scale_fill_manual(values = sentiment_colors) +
  labs(
    title = "Review Engagement by Sentiment Class",
    subtitle = "Negative reviews receive higher engagement across useful (log scale)",
    x = "Sentiment (1 = Positive, 0 = Negative)",
    y = "Reaction Count (log scale)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, color = "gray30"),
    axis.title = element_text(face = "bold"),
    strip.text = element_text(face = "bold"),
    legend.position = "none"
  )

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 209367 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 209367 rows containing non-finite outside the scale range
## (`stat_summary()`).

# EDA Plot 5: Most Distinguishing Words in Positive vs Negative Reviews (TF-IDF)
ggplot(top_tfidf, aes(tf_idf, word, fill = factor(target))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ target, scales = "free_y",
             labeller = as_labeller(c(`0` = "Negative Reviews", `1` = "Positive Reviews"))) +
  scale_y_reordered() +
  scale_fill_manual(values = sentiment_colors) +
  labs(
    title = "Most Distinguishing Words in Positive vs. Negative Reviews (TF-IDF)",
    subtitle = "Negative sentiment displays highly differentiating problem language; positive sentiment uses more generic descriptors",
    x = "TF-IDF Score",
    y = ""
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, color = "gray30"),
    axis.title = element_text(face = "bold")
  )

# Summary Statistics Table
library(skimr)
library(knitr)
library(kableExtra)

summary_vars <- df %>%
  select(
    stars, target,
    useful, funny, cool,
    user_review_count, user_average_stars, user_fans,
    business_stars, business_review_count
  )

skim_table <- skim(summary_vars) %>%
  select(-n_missing, -complete_rate)   # drop unwanted columns

skim_table %>%
  kable(format = "html", caption = "Summary Statistics for Key Numeric Variables") %>%
  kable_styling(
    bootstrap_options = c("hover", "striped"),
    full_width = FALSE,
    font_size = 13
  ) %>%
  row_spec(0, bold = TRUE, color = "white", background = "#2ECC71")

Summary Statistics for Key Numeric Variables
skim_type	skim_variable	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
numeric	stars	3.7851474	1.3925425	1	3.00	4.00	5.00	5	▂▂▂▅▇
numeric	target	0.6755368	0.4681764	0	0.00	1.00	1.00	1	▃▁▁▁▇
numeric	useful	0.9831263	2.5022857	0	0.00	0.00	1.00	152	▇▁▁▁▁
numeric	funny	0.2985158	1.7885266	0	0.00	0.00	0.00	361	▇▁▁▁▁
numeric	cool	0.4773158	1.9531269	0	0.00	0.00	0.00	141	▇▁▁▁▁
numeric	user_review_count	121.6553474	331.9615460	0	8.00	27.00	103.00	17473	▇▁▁▁▁
numeric	user_average_stars	3.7792680	0.7996369	1	3.44	3.89	4.28	5	▁▁▃▇▅
numeric	user_fans	12.4285158	83.3267267	0	0.00	0.00	4.00	12497	▇▁▁▁▁
numeric	business_stars	3.7891474	0.6271263	1	3.50	4.00	4.00	5	▁▁▁▇▃
numeric	business_review_count	489.0101579	843.0648177	5	90.00	220.00	506.00	7568	▇▁▁▁▁

# Correlation heatmap 
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

numeric_df <- df %>%
  mutate(review_length = nchar(text_clean)) %>%   # correct column name
  select(
    stars, target,
    useful, funny, cool,
    user_review_count, user_average_stars, user_fans,
    business_stars, business_review_count,
    review_length
  )

corr_matrix <- round(cor(numeric_df, use = "complete.obs"), 2)
corr_melt <- melt(corr_matrix)

ggplot(corr_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = value), color = "black", size = 3) +
  scale_fill_gradient2(
    low = "#CC0000", mid = "white", high = "#2ECC71", midpoint = 0,
    limits = c(-1, 1)
  ) +
  labs(
    title = "Correlation Heatmap of Numeric Features",
    subtitle = "Highlights relationships across review, user, and business metadata",
    x = "",
    y = "",
    fill = "Correlation"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

3.Feature Engineering

# Text Features 
data <- data %>%
  mutate(

    # 1) Review Length
    review_length = nchar(text_clean),

    # 2) Negative Keyword Count
    neg_word_count = str_count(text_clean, "\\b(bad|worst|awful|terrible|rude|disgusting|dirty|slow|unhelpful)\\b"),

    # 3) Positive Keyword Count
    pos_word_count = str_count(text_clean, "\\b(good|great|amazing|excellent|friendly|love|perfect|wonderful|happy)\\b"),

    # 4) Negation Count
    negation_count = str_count(text_clean, "\\b(not|no|never|nothing|none|n't)\\b"),

    # 5) Emphasis Score
    emphasis_score = str_count(text_clean, "[!?]{2,}") +
                     str_count(text_clean, "([A-Za-z])\\1{2,}"),
    
  # Metadata Features

    # 6) Total Reactions
    total_reactions = useful + funny + cool,

    # 7) Useful Ratio
    useful_ratio = ifelse(total_reactions == 0, 0, useful / total_reactions),

    # 8) Log User Review Count
    log_user_reviews = log1p(user_review_count),

    # 9) Log Business Reviews
    log_business_reviews = log1p(business_review_count)
  )

  #10 ) Business Varaibility 
business_variability <- data %>%
  group_by(business_id) %>%
  summarise(
    business_review_variability = sd(target, na.rm = TRUE)
  )

data <- data %>%
  left_join(business_variability, by = "business_id")

colnames(data)

##  [1] "review_id"                   "user_id"                    
##  [3] "business_id"                 "stars"                      
##  [5] "date"                        "text"                       
##  [7] "useful"                      "funny"                      
##  [9] "cool"                        "user_review_count"          
## [11] "user_average_stars"          "user_fans"                  
## [13] "business_name"               "business_city"              
## [15] "business_state"              "business_stars"             
## [17] "business_review_count"       "target"                     
## [19] "text_clean"                  "review_length"              
## [21] "neg_word_count"              "pos_word_count"             
## [23] "negation_count"              "emphasis_score"             
## [25] "total_reactions"             "useful_ratio"               
## [27] "log_user_reviews"            "log_business_reviews"       
## [29] "business_review_variability"

# Scale features appropriately 
features_to_scale <- c(
  "review_length",
  "neg_word_count",
  "pos_word_count",
  "negation_count",
  "emphasis_score",
  "total_reactions",
  "useful_ratio",
  "log_user_reviews",
  "log_business_reviews",
  "business_review_variability"
)

data[features_to_scale] <- scale(data[features_to_scale])

summary(data[features_to_scale])

##  review_length     neg_word_count    pos_word_count    negation_count    
##  Min.   :-1.0837   Min.   :-0.3596   Min.   :-1.0588   Min.   :-0.64925  
##  1st Qu.:-0.6488   1st Qu.:-0.3596   1st Qu.:-0.4523   1st Qu.:-0.64925  
##  Median :-0.3108   Median :-0.3596   Median :-0.4523   Median :-0.64925  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.2992   3rd Qu.:-0.3596   3rd Qu.: 0.7607   3rd Qu.:-0.00614  
##  Max.   : 8.9246   Max.   :15.2595   Max.   :12.2845   Max.   :57.23103  
##                                                                          
##  emphasis_score    total_reactions     useful_ratio     log_user_reviews 
##  Min.   :-0.1648   Min.   :-0.31949   Min.   :-0.7682   Min.   :-2.1539  
##  1st Qu.:-0.1648   1st Qu.:-0.31949   1st Qu.:-0.7682   1st Qu.:-0.7918  
##  Median :-0.1648   Median :-0.31949   Median :-0.7682   Median :-0.0882  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.1648   3rd Qu.: 0.04378   3rd Qu.: 0.7229   3rd Qu.: 0.7252  
##  Max.   :32.7685   Max.   :75.42220   Max.   : 1.7169   Max.   : 3.9017  
##                                                                          
##  log_business_reviews business_review_variability
##  Min.   :-2.72771     Min.   :-1.7015            
##  1st Qu.:-0.65157     1st Qu.:-0.3903            
##  Median : 0.02592     Median : 0.3527            
##  Mean   : 0.00000     Mean   : 0.0000            
##  3rd Qu.: 0.65992     3rd Qu.: 0.6440            
##  Max.   : 2.72400     Max.   : 1.5102            
##                       NA's   :11804

feature_cols <- c(
  "review_length",
  "neg_word_count",
  "pos_word_count",
  "negation_count",
  "emphasis_score",
  "total_reactions",
  "useful_ratio",
  "log_user_reviews",
  "log_business_reviews",
  "business_review_variability"
)

4.Train/Test

set.seed(123)  # ensures reproducible split

# 80/20 split index
train_index <- sample(seq_len(nrow(data)), size = 0.8 * nrow(data))

# Create train/test datasets
train_data <- data[train_index, ]
test_data  <- data[-train_index, ]

dim(train_data)

## [1] 76000    29

dim(test_data)

## [1] 19000    29

# Rebuild model datasets with only selected features + target
train_model <- train_data[, c(feature_cols, "target")]
test_model  <- test_data[, c(feature_cols, "target")]

# Drop rows with missing values BEFORE training/testing
train_model <- drop_na(train_model)
test_model  <- drop_na(test_model)

# Verify sizes
nrow(train_model)

## [1] 66584

nrow(test_model)

## [1] 16612

table(test_model$target)

## 
##     0     1 
##  5279 11333

#Why do we do this? : An 80/20  split ensures that the model is trained on the majority of the data while reserving a meaningful portion for unbiased evaluation. The training set allows the model to learn patterns in the data, while the test set simulates real-world "unseen"/"new" reviews to measure how well the model generalizes. Without a proper split, performance estimates would be overly optimistic because the model would be evaluated on data it has already seen.

#We set a seed to ensure reproducibility. Random operations such as train/test splitting can produce different results each time they run. Setting a seed locks in the randomization process so that the same rows are selected for training and testing every time the code is executed, allowing results to be consistently replicated and compared.

Model Training

# ===========================
# MODEL 1: LOGISTIC REGRESSION
# ===========================

# Convert target to factor with explicit labels
train_data <- train_data %>%
  mutate(target = ifelse(target == 1, "pos", "neg"),
         target = factor(target, levels = c("neg", "pos")))

test_data <- test_data %>%
  mutate(target = ifelse(target == 1, "pos", "neg"),
         target = factor(target, levels = c("neg", "pos")))

# keep only features + target for modeling
train_model <- train_data[, c(feature_cols, "target")]
test_model  <- test_data[,  c(feature_cols, "target")]

train_model <- train_model %>% drop_na()
test_model  <- test_model %>% drop_na()

library(caret)
library(pROC)

# Cross-validation setup for ROC
ctrl_roc <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

set.seed(123)
log_model <- train(
  target ~ .,
  data = train_model,
  method = "glm",
  family = "binomial",
  trControl = ctrl_roc,
  metric = "ROC"
)

# Weighted Logistic Regression 
set.seed(123)
log_model_weighted <- train(
  target ~ .,
  data = train_model,
  method = "glm",
  family = "binomial",
  trControl = ctrl_roc,
  metric = "ROC",                     # still optimize by ROC
  weights = ifelse(train_model$target == "neg", 3, 1)
)

# AUC on test set
log_pred_prob <- predict(log_model_weighted, newdata = test_model, type = "prob")[, 2]
log_roc <- roc(test_model$target, log_pred_prob)

## Setting levels: control = neg, case = pos

## Setting direction: controls < cases

log_auc <- as.numeric(auc(log_roc))
log_auc   # ~0.84

## [1] 0.8421641

# ===========================
# MODEL 2: RANDOM FOREST (WEIGHTED & TUNED)
# ===========================

set.seed(123)

rf_grid_small <- expand.grid(
  mtry = c(4, 6),
  splitrule = "gini",
  min.node.size = c(5, 10)
)

rf_model_weighted <- train(
  target ~ .,
  data = train_model,
  method = "ranger",
  metric = "ROC",
  trControl = ctrl_roc,
  tuneGrid = rf_grid_small,
  class.weights = c(neg = 3, pos = 1),
  num.trees = 400,
  importance = "impurity"
)

rf_model_weighted

## Random Forest 
## 
## 66584 samples
##    10 predictor
##     2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 53267, 53268, 53267, 53268, 53266 
## Resampling results across tuning parameters:
## 
##   mtry  min.node.size  ROC        Sens       Spec     
##   4      5             0.8517876  0.5936854  0.9028144
##   4     10             0.8536345  0.5945061  0.9033376
##   6      5             0.8477777  0.5964376  0.8971245
##   6     10             0.8502589  0.5947475  0.8999803
## 
## Tuning parameter 'splitrule' was held constant at a value of gini
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 4, splitrule = gini
##  and min.node.size = 10.

rfw_pred_prob  <- predict(rf_model_weighted, newdata = test_model, type = "prob")[, 2]
rfw_roc        <- roc(test_model$target, rfw_pred_prob)

## Setting levels: control = neg, case = pos
## Setting direction: controls < cases

rfw_auc        <- as.numeric(auc(rfw_roc))
rfw_auc   # ~0.85

## [1] 0.8536938

# ===========================
# MODEL 3: XGBOOST
# ===========================

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

# Use the same train_model / test_model (no redefinition)
xgb_train <- xgb.DMatrix(
  data = as.matrix(train_model[, feature_cols]),
  label = ifelse(train_model$target == "pos", 1, 0)
)

xgb_test <- xgb.DMatrix(
  data = as.matrix(test_model[, feature_cols]),
  label = ifelse(test_model$target == "pos", 1, 0)
)

params <- list(
  objective = "binary:logistic",
  eval_metric = "auc",
  max_depth = 4,
  eta = 0.1,
  subsample = 0.8,
  colsample_bytree = 0.8
)

set.seed(123)
xgb_model <- xgb.train(
  params = params,
  data = xgb_train,
  nrounds = 150,
  watchlist = list(val = xgb_test),
  verbose = 1
)

## [1]  val-auc:0.810508 
## [2]  val-auc:0.830280 
## [3]  val-auc:0.835386 
## [4]  val-auc:0.836768 
## [5]  val-auc:0.836858 
## [6]  val-auc:0.841557 
## [7]  val-auc:0.842659 
## [8]  val-auc:0.844684 
## [9]  val-auc:0.847355 
## [10] val-auc:0.846860 
## [11] val-auc:0.847291 
## [12] val-auc:0.847199 
## [13] val-auc:0.848977 
## [14] val-auc:0.849693 
## [15] val-auc:0.851784 
## [16] val-auc:0.853015 
## [17] val-auc:0.853595 
## [18] val-auc:0.854361 
## [19] val-auc:0.854587 
## [20] val-auc:0.855387 
## [21] val-auc:0.856106 
## [22] val-auc:0.856589 
## [23] val-auc:0.857023 
## [24] val-auc:0.857156 
## [25] val-auc:0.858688 
## [26] val-auc:0.858979 
## [27] val-auc:0.859381 
## [28] val-auc:0.859599 
## [29] val-auc:0.859856 
## [30] val-auc:0.860019 
## [31] val-auc:0.860212 
## [32] val-auc:0.860578 
## [33] val-auc:0.860649 
## [34] val-auc:0.861149 
## [35] val-auc:0.861122 
## [36] val-auc:0.861201 
## [37] val-auc:0.861913 
## [38] val-auc:0.862160 
## [39] val-auc:0.862414 
## [40] val-auc:0.862396 
## [41] val-auc:0.862790 
## [42] val-auc:0.862910 
## [43] val-auc:0.862953 
## [44] val-auc:0.863293 
## [45] val-auc:0.863426 
## [46] val-auc:0.863483 
## [47] val-auc:0.863763 
## [48] val-auc:0.863915 
## [49] val-auc:0.864146 
## [50] val-auc:0.864245 
## [51] val-auc:0.864460 
## [52] val-auc:0.864551 
## [53] val-auc:0.864562 
## [54] val-auc:0.864737 
## [55] val-auc:0.864734 
## [56] val-auc:0.864967 
## [57] val-auc:0.865003 
## [58] val-auc:0.865083 
## [59] val-auc:0.865258 
## [60] val-auc:0.865450 
## [61] val-auc:0.865422 
## [62] val-auc:0.865421 
## [63] val-auc:0.865523 
## [64] val-auc:0.865686 
## [65] val-auc:0.865722 
## [66] val-auc:0.865780 
## [67] val-auc:0.865806 
## [68] val-auc:0.865936 
## [69] val-auc:0.865930 
## [70] val-auc:0.866047 
## [71] val-auc:0.866170 
## [72] val-auc:0.866385 
## [73] val-auc:0.866403 
## [74] val-auc:0.866508 
## [75] val-auc:0.866546 
## [76] val-auc:0.866571 
## [77] val-auc:0.866594 
## [78] val-auc:0.866627 
## [79] val-auc:0.866659 
## [80] val-auc:0.866639 
## [81] val-auc:0.866634 
## [82] val-auc:0.866776 
## [83] val-auc:0.866877 
## [84] val-auc:0.866905 
## [85] val-auc:0.866940 
## [86] val-auc:0.866931 
## [87] val-auc:0.866897 
## [88] val-auc:0.866932 
## [89] val-auc:0.866933 
## [90] val-auc:0.866950 
## [91] val-auc:0.866944 
## [92] val-auc:0.866973 
## [93] val-auc:0.866998 
## [94] val-auc:0.866974 
## [95] val-auc:0.866982 
## [96] val-auc:0.866971 
## [97] val-auc:0.867051 
## [98] val-auc:0.867102 
## [99] val-auc:0.867127 
## [100]    val-auc:0.867136 
## [101]    val-auc:0.867135 
## [102]    val-auc:0.867106 
## [103]    val-auc:0.867145 
## [104]    val-auc:0.867168 
## [105]    val-auc:0.867177 
## [106]    val-auc:0.867170 
## [107]    val-auc:0.867176 
## [108]    val-auc:0.867169 
## [109]    val-auc:0.867179 
## [110]    val-auc:0.867208 
## [111]    val-auc:0.867245 
## [112]    val-auc:0.867290 
## [113]    val-auc:0.867312 
## [114]    val-auc:0.867371 
## [115]    val-auc:0.867377 
## [116]    val-auc:0.867478 
## [117]    val-auc:0.867510 
## [118]    val-auc:0.867566 
## [119]    val-auc:0.867526 
## [120]    val-auc:0.867501 
## [121]    val-auc:0.867503 
## [122]    val-auc:0.867520 
## [123]    val-auc:0.867531 
## [124]    val-auc:0.867522 
## [125]    val-auc:0.867518 
## [126]    val-auc:0.867570 
## [127]    val-auc:0.867597 
## [128]    val-auc:0.867589 
## [129]    val-auc:0.867629 
## [130]    val-auc:0.867624 
## [131]    val-auc:0.867630 
## [132]    val-auc:0.867665 
## [133]    val-auc:0.867639 
## [134]    val-auc:0.867651 
## [135]    val-auc:0.867638 
## [136]    val-auc:0.867680 
## [137]    val-auc:0.867689 
## [138]    val-auc:0.867679 
## [139]    val-auc:0.867758 
## [140]    val-auc:0.867791 
## [141]    val-auc:0.867791 
## [142]    val-auc:0.867842 
## [143]    val-auc:0.867834 
## [144]    val-auc:0.867829 
## [145]    val-auc:0.867839 
## [146]    val-auc:0.867852 
## [147]    val-auc:0.867824 
## [148]    val-auc:0.867829 
## [149]    val-auc:0.867824 
## [150]    val-auc:0.867782

xgb_pred <- predict(xgb_model, xgb_test)
xgb_auc  <- roc(test_model$target, xgb_pred)$auc

## Setting levels: control = neg, case = pos

## Setting direction: controls < cases

xgb_auc  # ~0.8656

## Area under the curve: 0.8678

# ===========================
# MODEL TRAINING SUMMARY TABLE
# ===========================

library(knitr)
library(kableExtra)

model_summary <- data.frame(
  Model = c("Logistic Regression (Weighted)", 
            "Random Forest (Weighted, Tuned)", 
            "XGBoost (Tuned)"),
  Tuning = c("class weights only (3:1 neg:pos)", 
             "mtry & min.node.size grid (5-fold CV) + class weights", 
             "depth, learning rate, nrounds, subsampling"),
  Key_Parameters = c(
    "glm (binomial link), weights(neg=3,pos=1)",
    "ranger, mtry ∈ {4,6}, min.node.size ∈ {5,10}, class.weights(neg=3,pos=1), num.trees=400",
    "nrounds=150, max_depth=4, eta=0.1, subsample=0.8, colsample_bytree=0.8"
  ),
  AUC = c(round(log_auc, 4), round(rfw_auc, 4), round(as.numeric(xgb_auc), 4))
)

kable(model_summary, format = "html", caption = "Model Training Summary") %>%
  kable_classic(full_width = FALSE, html_font = "Calibri") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(4, bold = TRUE, color = c("black", "black", "darkgreen"))

Model Training Summary
Model	Tuning	Key_Parameters	AUC
Logistic Regression (Weighted)	class weights only (3:1 neg:pos)	glm (binomial link), weights(neg=3,pos=1)	0.8422
Random Forest (Weighted, Tuned)	mtry & min.node.size grid (5-fold CV) + class weights	ranger, mtry ∈ {4,6}, min.node.size ∈ {5,10}, class.weights(neg=3,pos=1), num.trees=400	0.8537
XGBoost (Tuned)	depth, learning rate, nrounds, subsampling	nrounds=150, max_depth=4, eta=0.1, subsample=0.8, colsample_bytree=0.8	0.8678

Model Evaluation and Performance

library(caret)
library(pROC)

# 1) WEIGHTED LOGISTIC PREDICTIONS @ 0.5
log_pred_prob  <- predict(log_model_weighted, newdata = test_model, type = "prob")[, 2]
log_pred_class <- ifelse(log_pred_prob > 0.5, "pos", "neg")
log_pred_class <- factor(log_pred_class, levels = c("neg", "pos"))

cm_log  <- confusionMatrix(log_pred_class, test_model$target, positive = "pos")
log_roc <- roc(test_model$target, log_pred_prob)

## Setting levels: control = neg, case = pos

## Setting direction: controls < cases

log_auc <- as.numeric(auc(log_roc))

# 2) RANDOM FOREST (WEIGHTED) @ 0.5
rf_pred_prob  <- predict(rf_model_weighted, newdata = test_model, type = "prob")[, 2]
rf_pred_class <- ifelse(rf_pred_prob > 0.5, "pos", "neg")
rf_pred_class <- factor(rf_pred_class, levels = c("neg", "pos"))

cm_rf  <- confusionMatrix(rf_pred_class, test_model$target, positive = "pos")
rf_roc <- roc(test_model$target, rf_pred_prob)

## Setting levels: control = neg, case = pos
## Setting direction: controls < cases

rf_auc <- as.numeric(auc(rf_roc))

# 3) XGBOOST @ 0.5
test_matrix    <- model.matrix(target ~ . - 1, data = test_model)
xgb_pred_prob  <- predict(xgb_model, test_matrix)
xgb_pred_class <- ifelse(xgb_pred_prob > 0.5, "pos", "neg")
xgb_pred_class <- factor(xgb_pred_class, levels = c("neg", "pos"))

cm_xgb  <- confusionMatrix(xgb_pred_class, test_model$target, positive = "pos")
xgb_roc <- roc(test_model$target, xgb_pred_prob)

## Setting levels: control = neg, case = pos
## Setting direction: controls < cases

xgb_auc <- as.numeric(auc(xgb_roc))

# 4) COLLECT METRICS INTO FINAL TABLE
results_test <- data.frame(
  Model     = c("Logistic Regression (0.5)",
                "Random Forest (Tuned, 0.5)",
                "XGBoost (0.5)"),
  Accuracy  = c(cm_log$overall["Accuracy"],
                cm_rf$overall["Accuracy"],
                cm_xgb$overall["Accuracy"]),
  Precision = c(cm_log$byClass["Precision"],
                cm_rf$byClass["Precision"],
                cm_xgb$byClass["Precision"]),
  Recall    = c(cm_log$byClass["Recall"],
                cm_rf$byClass["Recall"],
                cm_xgb$byClass["Recall"]),
  F1        = c(cm_log$byClass["F1"],
                cm_rf$byClass["F1"],
                cm_xgb$byClass["F1"]),
  AUC       = c(log_auc, rf_auc, xgb_auc)
)

results_test

##                        Model  Accuracy Precision    Recall        F1       AUC
## 1  Logistic Regression (0.5) 0.7456056 0.8830441 0.7228448 0.7949539 0.8421641
## 2 Random Forest (Tuned, 0.5) 0.8022514 0.8228498 0.9049678 0.8619574 0.8536938
## 3              XGBoost (0.5) 0.8104984 0.8253956 0.9159975 0.8683396 0.8677819

library(kableExtra)

kable(results_test, format = "html", caption = "Model Evaluation on Test Set") %>%
  kable_styling(full_width = FALSE, position = "center", bootstrap_options = c("striped", "hover")) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:6, width = "8em")

Model Evaluation on Test Set
Model	Accuracy	Precision	Recall	F1	AUC
Logistic Regression (0.5)	0.7456056	0.8830441	0.7228448	0.7949539	0.8421641
Random Forest (Tuned, 0.5)	0.8022514	0.8228498	0.9049678	0.8619574	0.8536938
XGBoost (0.5)	0.8104984	0.8253956	0.9159975	0.8683396	0.8677819

# Build tidy ROC data for each model
log_df <- data.frame(
  fpr   = 1 - log_roc$specificities,
  tpr   = log_roc$sensitivities,
  model = "Logistic"
)

rf_df <- data.frame(
  fpr   = 1 - rf_roc$specificities,
  tpr   = rf_roc$sensitivities,
  model = "Random Forest"
)

xgb_df <- data.frame(
  fpr   = 1 - xgb_roc$specificities,
  tpr   = xgb_roc$sensitivities,
  model = "XGBoost"
)

roc_df <- bind_rows(log_df, rf_df, xgb_df)

# Add AUC to legend labels
auc_vals <- c(
  Logistic      = log_auc,
  `Random Forest` = rf_auc,
  XGBoost       = xgb_auc
)

roc_df$model <- factor(
  roc_df$model,
  levels = c("Logistic", "Random Forest", "XGBoost"),
  labels = paste0(
    c("Logistic", "Random Forest", "XGBoost"),
    " (AUC = ",
    round(auc_vals[c("Logistic","Random Forest","XGBoost")], 3),
    ")"
  )
)

ggplot(roc_df, aes(x = fpr, y = tpr, color = model)) +
  geom_line(size = 1.1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", alpha = 0.5) +
  labs(
    title = "ROC Curves on Test Set",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)",
    color = "Model"
  ) +
  coord_equal() +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "bottom"
  )

#Find Feature Importance

# Extract coefficients from weighted logistic model
log_coef <- summary(log_model_weighted)$coefficients

# Convert to data frame
log_imp <- data.frame(
  Feature = rownames(log_coef),
  Coefficient = log_coef[, "Estimate"]
)

# Remove intercept
log_imp <- log_imp[log_imp$Feature != "(Intercept)", ]

# Add absolute magnitude for ranking
log_imp$Abs_Coefficient <- abs(log_imp$Coefficient)

# Sort by importance
log_imp <- log_imp[order(-log_imp$Abs_Coefficient), ]

# Plot
library(ggplot2)

ggplot(log_imp, aes(x = reorder(Feature, Abs_Coefficient), y = Abs_Coefficient, fill = Coefficient > 0)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("#B00000", "#2ECC71"), labels = c("Negative Effect", "Positive Effect")) +
  labs(
    title = "Logistic Regression Feature Importance",
    subtitle = "Importance measured by absolute coefficient magnitude",
    x = "Feature",
    y = "Coefficient Magnitude",
    fill = "Effect Direction"
  ) +
  theme_minimal(base_size = 14)

rf_imp <- varImp(rf_model_weighted, scale = FALSE)
rf_imp

## ranger variable importance
## 
##                             Overall
## negation_count               6458.5
## business_review_variability  6348.2
## review_length                6284.4
## log_business_reviews         6023.2
## neg_word_count               6001.4
## log_user_reviews             5609.0
## pos_word_count               5312.4
## total_reactions              1524.9
## useful_ratio                 1458.6
## emphasis_score                260.5

plot(rf_imp, top = 10, main = "Random Forest Feature Importance")

library(xgboost)
library(ggplot2)
library(dplyr)
library(forcats)

# Extract importance
xgb_imp <- xgb.importance(model = xgb_model)

# Plot with new colors
ggplot(xgb_imp, aes(x = fct_rev(Feature), y = Gain)) +
  geom_col(fill = "#3b82f6") +   # blue
  coord_flip() +
  labs(title = "XGBoost Feature Importance",
       x = "Feature",
       y = "Gain (Contribution to Model Performance)") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    panel.grid.minor = element_blank()
  )

HW4_Anna_Keough

Anna Keough

2025-11-19