## ----eda-packages, message=FALSE, warning=FALSE----
library(tidyverse)
library(tidytext)
library(ggplot2)
library(stringr)
library(forcats)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# Install textdata for sentiment lexicons (AFINN)
if (!require(textdata)) {
install.packages("textdata")
library(textdata)
}
## Loading required package: textdata
## Warning: package 'textdata' was built under R version 4.3.3
## ----eda-tokenize----
# Tokenize text into words
tokens <- df %>%
unnest_tokens(word, clean_text) # uses the cleaned text created earlier
# Load sentiment lexicon (AFINN gives numeric scores)
afinn <- get_sentiments("afinn")
# Join tokens with sentiment scores
tokens_sent <- tokens %>%
left_join(afinn, by = "word")
## ----plot-star-dist----
ggplot(df, aes(x = y)) +
geom_bar(fill = "#87CEFA") +
labs(
title = "Distribution of Positive vs. Non-Positive Reviews",
x = "y (1 = stars > 3, 0 = stars ≤ 3)",
y = "Count"
) +
theme_minimal()
## ----sentiment-by-review----
# Get sentiment score for each review by summing token sentiment
review_sent <- tokens_sent %>%
group_by(review_id) %>%
summarize(sentiment = sum(value, na.rm = TRUE))
# Merge back into df
df_sent <- df %>%
left_join(review_sent, by = "review_id")
## ----plot-sentiment----
ggplot(df_sent, aes(x = y, y = sentiment, fill = factor(y))) +
geom_boxplot(alpha = 0.6) +
scale_fill_manual(values = c("#FF9999", "#99CCFF")) +
labs(
title = "Sentiment Score by Review Outcome",
x = "Review Outcome (y)",
y = "Sentiment Score",
fill = "y"
) +
theme_minimal()
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## ----fp-pronoun-calc----
fp_words <- c("i", "me", "my", "mine", "we", "us", "our", "ours")
fp_counts <- tokens %>%
filter(word %in% fp_words) %>%
group_by(review_id) %>%
summarize(fp_count = n())
# Add to main df
df_fp <- df %>%
left_join(fp_counts, by = "review_id") %>%
mutate(fp_count = replace_na(fp_count, 0))
## ----plot-fp-pronouns----
ggplot(df_fp, aes(x = y, y = fp_count, fill = factor(y))) +
geom_boxplot(alpha = 0.6) +
scale_fill_manual(values = c("#F4A7A7", "#A7D3F4")) +
labs(
title = "First-Person Pronoun Usage by Review Outcome",
x = "Review Outcome (y)",
y = "First-Person Pronoun Count",
fill = "y"
) +
theme_minimal()
## ----plot-user-fans----
ggplot(df, aes(x = y, y = user_fans, fill = factor(y))) +
geom_boxplot(alpha = 0.6) +
scale_y_log10(labels = comma) + # log scale to handle skew
scale_fill_manual(values = c("#FFB3B3", "#B3D9FF")) +
labs(
title = "User Fans by Review Outcome (log scale)",
x = "Review Outcome (y)",
y = "User Fans (log scale)",
fill = "y"
) +
theme_minimal()
## Warning in scale_y_log10(labels = comma): log-10 transformation introduced
## infinite values.
## Warning: Removed 48232 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## ----city-positive-rate----
city_pos <- df %>%
group_by(business_city) %>%
summarize(
n = n(),
positive_rate = mean(y)
) %>%
filter(n > 50) %>% # filter to cities with enough data
arrange(desc(positive_rate)) %>%
slice(1:10)
ggplot(city_pos, aes(x = fct_reorder(business_city, positive_rate),
y = positive_rate)) +
geom_col(fill = "#87CEFA") +
coord_flip() +
scale_y_continuous(labels = percent) +
labs(
title = "Top 10 Cities by % of Positive Reviews",
x = "City",
y = "Positive Review Rate"
) +
theme_minimal()
## ----define-regions----
west_coast <- c("CA", "OR", "WA")
east_coast <- c("NY", "NJ", "MA", "MD", "VA", "PA", "CT", "RI", "NC", "SC", "GA", "FL")
df_region <- df %>%
mutate(region = case_when(
business_state %in% west_coast ~ "West Coast",
business_state %in% east_coast ~ "East Coast",
TRUE ~ "Other"
))
## ----plot-regions----
region_summary <- df_region %>%
group_by(region) %>%
summarize(positive_rate = mean(y))
ggplot(region_summary, aes(x = region, y = positive_rate, fill = region)) +
geom_col(alpha = 0.8) +
scale_y_continuous(labels = percent) +
labs(
title = "Positive Review Rates by Region",
x = "Region",
y = "Positive Review Rate"
) +
theme_minimal() +
theme(legend.position = "none")
## ----summary-stats----
df %>%
summarize(
avg_review_length = mean(str_count(clean_text, "\\S+")),
median_user_fans = median(user_fans),
avg_useful = mean(useful),
avg_funny = mean(funny),
avg_cool = mean(cool)
)
## # A tibble: 1 × 5
## avg_review_length median_user_fans avg_useful avg_funny avg_cool
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 103. 0 0.983 0.299 0.477
#Insights: ## Interesting trends and relationships emerged from the exploratory data analysis. Notably, reviews with higher sentiment scores tended to correspond with positive outcomes (y = 1). Additionally, first-person pronoun usage appeared more frequently in positive reviews, suggesting a more personal connection in favorable feedback(I vs. They). User engagement metrics, such as the number of fans, also showed a correlation with review positivity, indicating that more popular users may leave more positive reviews. Geographically, certain cities and regions exhibited higher rates of positive reviews, which could reflect local cultural attitudes or business practices. These findings provide valuable insights for further analysis and model development.
## ----feature-engineering, message=FALSE, warning=FALSE----
library(tidyverse)
library(tidytext)
library(stringr)
library(forcats)
library(textdata)
# ---------------------------------------------------------------
# TOKENIZE TEXT (clean_text already created in preprocessing)
# ---------------------------------------------------------------
tokens <- df %>%
unnest_tokens(word, clean_text) # splits text into individual words
# ---------------------------------------------------------------
# TEXT FEATURE 1: SENTIMENT SCORE (AFINN)
# ---------------------------------------------------------------
afinn <- get_sentiments("afinn")
tokens_sent <- tokens %>%
left_join(afinn, by = "word")
review_sent <- tokens_sent %>%
group_by(review_id) %>%
summarize(sentiment = sum(value, na.rm = TRUE))
df <- df %>%
left_join(review_sent, by = "review_id") %>%
mutate(sentiment = replace_na(sentiment, 0))
# ---------------------------------------------------------------
# TEXT FEATURE 2: FIRST-PERSON PRONOUN COUNT + DENSITY
# ---------------------------------------------------------------
fp_words <- c("i","me","my","mine","we","us","our","ours")
fp_counts <- tokens %>%
filter(word %in% fp_words) %>%
count(review_id, name = "fp_count")
df <- df %>%
left_join(fp_counts, by = "review_id") %>%
mutate(fp_count = replace_na(fp_count, 0))
# ---------------------------------------------------------------
# TEXT FEATURE 3: REVIEW LENGTH (WORD COUNT)
# ---------------------------------------------------------------
df <- df %>%
mutate(
word_count = str_count(clean_text, "\\S+"),
review_length = word_count
)
# ---------------------------------------------------------------
# TEXT FEATURE 4: PROPORTION OF POSITIVE WORDS (bing lexicon)
# ---------------------------------------------------------------
bing_pos <- get_sentiments("bing") %>%
filter(sentiment == "positive")
pos_counts <- tokens %>%
inner_join(bing_pos, by = "word") %>%
count(review_id, name = "pos_word_count")
df <- df %>%
left_join(pos_counts, by = "review_id") %>%
mutate(
pos_word_count = replace_na(pos_word_count, 0),
pos_prop = pos_word_count / pmax(word_count, 1)
)
# ---------------------------------------------------------------
# TEXT FEATURE 5: NEGATION COUNT
# ---------------------------------------------------------------
neg_words <- c("not","no","never","n't","didn't","dont","don't","cannot","can't")
neg_counts <- tokens %>%
filter(word %in% neg_words) %>%
count(review_id, name = "neg_count")
df <- df %>%
left_join(neg_counts, by = "review_id") %>%
mutate(neg_count = replace_na(neg_count, 0))
# ---------------------------------------------------------------
# METADATA FEATURE 1: log_user_fans
# ---------------------------------------------------------------
df <- df %>%
mutate(log_user_fans = log1p(user_fans))
# ---------------------------------------------------------------
# METADATA FEATURE 2: REVIEW AGE (in days)
# ---------------------------------------------------------------
df <- df %>%
mutate(date = as.Date(date))
max_date <- max(df$date, na.rm = TRUE)
df <- df %>%
mutate(
review_age_days = as.numeric(max_date - date),
log_review_age = log1p(review_age_days)
)
# ---------------------------------------------------------------
# METADATA FEATURE 3: REGION (West / East / Other)
# ---------------------------------------------------------------
west_coast <- c("CA","OR","WA")
east_coast <- c("NY","NJ","MA","MD","VA","PA","CT","RI","NC","SC","GA","FL")
df <- df %>%
mutate(
region = case_when(
business_state %in% west_coast ~ "West",
business_state %in% east_coast ~ "East",
TRUE ~ "Other"
),
region = factor(region)
)
# ---------------------------------------------------------------
# METADATA FEATURE 4: BUSINESS CITY GROUPING (Top 10 cities)
# ---------------------------------------------------------------
top_cities <- df %>%
count(business_city, sort = TRUE) %>%
slice(1:10) %>%
pull(business_city)
df <- df %>%
mutate(
city_group = if_else(business_city %in% top_cities,
business_city, "Other"),
city_group = factor(city_group)
)
# ---------------------------------------------------------------
# METADATA FEATURE 5: USER ENGAGEMENT SCORE
# (useful + funny + cool)
# ---------------------------------------------------------------
df <- df %>%
mutate(user_engagement = useful + funny + cool)
# ---------------------------------------------------------------
# FINAL TEXT METRIC: FIRST-PERSON PRONOUN DENSITY
# ---------------------------------------------------------------
df <- df %>%
mutate(fp_density = fp_count / pmax(word_count, 1))
## ----train-test-split, message=FALSE----
set.seed(123) # ensures reproducibility
# Create an index for an 80/20 split
train_index <- sample(seq_len(nrow(df)), size = 0.8 * nrow(df))
# Subset into training and testing sets
train_df <- df[train_index, ]
test_df <- df[-train_index, ]
# Check class balance in both sets
table(train_df$y) / nrow(train_df)
##
## 0 1
## 0.3229737 0.6770263
table(test_df$y) / nrow(test_df)
##
## 0 1
## 0.3304211 0.6695789
## ----model-training-fast, message=FALSE, warning=FALSE----
library(ranger)
## Warning: package 'ranger' was built under R version 4.3.3
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Ensure classification mode
train_df$y <- factor(train_df$y, levels = c(0,1))
test_df$y <- factor(test_df$y, levels = c(0,1))
# ---------------------------------------------------------------
# 1. LOGISTIC REGRESSION (Benchmark)
# ---------------------------------------------------------------
log_model <- glm(
y ~ sentiment + fp_density + review_length + pos_prop + neg_count +
log_user_fans + review_age_days + region + city_group + user_engagement,
data = train_df,
family = binomial
)
# Predict
log_probs <- predict(log_model, newdata = test_df, type = "response")
log_auc <- roc(test_df$y, log_probs)$auc
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# ---------------------------------------------------------------
# 2. RANDOM FOREST (FAST VERSION with ranger)
# ---------------------------------------------------------------
rf_model <- ranger(
y ~ sentiment + fp_density + review_length + pos_prop + neg_count +
log_user_fans + review_age_days + region + city_group + user_engagement,
data = train_df,
num.trees = 150, # extremely fast but accurate
mtry = 4,
probability = TRUE,
importance = "impurity"
)
rf_probs <- predict(rf_model, test_df)$predictions[, "1"]
rf_auc <- roc(test_df$y, rf_probs)$auc
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# ---------------------------------------------------------------
# 3. XGBOOST (Memory Safe)
# ---------------------------------------------------------------
# Convert factors to numeric for XGBoost
train_matrix <- model.matrix(
y ~ sentiment + fp_density + review_length + pos_prop + neg_count +
log_user_fans + review_age_days + region + city_group + user_engagement,
data = train_df
)[, -1]
test_matrix <- model.matrix(
y ~ sentiment + fp_density + review_length + pos_prop + neg_count +
log_user_fans + review_age_days + region + city_group + user_engagement,
data = test_df
)[, -1]
dtrain <- xgb.DMatrix(data = train_matrix, label = as.numeric(train_df$y) - 1)
dtest <- xgb.DMatrix(data = test_matrix, label = as.numeric(test_df$y) - 1)
xgb_model <- xgboost(
data = dtrain,
objective = "binary:logistic",
nrounds = 80,
max_depth = 4,
eta = 0.15,
subsample = 0.8,
colsample_bytree = 0.8,
verbose = 0
)
xgb_probs <- predict(xgb_model, dtest)
xgb_auc <- roc(test_df$y, xgb_probs)$auc
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# ---------------------------------------------------------------
# Print AUCs
# ---------------------------------------------------------------
log_auc
## Area under the curve: 0.8576
rf_auc
## Area under the curve: 0.8604
xgb_auc
## Area under the curve: 0.868
## ----model-eval-clean, message=FALSE, warning=FALSE----
library(pROC)
# ----------------------------------------
# 1. Ensure y is numeric 0/1
# ----------------------------------------
test_df$y <- as.numeric(as.character(test_df$y))
# ----------------------------------------
# 2. Probability Predictions
# ----------------------------------------
# Logistic Regression
log_probs <- predict(log_model, newdata = test_df, type = "response")
# Random Forest (ranger)
rf_probs <- predict(rf_model, data = test_df)$predictions[, "1"]
# XGBoost
xgb_probs <- predict(xgb_model, dtest)
# ----------------------------------------
# 3. CLASS Predictions (0/1)
# ----------------------------------------
log_pred <- ifelse(log_probs > 0.5, 1, 0)
rf_pred <- ifelse(rf_probs > 0.5, 1, 0)
xgb_pred <- ifelse(xgb_probs > 0.5, 1, 0)
# ----------------------------------------
# 4. METRICS: ACCURACY, PRECISION, RECALL, AUC
# ----------------------------------------
# Accuracy
log_acc <- mean(log_pred == test_df$y)
rf_acc <- mean(rf_pred == test_df$y)
xgb_acc <- mean(xgb_pred == test_df$y)
# Precision
log_prec <- sum(log_pred == 1 & test_df$y == 1) / sum(log_pred == 1)
rf_prec <- sum(rf_pred == 1 & test_df$y == 1) / sum(rf_pred == 1)
xgb_prec <- sum(xgb_pred == 1 & test_df$y == 1) / sum(xgb_pred == 1)
# Recall
log_rec <- sum(log_pred == 1 & test_df$y == 1) / sum(test_df$y == 1)
rf_rec <- sum(rf_pred == 1 & test_df$y == 1) / sum(test_df$y == 1)
xgb_rec <- sum(xgb_pred == 1 & test_df$y == 1) / sum(test_df$y == 1)
# AUC
log_auc <- roc(test_df$y, log_probs)$auc
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
rf_auc <- roc(test_df$y, rf_probs)$auc
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
xgb_auc <- roc(test_df$y, xgb_probs)$auc
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# ----------------------------------------
# 5. RESULTS TABLE
# ----------------------------------------
results <- data.frame(
Model = c("Logistic Regression","Random Forest (ranger)","XGBoost"),
Accuracy = c(log_acc, rf_acc, xgb_acc),
Precision = c(log_prec, rf_prec, xgb_prec),
Recall = c(log_rec, rf_rec, xgb_rec),
AUC = c(log_auc, rf_auc, xgb_auc)
)
results
## Model Accuracy Precision Recall AUC
## 1 Logistic Regression 0.8104737 0.8262394 0.9078761 0.8575527
## 2 Random Forest (ranger) 0.8111053 0.8246250 0.9118063 0.8603542
## 3 XGBoost 0.8158421 0.8255559 0.9191951 0.8680162
# ----------------------------------------
# 6. ROC Curves
# ----------------------------------------
plot(roc(test_df$y, log_probs), col="blue", lwd=2, main="ROC Curves")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
lines(roc(test_df$y, rf_probs), col="forestgreen", lwd=2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
lines(roc(test_df$y, xgb_probs), col="red", lwd=2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright",
legend=c("Logistic","Random Forest","XGBoost"),
col=c("blue","forestgreen","red"),
lwd=2)
# ----------------------------------------
# 7. FEATURE IMPORTANCE
# ----------------------------------------
# Random Forest importance
rf_importance <- ranger::importance(rf_model)
rf_importance
## sentiment fp_density review_length pos_prop neg_count
## 5881.5577 2239.7256 2531.7310 7877.0679 2366.3944
## log_user_fans review_age_days region city_group user_engagement
## 1437.3462 2871.5325 353.5957 1109.4808 840.6835
# XGBoost importance
xgb.importance(model = xgb_model)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: pos_prop 4.878552e-01 1.922690e-01 0.1401547721
## 2: sentiment 2.639299e-01 2.174835e-01 0.1590713672
## 3: neg_count 1.392676e-01 1.461668e-01 0.1238177128
## 4: review_length 3.674500e-02 1.126546e-01 0.1341358555
## 5: log_user_fans 3.084058e-02 6.625555e-02 0.1126397248
## 6: review_age_days 1.751576e-02 8.113323e-02 0.1281169390
## 7: fp_density 1.093917e-02 7.306532e-02 0.0920034394
## 8: user_engagement 8.284626e-03 6.633109e-02 0.0627687016
## 9: city_groupOther 1.829079e-03 1.211391e-02 0.0137575236
## 10: city_groupIndianapolis 7.069719e-04 4.621181e-03 0.0042992261
## 11: city_groupPhiladelphia 6.171121e-04 6.575239e-03 0.0085984523
## 12: city_groupNew Orleans 4.638605e-04 8.465354e-03 0.0060189166
## 13: city_groupTampa 3.921095e-04 9.116545e-03 0.0060189166
## 14: regionOther 1.598316e-04 2.871951e-03 0.0025795357
## 15: city_groupSaint Louis 1.253069e-04 1.743797e-04 0.0008598452
## 16: city_groupNashville 1.138589e-04 1.797755e-05 0.0008598452
## 17: regionWest 1.031801e-04 5.529552e-04 0.0017196905
## 18: city_groupTucson 6.286633e-05 5.265587e-05 0.0017196905
## 19: city_groupReno 4.797506e-05 7.872166e-05 0.0008598452
########AI Use Statement
########I used AI tools (ChatGPT and GitHub Copilot) to support parts of this assignment, including generating example R code snippets, debugging error messages, and drafting short explanatory text for several sections. All code was run, reviewed, and modified by me to ensure accuracy and fit the specific requirements of the homework. All interpretations, analyses, discussion of results, and final model evaluations were written and validated by me.