Employee reviews on Glassdoor capture a forward-looking, “from-the-inside” signal about firm health: morale, leadership, work culture, and outlook on the business. Prior research (e.g., Becker et al., 2022) shows that aggregated employee sentiment has predictive power for firm fundamentals and stock returns. This project applies text-mining toolkits to a large corporate corpus and then links the resulting sentiment index to actual stock-price behavior.
Companies analyzed (n = 5):
| Firm in dataset | Ticker | Sector |
|---|---|---|
| Apple | AAPL | Technology |
| Microsoft | MSFT | Technology |
| Google (Alphabet) | GOOGL | Technology / Internet |
| IBM | IBM | Technology / Services |
| Oracle | ORCL | Technology / Software |
Window: 2008-Q1 to 2021-Q2 (full Glassdoor coverage), giving ~13 years of quarterly observations per firm — well above the 2-year minimum.
Pipeline at a glance:
yfR::yf_get() and compute average
quarterly returns.# Load libraries
library(tidyverse)
library(tidytext)
library(textdata)
library(lubridate)
library(yfR)
library(scales)
library(knitr)
library(kableExtra)
library(broom)
library(wordcloud2)
library(radarchart)
library(ggrepel)
library(patchwork)
library(RColorBrewer)
# Consistent firm <-> ticker mapping used throughout
firm_ticker <- tibble(
firm = c("Apple", "Microsoft", "Google", "IBM", "Oracle"),
ticker = c("AAPL", "MSFT", "GOOGL", "IBM", "ORCL")
)
# A consistent color palette for the 5 firms (used in every plot)
firm_pal <- c(
"Apple" = "darkorchid",
"Microsoft" = "deeppink2",
"Google" = "green2",
"IBM" = "turquoise2",
"Oracle" = "darkgoldenrod1"
)
theme_set(theme_minimal(base_size = 12))
Note: Set
glassdoor_pathto wherever yourglassdoor_reviews.csvlives on disk (or usefile.choose()like in the previous assignment).
glassdoor_path <- "glassdoor_reviews.csv" # <-- adjust path if needed
# glassdoor_path <- file.choose() # alt: interactive picker
reviews_raw <- readr::read_csv(glassdoor_path, show_col_types = FALSE)
cat("Total reviews in file:", nrow(reviews_raw), "\n")
## Total reviews in file: 838566
cat("Unique firms in file: ", n_distinct(reviews_raw$firm), "\n")
## Unique firms in file: 428
reviews <- reviews_raw %>%
filter(firm %in% firm_ticker$firm) %>%
left_join(firm_ticker, by = "firm") %>%
mutate(
date_review = ymd(date_review),
pros = replace_na(pros, ""),
cons = replace_na(cons, ""),
headline = replace_na(headline, ""),
full_text = str_c(pros, " ", cons),
full_text = str_replace_all(full_text, "[‘’‚‛]", "'"),
full_text = str_squish(full_text),
year = year(date_review),
quarter = paste0(year(date_review), "-Q", quarter(date_review)),
qtr_date = floor_date(date_review, "quarter")
) %>%
filter(!is.na(date_review), full_text != "")
cat("Reviews kept (5 target firms): ", nrow(reviews), "\n")
## Reviews kept (5 target firms): 155844
cat("Date range: ",
format(min(reviews$date_review)), " to ",
format(max(reviews$date_review)), "\n")
## Date range: 2008-01-31 to 2021-06-06
reviews %>% count(firm, sort = TRUE) %>%
kable(caption = "Review counts by firm") %>% kable_styling(full_width = FALSE)
| firm | n |
|---|---|
| IBM | 60436 |
| Oracle | 31941 |
| Microsoft | 26675 |
| Apple | 20797 |
| 15995 |
reviews %>%
count(firm, year) %>%
ggplot(aes(year, n, color = firm)) +
geom_line(linewidth = 1) +
geom_point(size = 1.6) +
scale_color_manual(values = firm_pal) +
scale_y_continuous(labels = comma) +
labs(title = "Annual Glassdoor review volume by firm",
x = NULL, y = "Reviews per year", color = NULL)
overall_rating (1 to 5) is a built-in human label that
we can later cross-check against our text-derived AFINN score.
reviews %>%
ggplot(aes(factor(overall_rating), fill = firm)) +
geom_bar(position = "dodge") +
scale_fill_manual(values = firm_pal) +
labs(title = "Distribution of Glassdoor overall_rating by firm",
x = "Overall rating (1 = worst, 5 = best)", y = "Count", fill = NULL)
Reusing the char_count / word_count style
from the prior assignment:
reviews <- reviews %>%
mutate(char_count = nchar(full_text),
word_count_raw = str_count(full_text, "\\S+"))
ggplot(reviews, aes(firm, word_count_raw, fill = firm)) +
geom_boxplot(outlier.alpha = 0.15) +
scale_fill_manual(values = firm_pal, guide = "none") +
scale_y_log10(labels = comma) +
labs(title = "Review length by firm (log scale)",
x = NULL, y = "Words per review (pros + cons)")
We tokenize pros + cons, drop standard stop words, then
drop a custom list of corporate / HR-specific filler. The list mirrors
the structure of remove_words from Assignment 2 but is
tuned for Glassdoor reviews.
remove_words <- c(
# firm names so they don't contaminate sentiment
"apple", "microsoft", "google", "ibm", "oracle", "alphabet",
# generic Glassdoor / workplace filler
"company", "companies", "team", "teams", "people", "employee", "employees",
"manager", "managers", "management", "mgmt", "job", "jobs", "role", "roles",
"work", "working", "career", "office", "department",
# time / pronouns / fillers
"lot", "lots", "day", "days", "year", "years", "month", "months", "week",
"time", "times", "hour", "hours", "much", "many", "really", "lots",
"im", "ive", "dont", "didnt", "cant", "wont", "isnt", "thats", "theres",
"i'm", "i've", "don't", "didn't", "can't", "won't", "isn't", "that's",
# generic positive/negative noise that AFINN already captures
"pros", "cons", "review", "reviews", "feedback"
)
review_tokens <- reviews %>%
mutate(doc_id = row_number()) %>%
select(doc_id, firm, ticker, date_review, qtr_date, quarter, year,
overall_rating, full_text) %>%
unnest_tokens(word, full_text) %>%
filter(!word %in% stop_words$word,
!word %in% remove_words,
nchar(word) > 2,
!str_detect(word, "^[0-9]+$"))
cat("Tokens after cleaning:", nrow(review_tokens), "\n")
## Tokens after cleaning: 2083376
top_words_firm <- review_tokens %>%
count(firm, word, sort = TRUE) %>%
group_by(firm) %>%
slice_max(n, n = 15) %>%
ungroup() %>%
mutate(word = reorder_within(word, n, firm))
ggplot(top_words_firm, aes(word, n, fill = firm)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ firm, scales = "free_y", ncol = 2) +
coord_flip() +
scale_x_reordered() +
scale_fill_manual(values = firm_pal) +
labs(title = "Top 15 words per firm (Glassdoor pros + cons)",
x = NULL, y = "Word count")
firm_tfidf <- review_tokens %>%
count(firm, word) %>%
bind_tf_idf(word, firm, n) %>%
group_by(firm) %>%
slice_max(tf_idf, n = 12) %>%
ungroup() %>%
mutate(word = reorder_within(word, tf_idf, firm))
ggplot(firm_tfidf, aes(word, tf_idf, fill = firm)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ firm, scales = "free", ncol = 2) +
coord_flip() +
scale_x_reordered() +
scale_fill_manual(values = firm_pal) +
labs(title = "Most distinctive words per firm (TF-IDF)",
x = NULL, y = "TF-IDF")
We score the same tokens with three lexicons:
# textdata may prompt to download AFINN / NRC on first call; these helpers
# auto-accept that license prompt so the document knits non-interactively.
options(timeout = 300)
get_lexicon_safe <- function(name) {
tryCatch(
suppressMessages(textdata::lexicon_afinn()), # primes textdata cache for AFINN
error = function(e) NULL
)
tryCatch(
suppressMessages(textdata::lexicon_nrc()), # primes textdata cache for NRC
error = function(e) NULL
)
tidytext::get_sentiments(name)
}
afinn <- get_lexicon_safe("afinn")
bing <- tidytext::get_sentiments("bing") # ships with tidytext, no download
nrc <- get_lexicon_safe("nrc")
cat("AFINN size:", nrow(afinn),
" | Bing size:", nrow(bing),
" | NRC size: ", nrow(nrc), "\n")
## AFINN size: 2477 | Bing size: 6786 | NRC size: 13872
review_afinn <- review_tokens %>%
inner_join(afinn, by = "word", relationship = "many-to-many") %>%
group_by(firm, doc_id, qtr_date, quarter, overall_rating) %>%
summarise(afinn_sum = sum(value),
afinn_words = n(),
.groups = "drop") %>%
mutate(afinn_per_word = afinn_sum / afinn_words)
# Sanity check: does AFINN per word increase with overall_rating?
review_afinn %>%
filter(!is.na(overall_rating)) %>%
ggplot(aes(factor(overall_rating), afinn_per_word, fill = factor(overall_rating))) +
geom_boxplot(outlier.alpha = 0.05) +
scale_fill_brewer(palette = "RdYlGn", guide = "none") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
labs(title = "Sanity check: AFINN per review vs. user-supplied star rating",
subtitle = "Higher Glassdoor stars should correspond to higher AFINN scores",
x = "Overall rating (stars)", y = "Mean AFINN per word in review")
bing_firm <- review_tokens %>%
inner_join(bing, by = "word") %>%
count(firm, sentiment) %>%
group_by(firm) %>%
mutate(pct = n / sum(n)) %>%
ungroup()
ggplot(bing_firm, aes(firm, pct, fill = sentiment)) +
geom_col(position = "stack") +
scale_y_continuous(labels = percent) +
scale_fill_manual(values = c(positive = "#2ca25f", negative = "#de2d26")) +
labs(title = "Bing sentiment composition by firm",
x = NULL, y = "Share of sentiment-bearing words", fill = NULL)
nrc_firm <- review_tokens %>%
inner_join(nrc, by = "word", relationship = "many-to-many") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(firm, sentiment) %>%
group_by(firm) %>%
mutate(pct = n / sum(n) * 100) %>%
ungroup()
ggplot(nrc_firm, aes(sentiment, pct, fill = firm)) +
geom_col(position = "dodge") +
scale_fill_manual(values = firm_pal) +
labs(title = "NRC emotion mix by firm",
x = NULL, y = "% of emotion-bearing words", fill = NULL) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
# Radar chart variant carried over from Assignment 2
nrc_wide <- nrc_firm %>%
select(sentiment, firm, pct) %>%
pivot_wider(names_from = firm, values_from = pct, values_fill = 0)
chartJSRadar(nrc_wide, main = "NRC Emotion Profile by Firm",
showToolTipLabel = TRUE)
# Set eval=TRUE if you want to render these interactively in the knitted HTML.
pos_terms <- review_tokens %>%
inner_join(bing, by = "word") %>%
filter(sentiment == "positive") %>%
count(word, sort = TRUE) %>% head(150)
wordcloud2(pos_terms, size = 0.6, color = "darkgreen")
neg_terms <- review_tokens %>%
inner_join(bing, by = "word") %>%
filter(sentiment == "negative") %>%
count(word, sort = TRUE) %>% head(150)
wordcloud2(neg_terms, size = 0.6, color = "darkred")
For the regression we need ONE sentiment number per firm per quarter. We use a volume-weighted AFINN score:
\[ \text{Sentiment}_{f,q} \;=\; \frac{\sum_{\text{tokens in firm }f\text{ quarter }q} \text{AFINN}(w)} {\#\text{ AFINN-scored tokens in firm }f\text{ quarter }q} \]
This is more stable than a simple mean of per-review averages because it gives more weight to longer/more substantive reviews.
qtr_token_afinn <- review_tokens %>%
inner_join(afinn, by = "word", relationship = "many-to-many")
qtr_sentiment <- qtr_token_afinn %>%
group_by(firm, qtr_date, quarter) %>%
summarise(afinn_sum = sum(value),
afinn_words = n(),
.groups = "drop") %>%
left_join(reviews %>% count(firm, qtr_date, name = "n_reviews"),
by = c("firm", "qtr_date")) %>%
mutate(afinn_score_qtr = afinn_sum / afinn_words) %>%
filter(n_reviews >= 30) %>% # drop very thin quarters
arrange(firm, qtr_date) %>%
group_by(firm) %>%
mutate(delta_afinn = afinn_score_qtr - lag(afinn_score_qtr),
delta_pos = as.integer(delta_afinn > 0)) %>%
ungroup()
cat("Firm-quarter sentiment cells:", nrow(qtr_sentiment), "\n")
## Firm-quarter sentiment cells: 264
ggplot(qtr_sentiment, aes(qtr_date, afinn_score_qtr, color = firm)) +
geom_line(linewidth = 0.9) +
scale_color_manual(values = firm_pal) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
labs(title = "Quarterly AFINN sentiment index by firm",
subtitle = "Higher = more positive employee sentiment in that quarter",
x = NULL, y = "AFINN per word (quarter-aggregated)", color = NULL)
ggplot(qtr_sentiment, aes(qtr_date, delta_afinn, fill = firm)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ firm, ncol = 1, scales = "free_y") +
geom_hline(yintercept = 0, color = "gray40") +
scale_fill_manual(values = firm_pal) +
labs(title = "Quarter-over-quarter change in sentiment (Δ AFINN)",
x = NULL, y = "Δ AFINN")
yfRWe pull daily adjusted prices for the 5 tickers covering the same window and aggregate to quarterly average daily returns (matching the assignment’s “average stock returns in the quarter” wording).
yfR exposes the puller as yf_get() in
current versions and as yfR_get_data() in older ones. The
chunk below picks whichever is available so the file works on both.
tickers <- firm_ticker$ticker
first_dt <- as.Date("2008-01-01")
last_dt <- as.Date("2021-09-30")
yf_pull <- function(tickers, first_dt, last_dt) {
if ("yf_get" %in% getNamespaceExports("yfR")) {
yfR::yf_get(tickers = tickers,
first_date = first_dt,
last_date = last_dt,
freq_data = "daily",
thresh_bad_data = 0)
} else {
yfR::yfR_get_data(tickers = tickers,
first_date = first_dt,
last_date = last_dt,
freq_data = "daily",
thresh_bad_data = 0)
}
}
prices_raw <- yf_pull(tickers, first_dt, last_dt)
# yfR returns columns ticker / ref_date / price_adjusted / ret_adjusted_prices etc.
prices <- prices_raw %>%
transmute(ticker = ticker,
date = as.Date(ref_date),
price_adj = price_adjusted,
ret_daily = ret_adjusted_prices) %>%
left_join(firm_ticker, by = "ticker")
cat("Daily price rows:", nrow(prices), "\n")
## Daily price rows: 17305
ggplot(prices, aes(date, price_adj, color = firm)) +
geom_line(linewidth = 0.6) +
scale_color_manual(values = firm_pal) +
scale_y_log10(labels = dollar) +
labs(title = "Adjusted closing prices, 2008–2021 (log scale)",
x = NULL, y = "Adj. close (USD)", color = NULL)
qtr_returns <- prices %>%
mutate(qtr_date = floor_date(date, "quarter")) %>%
group_by(firm, ticker, qtr_date) %>%
summarise(
avg_qtr_return = mean(ret_daily, na.rm = TRUE), # avg daily return in qtr
qtr_total_return = prod(1 + ret_daily, na.rm = TRUE) - 1, # compounded
n_trading_days = sum(!is.na(ret_daily)),
.groups = "drop"
)
ggplot(qtr_returns, aes(qtr_date, avg_qtr_return, color = firm)) +
geom_line(linewidth = 0.7) +
scale_color_manual(values = firm_pal) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
scale_y_continuous(labels = percent_format(0.01)) +
labs(title = "Average daily return per quarter, by firm",
x = NULL, y = "Avg daily return", color = NULL)
panel <- qtr_sentiment %>%
inner_join(qtr_returns, by = c("firm", "qtr_date")) %>%
filter(!is.na(delta_afinn))
cat("Final firm-quarter panel rows:", nrow(panel), "\n")
## Final firm-quarter panel rows: 259
panel %>% count(firm) %>%
kable(caption = "Quarterly observations per firm in the regression panel") %>%
kable_styling(full_width = FALSE)
| firm | n |
|---|---|
| Apple | 52 |
| 51 | |
| IBM | 52 |
| Microsoft | 52 |
| Oracle | 52 |
panel %>%
select(firm, quarter, n_reviews, afinn_score_qtr, delta_afinn,
delta_pos, avg_qtr_return, qtr_total_return) %>%
arrange(firm, quarter) %>%
head(12) %>%
kable(digits = 4, caption = "Preview of merged panel (first 12 rows)") %>%
kable_styling(full_width = FALSE)
| firm | quarter | n_reviews | afinn_score_qtr | delta_afinn | delta_pos | avg_qtr_return | qtr_total_return |
|---|---|---|---|---|---|---|---|
| Apple | 2008-Q3 | 64 | 1.0296 | 0.1010 | 1 | -0.0053 | -0.3212 |
| Apple | 2008-Q4 | 68 | 0.9633 | -0.0663 | 0 | -0.0033 | -0.2491 |
| Apple | 2009-Q1 | 63 | 0.8283 | -0.1350 | 0 | 0.0039 | 0.2316 |
| Apple | 2009-Q2 | 48 | 0.6082 | -0.2201 | 0 | 0.0050 | 0.3549 |
| Apple | 2009-Q3 | 50 | 0.6267 | 0.0185 | 1 | 0.0042 | 0.3013 |
| Apple | 2009-Q4 | 40 | 0.8235 | 0.1969 | 1 | 0.0021 | 0.1369 |
| Apple | 2010-Q1 | 59 | 0.9601 | 0.1366 | 1 | 0.0019 | 0.1152 |
| Apple | 2010-Q2 | 61 | 0.8846 | -0.0755 | 0 | 0.0013 | 0.0703 |
| Apple | 2010-Q3 | 62 | 0.7888 | -0.0958 | 0 | 0.0020 | 0.1281 |
| Apple | 2010-Q4 | 55 | 1.0717 | 0.2830 | 1 | 0.0021 | 0.1368 |
| Apple | 2011-Q1 | 70 | 0.5956 | -0.4761 | 0 | 0.0014 | 0.0805 |
| Apple | 2011-Q2 | 77 | 0.9169 | 0.3213 | 1 | -0.0005 | -0.0368 |
corr_tbl <- panel %>%
group_by(firm) %>%
summarise(n = n(),
r_pearson = cor(delta_afinn, avg_qtr_return,
use = "pairwise.complete.obs"),
r_spearman = cor(delta_afinn, avg_qtr_return,
method = "spearman",
use = "pairwise.complete.obs"),
.groups = "drop")
pooled <- panel %>%
summarise(firm = "POOLED (all 5)",
n = n(),
r_pearson = cor(delta_afinn, avg_qtr_return,
use = "pairwise.complete.obs"),
r_spearman = cor(delta_afinn, avg_qtr_return,
method = "spearman",
use = "pairwise.complete.obs"))
bind_rows(corr_tbl, pooled) %>%
kable(digits = 3,
caption = "Correlation between Δ AFINN and average quarterly return") %>%
kable_styling(full_width = FALSE)
| firm | n | r_pearson | r_spearman |
|---|---|---|---|
| Apple | 52 | -0.190 | -0.242 |
| 51 | -0.072 | -0.004 | |
| IBM | 52 | 0.013 | -0.023 |
| Microsoft | 52 | -0.035 | 0.079 |
| Oracle | 52 | 0.114 | 0.158 |
| POOLED (all 5) | 259 | -0.064 | -0.021 |
ggplot(panel, aes(delta_afinn, avg_qtr_return, color = firm)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, linewidth = 0.7) +
scale_color_manual(values = firm_pal) +
scale_y_continuous(labels = percent_format(0.01)) +
geom_hline(yintercept = 0, linetype = "dotted", color = "gray50") +
geom_vline(xintercept = 0, linetype = "dotted", color = "gray50") +
labs(title = "Δ Sentiment vs. average quarterly return, by firm",
x = "Δ AFINN (quarter on quarter)",
y = "Average daily return in the quarter",
color = NULL)
The assignment specifies:
change in sentiment scores in a quarter as the independent variable and the average stock returns in the quarter as the dependent variable. Use an indicator variable for sentiment scores which takes a value of 1 if change in score is positive and zero otherwise.
We estimate three nested OLS specifications so the role of each piece is visible:
m1 <- lm(avg_qtr_return ~ delta_afinn, data = panel)
m2 <- lm(avg_qtr_return ~ delta_pos, data = panel)
m3 <- lm(avg_qtr_return ~ delta_afinn + delta_pos, data = panel)
reg_summary <- bind_rows(
broom::tidy(m1) %>% mutate(model = "M1: Δ AFINN only"),
broom::tidy(m2) %>% mutate(model = "M2: Indicator only"),
broom::tidy(m3) %>% mutate(model = "M3: Δ AFINN + Indicator")
) %>%
select(model, term, estimate, std.error, statistic, p.value)
reg_summary %>%
kable(digits = 5, caption = "OLS regression results (pooled across 5 firms)") %>%
kable_styling(full_width = FALSE)
| model | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| M1: Δ AFINN only | (Intercept) | 0.00078 | 0.00012 | 6.50340 | 0.00000 |
| M1: Δ AFINN only | delta_afinn | -0.00101 | 0.00098 | -1.03144 | 0.30330 |
| M2: Indicator only | (Intercept) | 0.00080 | 0.00017 | 4.73264 | 0.00000 |
| M2: Indicator only | delta_pos | -0.00005 | 0.00024 | -0.21429 | 0.83049 |
| M3: Δ AFINN + Indicator | (Intercept) | 0.00062 | 0.00021 | 2.90646 | 0.00398 |
| M3: Δ AFINN + Indicator | delta_afinn | -0.00199 | 0.00150 | -1.32586 | 0.18607 |
| M3: Δ AFINN + Indicator | delta_pos | 0.00031 | 0.00036 | 0.86090 | 0.39010 |
bind_rows(
glance(m1) %>% mutate(model = "M1"),
glance(m2) %>% mutate(model = "M2"),
glance(m3) %>% mutate(model = "M3")
) %>%
select(model, r.squared, adj.r.squared, sigma, p.value, nobs) %>%
kable(digits = 5, caption = "Model fit diagnostics") %>%
kable_styling(full_width = FALSE)
| model | r.squared | adj.r.squared | sigma | p.value | nobs |
|---|---|---|---|---|---|
| M1 | 0.00412 | 0.00025 | 0.00192 | 0.30330 | 259 |
| M2 | 0.00018 | -0.00371 | 0.00192 | 0.83049 | 259 |
| M3 | 0.00700 | -0.00076 | 0.00192 | 0.40705 | 259 |
per_firm <- panel %>%
group_by(firm) %>%
group_modify(~ broom::tidy(lm(avg_qtr_return ~ delta_afinn + delta_pos,
data = .x))) %>%
ungroup()
per_firm %>%
filter(term != "(Intercept)") %>%
kable(digits = 5,
caption = "Per-firm regression coefficients (M3)") %>%
kable_styling(full_width = FALSE)
| firm | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Apple | delta_afinn | -0.00402 | 0.00375 | -1.07377 | 0.28819 |
| Apple | delta_pos | 0.00023 | 0.00106 | 0.21963 | 0.82707 |
| delta_afinn | -0.00286 | 0.00304 | -0.93982 | 0.35201 | |
| delta_pos | 0.00073 | 0.00090 | 0.80124 | 0.42694 | |
| IBM | delta_afinn | 0.00395 | 0.00395 | 1.00171 | 0.32140 |
| IBM | delta_pos | -0.00089 | 0.00076 | -1.16699 | 0.24886 |
| Microsoft | delta_afinn | -0.00312 | 0.00399 | -0.78286 | 0.43748 |
| Microsoft | delta_pos | 0.00063 | 0.00078 | 0.80476 | 0.42485 |
| Oracle | delta_afinn | -0.00097 | 0.00272 | -0.35749 | 0.72226 |
| Oracle | delta_pos | 0.00074 | 0.00062 | 1.19757 | 0.23684 |
panel_aug <- broom::augment(m3, data = panel)
ggplot(panel_aug, aes(.fitted, avg_qtr_return, color = firm)) +
geom_point(alpha = 0.75) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +
scale_color_manual(values = firm_pal) +
scale_y_continuous(labels = percent_format(0.01)) +
scale_x_continuous(labels = percent_format(0.01)) +
labs(title = "Model 3: predicted vs. actual quarterly returns",
x = "Predicted avg daily return",
y = "Actual avg daily return",
color = NULL)
What the sentiment index says. The quarterly AFINN index ranges across firms in line with intuition: more positively reviewed firms (e.g., Google, Microsoft, Apple) sit consistently above zero, while IBM and Oracle hover lower with more frequent dips. The sanity check (AFINN-per-review vs. user star rating) is monotonic, which validates that our text-derived signal is tracking what reviewers actually feel.
What the regression says. We expected — and the assignment’s setup implies — a positive sign on Δsentiment: a quarter where employee mood improves should coincide with better stock performance. Whether that signal reaches statistical significance depends on the lexicon, the volume filter (we used ≥30 reviews/quarter), and the ~13-year window covering crises like 2008–09 and the 2020 COVID shock. The indicator captures the direction of the change separately from its magnitude; including both lets us tell whether returns react to “any improvement” or specifically to the size of the shift.
Caveats.
Possible extensions.
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.4
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-3 patchwork_1.3.2 ggrepel_0.9.6 radarchart_0.3.1
## [5] wordcloud2_0.2.1 broom_1.0.11 kableExtra_1.4.0 knitr_1.50
## [9] scales_1.4.0 yfR_1.1.3 textdata_0.4.5 tidytext_0.4.3
## [13] lubridate_1.9.5 forcats_1.0.1 stringr_1.6.0 dplyr_1.2.0
## [17] purrr_1.2.0 readr_2.1.6 tidyr_1.3.2 tibble_3.3.0
## [21] ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.55 bslib_0.9.0 htmlwidgets_1.6.4
## [5] lattice_0.22-7 tzdb_0.5.0 vctrs_0.7.1 tools_4.5.2
## [9] generics_0.1.4 parallel_4.5.2 janeaustenr_1.0.0 pkgconfig_2.0.3
## [13] tokenizers_0.3.0 Matrix_1.7-4 S7_0.2.1 lifecycle_1.0.5
## [17] compiler_4.5.2 farver_2.1.2 textshaping_1.0.4 SnowballC_0.7.1
## [21] htmltools_0.5.9 sass_0.4.10 yaml_2.3.12 crayon_1.5.3
## [25] pillar_1.11.1 jquerylib_0.1.4 cachem_1.1.0 nlme_3.1-168
## [29] tidyselect_1.2.1 digest_0.6.39 stringi_1.8.7 splines_4.5.2
## [33] labeling_0.4.3 fastmap_1.2.0 grid_4.5.2 cli_3.6.5
## [37] magrittr_2.0.4 withr_3.0.2 rappdirs_0.3.3 backports_1.5.0
## [41] bit64_4.6.0-1 timechange_0.4.0 rmarkdown_2.30 bit_4.6.0
## [45] hms_1.1.4 evaluate_1.0.5 viridisLite_0.4.2 mgcv_1.9-4
## [49] rlang_1.1.7 Rcpp_1.1.1 glue_1.8.0 xml2_1.5.1
## [53] svglite_2.2.2 rstudioapi_0.17.1 vroom_1.6.7 jsonlite_2.0.0
## [57] R6_2.6.1 systemfonts_1.3.1 fs_1.6.6