Liveo Research: focused correspondence analysis

Author

Liveo Research

1 Purpose

This report shows a clean, reader-friendly workflow for analyzing one sustained email correspondence. The focus is to surface tone, cadence, and thematic signals that are easy to interpret for marketing analysis.

2 Notes and scope

  • We focus on single-recipient emails to avoid group thread noise.
  • A “correspondence” is defined as two people exchanging at least 20 emails over more than 5 days.
  • Forwarded content and email addresses are scrubbed from the body text before analysis.
  • Times are normalized to UTC for consistent ordering.
  • Emails older than 1995 are excluded.
  • Plots show a midpoint window of +/- 5 days.

3 Global variables

Show code
data_root <- "data"
output_root <- "outputs"

min_emails <- 20
min_span_days <- 5
max_terms <- 15
plot_window_days <- 5
min_date <- as.POSIXct("1995-01-01", tz = "UTC")

brand_ink <- "#2f2f2f"
brand_teal <- "#1f4e5f"
brand_clay <- "#8c3b2a"
brand_sand <- "#e5d5b3"
brand_olive <- "#8a9a5b"

4 Load packages

Show code
library(dplyr)
library(readr)
library(stringr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(tidytext)
library(igraph)
library(ggraph)

theme_set(theme_minimal(base_size = 12))
theme_update(panel.grid.minor = element_blank(), plot.title.position = "plot")

5 Helper functions

Show code
extract_header_field <- function(header, field) {
  pattern <- paste0("(?m)^", field, ":\\s*.*$")
  value <- str_extract(header, pattern)
  str_remove(value, paste0("(?m)^", field, ":\\s*"))
}

clean_body <- function(text) {
  text %>%
    str_replace_all("\\r", "") %>%
    str_replace("(?s)-----Original Message-----.*$", " FORWARDED_MESSAGE ") %>%
    str_replace("(?s)----- Forwarded by.*$", " FORWARDED_MESSAGE ") %>%
    str_replace_all("\\S+@\\S+", "EMAIL_ADDRESS") %>%
    str_replace_all("[\\n\\t]+", " ") %>%
    str_squish()
}

6 Data ingest and parsing

We parse the header fields, clean the body text, and keep only the metadata needed for correspondence analysis.

Show code
email_path <- file.path(data_root, "emails.csv")
if (!file.exists(email_path)) {
  stop("Missing input file: ", email_path)
}

emails_raw <- read_csv(email_path, show_col_types = FALSE)
required_cols <- c("file", "message")
missing_cols <- setdiff(required_cols, names(emails_raw))
if (length(missing_cols) > 0) {
  stop("Missing columns: ", paste(missing_cols, collapse = ", "))
}

emails_parsed <- emails_raw %>%
  mutate(message = str_replace_all(message, "\\r", "")) %>%
  mutate(
    header_end = str_locate(message, "\\n\\n")[, 1],
    header = if_else(!is.na(header_end), str_sub(message, 1, header_end - 1), ""),
    body_raw = if_else(!is.na(header_end), str_sub(message, header_end + 2), message),
    body = clean_body(body_raw),
    date_raw = extract_header_field(header, "Date"),
    from = str_to_lower(extract_header_field(header, "From")),
    to = str_to_lower(extract_header_field(header, "To")),
    subject = extract_header_field(header, "Subject"),
    date_raw = str_replace(date_raw, "\\s*\\(.*\\)$", "")
  ) %>%
  mutate(
    date = parse_date_time(
      date_raw,
      orders = c("a, d b Y H:M:S z", "a, d b Y H:M z", "d b Y H:M:S z", "d b Y H:M z"),
      tz = "UTC"
    )
  ) %>%
  select(file, date, from, to, subject, body) %>%
  filter(!is.na(date), date >= min_date)

7 Select one sustained correspondence

We keep single-recipient emails, build bidirectional pairs, and select the top pair that meets the threshold.

Show code
emails_clean <- emails_parsed %>%
  filter(!is.na(date), !is.na(from), !is.na(to)) %>%
  mutate(to_count = str_count(to, ",|;") + 1) %>%
  filter(to_count == 1) %>%
  separate_rows(to, sep = "\\s*,\\s*|\\s*;\\s*") %>%
  mutate(to = str_trim(to)) %>%
  filter(to != "", from != "", from != to)

emails_pairs <- emails_clean %>%
  mutate(
    pair_id = if_else(
      from < to,
      paste(from, to, sep = " <-> "),
      paste(to, from, sep = " <-> ")
    )
  )

pair_stats <- emails_pairs %>%
  group_by(pair_id) %>%
  summarise(
    email_count = n(),
    start_date = min(date),
    end_date = max(date),
    span_days = as.numeric(difftime(end_date, start_date, units = "days")),
    .groups = "drop"
  ) %>%
  filter(email_count >= min_emails, span_days > min_span_days) %>%
  arrange(desc(email_count), desc(span_days))

if (nrow(pair_stats) == 0) {
  stop("No correspondence pair meets the thresholds. Adjust min_emails or min_span_days.")
}

selected_pair_id <- pair_stats$pair_id[2]
pair_members <- str_split(selected_pair_id, " <-> ", simplify = TRUE)

selected_pair <- emails_pairs %>%
  filter(pair_id == selected_pair_id) %>%
  arrange(date) %>%
  mutate(email_id = row_number())

pair_summary <- tibble(
  pair_id = selected_pair_id,
  participant_a = pair_members[1],
  participant_b = pair_members[2],
  email_count = nrow(selected_pair),
  start_date = min(selected_pair$date),
  end_date = max(selected_pair$date),
  span_days = as.numeric(difftime(max(selected_pair$date), min(selected_pair$date), units = "days"))
)

knitr::kable(pair_summary)
pair_id participant_a participant_b email_count start_date end_date span_days
all.worldwide@enron.com <-> enron.announcements@enron.com all.worldwide@enron.com enron.announcements@enron.com 2206 1999-08-18 17:54:00 2001-07-08 17:31:00 689.984

8 Descriptive signals

The descriptive layer helps explain cadence and responsiveness before we interpret tone.

Show code
selected_pair <- selected_pair %>%
  mutate(
    day = as.Date(date),
    direction = if_else(
      from == pair_members[1],
      paste(pair_members[1], "->", pair_members[2]),
      paste(pair_members[2], "->", pair_members[1])
    )
  )

midpoint_date <- min(selected_pair$date) + (max(selected_pair$date) - min(selected_pair$date)) / 2
window_start <- midpoint_date - days(plot_window_days)
window_end <- midpoint_date + days(plot_window_days)
selected_pair_window <- selected_pair %>%
  filter(date >= window_start, date <= window_end)

gap_hours <- selected_pair %>%
  arrange(date) %>%
  mutate(gap_hours = as.numeric(difftime(date, lag(date), units = "hours"))) %>%
  filter(!is.na(gap_hours))

span_days <- as.numeric(difftime(max(selected_pair$date), min(selected_pair$date), units = "days"))

descriptive_summary <- tibble(
  metric = c("Total emails", "Date span (days)", "Emails per day", "Median gap (hours)", "Mean gap (hours)"),
  value = c(
    nrow(selected_pair),
    round(span_days, 1),
    round(nrow(selected_pair) / max(span_days, 1), 2),
    round(median(gap_hours$gap_hours), 2),
    round(mean(gap_hours$gap_hours), 2)
  )
)

knitr::kable(descriptive_summary)
metric value
Total emails 2206.00
Date span (days) 690.00
Emails per day 3.20
Median gap (hours) 0.00
Mean gap (hours) 7.51
Show code
daily_volume <- selected_pair %>%
  count(day)

ggplot(daily_volume, aes(day, n)) +
  geom_col(fill = brand_teal) +
  labs(
    title = "Email volume over time",
    x = "Date",
    y = "Emails per day"
  )

9 Sentiment analysis

We score each email using the AFINN lexicon and use that to track tone shifts over time.

Show code
email_tokens <- selected_pair %>%
  select(email_id, body) %>%
  unnest_tokens(word, body) %>%
  filter(!str_detect(word, "^\\d+$"))

sentiment_scores <- email_tokens %>%
  anti_join(stop_words, by = "word") %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  group_by(email_id) %>%
  summarise(sentiment = sum(value), .groups = "drop")

selected_pair <- selected_pair %>%
  left_join(sentiment_scores, by = "email_id") %>%
  mutate(
    sentiment = replace_na(sentiment, 0),
    word_count = str_count(body, "\\S+"),
    weekday = wday(date, label = TRUE)
  )

selected_pair_window <- selected_pair %>%
  filter(date >= as.POSIXct("2000-01-01 00:00:00", tz = "UTC"))
Show code
ggplot(selected_pair_window, aes(date, sentiment, color = direction)) +
  geom_hline(yintercept = 0, color = "grey75") +
  geom_line(alpha = 0.4) +
  geom_point(size = 2) +
  scale_color_manual(values = c(brand_teal, brand_clay)) +
  labs(
    title = "Sentiment over time",
    x = "Date",
    y = "AFINN sentiment score",
    color = "Direction"
  )

Show code
ggplot(selected_pair, aes(weekday, sentiment, fill = direction)) +
  geom_boxplot(alpha = 0.75, outlier.size = 1) +
  scale_fill_manual(values = c(brand_teal, brand_clay)) +
  labs(
    title = "Sentiment by weekday",
    x = "Weekday",
    y = "AFINN sentiment score",
    fill = "Direction"
  )

These numbers are document-level sentiment sums for each email in selected_pair_window.

With AFINN, each matched word has an integer sentiment value:

  • negative words get negative values
  • positive words get positive values
  • neutral or unmatched words contribute nothing

Your code does this:

10 Semantic timeline graph

This view treats each email as a node on a time axis. Node size represents length (semantic load), and color reflects sentiment score.

Show code
graph_nodes <- selected_pair_window %>%
  arrange(date) %>%
  mutate(
    node_id = row_number(),
    name = as.character(node_id),
    x = as.numeric(date),
    y = 0,
    date_label = format(as.Date(date), "%Y-%m-%d")
  ) %>%
  select(name, everything())

graph_edges <- graph_nodes %>%
  mutate(next_name = lead(name)) %>%
  filter(!is.na(next_name)) %>%
  transmute(from = name, to = next_name)

graph <- igraph::graph_from_data_frame(
  graph_edges,
  vertices = graph_nodes,
  directed = TRUE
)

layout <- create_layout(
  graph,
  layout = "manual",
  x = graph_nodes$x,
  y = graph_nodes$y
)

ggraph(layout) +
  geom_edge_link(
    color = "grey70",
    alpha = 0.6,
    arrow = grid::arrow(length = grid::unit(2, "mm")),
    end_cap = ggraph::circle(2, "mm")
  ) +
  geom_node_point(aes(size = word_count, color = sentiment)) +
  geom_node_text(
    aes(label = date_label),
    nudge_y = 0.08,
    size = 2.8,
    check_overlap = TRUE
  ) +
  scale_color_gradient2(low = "#7f1d1d", mid = brand_sand, high = brand_teal) +
  scale_size(range = c(2, 8)) +
  scale_x_continuous(
    breaks = function(x) pretty(x, n = 5),
    labels = function(x) as.Date(x, origin = "1970-01-01")
  ) +
  labs(
    title = "Semantic timeline of the correspondence",
    x = "Time",
    y = NULL,
    color = "Sentiment",
    size = "Words"
  ) +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  )

11 Key terms snapshot

This is a fast, marketing-friendly view of the terms that dominate the conversation.

Show code
top_terms <- email_tokens %>%
  anti_join(stop_words, by = "word") %>%
  count(word, sort = TRUE) %>%
  slice_head(n = max_terms)

ggplot(top_terms, aes(reorder(word, n), n)) +
  geom_col(fill = brand_olive) +
  coord_flip() +
  labs(
    title = "Top terms in the selected correspondence",
    x = NULL,
    y = "Occurrences"
  )

12 Question detection (LLM placeholder)

The following block is a placeholder for a Python + LLM pipeline that will detect questions raised within the correspondence.

Show code
# Load the selected correspondence emails exported from R
# Normalize text (strip signatures, quoted replies, and forwarded blocks)
# Use a classifier or prompt-based LLM to detect explicit and implicit questions
# Tag each email with question_type, urgency, and topic
# Aggregate question frequency over time and by sender
# Return a summary table and a trend chart back to the report

13 Conclusion

This focused correspondence analysis shows how Liveo can translate raw email text into clear marketing signals: cadence, tone shifts, dominant themes or identifiy explicit and implicit questions. The workflow is compact, interpretable, and easy to extend with LLM-based question detection or richer semantic models when needed.