Overview

Here’s a quick - and possibly not very informative - demo of what could be done with raw Brandwatch data capturing social media posts by FloRacing.

A query written to capture all posts by FloRacing between Dec. 1, 2024 and yesterday captured the full text and metadata for 12,873 posts on a number of platforms:



Bigram frequency analysis showed that “late model” and “sprint” - references, I gather, to different categories of dirt-track racing - were prominent themes. So, I decided to see whether one category got mentioned more than the other. A difference might be notable in the context of agenda-setting theory, which, among other things, looks at how different issues or issue attributes compete for media attention. Here’s a week-by-week breakdown of mentions for each category:

Looks like “late models” might have gotten more attention that “sprints.” A paired-samples t-test can say for certain:

Descriptive Statistics: Pair Differences
count mean sd min max
49.000 −8.673 19.803 −61.000 47.000
Normality Test (Shapiro-Wilk)
statistic p.value method
0.9672 0.1874 Shapiro-Wilk normality test
Paired-Samples t-Test
estimate statistic p.value parameter conf.low conf.high method alternative
-8.673469 −3.0660 0.0036 48 −14.3614 −2.9855 Paired t-test two.sided

The code:

# ============================================================
# 1. INSTALL AND LOAD REQUIRED PACKAGES
# ============================================================

if (!require(tidyverse)) install.packages("tidyverse")
library(tidyverse)

if (!require(DT)) install.packages("DT")
library(DT)

if (!require(tidytext)) install.packages("tidytext")
library(tidytext)

if (!require(plotly)) install.packages("plotly")
library(plotly)

if (!require(gt)) install.packages("gt")
library(gt)

if (!require(gtExtras)) install.packages("gtExtras")
library(gtExtras)

if (!require(broom)) install.packages("broom")
library(broom)

# ============================================================
# 2. DATA IMPORT AND INITIAL CLEANING
# ============================================================

# Import CSV file
mydata <- read_csv(
  "FloRacing on X.csv",
  skip = 6,
  col_types = cols("Thread Id" = col_character())
)

# Standardize column names
names(mydata) <- make.names(names(mydata))

# Keep relevant columns
KeptData <- mydata %>%
  select(
    Date,
    Full.Name,
    Author,
    Url,
    Page.Type,
    Full.Text,
    Expanded.URLs,
    Thread.Id,
    Impressions,
    X.Replies,
    X.Reposts,
    X.Likes,
    Engagement.Type,
    Hashtags,
    Engagement.Score
  )

# Convert Date to POSIXct
KeptData$Date <- as.POSIXct(KeptData$Date, tz = "America/Chicago")

# Sort by Date
KeptData <- arrange(KeptData, Date)

# ============================================================
# 3. ADD WEEK VARIABLE
# ============================================================

KeptData <- KeptData %>%
  mutate(
    # Floor to previous Sunday
    WeekStart = as.Date(Date) - as.integer(format(as.Date(Date), "%w")),
    # Week number since earliest week
    Week = as.integer(difftime(WeekStart, min(WeekStart), units = "weeks")) + 1
  )

# ============================================================
# 4. SAVE CLEANED DATA
# ============================================================

saveRDS(KeptData, file = "FloRacingData.RDS")

# ============================================================
# 5. POST COUNTS BY CHANNEL
# ============================================================

PostsByChannel <- KeptData %>% 
  group_by(Page.Type) %>% 
  summarise(
    n = n(),
    pct = paste0(round(n / nrow(KeptData) * 100, 1), "%")
  )

# Display as interactive table
Channels <- datatable(
  PostsByChannel,
  caption = "Posts by Page Type",
  options = list(pageLength = 10)
)

Channels

# ============================================================
# 6. WORD FREQUENCY ANALYSIS
# ============================================================

# Cleaning regular expressions
replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https"
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"

tidy_tweets <- KeptData %>% 
  mutate(text = str_replace_all(Full.Text, replace_reg, "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$word, "'"),
         str_detect(word, "[a-z]"))

WordFrequency <- tidy_tweets %>% 
  count(Page.Type, word, sort = TRUE) %>% 
  left_join(tidy_tweets %>% count(Page.Type, name = "total")) %>%
  mutate(freq = n / total)

# ============================================================
# 7. BIGRAM FREQUENCY ANALYSIS
# ============================================================

tidy_bigrams <- KeptData %>% 
  filter(!str_detect(Full.Text, "^RT")) %>%
  mutate(text = str_replace_all(Full.Text, replace_reg, "")) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(
    !word1 %in% stop_words$word,
    !word2 %in% stop_words$word,
    str_detect(word1, "[a-z]"),
    str_detect(word2, "[a-z]")
  ) %>%
  unite(bigram, word1, word2, sep = " ")

BigramFrequency <- tidy_bigrams %>% 
  count(Page.Type, bigram, sort = TRUE) %>% 
  left_join(tidy_tweets %>% count(Page.Type, name = "total")) %>%
  mutate(freq = n / total)

# ============================================================
# 8. FLAG TOPICS (AGENDA-SETTING)
# ============================================================

APNews <- KeptData

# Topic1: Late Model
phrases1 <- c("sprint car", "sprint cars", "sprint", "national sprint", "sprints", "sprintcar")
escaped_phrases1 <- str_replace_all(phrases1, "([\\^$.|?*+()\\[\\]{}\\\\])", "\\\\\\1")
pattern1 <- paste0("\\b", escaped_phrases1, "\\b", collapse = "|")

APNews <- APNews %>%
  mutate(
    Full.Text.clean = str_squish(Full.Text),
    Topic1 = if_else(str_detect(Full.Text.clean, regex(pattern1, ignore_case = TRUE)), "Yes", "No")
  )

# Topic2: Sprint Car
phrases2 <- c("late model", "late models", "pro late", "dirt late", "super late", "late")
escaped_phrases2 <- str_replace_all(phrases2, "([\\^$.|?*+()\\[\\]{}\\\\])", "\\\\\\1")
pattern2 <- paste0("\\b", escaped_phrases2, "\\b", collapse = "|")

APNews <- APNews %>%
  mutate(
    Full.Text.clean = str_squish(Full.Text),
    Topic2 = if_else(str_detect(Full.Text.clean, regex(pattern2, ignore_case = TRUE)), "Yes", "No")
  )

# ============================================================
# 9. WEEKLY COUNTS OF TOPICS
# ============================================================

# Summarize weekly counts
Topic1_weekly <- APNews %>%
  filter(Topic1 == "Yes") %>%
  group_by(Week) %>%
  summarize(Count = n(), .groups = "drop") %>%
  mutate(Topic = "Sprint") # <- CUSTOM LABEL, TOPIC 1 ######

Topic2_weekly <- APNews %>%
  filter(Topic2 == "Yes") %>%
  group_by(Week) %>%
  summarize(Count = n(), .groups = "drop") %>%
  mutate(Topic = "Late") # <- CUSTOM LABEL, TOPIC 2 ######

Weekly_counts <- bind_rows(Topic1_weekly, Topic2_weekly) %>%
  tidyr::complete(
    Topic,
    Week = full_seq(range(Week), 1),
    fill = list(Count = 0)
  ) %>%
  arrange(Topic, Week)

# ============================================================
# 10. INTERACTIVE PLOTLY LINE CHART
# ============================================================

AS1 <- plot_ly(
  data = Weekly_counts,
  x = ~Week,
  y = ~Count,
  color = ~Topic,
  colors = c("steelblue", "firebrick"),
  type = "scatter",
  mode = "lines+markers",
  line = list(width = 2),
  marker = list(size = 6)
) %>%
  layout(
    title = "Weekly Counts of Topic1- and Topic2-Related AP News Articles",
    xaxis = list(title = "Week Number (starting with Week 1)", dtick = 1),
    yaxis = list(title = "Number of Articles"),
    legend = list(title = list(text = "Topic")),
    hovermode = "x unified"
  )

AS1

# ============================================================
# 11. ADDITIONAL ANALYSIS AND STATISTICS (HISTOGRAMS, TESTS)
# ============================================================

# Pivot data for repeated-measures analysis
df_long <- Weekly_counts %>%
  pivot_wider(names_from = Topic, values_from = Count, values_fill = 0) %>%
  pivot_longer(cols = c(Late, Sprint), names_to = "Measure", values_to = "Value")

group_means <- df_long %>%
  group_by(Measure) %>%
  summarise(mean_value = mean(Value), .groups = "drop")

# Interactive boxplot with means
boxplot_measures <- plot_ly() %>%
  add_trace(
    data = df_long,
    x = ~Measure, y = ~Value,
    type = "box",
    boxpoints = "outliers",
    marker = list(color = "red", size = 4),
    line = list(color = "black"),
    fillcolor = "royalblue",
    name = ""
  ) %>%
  add_trace(
    data = group_means,
    x = ~Measure, y = ~mean_value,
    type = "scatter", mode = "markers",
    marker = list(symbol = "diamond", size = 9, color = "black",
                  line = list(color = "white", width = 1)),
    text = ~paste0("Mean = ", round(mean_value, 2)),
    hoverinfo = "text",
    name = "Group Mean"
  ) %>%
  layout(
    title = "Boxplot of Weekly Counts (Late vs Sprint) with Means",
    xaxis = list(title = "Measure"),
    yaxis = list(title = "Value"),
    showlegend = FALSE
  )

boxplot_measures

# ============================================================
# 12. PAIR DIFFERENCE AND STATISTICAL TESTS
# ============================================================

# Compute pair differences
df_wide <- df_long %>%
  pivot_wider(names_from = Measure, values_from = Value)

df_wide$PairDifferences <- df_wide$Sprint - df_wide$Late

# Histogram
hist_plot <- plot_ly(
  data = df_wide,
  x = ~PairDifferences,
  type = "histogram",
  marker = list(color = "#1f78b4", line = list(color = "black", width = 1))
) %>%
  layout(
    title = "Distribution of Pair Differences",
    xaxis = list(title = "Pair Differences"),
    yaxis = list(title = "Count")
  )

hist_plot

# Descriptive statistics
desc_stats <- df_wide %>%
  summarise(
    count = n(),
    mean = mean(PairDifferences, na.rm = TRUE),
    sd = sd(PairDifferences, na.rm = TRUE),
    min = min(PairDifferences, na.rm = TRUE),
    max = max(PairDifferences, na.rm = TRUE)
  )

desc_table <- desc_stats %>%
  gt() %>%
  gt_theme_538() %>%
  tab_header(title = "Descriptive Statistics: Pair Differences") %>%
  fmt_number(columns = where(is.numeric), decimals = 3)

desc_table

# Shapiro-Wilk test for normality
shapiro_res <- shapiro.test(df_wide$PairDifferences)
shapiro_table <- tidy(shapiro_res) %>%
  select(statistic, p.value, method) %>%
  gt() %>%
  gt_theme_538() %>%
  tab_header(title = "Normality Test (Shapiro-Wilk)") %>%
  fmt_number(columns = c(statistic, p.value), decimals = 4)

shapiro_table

# Paired t-test
t_res <- t.test(df_wide$Sprint, df_wide$Late, paired = TRUE)
t_table <- tidy(t_res) %>%
  gt() %>%
  gt_theme_538() %>%
  tab_header(title = "Paired-Samples t-Test") %>%
  fmt_number(columns = c(statistic, p.value, conf.low, conf.high), decimals = 4)

t_table

# Wilcoxon signed-rank test
wilcox_res <- wilcox.test(df_wide$Late, df_wide$Sprint, paired = TRUE, exact = FALSE)
wilcox_table <- tidy(wilcox_res) %>%
  gt() %>%
  gt_theme_538() %>%
  tab_header(title = "Wilcoxon Signed Rank Test") %>%
  fmt_number(columns = c(statistic, p.value), decimals = 4)

wilcox_table