Here’s a quick - and possibly not very informative - demo of what could be done with raw Brandwatch data capturing social media posts by FloRacing.
A query written to capture all posts by FloRacing between Dec. 1, 2024 and yesterday captured the full text and metadata for 12,873 posts on a number of platforms:
Bigram frequency analysis showed that “late model” and “sprint” - references, I gather, to different categories of dirt-track racing - were prominent themes. So, I decided to see whether one category got mentioned more than the other. A difference might be notable in the context of agenda-setting theory, which, among other things, looks at how different issues or issue attributes compete for media attention. Here’s a week-by-week breakdown of mentions for each category:
Looks like “late models” might have gotten more attention that “sprints.” A paired-samples t-test can say for certain:
| Descriptive Statistics: Pair Differences | ||||
| count | mean | sd | min | max |
|---|---|---|---|---|
| 49.000 | −8.673 | 19.803 | −61.000 | 47.000 |
| Normality Test (Shapiro-Wilk) | ||
| statistic | p.value | method |
|---|---|---|
| 0.9672 | 0.1874 | Shapiro-Wilk normality test |
| Paired-Samples t-Test | |||||||
| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| -8.673469 | −3.0660 | 0.0036 | 48 | −14.3614 | −2.9855 | Paired t-test | two.sided |
# ============================================================
# 1. INSTALL AND LOAD REQUIRED PACKAGES
# ============================================================
if (!require(tidyverse)) install.packages("tidyverse")
library(tidyverse)
if (!require(DT)) install.packages("DT")
library(DT)
if (!require(tidytext)) install.packages("tidytext")
library(tidytext)
if (!require(plotly)) install.packages("plotly")
library(plotly)
if (!require(gt)) install.packages("gt")
library(gt)
if (!require(gtExtras)) install.packages("gtExtras")
library(gtExtras)
if (!require(broom)) install.packages("broom")
library(broom)
# ============================================================
# 2. DATA IMPORT AND INITIAL CLEANING
# ============================================================
# Import CSV file
mydata <- read_csv(
"FloRacing on X.csv",
skip = 6,
col_types = cols("Thread Id" = col_character())
)
# Standardize column names
names(mydata) <- make.names(names(mydata))
# Keep relevant columns
KeptData <- mydata %>%
select(
Date,
Full.Name,
Author,
Url,
Page.Type,
Full.Text,
Expanded.URLs,
Thread.Id,
Impressions,
X.Replies,
X.Reposts,
X.Likes,
Engagement.Type,
Hashtags,
Engagement.Score
)
# Convert Date to POSIXct
KeptData$Date <- as.POSIXct(KeptData$Date, tz = "America/Chicago")
# Sort by Date
KeptData <- arrange(KeptData, Date)
# ============================================================
# 3. ADD WEEK VARIABLE
# ============================================================
KeptData <- KeptData %>%
mutate(
# Floor to previous Sunday
WeekStart = as.Date(Date) - as.integer(format(as.Date(Date), "%w")),
# Week number since earliest week
Week = as.integer(difftime(WeekStart, min(WeekStart), units = "weeks")) + 1
)
# ============================================================
# 4. SAVE CLEANED DATA
# ============================================================
saveRDS(KeptData, file = "FloRacingData.RDS")
# ============================================================
# 5. POST COUNTS BY CHANNEL
# ============================================================
PostsByChannel <- KeptData %>%
group_by(Page.Type) %>%
summarise(
n = n(),
pct = paste0(round(n / nrow(KeptData) * 100, 1), "%")
)
# Display as interactive table
Channels <- datatable(
PostsByChannel,
caption = "Posts by Page Type",
options = list(pageLength = 10)
)
Channels
# ============================================================
# 6. WORD FREQUENCY ANALYSIS
# ============================================================
# Cleaning regular expressions
replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|RT|https"
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
tidy_tweets <- KeptData %>%
mutate(text = str_replace_all(Full.Text, replace_reg, "")) %>%
unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
filter(!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"),
str_detect(word, "[a-z]"))
WordFrequency <- tidy_tweets %>%
count(Page.Type, word, sort = TRUE) %>%
left_join(tidy_tweets %>% count(Page.Type, name = "total")) %>%
mutate(freq = n / total)
# ============================================================
# 7. BIGRAM FREQUENCY ANALYSIS
# ============================================================
tidy_bigrams <- KeptData %>%
filter(!str_detect(Full.Text, "^RT")) %>%
mutate(text = str_replace_all(Full.Text, replace_reg, "")) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(
!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
str_detect(word1, "[a-z]"),
str_detect(word2, "[a-z]")
) %>%
unite(bigram, word1, word2, sep = " ")
BigramFrequency <- tidy_bigrams %>%
count(Page.Type, bigram, sort = TRUE) %>%
left_join(tidy_tweets %>% count(Page.Type, name = "total")) %>%
mutate(freq = n / total)
# ============================================================
# 8. FLAG TOPICS (AGENDA-SETTING)
# ============================================================
APNews <- KeptData
# Topic1: Late Model
phrases1 <- c("sprint car", "sprint cars", "sprint", "national sprint", "sprints", "sprintcar")
escaped_phrases1 <- str_replace_all(phrases1, "([\\^$.|?*+()\\[\\]{}\\\\])", "\\\\\\1")
pattern1 <- paste0("\\b", escaped_phrases1, "\\b", collapse = "|")
APNews <- APNews %>%
mutate(
Full.Text.clean = str_squish(Full.Text),
Topic1 = if_else(str_detect(Full.Text.clean, regex(pattern1, ignore_case = TRUE)), "Yes", "No")
)
# Topic2: Sprint Car
phrases2 <- c("late model", "late models", "pro late", "dirt late", "super late", "late")
escaped_phrases2 <- str_replace_all(phrases2, "([\\^$.|?*+()\\[\\]{}\\\\])", "\\\\\\1")
pattern2 <- paste0("\\b", escaped_phrases2, "\\b", collapse = "|")
APNews <- APNews %>%
mutate(
Full.Text.clean = str_squish(Full.Text),
Topic2 = if_else(str_detect(Full.Text.clean, regex(pattern2, ignore_case = TRUE)), "Yes", "No")
)
# ============================================================
# 9. WEEKLY COUNTS OF TOPICS
# ============================================================
# Summarize weekly counts
Topic1_weekly <- APNews %>%
filter(Topic1 == "Yes") %>%
group_by(Week) %>%
summarize(Count = n(), .groups = "drop") %>%
mutate(Topic = "Sprint") # <- CUSTOM LABEL, TOPIC 1 ######
Topic2_weekly <- APNews %>%
filter(Topic2 == "Yes") %>%
group_by(Week) %>%
summarize(Count = n(), .groups = "drop") %>%
mutate(Topic = "Late") # <- CUSTOM LABEL, TOPIC 2 ######
Weekly_counts <- bind_rows(Topic1_weekly, Topic2_weekly) %>%
tidyr::complete(
Topic,
Week = full_seq(range(Week), 1),
fill = list(Count = 0)
) %>%
arrange(Topic, Week)
# ============================================================
# 10. INTERACTIVE PLOTLY LINE CHART
# ============================================================
AS1 <- plot_ly(
data = Weekly_counts,
x = ~Week,
y = ~Count,
color = ~Topic,
colors = c("steelblue", "firebrick"),
type = "scatter",
mode = "lines+markers",
line = list(width = 2),
marker = list(size = 6)
) %>%
layout(
title = "Weekly Counts of Topic1- and Topic2-Related AP News Articles",
xaxis = list(title = "Week Number (starting with Week 1)", dtick = 1),
yaxis = list(title = "Number of Articles"),
legend = list(title = list(text = "Topic")),
hovermode = "x unified"
)
AS1
# ============================================================
# 11. ADDITIONAL ANALYSIS AND STATISTICS (HISTOGRAMS, TESTS)
# ============================================================
# Pivot data for repeated-measures analysis
df_long <- Weekly_counts %>%
pivot_wider(names_from = Topic, values_from = Count, values_fill = 0) %>%
pivot_longer(cols = c(Late, Sprint), names_to = "Measure", values_to = "Value")
group_means <- df_long %>%
group_by(Measure) %>%
summarise(mean_value = mean(Value), .groups = "drop")
# Interactive boxplot with means
boxplot_measures <- plot_ly() %>%
add_trace(
data = df_long,
x = ~Measure, y = ~Value,
type = "box",
boxpoints = "outliers",
marker = list(color = "red", size = 4),
line = list(color = "black"),
fillcolor = "royalblue",
name = ""
) %>%
add_trace(
data = group_means,
x = ~Measure, y = ~mean_value,
type = "scatter", mode = "markers",
marker = list(symbol = "diamond", size = 9, color = "black",
line = list(color = "white", width = 1)),
text = ~paste0("Mean = ", round(mean_value, 2)),
hoverinfo = "text",
name = "Group Mean"
) %>%
layout(
title = "Boxplot of Weekly Counts (Late vs Sprint) with Means",
xaxis = list(title = "Measure"),
yaxis = list(title = "Value"),
showlegend = FALSE
)
boxplot_measures
# ============================================================
# 12. PAIR DIFFERENCE AND STATISTICAL TESTS
# ============================================================
# Compute pair differences
df_wide <- df_long %>%
pivot_wider(names_from = Measure, values_from = Value)
df_wide$PairDifferences <- df_wide$Sprint - df_wide$Late
# Histogram
hist_plot <- plot_ly(
data = df_wide,
x = ~PairDifferences,
type = "histogram",
marker = list(color = "#1f78b4", line = list(color = "black", width = 1))
) %>%
layout(
title = "Distribution of Pair Differences",
xaxis = list(title = "Pair Differences"),
yaxis = list(title = "Count")
)
hist_plot
# Descriptive statistics
desc_stats <- df_wide %>%
summarise(
count = n(),
mean = mean(PairDifferences, na.rm = TRUE),
sd = sd(PairDifferences, na.rm = TRUE),
min = min(PairDifferences, na.rm = TRUE),
max = max(PairDifferences, na.rm = TRUE)
)
desc_table <- desc_stats %>%
gt() %>%
gt_theme_538() %>%
tab_header(title = "Descriptive Statistics: Pair Differences") %>%
fmt_number(columns = where(is.numeric), decimals = 3)
desc_table
# Shapiro-Wilk test for normality
shapiro_res <- shapiro.test(df_wide$PairDifferences)
shapiro_table <- tidy(shapiro_res) %>%
select(statistic, p.value, method) %>%
gt() %>%
gt_theme_538() %>%
tab_header(title = "Normality Test (Shapiro-Wilk)") %>%
fmt_number(columns = c(statistic, p.value), decimals = 4)
shapiro_table
# Paired t-test
t_res <- t.test(df_wide$Sprint, df_wide$Late, paired = TRUE)
t_table <- tidy(t_res) %>%
gt() %>%
gt_theme_538() %>%
tab_header(title = "Paired-Samples t-Test") %>%
fmt_number(columns = c(statistic, p.value, conf.low, conf.high), decimals = 4)
t_table
# Wilcoxon signed-rank test
wilcox_res <- wilcox.test(df_wide$Late, df_wide$Sprint, paired = TRUE, exact = FALSE)
wilcox_table <- tidy(wilcox_res) %>%
gt() %>%
gt_theme_538() %>%
tab_header(title = "Wilcoxon Signed Rank Test") %>%
fmt_number(columns = c(statistic, p.value), decimals = 4)
wilcox_table