knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(janitor)
files <- list.files(pattern = "\\.csv$")
all_data <- map_dfr(
files,
~ read_csv(.x, show_col_types = FALSE) %>%
clean_names(),
.id = "participant_id"
)
## New names:
## • `` -> `...4`
all_data <- all_data %>%
select(
participant_id,
start_time_s,
end_time_s,
duration_s
)
Participants watched the same boxing match (https://www.youtube.com/watch?v=IATUT2qzDmU) and held the space bar during moments they found interesting. Each participant file contains start times, end times, and durations of engagement.
The goal of this analysis is to identify moments during the match where many participants were engaged at the same time. These shared-engagement windows will be used to inform the placement of thought probes in a follow-up study.
##Load Participant Files At this point I have cleaned the data, and organized it.
#Construct crowd-level engagement signal
time_grid <- seq(
from = floor(min(all_data$start_time_s)),
to = ceiling(max(all_data$end_time_s)),
by = 1
)
All times are reported in seconds relative to the original video timestamps
crowd_engagement <- tibble(time_s = time_grid) %>%
rowwise() %>%
mutate(
n_engaged = sum(
all_data$start_time_s <= time_s &
all_data$end_time_s >= time_s
)
) %>%
ungroup()
summary(crowd_engagement$n_engaged)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 7.00 15.00 15.36 22.00 54.00
n_participants <- length(unique(all_data$participant_id))
crowd_engagement <- crowd_engagement %>%
mutate(prop_engaged = n_engaged / n_participants)
summary(crowd_engagement$prop_engaged)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.09211 0.19737 0.20215 0.28947 0.71053
#Identify shared-engagement threshold Shared engagement is defined as moments in the top 10% of overlap across participants
overlap_threshold <- quantile(
crowd_engagement$prop_engaged,
probs = 0.90
)
overlap_threshold
## 90%
## 0.3815789
#Extract shared-engagement windows
shared_windows <- crowd_engagement %>%
mutate(
high_overlap = prop_engaged >= overlap_threshold,
window_id = cumsum(high_overlap != lag(high_overlap, default = FALSE))
) %>%
filter(high_overlap) %>%
group_by(window_id) %>%
summarise(
start_time_s = min(time_s),
end_time_s = max(time_s),
duration_s = end_time_s - start_time_s + 1,
peak_n = max(n_engaged),
mean_n = mean(n_engaged),
.groups = "drop"
)
nrow(shared_windows)
## [1] 96
#Removing very short shared-engagement windows
probe_windows <- shared_windows %>%
filter(duration_s >= 8)
nrow(probe_windows)
## [1] 10
#Add MM:SS columns for video timing
probe_windows <- probe_windows %>%
mutate(
start_time_mmss = sprintf(
"%02d:%02d",
start_time_s %/% 60,
start_time_s %% 60
),
end_time_mmss = sprintf(
"%02d:%02d",
end_time_s %/% 60,
end_time_s %% 60
)
)
#Characterize shared-interest windows
summary(probe_windows$peak_n)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.00 38.75 41.00 42.70 46.50 54.00
summary(probe_windows$duration_s)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.25 12.00 13.80 13.75 36.00
#Final output for probe placement
probe_windows
## # A tibble: 10 × 8
## window_id start_time_s end_time_s duration_s peak_n mean_n start_time_mmss
## <int> <dbl> <dbl> <dbl> <int> <dbl> <chr>
## 1 25 1015 1023 9 37 35.2 16:55
## 2 49 1329 1342 14 41 34.8 22:09
## 3 51 1344 1354 11 41 34.5 22:24
## 4 63 1602 1637 36 54 39.9 26:42
## 5 67 1643 1650 8 38 31 27:23
## 6 109 2466 2478 13 48 39.2 41:06
## 7 111 2481 2493 13 41 36.2 41:21
## 8 177 3197 3204 8 35 33.2 53:17
## 9 181 3210 3225 16 42 37.2 53:30
## 10 191 3322 3331 10 50 40.5 55:22
## # ℹ 1 more variable: end_time_mmss <chr>
# Polished visualization with extended x-axis for duration clarity
# Video start time: 8:36
video_start_s <- 8 * 60 + 36
# Prepare probe window data
probe_plot <- probe_windows %>%
mutate(
start_time_min = (start_time_s + video_start_s) / 60,
end_time_min = (end_time_s + video_start_s) / 60
)
# Prepare smoothed overall engagement trend
trend_data <- crowd_engagement %>%
mutate(
video_time_min = (time_s + video_start_s) / 60
)
# Define x-axis limits with extra padding
x_min <- min(probe_plot$start_time_min) - 5
x_max <- max(probe_plot$end_time_min) + 5
ggplot() +
# Overall engagement trend (faint, contextual)
geom_smooth(
data = trend_data,
aes(
x = video_time_min,
y = n_engaged,
colour = "Overall engagement trend"
),
method = "loess",
span = 0.08,
se = FALSE,
linewidth = 1,
alpha = 0.6
) +
# Shared-interest duration bars
geom_segment(
data = probe_plot,
aes(
x = start_time_min,
xend = end_time_min,
y = peak_n,
yend = peak_n,
colour = "Shared-interest duration"
),
linewidth = 4,
lineend = "round"
) +
# End caps (start)
geom_segment(
data = probe_plot,
aes(
x = start_time_min,
xend = start_time_min,
y = peak_n - 1.2,
yend = peak_n + 1.2
),
linewidth = 1
) +
# End caps (end)
geom_segment(
data = probe_plot,
aes(
x = end_time_min,
xend = end_time_min,
y = peak_n - 1.2,
yend = peak_n + 1.2
),
linewidth = 1
) +
# Labels
labs(
title = "Shared-Interest Moments During Boxing Match",
x = "Video Time (minutes since 0:00)",
y = "Number of Participants Engaged",
colour = "",
linetype = ""
) +
# X-axis: extended limits + 5-minute ticks
scale_x_continuous(
limits = c(x_min, x_max),
breaks = seq(
floor(x_min / 5) * 5,
ceiling(x_max / 5) * 5,
by = 5
)
) +
# Colours and line types
scale_colour_manual(
values = c(
"Shared-interest duration" = "purple",
"Overall engagement trend" = "pink"
)
) +
scale_linetype_manual(
values = c("Video start (8:36)" = "dashed")
) +
coord_cartesian(
ylim = c(0, max(probe_plot$peak_n) + 5)
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "bottom",
panel.grid.minor = element_blank()
)
## Ignoring unknown labels:
## • linetype : ""
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 195 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's linetype values.
#Duration focused
# Shared-interest windows by number of engaged participants
duration_plot <- probe_windows %>%
mutate(
window_label = sprintf(
"%02d:%02d",
start_time_s %/% 60,
start_time_s %% 60
)
) %>%
arrange(peak_n)
ggplot(duration_plot, aes(y = reorder(window_label, peak_n))) +
# Bars show number of participants engaged
geom_col(
aes(x = peak_n,),
fill = "purple",
width = 0.6
) +
# Duration labels (seconds)
geom_text(
aes(
x = peak_n,
label = paste0(duration_s, " s")
),
hjust = -0.15,
size = 3
) +
# Labels
labs(
title = "Shared-Interest Windows by Participant Overlap",
x = "Number of Participants Engaged at Peak",
y = "Window Onset Gime (Minute:Second)"
) +
# Expand x-axis for label visibility
scale_x_continuous(
expand = expansion(mult = c(0, 0.15))
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)