knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(janitor)

List participant CSV files

files <- list.files(pattern = "\\.csv$")

Read and combine all participant files

all_data <- map_dfr(
  files,
  ~ read_csv(.x, show_col_types = FALSE) %>%
    clean_names(),
  .id = "participant_id"
)

## New names:
## • `` -> `...4`

Keep only relevant columns

all_data <- all_data %>%
  select(
    participant_id,
    start_time_s,
    end_time_s,
    duration_s
  )

Overview

Participants watched the same boxing match (https://www.youtube.com/watch?v=IATUT2qzDmU) and held the space bar during moments they found interesting. Each participant file contains start times, end times, and durations of engagement.

The goal of this analysis is to identify moments during the match where many participants were engaged at the same time. These shared-engagement windows will be used to inform the placement of thought probes in a follow-up study.

##Load Participant Files At this point I have cleaned the data, and organized it.

#Construct crowd-level engagement signal

time_grid <- seq(
  from = floor(min(all_data$start_time_s)),
  to   = ceiling(max(all_data$end_time_s)),
  by   = 1
)

Count number of participants engaged at each second

All times are reported in seconds relative to the original video timestamps

crowd_engagement <- tibble(time_s = time_grid) %>%
  rowwise() %>%
  mutate(
    n_engaged = sum(
      all_data$start_time_s <= time_s &
      all_data$end_time_s   >= time_s
    )
  ) %>%
  ungroup()

summary(crowd_engagement$n_engaged)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.00   15.00   15.36   22.00   54.00

Normalize by sample size

n_participants <- length(unique(all_data$participant_id))

crowd_engagement <- crowd_engagement %>%
  mutate(prop_engaged = n_engaged / n_participants)

summary(crowd_engagement$prop_engaged)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.09211 0.19737 0.20215 0.28947 0.71053

#Identify shared-engagement threshold Shared engagement is defined as moments in the top 10% of overlap across participants

overlap_threshold <- quantile(
  crowd_engagement$prop_engaged,
  probs = 0.90
)

overlap_threshold

##       90% 
## 0.3815789

#Extract shared-engagement windows

shared_windows <- crowd_engagement %>%
  mutate(
    high_overlap = prop_engaged >= overlap_threshold,
    window_id = cumsum(high_overlap != lag(high_overlap, default = FALSE))
  ) %>%
  filter(high_overlap) %>%
  group_by(window_id) %>%
  summarise(
    start_time_s = min(time_s),
    end_time_s   = max(time_s),
    duration_s   = end_time_s - start_time_s + 1,
    peak_n       = max(n_engaged),
    mean_n       = mean(n_engaged),
    .groups = "drop"
  )

nrow(shared_windows)

## [1] 96

#Removing very short shared-engagement windows

probe_windows <- shared_windows %>%
  filter(duration_s >= 8)

nrow(probe_windows)

## [1] 10

#Add MM:SS columns for video timing

probe_windows <- probe_windows %>%
  mutate(
    start_time_mmss = sprintf(
      "%02d:%02d",
      start_time_s %/% 60,
      start_time_s %% 60
    ),
    end_time_mmss = sprintf(
      "%02d:%02d",
      end_time_s %/% 60,
      end_time_s %% 60
    )
  )

#Characterize shared-interest windows

summary(probe_windows$peak_n)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35.00   38.75   41.00   42.70   46.50   54.00

summary(probe_windows$duration_s)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00    9.25   12.00   13.80   13.75   36.00

#Final output for probe placement

probe_windows

## # A tibble: 10 × 8
##    window_id start_time_s end_time_s duration_s peak_n mean_n start_time_mmss
##        <int>        <dbl>      <dbl>      <dbl>  <int>  <dbl> <chr>          
##  1        25         1015       1023          9     37   35.2 16:55          
##  2        49         1329       1342         14     41   34.8 22:09          
##  3        51         1344       1354         11     41   34.5 22:24          
##  4        63         1602       1637         36     54   39.9 26:42          
##  5        67         1643       1650          8     38   31   27:23          
##  6       109         2466       2478         13     48   39.2 41:06          
##  7       111         2481       2493         13     41   36.2 41:21          
##  8       177         3197       3204          8     35   33.2 53:17          
##  9       181         3210       3225         16     42   37.2 53:30          
## 10       191         3322       3331         10     50   40.5 55:22          
## # ℹ 1 more variable: end_time_mmss <chr>

Plot crowd engagement with probe windows

# Polished visualization with extended x-axis for duration clarity

# Video start time: 8:36
video_start_s <- 8 * 60 + 36

# Prepare probe window data
probe_plot <- probe_windows %>%
  mutate(
    start_time_min = (start_time_s + video_start_s) / 60,
    end_time_min   = (end_time_s   + video_start_s) / 60
  )

# Prepare smoothed overall engagement trend
trend_data <- crowd_engagement %>%
  mutate(
    video_time_min = (time_s + video_start_s) / 60
  )

# Define x-axis limits with extra padding
x_min <- min(probe_plot$start_time_min) - 5
x_max <- max(probe_plot$end_time_min) + 5

ggplot() +
  
  # Overall engagement trend (faint, contextual)
  geom_smooth(
    data = trend_data,
    aes(
      x = video_time_min,
      y = n_engaged,
      colour = "Overall engagement trend"
    ),
    method = "loess",
    span = 0.08,
    se = FALSE,
    linewidth = 1,
    alpha = 0.6
  ) +
  
  # Shared-interest duration bars
  geom_segment(
    data = probe_plot,
    aes(
      x = start_time_min,
      xend = end_time_min,
      y = peak_n,
      yend = peak_n,
      colour = "Shared-interest duration"
    ),
    linewidth = 4,
    lineend = "round"
  ) +
  
  # End caps (start)
  geom_segment(
    data = probe_plot,
    aes(
      x = start_time_min,
      xend = start_time_min,
      y = peak_n - 1.2,
      yend = peak_n + 1.2
    ),
    linewidth = 1
  ) +
  
  # End caps (end)
  geom_segment(
    data = probe_plot,
    aes(
      x = end_time_min,
      xend = end_time_min,
      y = peak_n - 1.2,
      yend = peak_n + 1.2
    ),
    linewidth = 1
  ) +
  
  # Labels
  labs(
    title = "Shared-Interest Moments During Boxing Match",
    x = "Video Time (minutes since 0:00)",
    y = "Number of Participants Engaged",
    colour = "",
    linetype = ""
  ) +
  
  # X-axis: extended limits + 5-minute ticks
  scale_x_continuous(
    limits = c(x_min, x_max),
    breaks = seq(
      floor(x_min / 5) * 5,
      ceiling(x_max / 5) * 5,
      by = 5
    )
  ) +
  
  # Colours and line types
  scale_colour_manual(
    values = c(
      "Shared-interest duration" = "purple",
      "Overall engagement trend" = "pink"
    )
  ) +
  
  scale_linetype_manual(
    values = c("Video start (8:36)" = "dashed")
  ) +
  
  coord_cartesian(
    ylim = c(0, max(probe_plot$peak_n) + 5)
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "bottom",
    panel.grid.minor = element_blank()
  )

## Ignoring unknown labels:
## • linetype : ""
## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 195 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's linetype values.

#Duration focused

# Shared-interest windows by number of engaged participants

duration_plot <- probe_windows %>%
  mutate(
    window_label = sprintf(
      "%02d:%02d",
      start_time_s %/% 60,
      start_time_s %% 60
    )
  ) %>%
  arrange(peak_n)

ggplot(duration_plot, aes(y = reorder(window_label, peak_n))) +
  
  # Bars show number of participants engaged
  geom_col(
    aes(x = peak_n,), 
    fill = "purple", 
    width = 0.6
  ) +
  
  # Duration labels (seconds)
  geom_text(
    aes(
      x = peak_n,
      label = paste0(duration_s, " s")
    ),
    hjust = -0.15,
    size = 3
  ) +
  
  # Labels
  labs(
    title = "Shared-Interest Windows by Participant Overlap",
    x = "Number of Participants Engaged at Peak",
    y = "Window Onset Gime (Minute:Second)"
  ) +
  
  # Expand x-axis for label visibility
  scale_x_continuous(
    expand = expansion(mult = c(0, 0.15))
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )

Crowdsourcing Data (Analysis) Feb 3

Tanraj.G

2026-02-03