Segmentation Pilot Analysis

————————————-Import Data————————————-

print(
  all_raw_data %>%
    count(video_attention_failure_type)
)

## # A tibble: 4 × 2
##   video_attention_failure_type     n
##   <chr>                        <int>
## 1 no_boundary_press                8
## 2 rapid_successive_press          16
## 3 tab_switch                       2
## 4 <NA>                          2132

————————————-Clean Data————————————-

confidence_data <- clean_data %>%
  filter(!is.na(confidence_rating)) %>%
  select(run_id, stimulus_name, predictability, confidence_rating)

confidence_data %>%
  count(confidence_rating, sort = TRUE) %>%
  mutate(percent = round(100 * n / sum(n), 2))

## # A tibble: 5 × 3
##   confidence_rating        n percent
##   <chr>                <int>   <dbl>
## 1 Moderately confident   376   42.9 
## 2 Very confident         239   27.2 
## 3 Slightly unconfident   115   13.1 
## 4 Neutral                112   12.8 
## 5 Very unconfident        35    3.99

# Count Very unconfident
confidence_data %>%
  filter(confidence_rating == "Very unconfident") %>%
  nrow()

## [1] 35

library(dplyr)

clean_data2 <- clean_data %>%
  mutate(row_id = row_number())

# Rows containing "Very unconfident"
very_unconf_rows <- clean_data2 %>%
  filter(confidence_rating == "Very unconfident") %>%
  pull(row_id)

# Remove those rows AND the row immediately before them
clean_data2 <- clean_data2 %>%
  filter(!(row_id %in% c(very_unconf_rows,
                          very_unconf_rows - 1)))

segmentation_data <- clean_data2 %>%
  filter(trial_kind == "segmentation_video") %>%
  select(
    -any_of(c(
      "trial_type", "time_elapsed", "PROLIFIC_PID", "trial_index", "trial_kind", "pair_number"))
  )
dim(segmentation_data)

## [1] 843  13

———————————Descriptive Data———————————

Participant level

participant_mean <- segmentation_data %>%
  mutate(boundary_count_num = readr::parse_number(boundary_count)) %>%
  group_by(run_id) %>%
  summarise(mean_NoB = mean(boundary_count_num, na.rm = TRUE),.groups = "drop")

grand_mean <- mean(participant_mean$mean_NoB)
ground_truth_mean <- 13.64

ggplot(participant_mean, aes(x = mean_NoB)) +
  geom_histogram(binwidth = 1, color = "black", fill = "skyblue") +
  geom_vline(xintercept = grand_mean, linetype = "dashed", linewidth = 1.2, color = "red") +
  annotate("text", x = grand_mean, y = Inf, label = paste0("Participants' Mean = ", round(grand_mean, 2)), color = "red", vjust = 1.5, hjust = 0.6, size = 4) +
  labs(title = "Distribution of Participants' Average Number of Boundaries", x = "Average Number of Boundaries per Video", y = "Number of Participants") +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold")
  )

participant_condition_mean <- segmentation_data %>%
  mutate(boundary_count_num = readr::parse_number(boundary_count)) %>%
  group_by(run_id, predictability) %>%
  summarise(mean_NoB = mean(boundary_count_num, na.rm = TRUE), n_trials = n(),.groups = "drop")

ggplot(
  participant_condition_mean, aes(x = predictability, y = mean_NoB, group = run_id)) +
  geom_line(alpha = 0.35, color = "grey60") +
  geom_point(aes(color = predictability), size = 2.5) +
  geom_text(aes(label = run_id), size = 2.5, hjust = -0.15, alpha = 0.75) +
  stat_summary(aes(group = 1), fun = mean, geom = "line", linewidth = 1.4, color = "black") +
  labs(title = "Participant-Level Mean NoB Across Predictability Conditions", x = NULL, y = "Mean NoB") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none", plot.title = element_text(face = "bold"))

Paired-Sample T-Test on Average NoB –> Participant Level

participant_condition_wide <- participant_condition_mean %>%
  select(run_id, predictability, mean_NoB) %>%
  pivot_wider(
    names_from = predictability,
    values_from = mean_NoB
  )

participant_ttest <- t.test(
  participant_condition_wide$Predictable,
  participant_condition_wide$Unpredictable,
  paired = TRUE
)

participant_ttest

## 
##  Paired t-test
## 
## data:  participant_condition_wide$Predictable and participant_condition_wide$Unpredictable
## t = -1.053, df = 14, p-value = 0.3102
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.4773090  0.1629563
## sample estimates:
## mean difference 
##      -0.1571764

Average NoB

ggplot(button_count_long,
  aes(x = predictability, y = mean_button_count, group = stimulus_name)
  ) +
  geom_line(alpha = 0.4, color = "grey60") +
  geom_point(aes(color = predictability), size = 2.5) +
  stat_summary(aes(group = 1), fun = mean, geom = "line", linewidth = 1.2, color = "black"
  ) +
  stat_summary(aes(group = 1), fun = mean, geom = "point", size = 3.5, color = "black"
  ) +
  labs(x = NULL, y = "Mean NoB", title = "Mean NoB Across Predictability Conditions"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none", plot.title = element_text(face = "bold"), plot.subtitle = element_text(color = "grey40")
  )

Variance

top5_P <- consensus_long %>%
  filter(predictability == "Predictable") %>%
  arrange(desc(var_boundary_count)) %>%
  slice_head(n = 5)

top5_U <- consensus_long %>%
  filter(predictability == "Unpredictable") %>%
  arrange(desc(var_boundary_count)) %>%
  slice_head(n = 5)

top5_labels <- bind_rows(top5_P, top5_U) %>%
  select(stimulus_name, predictability)

consensus_long_labeled <- consensus_long %>%
  left_join(top5_labels %>% mutate(label = stimulus_name), by = c("stimulus_name", "predictability"))

ggplot(consensus_long_labeled,
  aes(x = predictability, y = var_boundary_count, group = stimulus_name)
  ) +
  geom_line(alpha = 0.4, color = "grey60") +
  geom_text(aes(label = label), hjust = -0.1, size = 3, na.rm = TRUE) +
  geom_point(aes(color = predictability), size = 2.5) +
  stat_summary(aes(group = 1), fun = mean, geom = "line", linewidth = 1.2, color = "black"
  ) +
  stat_summary(aes(group = 1), fun = mean, geom = "point", size = 3.5, color = "black"
  ) +
  labs(x = NULL, y = "Variance of NoB", title = "Within-Video Variability Across Predictability Conditions"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none", plot.title = element_text(face = "bold"), plot.subtitle = element_text(color = "grey40")
  )

Paired-Sample T-Test on Variance

Do participants disagree more about how many boundaries there are in unpredictable videos compared with predictable videos?

t.test(consensus_wide$Unpredictable, consensus_wide$Predictable, paired = TRUE)

## 
##  Paired t-test
## 
## data:  consensus_wide$Unpredictable and consensus_wide$Predictable
## t = 0.5378, df = 29, p-value = 0.5948
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.7456218  1.2776505
## sample estimates:
## mean difference 
##       0.2660144

A paired-samples t-test is used as the raw participant-level observations is collapsed into one variance estimate per video-condition. This paired-samples t-test comparing the within-video variance of boundary counts between predictable and unpredictable versions revealed no significant difference, t(29) = −0.86, p = .40, 95% CI [−0.49, 0.20]. Thus, even if unpredictability changes event structure, participants remain similarly consistent in the number of boundaries they perceive. However, this analysis concerns agreement in the number of perceived boundaries and does not address whether predictability influences agreement in the temporal locations of those boundaries

—————————–Mixed Effect Regression—————————–

MEM for the effect of Predictability on NoB

After accounting for individual differences in segmentation tendencies and differences among videos, does predictability affect the average number of boundaries?

segmentation_data <- segmentation_data %>%
  mutate(boundary_count = as.numeric(boundary_count))

MEM_mean_Gaussian <- lmer(boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name), data = segmentation_data)

summary(MEM_mean_Gaussian)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name)
##    Data: segmentation_data
## 
## REML criterion at convergence: 4187.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6972 -0.2644 -0.0392  0.2282  7.5440 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  stimulus_name (Intercept) 0.8586   0.9266  
##  run_id        (Intercept) 5.3309   2.3089  
##  Residual                  7.5073   2.7399  
## Number of obs: 843, groups:  stimulus_name, 30; run_id, 15
## 
## Fixed effects:
##                             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)                   4.9328     0.6341  16.9474   7.779 5.44e-07 ***
## predictabilityUnpredictable   0.1497     0.1891 798.1046   0.792    0.429    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## prdctbltyUn -0.150

anova(MEM_mean_Gaussian)

## Type III Analysis of Variance Table with Satterthwaite's method
##                Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## predictability 4.7074  4.7074     1 798.1   0.627 0.4287

A mixed-effects model predicting boundary count from predictability, with random intercepts for participant and stimulus, revealed no significant effect of predictability, F(1, 729.05) = 2.34, p = .126. Participants marked, on average, 0.29 fewer boundaries in the unpredictable condition relative to the predictable condition (β = −0.29, SE = 0.19). Random effects indicated substantial variability across participants (SD = 4.17) and, to a lesser extent, across stimuli (SD = 1.98).

MEM with Stimulus Duration as Fixed Effect

video_duration <- tibble(
  stimulus_name = c(
    "baking","balcony","bank","bathroom","beach","bedding",
    "bike","car","cleaning","cereal","fireplace","football",
    "gym","lamp","laundry","mouse","music","painting",
    "party","poster","printer","record","shopping",
    "skateboard","suitcase","sunbathing","tea","tennis",
    "walking","whiteboard"),
  video_duration_sec = c(
    28.75, 28.00, 19.11, 35.11, 26.75, 34.68,
    29.31, 39.44, 30.00, 37.71, 30.99, 36.00,
    29.31, 29.52, 30.84, 29.76, 38.12, 24.55,
    35.17, 23.53, 44.05, 19.11, 11.79,
    24.12, 27.15, 40.12, 31.00, 29.76,
    21.52, 36.54
  )
)

segmentation_data <- segmentation_data %>%
  left_join(video_duration, by = "stimulus_name") %>%
  mutate(
    video_duration_sec = as.numeric(video_duration_sec),
    duration_z = as.numeric(scale(video_duration_sec)),
    boundary_count = as.numeric(boundary_count)
  )

MEM_duration <- lmer(boundary_count ~ predictability + duration_z + (1 | run_id) + (1 | stimulus_name), data = segmentation_data)

summary(MEM_duration)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: boundary_count ~ predictability + duration_z + (1 | run_id) +  
##     (1 | stimulus_name)
##    Data: segmentation_data
## 
## REML criterion at convergence: 4158.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6989 -0.2960 -0.0280  0.2669  7.3982 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  stimulus_name (Intercept) 0.1235   0.3514  
##  run_id        (Intercept) 5.3512   2.3133  
##  Residual                  7.5050   2.7395  
## Number of obs: 843, groups:  stimulus_name, 30; run_id, 15
## 
## Fixed effects:
##                             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)                   4.9348     0.6155  15.0144   8.017 8.33e-07 ***
## predictabilityUnpredictable   0.1550     0.1890 799.4134   0.820    0.412    
## duration_z                    0.8447     0.1138  29.0337   7.424 3.49e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) prdctU
## prdctbltyUn -0.155       
## duration_z   0.000  0.004

MEM with Negative Biomodal Distribution

mean(segmentation_data$boundary_count, na.rm = TRUE)

## [1] 4.998814

var(segmentation_data$boundary_count, na.rm = TRUE)

## [1] 13.20546

MEM_mean_NB <- glmer.nb(boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name), data = segmentation_data)

summary(MEM_mean_NB)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: Negative Binomial(32.5144)  ( log )
## Formula: boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name)
##    Data: segmentation_data
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    3618.0    3641.7   -1804.0    3608.0       838 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6543 -0.3305 -0.0483  0.2852  4.7165 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  stimulus_name (Intercept) 0.03989  0.1997  
##  run_id        (Intercept) 0.17745  0.4213  
## Number of obs: 843, groups:  stimulus_name, 30; run_id, 15
## 
## Fixed effects:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  1.48955    0.11741  12.687   <2e-16 ***
## predictabilityUnpredictable  0.02731    0.03358   0.813    0.416    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## prdctbltyUn -0.146

The Negative Binomial Distribution reached the same conclusion as the Gaussian Distribution, indicating that unpredictability was associated with a non-significant 2.9% reduction in boundary counts (z = −1.29, p = .197). Thus, the absence of a predictability effect was robust across modeling assumptions.

MEM with Poisson Distribution

model_pois <- glmer(boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name), family = poisson,
data = segmentation_data)

summary(model_pois)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name)
##    Data: segmentation_data
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    3638.4    3657.3   -1815.2    3630.4       839 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0960 -0.3562 -0.0553  0.3055  5.4060 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  stimulus_name (Intercept) 0.04171  0.2042  
##  run_id        (Intercept) 0.17870  0.4227  
## Number of obs: 843, groups:  stimulus_name, 30; run_id, 15
## 
## Fixed effects:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  1.48784    0.11767  12.644   <2e-16 ***
## predictabilityUnpredictable  0.02978    0.03079   0.967    0.333    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## prdctbltyUn -0.134

overdispersion_ratio <-
    sum(residuals(model_pois, type = "pearson")^2) / df.residual(model_pois)

overdispersion_ratio

## [1] 0.7616424

After accounting for participant and stimulus effects, there is only half as much residual variation as Poisson would expect.