all_raw_data <- all_raw_data %>%
filter(run_id != "38")
clean_data_block1 <- clean_data %>%
filter(block == 1)
confidence_data <- clean_data_block1 %>%
filter(!is.na(confidence_rating)) %>%
select(run_id, stimulus_name, predictability, confidence_rating)
confidence_data %>%
count(confidence_rating, sort = TRUE) %>%
mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 5 × 3
## confidence_rating n percent
## <chr> <int> <dbl>
## 1 Moderately confident 177 42.8
## 2 Very confident 89 21.5
## 3 Neutral 70 16.9
## 4 Slightly unconfident 62 15.0
## 5 Very unconfident 16 3.86
# Count Very unconfident
confidence_data %>%
filter(confidence_rating == "Very unconfident") %>%
nrow()
## [1] 16
# 1. Print attention-failure summary before removing anything
print(
clean_data_block1 %>%
count(video_attention_failure_type)
)
## # A tibble: 1 × 2
## video_attention_failure_type n
## <chr> <int>
## 1 <NA> 828
# 2. Remove trials with attention failures
clean_data_attn_clean <- clean_data_block1 %>%
filter(is.na(video_attention_failure_type) | video_attention_failure_type == "")
# 3. Add row id after attention-failure removal
clean_data2 <- clean_data_attn_clean %>%
mutate(row_id = row_number())
# 4. Find rows containing "Very unconfident"
very_unconf_rows <- clean_data2 %>%
filter(confidence_rating == "Very unconfident") %>%
pull(row_id)
# 5. Remove those confidence-rating rows AND the row immediately before them
clean_data2 <- clean_data2 %>%
filter(!(row_id %in% c(very_unconf_rows, very_unconf_rows - 1))) %>%
select(-row_id)
segmentation_data <- clean_data2 %>%
filter(trial_kind == "segmentation_video") %>%
select(
-any_of(c(
"trial_type", "time_elapsed", "PROLIFIC_PID", "trial_index", "trial_kind", "pair_number"))
)
dim(segmentation_data)
## [1] 398 13
participant_mean <- segmentation_data %>%
mutate(boundary_count_num = readr::parse_number(boundary_count)) %>%
group_by(run_id) %>%
summarise(mean_NoB = mean(boundary_count_num, na.rm = TRUE),.groups = "drop")
grand_mean <- mean(participant_mean$mean_NoB)
ground_truth_mean <- 13.64
ggplot(participant_mean, aes(x = mean_NoB)) +
geom_histogram(binwidth = 1, color = "black", fill = "skyblue") +
geom_vline(xintercept = grand_mean, linetype = "dashed", linewidth = 1.2, color = "red") +
annotate("text", x = grand_mean, y = Inf, label = paste0("Participants' Mean = ", round(grand_mean, 2)), color = "red", vjust = 1.5, hjust = 0.6, size = 4) +
labs(title = "Distribution of Participants' Average Number of Boundaries", x = "Average Number of Boundaries per Video", y = "Number of Participants") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold")
)
button_count_table <- segmentation_data %>%
mutate(boundary_count = as.numeric(boundary_count)) %>%
group_by(stimulus_name, predictability) %>%
summarise(mean_button_count = mean(boundary_count, na.rm = TRUE), n_trials = n(),.groups = "drop") %>%
pivot_wider(id_cols = stimulus_name, names_from = predictability, values_from = mean_button_count) %>%
mutate(Difference_UP = Unpredictable - Predictable) %>%
arrange(desc(Difference_UP))
button_count_long <- segmentation_data %>%
mutate(boundary_count = as.numeric(boundary_count)) %>%
group_by(stimulus_name, predictability) %>%
summarise(mean_button_count = mean(boundary_count, na.rm = TRUE), n_trials = n(),.groups = "drop")
ggplot(button_count_long, aes(x = predictability, y = mean_button_count, group = stimulus_name)) +
geom_line(alpha = 0.4, color = "grey60") +
geom_point(aes(color = predictability), size = 2.5) +
stat_summary(aes(group = 1), fun = mean, geom = "line", linewidth = 1.2, color = "black") +
stat_summary(aes(group = 1), fun = mean, geom = "point", size = 3.5, color = "black") +
labs(x = NULL, y = "Mean NoB", title = "Mean NoB Across Predictability Conditions") +
theme_minimal(base_size = 14) +
theme(legend.position = "none", plot.title = element_text(face = "bold"), plot.subtitle = element_text(color = "grey40"))
segmentation_data <- segmentation_data %>%
mutate(boundary_count = as.numeric(boundary_count))
MEM_mean_Gaussian <- lmer(boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name), data = segmentation_data)
summary(MEM_mean_Gaussian)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name)
## Data: segmentation_data
##
## REML criterion at convergence: 1327.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5122 -0.5833 0.0007 0.5211 3.4958
##
## Random effects:
## Groups Name Variance Std.Dev.
## stimulus_name (Intercept) 0.9785 0.9892
## run_id (Intercept) 2.5360 1.5925
## Residual 1.1856 1.0889
## Number of obs: 398, groups: stimulus_name, 30; run_id, 14
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 4.4237 0.4697 18.2779 9.419 1.94e-08 ***
## predictabilityUnpredictable 0.1863 0.1134 357.3725 1.643 0.101
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## prdctbltyUn -0.132
anova(MEM_mean_Gaussian)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## predictability 3.1998 3.1998 1 357.37 2.6988 0.1013
pred <- ggpredict(MEM_mean_Gaussian, terms = "predictability")
ggplot(pred, aes(x = x, y = predicted)) + geom_point(size = 3) + geom_line(aes(group = 1), linewidth = 1) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.1, linewidth = 0.8) +
labs(x = "Predictability", y = "Predicted mean boundary count", title = "Mixed-effects Model Predictions") +
theme_minimal(base_size = 14)
### MEM with Poisson Distribution
model_pois <- glmer(boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name), family = poisson,
data = segmentation_data)
summary(model_pois)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: boundary_count ~ predictability + (1 | run_id) + (1 | stimulus_name)
## Data: segmentation_data
##
## AIC BIC logLik -2*log(L) df.resid
## 1516.3 1532.2 -754.1 1508.3 394
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.72822 -0.32129 -0.04172 0.26558 1.63957
##
## Random effects:
## Groups Name Variance Std.Dev.
## stimulus_name (Intercept) 0.0373 0.1931
## run_id (Intercept) 0.1098 0.3314
## Number of obs: 398, groups: stimulus_name, 30; run_id, 14
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.41610 0.10230 13.84 <2e-16 ***
## predictabilityUnpredictable 0.04116 0.04898 0.84 0.401
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## prdctbltyUn -0.264
pred_pois <- ggpredict(model_pois, terms = "predictability")
## You are calculating adjusted predictions on the population-level (i.e.
## `type = "fixed"`) for a *generalized* linear mixed model.
## This may produce biased estimates due to Jensen's inequality. Consider
## setting `bias_correction = TRUE` to correct for this bias.
## See also the documentation of the `bias_correction` argument.
ggplot(pred_pois, aes(x = x, y = predicted)) + geom_point(size = 3) + geom_line(aes(group = 1), linewidth = 1) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.1, linewidth = 0.8) +
labs(x = "Predictability", y = "Predicted mean boundary count", title = "Poisson Mixed-effects Model Predictions")+
theme_minimal(base_size = 14)
overdispersion_ratio <-
sum(residuals(model_pois, type = "pearson")^2) / df.residual(model_pois)
overdispersion_ratio
## [1] 0.2428319
window_counts <- clean_data %>%
filter(trial_kind == "segmentation_video") %>%
select(run_id, stimulus_name, predictability, boundary_times_sec) %>%
left_join(critical_windows, by = "stimulus_name") %>%
left_join(video_duration, by = "stimulus_name") %>% rowwise() %>%
mutate(
boundary_times = list(jsonlite::fromJSON(boundary_times_sec)),
pre_count = sum(boundary_times >= 0 & boundary_times < critical_start),
critical_count = sum(boundary_times >= critical_start & boundary_times <= critical_end),
post_count = sum(boundary_times > critical_end & boundary_times <= video_duration_sec)) %>%
ungroup() %>%
mutate(predictability = factor(predictability), run_id = factor(run_id), stimulus_name = factor(stimulus_name))
model_pre <- lmer(pre_count ~ predictability + (1 | run_id) + (1 | stimulus_name), data = window_counts)
model_critical <- lmer(critical_count ~ predictability + (1 | run_id) + (1 | stimulus_name), data = window_counts)
model_post <- lmer(post_count ~ predictability + (1 | run_id) + (1 | stimulus_name), data = window_counts)
summary(model_pre)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: pre_count ~ predictability + (1 | run_id) + (1 | stimulus_name)
## Data: window_counts
##
## REML criterion at convergence: 2098
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8271 -0.5986 -0.0260 0.5526 4.7852
##
## Random effects:
## Groups Name Variance Std.Dev.
## stimulus_name (Intercept) 0.4732 0.6879
## run_id (Intercept) 0.6177 0.7859
## Residual 0.6305 0.7941
## Number of obs: 820, groups: stimulus_name, 30; run_id, 14
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.891e+00 2.478e-01 2.295e+01 7.628 9.76e-08 ***
## predictabilityUnpredictable 7.763e-03 5.549e-02 7.761e+02 0.140 0.889
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## prdctbltyUn -0.112
summary(model_critical)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: critical_count ~ predictability + (1 | run_id) + (1 | stimulus_name)
## Data: window_counts
##
## REML criterion at convergence: 1482.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0713 -0.6845 -0.0376 0.6520 6.3505
##
## Random effects:
## Groups Name Variance Std.Dev.
## stimulus_name (Intercept) 0.13793 0.3714
## run_id (Intercept) 0.07269 0.2696
## Residual 0.30891 0.5558
## Number of obs: 820, groups: stimulus_name, 30; run_id, 14
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.66233 0.10268 34.29174 6.451 2.17e-07 ***
## predictabilityUnpredictable 0.04392 0.03884 776.06592 1.131 0.258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## prdctbltyUn -0.189
summary(model_post)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: post_count ~ predictability + (1 | run_id) + (1 | stimulus_name)
## Data: window_counts
##
## REML criterion at convergence: 2126.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9520 -0.5874 -0.0382 0.6044 5.5076
##
## Random effects:
## Groups Name Variance Std.Dev.
## stimulus_name (Intercept) 0.4324 0.6576
## run_id (Intercept) 0.4811 0.6936
## Residual 0.6592 0.8119
## Number of obs: 820, groups: stimulus_name, 30; run_id, 14
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.00600 0.22446 24.61226 8.937 3.38e-09 ***
## predictabilityUnpredictable -0.04839 0.05674 776.05930 -0.853 0.394
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## prdctbltyUn -0.126
#Plot
window_counts_long <- window_counts %>%
select(run_id, stimulus_name, predictability, pre_count, critical_count, post_count) %>%
pivot_longer( cols = c(pre_count, critical_count, post_count), names_to = "window", values_to = "boundary_count" ) %>%
mutate(
window = recode( window, pre_count = "Pre-critical", critical_count = "Critical", post_count = "Post-critical"),
window = factor(window, levels = c("Pre-critical", "Critical", "Post-critical")),
predictability = factor(predictability, levels = c("Predictable", "Unpredictable")))
window_means <- window_counts_long %>%
group_by(window, predictability) %>%
summarise( mean_boundary_count = mean(boundary_count, na.rm = TRUE), se = sd(boundary_count, na.rm = TRUE) / sqrt(n()), .groups = "drop")
ggplot(window_means, aes(x = predictability, y = mean_boundary_count, group = window, color = window)) +
geom_line(linewidth = 1.3) + geom_point(size = 3.5) +
geom_errorbar(aes(ymin = mean_boundary_count - se, ymax = mean_boundary_count + se), width = 0.08, linewidth = 0.7) +
scale_color_manual(values = c("Pre-critical" = "#1b9e77", "Critical" = "#d95f02", "Post-critical" = "#7570b3")) +
labs(x = NULL, y = "Mean boundary count", color = "Video window", title = "Boundary Counts Across Predictability Conditions") +
theme_minimal(base_size = 14)
# Create counts for each video window
window_counts <- clean_data %>% filter(trial_kind == "segmentation_video") %>%
select(run_id, stimulus_name, predictability, boundary_times_sec) %>%
left_join(critical_windows, by = "stimulus_name") %>% left_join(video_duration, by = "stimulus_name") %>% rowwise() %>%
mutate(boundary_times = list(jsonlite::fromJSON(boundary_times_sec)),
pre_count = sum(boundary_times >= 0 & boundary_times < critical_start),
critical_count = sum(boundary_times >= critical_start & boundary_times <= critical_end),
post_count = sum(boundary_times > critical_end & boundary_times <= video_duration_sec) ) %>%
ungroup()
# Convert to long format: one row per trial × interval
window_counts_long <- window_counts %>%
select(run_id, stimulus_name, predictability, pre_count, critical_count, post_count) %>%
pivot_longer(cols = c(pre_count, critical_count, post_count), names_to = "Interval", values_to = "boundary_count") %>%
mutate(
Interval = recode(Interval, pre_count = "Pre-critical", critical_count = "Critical", post_count = "Post-critical"),
Interval = factor(Interval, levels = c("Pre-critical", "Critical", "Post-critical")),
predictability = factor(predictability, levels = c("Predictable", "Unpredictable")),
run_id = factor(run_id), stimulus_name = factor(stimulus_name))
# One mixed-effects model with Interval and predictability as fixed effects
model_interval_int <- lmer(boundary_count ~ predictability * Interval + (1 | run_id) + (1 | stimulus_name), data = window_counts_long)
model_interval_add <- lmer(boundary_count ~ predictability + Interval + (1 | run_id) + (1 | stimulus_name), data = window_counts_long)
summary(model_interval_int)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: boundary_count ~ predictability * Interval + (1 | run_id) + (1 |
## stimulus_name)
## Data: window_counts_long
##
## REML criterion at convergence: 6723.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8209 -0.6609 -0.0937 0.5387 5.4577
##
## Random effects:
## Groups Name Variance Std.Dev.
## stimulus_name (Intercept) 0.09109 0.3018
## run_id (Intercept) 0.31593 0.5621
## Residual 0.85061 0.9223
## Number of obs: 2460, groups: stimulus_name, 30; run_id, 14
##
## Fixed effects:
## Estimate Std. Error
## (Intercept) 1.88539 0.16635
## predictabilityUnpredictable 0.01299 0.06443
## IntervalCritical -1.22628 0.06434
## IntervalPost-critical 0.12895 0.06434
## predictabilityUnpredictable:IntervalCritical 0.03557 0.09110
## predictabilityUnpredictable:IntervalPost-critical -0.07027 0.09110
## df t value Pr(>|t|)
## (Intercept) 18.78034 11.334 7.73e-10
## predictabilityUnpredictable 2412.07880 0.202 0.8402
## IntervalCritical 2411.94645 -19.060 < 2e-16
## IntervalPost-critical 2411.94645 2.004 0.0451
## predictabilityUnpredictable:IntervalCritical 2411.94645 0.390 0.6962
## predictabilityUnpredictable:IntervalPost-critical 2411.94645 -0.771 0.4405
##
## (Intercept) ***
## predictabilityUnpredictable
## IntervalCritical ***
## IntervalPost-critical *
## predictabilityUnpredictable:IntervalCritical
## predictabilityUnpredictable:IntervalPost-critical
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) prdctU IntrvC IntrP- prU:IC
## prdctbltyUn -0.193
## IntrvlCrtcl -0.193 0.499
## IntrvlPst-c -0.193 0.499 0.500
## prdctblU:IC 0.137 -0.707 -0.706 -0.353
## prdctbU:IP- 0.137 -0.707 -0.353 -0.706 0.500
summary(model_interval_add)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: boundary_count ~ predictability + Interval + (1 | run_id) + (1 |
## stimulus_name)
## Data: window_counts_long
##
## REML criterion at convergence: 6719
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7897 -0.6592 -0.0948 0.5406 5.4264
##
## Random effects:
## Groups Name Variance Std.Dev.
## stimulus_name (Intercept) 0.09109 0.3018
## run_id (Intercept) 0.31593 0.5621
## Residual 0.85039 0.9222
## Number of obs: 2460, groups: stimulus_name, 30; run_id, 14
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.891e+00 1.643e-01 1.786e+01 11.512 1.08e-09
## predictabilityUnpredictable 1.423e-03 3.720e-02 2.414e+03 0.038 0.9695
## IntervalCritical -1.209e+00 4.554e-02 2.414e+03 -26.536 < 2e-16
## IntervalPost-critical 9.390e-02 4.554e-02 2.414e+03 2.062 0.0393
##
## (Intercept) ***
## predictabilityUnpredictable
## IntervalCritical ***
## IntervalPost-critical *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) prdctU IntrvC
## prdctbltyUn -0.113
## IntrvlCrtcl -0.139 0.000
## IntrvlPst-c -0.139 0.000 0.500
model_interval_add_ml <- lmer(boundary_count ~ predictability + Interval + (1 | run_id) + (1 | stimulus_name), data = window_counts_long, REML = FALSE)
model_interval_int_ml <- lmer(boundary_count ~ predictability * Interval + (1 | run_id) + (1 | stimulus_name), data = window_counts_long, REML = FALSE)
anova(model_interval_add_ml, model_interval_int_ml)
## Data: window_counts_long
## Models:
## model_interval_add_ml: boundary_count ~ predictability + Interval + (1 | run_id) + (1 | stimulus_name)
## model_interval_int_ml: boundary_count ~ predictability * Interval + (1 | run_id) + (1 | stimulus_name)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## model_interval_add_ml 7 6717.4 6758.1 -3351.7 6703.4
## model_interval_int_ml 9 6720.0 6772.3 -3351.0 6702.0 1.4008 2 0.4964
pred_Int_add <- ggpredict(model_interval_add, terms = c("predictability", "Interval"))
ggplot(pred_Int_add, aes(x = x, y = predicted, color = group, group = group)) +
geom_point(size = 3) +
geom_line(linewidth = 1.2) + geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = .08) +
labs(x = "Predictability", y = "Predicted boundary count", color = "Interval", title = "Predicted Boundary Count by Predictability and Interval") +
theme_minimal(base_size = 14)
emm <- emmeans(model_interval_int, ~ predictability * Interval)
## Cannot use mode = "kenward-roger" because *pbkrtest* package is not installed
emm_df <- as.data.frame(emm)
ggplot(emm_df, aes(x = predictability, y = emmean, color = Interval, group = Interval)) +
geom_point(size = 3) +
geom_line(linewidth = 1.2) +
geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = 0.08) +
labs(x = "Predictability", y = "Predicted boundary count", color = "Interval", title = "Predicted Boundary Count by Predictability x Interval") +
theme_minimal(base_size = 14)