BabyView Objects Prolific Validation

Code
library(tidyverse)
library(ggplot2)
library(scales)
library(irr)
library(ggrepel)

OOD_CLASSES <- c("__OOD_BV_CROP__", "__OOD_SYNTHETIC__")

theme_set(
  theme_minimal(base_size = 12) +
    theme(
      plot.title    = element_text(hjust = 0.5, face = "bold", size = 13),
      plot.subtitle = element_text(hjust = 0.5, color = "gray40", size = 10),
      strip.text    = element_text(face = "bold")
    )
)
Code
df_raw <- read.csv("bv_objects.prolific_responses.csv") |>
  filter(!class %in% OOD_CLASSES) |>
  mutate(
    order_index   = as.integer(order_index),
    reaction_time = as.numeric(reaction_time)
  ) |>
  mutate(
    rater = {
      long_ids <- unique(worker_id[nchar(worker_id) > 10])
      id_to_label <- setNames(
        paste0("Rater ", seq_along(long_ids)),
        long_ids
      )
      ifelse(nchar(worker_id) > 10, id_to_label[worker_id], NA_character_)
    }
  ) |>
  filter(!is.na(rater))

# Ground-truth universe: the canonical (class, filename) pairs both raters evaluated.
# Both raters saw shuffled_index == 1. Note: categories do NOT all have 25 images —
# n_total varies per category and is computed from this table.
all_annotations <- read.csv("annotations.csv") |>
  filter(shuffled_index == 1) |>
  filter(!class %in% OOD_CLASSES) |>
  select(class, filename) |>
  distinct()

# True denominator per category — varies, do not assume 25
images_per_cat <- all_annotations |>
  count(class, name = "n_total")

invalid_df <- df_raw |> filter(trial_type == "invalid_drawing")
attn_df    <- df_raw |> filter(trial_type == "attention_check")

Overview

Code
df_raw |>
  count(rater, trial_type) |>
  pivot_wider(names_from = trial_type, values_from = n, values_fill = 0) |>
  rename(
    Rater              = rater,
    `Invalid drawings` = invalid_drawing,
    `Attention checks` = attention_check
  ) |>
  knitr::kable(caption = "Rows per rater by trial type")
Rows per rater by trial type
Rater Attention checks Invalid drawings
Rater 1 12 1488
Rater 2 6 2449
Code
images_per_cat |>
  summarise(
    `# categories`  = n(),
    `Min images`    = min(n_total),
    `Max images`    = max(n_total),
    `Median images` = median(n_total),
    `Mean images`   = round(mean(n_total), 1)
  ) |>
  knitr::kable(caption = "Ground-truth universe summary (annotations.csv, shuffled_index == 1)")
Ground-truth universe summary (annotations.csv, shuffled_index == 1)
# categories Min images Max images Median images Mean images
163 1 25 25 23.9
Code
# Distribution of images per category
images_per_cat |>
  ggplot(aes(x = n_total)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(
    title = "Distribution of images per category",
    subtitle = "From annotations.csv, shuffled_index == 1",
    x = "# images in category", y = "# categories"
  )


1. Invalid Image Rate per Category

The denominator for each category is n_total from all_annotations — the actual number of images in that category’s ground-truth bucket, which varies across categories.

Code
n_invalid_per_rater <- invalid_df |>
  group_by(rater, class) |>
  summarise(n_invalid = n_distinct(filename), .groups = "drop")

# Join true denominator; categories a rater marked 0 invalid won't appear above,
# All raters that exist in the data
raters <- df_raw |> distinct(rater)

# Full grid: every rater × every category in the annotation universe
full_grid <- raters |>
  cross_join(images_per_cat |> select(class)) # <-- ground truth universe

invalid_rate <- full_grid |>
  left_join(n_invalid_per_rater, by = c("rater", "class")) |>
  replace_na(list(n_invalid = 0L)) |>
  left_join(images_per_cat, by = "class") |>
  mutate(prop_invalid = n_invalid / n_total)

avg_valid <- invalid_rate |>
  group_by(rater) |>
  summarise(
    mean_n    = round(mean(n_invalid), 2),
    mean_prop = round(1 - mean(prop_invalid), 3),
    sd_prop   = round(sd(prop_invalid), 3),
    .groups   = "drop"
  )

avg_valid |>
  knitr::kable(
    caption   = "Average valid image proportion per rater first summarized within category",
    col.names = c("Rater", "Mean # invalid", "Mean proportion valid", "SD proportion valid")
  )
Average valid image proportion per rater first summarized within category
Rater Mean # invalid Mean proportion valid SD proportion valid
Rater 1 9.13 0.625 0.265
Rater 2 15.02 0.377 0.312
Code
invalid_rate |> group_by(rater) |>
  summarize(n_total = sum(n_total), n_valid = n_total - sum(n_invalid), prop_valid = round(n_valid / n_total, 3)) |>
    knitr::kable(
    caption   = "Average valid image proportion per rater summed across images",
    col.names = c("Rater", "Total images", "Total valid images", "Proportion valid")
  )
Average valid image proportion per rater summed across images
Rater Total images Total valid images Proportion valid
Rater 1 3901 2413 0.619
Rater 2 3901 1452 0.372
Code
cat_order <- invalid_rate |>
  group_by(class) |>
  summarise(mean_prop = mean(prop_invalid), .groups = "drop") |>
  arrange(desc(mean_prop)) |>
  pull(class)

invalid_rate |>
  mutate(class = factor(class, levels = rev(cat_order))) |>
  ggplot(aes(x = prop_invalid, y = class, fill = rater)) +
  geom_col(position = position_dodge(0.7), width = 0.6, alpha = 0.85) +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "firebrick", linewidth = 0.5) +
  scale_x_continuous(labels = percent_format(), limits = c(0, 1.05)) +
  scale_fill_manual(values = c("Rater 1" = "#4393C3", "Rater 2" = "#D6604D")) +
  labs(
    title    = "Invalid Image Rate per Category",
    subtitle = "Dashed line = 50% threshold; denominator = actual # images per category",
    x        = "Proportion marked invalid",
    y        = NULL,
    fill     = NULL
  ) +
  theme(
    legend.position = "top",
    axis.text.y     = element_text(size = 7)
  )


2. Inter-Rater Reliability (IRR)

The binary matrix is built by left-joining rater judgments onto the all_annotations universe. Every (class, filename) pair gets 1 if that rater marked it invalid, 0 otherwise. This means:

  • The 0s are genuine valid judgments, not missing data.
  • Each category contributes its true n_total rows, not a fixed 25.
  • Only categories seen by both raters are included.
Code
cats  <- full_grid |> distinct(class) |> pull(class)

universe <- all_annotations |> filter(class %in% cats)

r1_invalid <- invalid_df |>
  filter(rater == "Rater 1") |>
  select(class, filename) |> distinct() |> mutate(r1 = 1L)

r2_invalid <- invalid_df |>
  filter(rater == "Rater 2") |>
  select(class, filename) |> distinct() |> mutate(r2 = 1L)

irr_df <- universe |>
  left_join(r1_invalid, by = c("class", "filename")) |>
  left_join(r2_invalid, by = c("class", "filename")) |>
  replace_na(list(r1 = 0L, r2 = 0L))

# Sanity: show actual row counts vs n_total per category
irr_df |>
  count(class, name = "n_rows") |>
  left_join(images_per_cat, by = "class") |>
  summarise(
    all_match = all(n_rows == n_total),
    n_mismatch = sum(n_rows != n_total)
  )
  all_match n_mismatch
1      TRUE          0

2a. Overall Cohen’s Kappa

Code
kappa_result <- kappa2(irr_df |> select(r1, r2) |> as.matrix())
pct_agree    <- mean(irr_df$r1 == irr_df$r2)

tibble(
  Metric = c("# images in universe", "% agreement", "Cohen's κ", "z"),
  Value  = c(
    nrow(irr_df),
    percent(pct_agree, accuracy = 0.1),
    round(kappa_result$value, 3),
    round(kappa_result$statistic, 2)
  )
) |>
  knitr::kable(caption = "Overall IRR across all shared categories")
Overall IRR across all shared categories
Metric Value
# images in universe 3901
% agreement 67.4%
Cohen’s κ 0.385
z 27.13

2b. Per-Category Kappa Heatmap

Code
kappa_per_cat <- irr_df |>
  group_by(class) |>
  summarise(
    n_total   = n(),
    n_r1      = sum(r1),
    n_r2      = sum(r2),
    pct_agree = mean(r1 == r2),
    kappa     = tryCatch(kappa2(cbind(r1, r2))$value, error = \(e) NA_real_),
    .groups   = "drop"
  )

kappa_per_cat |>
  filter(!is.na(kappa), kappa != 0) |>
  slice_max(order_by = kappa, n = 10) |>
  bind_rows(
    kappa_per_cat |>
      filter(!is.na(kappa), kappa != 0) |>
      slice_min(order_by = kappa, n = 15)
  ) |>
  distinct(class, .keep_all = TRUE) |>
  mutate(class = fct_reorder(class, kappa)) |>
  ggplot(aes(x = 1, y = class, fill = kappa)) +
  geom_tile(color = "white", linewidth = 0.3) +
  geom_text(aes(label = round(kappa, 2)), size = 4) +
  scale_fill_gradient2(
    low = "#D73027", mid = "white", high = "#1A9850",
    midpoint = 0, limits = c(-1, 1), name = "κ"
  ) +
  scale_x_continuous(breaks = NULL) +
  labs(
    title    = "Per-Category Cohen's κ (Top & Bottom 10-15)",
    subtitle = "Green = high agreement, Red = systematic disagreement",
    x = NULL, y = NULL
  ) +
  theme(axis.text.y = element_text(size = 14), axis.ticks = element_blank())

2c. Confusion Matrix: Rater Decisions

Code
irr_df |>
  mutate(
    r1_label = if_else(r1 == 1L, "Invalid", "Valid"),
    r2_label = if_else(r2 == 1L, "Invalid", "Valid")
  ) |>
  count(r1_label, r2_label) |>
  group_by(r1_label) |>
  mutate(prop = n / sum(n)) |>
  ungroup() |>
  ggplot(aes(x = r2_label, y = r1_label, fill = prop)) +
  geom_tile(color = "gray80", linewidth = 0.5) +
  geom_text(aes(label = paste0(n, "\n(", percent(prop, 1), ")")),
            size = 4, lineheight = 1.3) +
  scale_fill_gradient(low = "white", high = "steelblue", name = "Row\nprop.") +
  labs(
    title    = "Rater Agreement Confusion Matrix",
    subtitle = "Rows = Rater 1, Cols = Rater 2; 0s from ground-truth universe",
    x = "Rater 2", y = "Rater 1"
  ) +
  coord_fixed()

2d. Invalid Count Heatmap (Both Raters, Shared Categories)

Labels show n_invalid / n_total so you can see the varying denominators directly.

Code
mean_prop_cat <- kappa_per_cat |>
  select(class, n_total, n_r1, n_r2) |>
  pivot_longer(c(n_r1, n_r2), names_to = "rater", values_to = "n_invalid") |>
  mutate(
    rater    = recode(rater, n_r1 = "Rater 1", n_r2 = "Rater 2"),
    prop     = n_invalid / n_total
  ) |>
  group_by(class) |>
  mutate(mean_prop = mean(prop)) |>
  ungroup() |>
  mutate(class = fct_reorder(class, mean_prop)) 
Code
mean_prop_cat |> 
  slice_max(order_by = mean_prop, n = 40) |>
  bind_rows(
    mean_prop_cat |>
      slice_min(order_by = mean_prop, n = 40)
  ) |>
  ggplot(aes(x = rater, y = class, fill = prop)) +
  geom_tile(color = "white", linewidth = 0.3) +
  geom_text(aes(label = paste0(n_invalid, "/", n_total)), size = 4) +
  scale_fill_gradient(
    low = "#F7F7F7", high = "#D73027",
    labels = percent_format(), name = "prop\ninvalid"
  ) +
  labs(
    title    = "Invalid Image Count per Category per Rater (Top and bottom 10)",
    subtitle = "Labels = n_invalid / n_total",
    x = NULL, y = NULL
  ) +
  theme(axis.text.y = element_text(size = 12), axis.ticks = element_blank())


3. Time-Series: order_index vs. Invalid Images & Attention Checks

3a. Invalid Images Over Time

Code
invalid_per_block <- invalid_df |>
  group_by(rater, order_index) |>
  summarise(n_invalid = n_distinct(filename), .groups = "drop")

attn_per_block <- attn_df |>
  group_by(rater, order_index) |>
  summarise(n_attn = n(), .groups = "drop")

ggplot(invalid_per_block, aes(x = order_index, y = n_invalid, color = rater)) +
  geom_point(alpha = 0.45, size = 1.5) +
  geom_smooth(method = "loess", span = 0.35, se = TRUE, linewidth = 1.1) +
  scale_color_manual(values = c("Rater 1" = "#4393C3", "Rater 2" = "#D6604D")) +
  labs(
    title    = "Invalid Images per Block Over Time",
    subtitle = "Each point = one category block; ribbon = loess ± SE",
    x = "Order index (block number)", y = "# images marked invalid", color = NULL
  ) +
  theme(legend.position = "top")

3b. Attention Checks Over Time

Code
ggplot(attn_per_block, aes(x = order_index, y = n_attn, color = rater)) +
  geom_segment(aes(xend = order_index, yend = 0), alpha = 0.5) +
  geom_point(size = 2.5) +
  scale_color_manual(values = c("Rater 1" = "#4393C3", "Rater 2" = "#D6604D")) +
  scale_y_continuous(breaks = 0:8) +
  facet_wrap(~rater, ncol = 1) +
  labs(
    title    = "Attention Checks per Block Over Time",
    subtitle = "Each spike = a block where attention checks were recorded",
    x = "Order index", y = "# attention checks", color = NULL
  ) +
  theme(legend.position = "none")

investigating the 6 attention check fail block

Code
attn_df |> group_by(rater, order_index, class, filename) |> summarize(n = n()) |> filter(n >= 4)
# A tibble: 2 × 5
# Groups:   rater, order_index, class [2]
  rater   order_index class    filename                                        n
  <chr>         <int> <chr>    <chr>                                       <int>
1 Rater 1          90 sandwich Z                                               6
2 Rater 2         164 window   nonobject_tile_0163.3ee48aea1ad9f03274f9.j…     4

Filename ‘Z’ indicates some bug within our attention check code..manual inspection of nonobject_tile_0163.jpg shows that it’s a random crop that includes a window and chairs among other things which made it hard to identify.

Todos: 1) Look through attention checks and remove any that match our existing categories, figure out if there’s a bug with filename ‘Z’ in attention checks. 2) ~80 minutes for the entire study. Break up into two parts? 3) Run through the study on my own to see if one of the raters is more reliable 4) Think about using the AI taskers mode on Prolific