BabyView Objects Prolific Validation

Code

library(tidyverse)
library(ggplot2)
library(scales)
library(irr)
library(ggrepel)

OOD_CLASSES <- c("__OOD_BV_CROP__", "__OOD_SYNTHETIC__")

theme_set(
  theme_minimal(base_size = 12) +
    theme(
      plot.title    = element_text(hjust = 0.5, face = "bold", size = 13),
      plot.subtitle = element_text(hjust = 0.5, color = "gray40", size = 10),
      strip.text    = element_text(face = "bold")
    )
)

Code

df_raw <- read.csv("bv_objects.prolific_responses.csv") |>
  filter(!class %in% OOD_CLASSES) |>
  mutate(
    order_index   = as.integer(order_index),
    reaction_time = as.numeric(reaction_time)
  ) |>
  mutate(
    rater = {
      long_ids <- unique(worker_id[nchar(worker_id) > 10])
      id_to_label <- setNames(
        paste0("Rater ", seq_along(long_ids)),
        long_ids
      )
      ifelse(nchar(worker_id) > 10, id_to_label[worker_id], NA_character_)
    }
  ) |>
  filter(!is.na(rater))

# Ground-truth universe: the canonical (class, filename) pairs both raters evaluated.
# Both raters saw shuffled_index == 1. Note: categories do NOT all have 25 images —
# n_total varies per category and is computed from this table.
all_annotations <- read.csv("annotations.csv") |>
  filter(shuffled_index == 1) |>
  filter(!class %in% OOD_CLASSES) |>
  select(class, filename) |>
  distinct()

# True denominator per category — varies, do not assume 25
images_per_cat <- all_annotations |>
  count(class, name = "n_total")

invalid_df <- df_raw |> filter(trial_type == "invalid_drawing")
attn_df    <- df_raw |> filter(trial_type == "attention_check")

Overview

Code

df_raw |>
  count(rater, trial_type) |>
  pivot_wider(names_from = trial_type, values_from = n, values_fill = 0) |>
  rename(
    Rater              = rater,
    `Invalid drawings` = invalid_drawing,
    `Attention checks` = attention_check
  ) |>
  knitr::kable(caption = "Rows per rater by trial type")

Rows per rater by trial type
Rater	Attention checks	Invalid drawings
Rater 1	12	1488
Rater 2	6	2449

Code

images_per_cat |>
  summarise(
    `# categories`  = n(),
    `Min images`    = min(n_total),
    `Max images`    = max(n_total),
    `Median images` = median(n_total),
    `Mean images`   = round(mean(n_total), 1)
  ) |>
  knitr::kable(caption = "Ground-truth universe summary (annotations.csv, shuffled_index == 1)")

Ground-truth universe summary (annotations.csv, shuffled_index == 1)
# categories	Min images	Max images	Median images	Mean images
163	1	25	25	23.9

Code

# Distribution of images per category
images_per_cat |>
  ggplot(aes(x = n_total)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(
    title = "Distribution of images per category",
    subtitle = "From annotations.csv, shuffled_index == 1",
    x = "# images in category", y = "# categories"
  )

1. Invalid Image Rate per Category

The denominator for each category is n_total from all_annotations — the actual number of images in that category’s ground-truth bucket, which varies across categories.

Code

n_invalid_per_rater <- invalid_df |>
  group_by(rater, class) |>
  summarise(n_invalid = n_distinct(filename), .groups = "drop")

# Join true denominator; categories a rater marked 0 invalid won't appear above,
# All raters that exist in the data
raters <- df_raw |> distinct(rater)

# Full grid: every rater × every category in the annotation universe
full_grid <- raters |>
  cross_join(images_per_cat |> select(class)) # <-- ground truth universe

invalid_rate <- full_grid |>
  left_join(n_invalid_per_rater, by = c("rater", "class")) |>
  replace_na(list(n_invalid = 0L)) |>
  left_join(images_per_cat, by = "class") |>
  mutate(prop_invalid = n_invalid / n_total)

avg_valid <- invalid_rate |>
  group_by(rater) |>
  summarise(
    mean_n    = round(mean(n_invalid), 2),
    mean_prop = round(1 - mean(prop_invalid), 3),
    sd_prop   = round(sd(prop_invalid), 3),
    .groups   = "drop"
  )

avg_valid |>
  knitr::kable(
    caption   = "Average valid image proportion per rater first summarized within category",
    col.names = c("Rater", "Mean # invalid", "Mean proportion valid", "SD proportion valid")
  )

Average valid image proportion per rater first summarized within category
Rater	Mean # invalid	Mean proportion valid	SD proportion valid
Rater 1	9.13	0.625	0.265
Rater 2	15.02	0.377	0.312

Code

invalid_rate |> group_by(rater) |>
  summarize(n_total = sum(n_total), n_valid = n_total - sum(n_invalid), prop_valid = round(n_valid / n_total, 3)) |>
    knitr::kable(
    caption   = "Average valid image proportion per rater summed across images",
    col.names = c("Rater", "Total images", "Total valid images", "Proportion valid")
  )

Average valid image proportion per rater summed across images
Rater	Total images	Total valid images	Proportion valid
Rater 1	3901	2413	0.619
Rater 2	3901	1452	0.372

Code

cat_order <- invalid_rate |>
  group_by(class) |>
  summarise(mean_prop = mean(prop_invalid), .groups = "drop") |>
  arrange(desc(mean_prop)) |>
  pull(class)

invalid_rate |>
  mutate(class = factor(class, levels = rev(cat_order))) |>
  ggplot(aes(x = prop_invalid, y = class, fill = rater)) +
  geom_col(position = position_dodge(0.7), width = 0.6, alpha = 0.85) +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "firebrick", linewidth = 0.5) +
  scale_x_continuous(labels = percent_format(), limits = c(0, 1.05)) +
  scale_fill_manual(values = c("Rater 1" = "#4393C3", "Rater 2" = "#D6604D")) +
  labs(
    title    = "Invalid Image Rate per Category",
    subtitle = "Dashed line = 50% threshold; denominator = actual # images per category",
    x        = "Proportion marked invalid",
    y        = NULL,
    fill     = NULL
  ) +
  theme(
    legend.position = "top",
    axis.text.y     = element_text(size = 7)
  )

2. Inter-Rater Reliability (IRR)

The binary matrix is built by left-joining rater judgments onto the all_annotations universe. Every (class, filename) pair gets 1 if that rater marked it invalid, 0 otherwise. This means:

The 0s are genuine valid judgments, not missing data.
Each category contributes its true n_total rows, not a fixed 25.
Only categories seen by both raters are included.

Code

cats  <- full_grid |> distinct(class) |> pull(class)

universe <- all_annotations |> filter(class %in% cats)

r1_invalid <- invalid_df |>
  filter(rater == "Rater 1") |>
  select(class, filename) |> distinct() |> mutate(r1 = 1L)

r2_invalid <- invalid_df |>
  filter(rater == "Rater 2") |>
  select(class, filename) |> distinct() |> mutate(r2 = 1L)

irr_df <- universe |>
  left_join(r1_invalid, by = c("class", "filename")) |>
  left_join(r2_invalid, by = c("class", "filename")) |>
  replace_na(list(r1 = 0L, r2 = 0L))

# Sanity: show actual row counts vs n_total per category
irr_df |>
  count(class, name = "n_rows") |>
  left_join(images_per_cat, by = "class") |>
  summarise(
    all_match = all(n_rows == n_total),
    n_mismatch = sum(n_rows != n_total)
  )

  all_match n_mismatch
1      TRUE          0

2a. Overall Cohen’s Kappa

Code

kappa_result <- kappa2(irr_df |> select(r1, r2) |> as.matrix())
pct_agree    <- mean(irr_df$r1 == irr_df$r2)

tibble(
  Metric = c("# images in universe", "% agreement", "Cohen's κ", "z"),
  Value  = c(
    nrow(irr_df),
    percent(pct_agree, accuracy = 0.1),
    round(kappa_result$value, 3),
    round(kappa_result$statistic, 2)
  )
) |>
  knitr::kable(caption = "Overall IRR across all shared categories")

Overall IRR across all shared categories
Metric	Value
# images in universe	3901
% agreement	67.4%
Cohen’s κ	0.385
z	27.13

2b. Per-Category Kappa Heatmap

Code

kappa_per_cat <- irr_df |>
  group_by(class) |>
  summarise(
    n_total   = n(),
    n_r1      = sum(r1),
    n_r2      = sum(r2),
    pct_agree = mean(r1 == r2),
    kappa     = tryCatch(kappa2(cbind(r1, r2))$value, error = \(e) NA_real_),
    .groups   = "drop"
  )

kappa_per_cat |>
  filter(!is.na(kappa), kappa != 0) |>
  slice_max(order_by = kappa, n = 10) |>
  bind_rows(
    kappa_per_cat |>
      filter(!is.na(kappa), kappa != 0) |>
      slice_min(order_by = kappa, n = 15)
  ) |>
  distinct(class, .keep_all = TRUE) |>
  mutate(class = fct_reorder(class, kappa)) |>
  ggplot(aes(x = 1, y = class, fill = kappa)) +
  geom_tile(color = "white", linewidth = 0.3) +
  geom_text(aes(label = round(kappa, 2)), size = 4) +
  scale_fill_gradient2(
    low = "#D73027", mid = "white", high = "#1A9850",
    midpoint = 0, limits = c(-1, 1), name = "κ"
  ) +
  scale_x_continuous(breaks = NULL) +
  labs(
    title    = "Per-Category Cohen's κ (Top & Bottom 10-15)",
    subtitle = "Green = high agreement, Red = systematic disagreement",
    x = NULL, y = NULL
  ) +
  theme(axis.text.y = element_text(size = 14), axis.ticks = element_blank())

2c. Confusion Matrix: Rater Decisions

Code

irr_df |>
  mutate(
    r1_label = if_else(r1 == 1L, "Invalid", "Valid"),
    r2_label = if_else(r2 == 1L, "Invalid", "Valid")
  ) |>
  count(r1_label, r2_label) |>
  group_by(r1_label) |>
  mutate(prop = n / sum(n)) |>
  ungroup() |>
  ggplot(aes(x = r2_label, y = r1_label, fill = prop)) +
  geom_tile(color = "gray80", linewidth = 0.5) +
  geom_text(aes(label = paste0(n, "\n(", percent(prop, 1), ")")),
            size = 4, lineheight = 1.3) +
  scale_fill_gradient(low = "white", high = "steelblue", name = "Row\nprop.") +
  labs(
    title    = "Rater Agreement Confusion Matrix",
    subtitle = "Rows = Rater 1, Cols = Rater 2; 0s from ground-truth universe",
    x = "Rater 2", y = "Rater 1"
  ) +
  coord_fixed()

2d. Invalid Count Heatmap (Both Raters, Shared Categories)

Labels show n_invalid / n_total so you can see the varying denominators directly.

Code

mean_prop_cat <- kappa_per_cat |>
  select(class, n_total, n_r1, n_r2) |>
  pivot_longer(c(n_r1, n_r2), names_to = "rater", values_to = "n_invalid") |>
  mutate(
    rater    = recode(rater, n_r1 = "Rater 1", n_r2 = "Rater 2"),
    prop     = n_invalid / n_total
  ) |>
  group_by(class) |>
  mutate(mean_prop = mean(prop)) |>
  ungroup() |>
  mutate(class = fct_reorder(class, mean_prop))

Code

mean_prop_cat |> 
  slice_max(order_by = mean_prop, n = 40) |>
  bind_rows(
    mean_prop_cat |>
      slice_min(order_by = mean_prop, n = 40)
  ) |>
  ggplot(aes(x = rater, y = class, fill = prop)) +
  geom_tile(color = "white", linewidth = 0.3) +
  geom_text(aes(label = paste0(n_invalid, "/", n_total)), size = 4) +
  scale_fill_gradient(
    low = "#F7F7F7", high = "#D73027",
    labels = percent_format(), name = "prop\ninvalid"
  ) +
  labs(
    title    = "Invalid Image Count per Category per Rater (Top and bottom 10)",
    subtitle = "Labels = n_invalid / n_total",
    x = NULL, y = NULL
  ) +
  theme(axis.text.y = element_text(size = 12), axis.ticks = element_blank())

3. Time-Series: order_index vs. Invalid Images & Attention Checks

3a. Invalid Images Over Time

Code

invalid_per_block <- invalid_df |>
  group_by(rater, order_index) |>
  summarise(n_invalid = n_distinct(filename), .groups = "drop")

attn_per_block <- attn_df |>
  group_by(rater, order_index) |>
  summarise(n_attn = n(), .groups = "drop")

ggplot(invalid_per_block, aes(x = order_index, y = n_invalid, color = rater)) +
  geom_point(alpha = 0.45, size = 1.5) +
  geom_smooth(method = "loess", span = 0.35, se = TRUE, linewidth = 1.1) +
  scale_color_manual(values = c("Rater 1" = "#4393C3", "Rater 2" = "#D6604D")) +
  labs(
    title    = "Invalid Images per Block Over Time",
    subtitle = "Each point = one category block; ribbon = loess ± SE",
    x = "Order index (block number)", y = "# images marked invalid", color = NULL
  ) +
  theme(legend.position = "top")

3b. Attention Checks Over Time

Code

ggplot(attn_per_block, aes(x = order_index, y = n_attn, color = rater)) +
  geom_segment(aes(xend = order_index, yend = 0), alpha = 0.5) +
  geom_point(size = 2.5) +
  scale_color_manual(values = c("Rater 1" = "#4393C3", "Rater 2" = "#D6604D")) +
  scale_y_continuous(breaks = 0:8) +
  facet_wrap(~rater, ncol = 1) +
  labs(
    title    = "Attention Checks per Block Over Time",
    subtitle = "Each spike = a block where attention checks were recorded",
    x = "Order index", y = "# attention checks", color = NULL
  ) +
  theme(legend.position = "none")

investigating the 6 attention check fail block

Code

attn_df |> group_by(rater, order_index, class, filename) |> summarize(n = n()) |> filter(n >= 4)

# A tibble: 2 × 5
# Groups:   rater, order_index, class [2]
  rater   order_index class    filename                                        n
  <chr>         <int> <chr>    <chr>                                       <int>
1 Rater 1          90 sandwich Z                                               6
2 Rater 2         164 window   nonobject_tile_0163.3ee48aea1ad9f03274f9.j…     4

Filename ‘Z’ indicates some bug within our attention check code..manual inspection of nonobject_tile_0163.jpg shows that it’s a random crop that includes a window and chairs among other things which made it hard to identify.

Todos: 1) Look through attention checks and remove any that match our existing categories, figure out if there’s a bug with filename ‘Z’ in attention checks. 2) ~80 minutes for the entire study. Break up into two parts? 3) Run through the study on my own to see if one of the raters is more reliable 4) Think about using the AI taskers mode on Prolific