# Distribution of images per categoryimages_per_cat |>ggplot(aes(x = n_total)) +geom_histogram(binwidth =1, fill ="steelblue", color ="white") +labs(title ="Distribution of images per category",subtitle ="From annotations.csv, shuffled_index == 1",x ="# images in category", y ="# categories" )
1. Invalid Image Rate per Category
The denominator for each category is n_total from all_annotations — the actual number of images in that category’s ground-truth bucket, which varies across categories.
Code
n_invalid_per_rater <- invalid_df |>group_by(rater, class) |>summarise(n_invalid =n_distinct(filename), .groups ="drop")# Join true denominator; categories a rater marked 0 invalid won't appear above,# All raters that exist in the dataraters <- df_raw |>distinct(rater)# Full grid: every rater × every category in the annotation universefull_grid <- raters |>cross_join(images_per_cat |>select(class)) # <-- ground truth universeinvalid_rate <- full_grid |>left_join(n_invalid_per_rater, by =c("rater", "class")) |>replace_na(list(n_invalid =0L)) |>left_join(images_per_cat, by ="class") |>mutate(prop_invalid = n_invalid / n_total)avg_valid <- invalid_rate |>group_by(rater) |>summarise(mean_n =round(mean(n_invalid), 2),mean_prop =round(1-mean(prop_invalid), 3),sd_prop =round(sd(prop_invalid), 3),.groups ="drop" )avg_valid |> knitr::kable(caption ="Average valid image proportion per rater first summarized within category",col.names =c("Rater", "Mean # invalid", "Mean proportion valid", "SD proportion valid") )
Average valid image proportion per rater first summarized within category
The binary matrix is built by left-joining rater judgments onto the all_annotations universe. Every (class, filename) pair gets 1 if that rater marked it invalid, 0 otherwise. This means:
The 0s are genuine valid judgments, not missing data.
Each category contributes its true n_total rows, not a fixed 25.
Only categories seen by both raters are included.
Code
cats <- full_grid |>distinct(class) |>pull(class)universe <- all_annotations |>filter(class %in% cats)r1_invalid <- invalid_df |>filter(rater =="Rater 1") |>select(class, filename) |>distinct() |>mutate(r1 =1L)r2_invalid <- invalid_df |>filter(rater =="Rater 2") |>select(class, filename) |>distinct() |>mutate(r2 =1L)irr_df <- universe |>left_join(r1_invalid, by =c("class", "filename")) |>left_join(r2_invalid, by =c("class", "filename")) |>replace_na(list(r1 =0L, r2 =0L))# Sanity: show actual row counts vs n_total per categoryirr_df |>count(class, name ="n_rows") |>left_join(images_per_cat, by ="class") |>summarise(all_match =all(n_rows == n_total),n_mismatch =sum(n_rows != n_total) )
# A tibble: 2 × 5
# Groups: rater, order_index, class [2]
rater order_index class filename n
<chr> <int> <chr> <chr> <int>
1 Rater 1 90 sandwich Z 6
2 Rater 2 164 window nonobject_tile_0163.3ee48aea1ad9f03274f9.j… 4
Filename ‘Z’ indicates some bug within our attention check code..manual inspection of nonobject_tile_0163.jpg shows that it’s a random crop that includes a window and chairs among other things which made it hard to identify.
Todos: 1) Look through attention checks and remove any that match our existing categories, figure out if there’s a bug with filename ‘Z’ in attention checks. 2) ~80 minutes for the entire study. Break up into two parts? 3) Run through the study on my own to see if one of the raters is more reliable 4) Think about using the AI taskers mode on Prolific