Choosing stimuli for experiment 2 of VVI

Published

August 4, 2025

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
here() starts at /Users/visuallearninglab/Documents/visvocab


Attaching package: 'jsonlite'


The following object is masked from 'package:purrr':

    flatten

Stimuli selection and formatting

# Storing stimuli on our Ubuntu server
STIMULI_PATH = "https://ucsdlearninglabs.org/stimuli/"
PROJECT_VERSION = Sys.getenv("PROJECT_VERSION")
# The directory the current file is located in
SOURCE_CODE_PATH = here("stimuli", "lookit")
DEST_PATH <- file.path(SOURCE_CODE_PATH, "exp2")
# Using regular destination path so that our file converter script can convert them and store them in mp3 and ogg folders
DEST_AUDIO_PATH <- file.path(DEST_PATH)

Grabbing trial, AoA and THINGS metadata

Grabbing all THINGSplus data

things_stimuli_metadata <- read_tsv(file.path(SOURCE_CODE_PATH, "preprocessing", "older_stimuli", "_concepts-metadata_things.tsv")) |> mutate(is_animate = grepl("animal", `Top-down Category (WordNet)`), food_item = grepl("food|fruit|vegetable", `Top-down Category (WordNet)`))

Rows: 1854 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (14): Word, uniqueID, WordNet ID, Wordnet ID2, Wordnet ID3, Wordnet ID4,...
dbl (11): Bigram, Percent_known, Rank (combining COCA/concreteness), Concret...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

things_images_metadata <- read_tsv(file.path(SOURCE_CODE_PATH, "preprocessing", "older_stimuli", "_images-metadata_things.tsv")) |> filter(grepl("1b", image))

Rows: 27961 Columns: 21
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (16): image, Word, uniqueID, display_label, answer, RT, recognizability_...
dbl  (5): index, recognizability_N-ratings, recognizability_N-correct, recog...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

exp1_metadata <- read_csv(here("data", "metadata", "level-trialtype_data.csv")) |> filter(!grepl("distractor", Trials.trialID))

Rows: 32 Columns: 23
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): Trials.trialID, Trials.targetImage, Trials.distractorImage, Trials...
dbl (17): image_similarity, text_similarity, multimodal_similarity, stimuli_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

aoa_ratings_kuperman <- read_csv(file.path(SOURCE_CODE_PATH, "preprocessing", "older_stimuli", "aoa_ratings_kuperman.csv"))

Rows: 31318 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Word, Freq_pm, PoS
dbl (5): OccurTotal, OccurNum, Rating.Mean, Rating.SD, Dunno

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

aoa_ratings_wordbank <- read_csv(file.path(SOURCE_CODE_PATH,
                                           "preprocessing", "older_stimuli",
                                           "wordbank_aoa_ratings.csv"))

Rows: 1077 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): item_definition, measure
dbl (3): intercept, slope, aoa

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

things_images_dir <- here("data", "raw", "THINGS")

Looking at existing similarity values, it’s clear that the distribution is neither uniform nor normal. Our goal is to try to make this distribution more uniform with our stimuli choices for experiment 2.

exp1_long <- exp1_metadata %>%
  pivot_longer(cols = c(text_similarity, image_similarity),
               names_to = "similarity_type",
               values_to = "similarity")


ggplot(exp1_long, aes(x = similarity)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white") +
  labs(x = "Similarity", y = "Count") +
  theme_minimal() +
  facet_wrap(~ similarity_type, scales = "free_x")

Filtering down stimuli based on AoAs and category

aoa_ratings_wordbank <- aoa_ratings_wordbank %>% 
  ##--- 1.  normalise the item names ------------------------------------------
  mutate(
    # drop parentheses and what’s inside, e.g. "call (on phone)" -> "call"
    item_definition = str_remove(item_definition, "\\s*\\([^)]*\\)"),
    # tidy whitespace that might be left behind
    item_definition = str_trim(item_definition)
  ) %>% 
  # split on “/”, making extra rows: "soda/pop" -> "soda" + "pop"
  separate_rows(item_definition, sep = "/") %>% 
  
  ##--- 2.  deal with duplicates *within* each measure -------------------------
  group_by(item_definition, measure) %>% 
  summarise(aoa = mean(aoa, na.rm = TRUE), .groups = "drop") %>% 
  
  ##--- 3.  pivot wider --------------------------------------------------------
  pivot_wider(
    names_from   = measure,
    values_from  = aoa,
    names_prefix = "aoa_"
  )

stimuli_with_aoa <- things_stimuli_metadata |> left_join(aoa_ratings_kuperman, by="Word") |> left_join(aoa_ratings_wordbank, by=c("Word"="item_definition")) |> filter(!is.na(Rating.Mean) | !is.na(aoa_produces)) |> filter(!(Word %in% c(exp1_metadata$Trials.targetImage, exp1_metadata$Trials.distractorImage)))

library(tidyverse)
library(fuzzyjoin)   # install.packages("fuzzyjoin") if needed

# 1. Select only the columns we need
stim_sub <- stimuli_with_aoa %>% 
  filter(!food_item) |>
  select(
    Word, is_animate,
    Rating.Mean,
    aoa_produces,
    uniqueID
  ) 

# 2. ------------------ KUPERMAN PAIRS (Rating.Mean)
kuper_pairs <- stim_sub %>% 
  filter(!is.na(Rating.Mean), Rating.Mean < 6) -> kuper_df

kuper_pairs <- difference_inner_join(
    kuper_df, kuper_df,
    by = "Rating.Mean",
    max_dist = 0.5, distance_col = NULL
  ) %>% 
  filter(is_animate.x == is_animate.y,
         Word.x < Word.y) %>% 
  transmute(
    text1 = Word.x,
    text2 = Word.y,
    source = "Kuperman",
    aoa_word1 = Rating.Mean.x,
    aoa_word2 = Rating.Mean.y,
    is_animate = is_animate.x,
    image1 = paste0(uniqueID.x, ".jpg"),
    image2= paste0(uniqueID.y, ".jpg")
  )

# 3. ------------------ WORDBANK PAIRS (aoa_produces) 
wb_df <- stim_sub %>% filter(!is.na(aoa_produces))

wb_pairs <- difference_inner_join(
    wb_df, wb_df,
    by = "aoa_produces",
    # 2 months
    max_dist = 2, distance_col = NULL
  ) %>%
  filter(is_animate.x == is_animate.y,
         Word.x < Word.y) %>% 
  transmute(
    text1 = Word.x,
    text2 = Word.y,
    source = "Wordbank",
    aoa_word1 = round(aoa_produces.x/12, 2),
    aoa_word2 = round(aoa_produces.y/12, 2),
    is_animate = is_animate.x,
    image1 = paste0(uniqueID.x, ".jpg"),
    image2= paste0(uniqueID.y, ".jpg")
  )

# ------------------ COMBINE
paired_words <- bind_rows(kuper_pairs, wb_pairs)

paired_words

# A tibble: 29,301 × 8
   text1    text2   source   aoa_word1 aoa_word2 is_animate image1       image2 
   <chr>    <chr>   <chr>        <dbl>     <dbl> <lgl>      <chr>        <chr>  
 1 airplane baby    Kuperman      3.94      3.84 FALSE      airplane.jpg baby.j…
 2 airplane bag     Kuperman      3.94      4.28 FALSE      airplane.jpg bag.jpg
 3 airplane balloon Kuperman      3.94      4.37 FALSE      airplane.jpg balloo…
 4 airplane bell    Kuperman      3.94      3.89 FALSE      airplane.jpg bell.j…
 5 airplane bench   Kuperman      3.94      4.21 FALSE      airplane.jpg bench.…
 6 airplane blanket Kuperman      3.94      3.61 FALSE      airplane.jpg blanke…
 7 airplane boat    Kuperman      3.94      3.84 FALSE      airplane.jpg boat.j…
 8 airplane book    Kuperman      3.94      3.68 FALSE      airplane.jpg book.j…
 9 airplane boot    Kuperman      3.94      3.89 FALSE      airplane.jpg boot.j…
10 airplane bottle  Kuperman      3.94      3.56 FALSE      airplane.jpg bottle…
# ℹ 29,291 more rows

Make sure the .jpg files exist

# Initialize list to store valid row indices
valid_indices <- c()

# Loop to check if both image files exist
for (i in seq_len(nrow(paired_words))) {
  file1 <- file.path(things_images_dir, paired_words$image1[i])
  file2 <- file.path(things_images_dir, paired_words$image2[i])
  
  if (file.exists(file1) && file.exists(file2)) {
    valid_indices <- c(valid_indices, i)
  }
}

# Subset to only rows with both images present
paired_words <- paired_words[valid_indices, ] |> distinct(image1, image2, .keep_all=TRUE)

write.csv(paired_words, "exp2_pairs.csv")

Grabbing similarity values for further filtering

# Read and deduplicate
sims <- read.csv("exp2_sims.csv") |> 
  distinct(image1, image2, .keep_all = TRUE)

# Create symmetric lookup table once (cleaner for multiple 1/2 columns)
flip_12_columns <- function(df) {
  df |> rename_with(
    ~ str_replace_all(., c("1$" = "TEMP", "2$" = "1", "TEMP" = "2")),
    .cols = matches("(1|2)$")
  )
}

sims_expanded <- bind_rows(sims, flip_12_columns(sims)) |> distinct()
paired_words_expanded <- bind_rows(paired_words, flip_12_columns(paired_words)) |> distinct()

# Simple join
sim_vals <- paired_words_expanded |>
  left_join(sims_expanded)

Joining with `by = join_by(text1, text2, image1, image2)`

# images that are either too hard, are food, or have some weird saliency issuess
bad_images = c("cassette", "lollipop", "cornbread", "pitcher", "hopscotch", "milkshake", "skin", "puddle", "wall", "treasure", "pump", "elbow", "scoop", "tadpole", "screwdriver", "sparkler", "dessert", "vegetable", "raspberry", "face", "tongue", "dryer", "stair", "tongue",  "nose", "chipmunk", "rim", "pinball", "ruby", "toe", "quilt", "asparagus", "bedpost", "gun", "shoulder", "broccoli", "brownie", "bacon", "shovel", "donut", "meat", "thumb", "bottle", "corn", "mouth", "wrist", "ashtray", "cabbage", "pasta", "tooth", "strap",
               "birdbath", "sandcastle", "baseball", "football", "glove", "mitten", "snowsuit", "handprint", "clover", "boot", "stockings", "drink", "jacket", "sword", "coat", "finger", "ant", "arm", "leg", "bird", "splinter", "girl", "boy", "flyswatter", "vest", "baby")

bad_pairs <- tibble(text1 = c("sunglasses", "bark", "ant", "dandelion", "ant", "shirt", "lamb", "seahorse", "bin", "book", "bathtub"), 
                    text2 = c("tool", "block", "bull", "fireworks", "elephant", "clothes", "sheep", "shark", "trashcan", "paper", "crayon"))

sim_vals_filtered_some <- sim_vals %>%
    filter(!(text1 %in% bad_images) & !(text2 %in% bad_images)) |> 
  anti_join(bad_pairs, by = c("text1", "text2")) |>
  group_by(source, text1) %>%
  arrange(desc(text_sim)) %>%  # sort descending first
  slice_head(n = 3) %>%  # top 3
  mutate(sim_type = "high") |>
  bind_rows(
    sim_vals %>%
        filter(!(text1 %in% bad_images) & !(text2 %in% bad_images)) |> 
      anti_join(bad_pairs, by = c("text1", "text2")) |>
      group_by(source, text1) %>%
      arrange(text_sim) %>%
      slice_head(n = 3) |>
      mutate(sim_type="low") # bottom 3
    # making sure that the range is big enough 
  ) %>% filter(any(text_sim < 0.75) | any(text_sim > 0.85)) |> distinct() |> filter(n() >= 2) |>
  ungroup() 

sim_vals_wb_only <- sim_vals_filtered_some |>
  filter(source == "Wordbank" & !(text1 %in% bad_images) & !(text2 %in% bad_images))


ggplot(sim_vals_wb_only, aes(x = text_sim)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white") +
  labs(x = "Similarity", y = "Count") +
  theme_minimal()

Sampling to make a uniform distribution

# 1. Bin individual values
breaks <- seq(0.6, 0.95, by = 0.05)
individual_vals <- sim_vals_filtered_some %>%
  mutate(bin = cut(text_sim, breaks = breaks, include.lowest = TRUE))

# 2. Define bin targets
exp1_hist <- hist(exp1_metadata$text_similarity, breaks = breaks, plot = FALSE)
target_count <- ceiling((nrow(exp1_metadata) + 128) / length(breaks))

bin_info <- data.frame(
  bin = levels(cut(exp1_metadata$text_similarity, breaks, include.lowest = TRUE)),
  current = exp1_hist$counts,
  needed = pmax(0, target_count - exp1_hist$counts)
)

# 3. Filter individual_vals to only keep those that can fill needed bins
bin_contributions <- individual_vals %>%
  left_join(bin_info, by = "bin") %>%
  filter(needed > 0)

# 4. Initialize sampled pair tracking
sampled_pairs <- data.frame()
used_text2_high <- character(0)
used_text2_low <- character(0)

# 5. Make all possible pairs (text1 x high x low)
possible_pairs <- sim_vals_filtered_some %>%
  filter(sim_type == "high") %>%
  select(text1, source, text2_high = text2) %>%
  inner_join(
    sim_vals_filtered_some %>%
      filter(sim_type == "low") %>%
      select(text1, source, text2_low = text2),
    by = c("text1", "source")
  ) %>%
  mutate(pair_id = row_number())

Warning in inner_join(., sim_vals_filtered_some %>% filter(sim_type == "low") %>% : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.

# 6. Score pairs by contribution to needed bins
pair_bins <- individual_vals %>%
  select(text1, text2, bin) %>%
  distinct()

# Get all bins contributed by each pair (text1 + high/low)
pair_priorities <- possible_pairs %>%
  left_join(pair_bins, by = c("text1", "text2_high" = "text2")) %>%
  rename(bin_high = bin) %>%
  left_join(pair_bins, by = c("text1", "text2_low" = "text2")) %>%
  rename(bin_low = bin) %>%
  mutate(all_bins = map2(bin_high, bin_low, ~na.omit(c(.x, .y)))) %>%
  mutate(
    priority_score = map_dbl(all_bins, function(bins) {
      if (length(bins) == 0) return(0)
      sum(bin_info$needed[match(bins, bin_info$bin)], na.rm = TRUE)
    })
  ) %>%
  filter(priority_score > 0) %>%
  arrange(desc(priority_score))

# 7. Get text1 priority order
text1_priority <- pair_priorities %>%
  group_by(text1, source) %>%
  summarise(priority_score = sum(priority_score), .groups = "drop") %>%
  arrange(desc(priority_score))

# 8. Track text1 contributions
text1_bin_contributions <- individual_vals %>%
  group_by(text1, bin) %>%
  summarise(count = n(), .groups = 'drop') %>%
  filter(!is.na(bin)) %>%
  left_join(bin_info, by = "bin") %>%
  filter(needed > 0)

# 9. Simple sampling using pair_priorities
sampled_pairs <- data.frame()
used_text2_high <- character(0)
used_text2_low <- character(0)

while (sum(bin_info$needed) > 0) {
  # Recalculate pair_priorities with current bin_info
  available_pairs <- possible_pairs %>%
    filter(!(text2_high %in% used_text2_high), !(text2_low %in% used_text2_low))
  
  if (nrow(available_pairs) == 0) break
  
  pair_priorities <- available_pairs %>%
    left_join(pair_bins, by = c("text1", "text2_high" = "text2")) %>%
    rename(bin_high = bin) %>%
    left_join(pair_bins, by = c("text1", "text2_low" = "text2")) %>%
    rename(bin_low = bin) %>%
    mutate(all_bins = map2(bin_high, bin_low, ~na.omit(c(.x, .y)))) %>%
    mutate(
      priority_score = map_dbl(all_bins, function(bins) {
        if (length(bins) == 0) return(0)
        sum(bin_info$needed[match(bins, bin_info$bin)], na.rm = TRUE)
      })
    ) %>%
    filter(priority_score > 0) %>%
    arrange(desc(priority_score))
  
  if (nrow(pair_priorities) == 0) break
  
  # Select top priority pair
  current_pair <- pair_priorities[1, ]
  sampled_pairs <- bind_rows(sampled_pairs, current_pair)
  used_text2_high <- c(used_text2_high, current_pair$text2_high)
  used_text2_low <- c(used_text2_low, current_pair$text2_low)
  
  # Update bin_info
  contributions <- individual_vals %>% 
    filter((text1 == current_pair$text1 & text2 == current_pair$text2_high) |
           (text1 == current_pair$text1 & text2 == current_pair$text2_low)) %>%
    count(bin) %>%
    filter(!is.na(bin))
  
  for (j in 1:nrow(contributions)) {
    bin_info$needed[bin_info$bin == contributions$bin[j]] <- 
      pmax(0, bin_info$needed[bin_info$bin == contributions$bin[j]] - contributions$n[j])
  }
}


# 10. Final sampled values
sim_vals_sampled <- sim_vals_filtered_some %>%
  inner_join(
    sampled_pairs %>%
      pivot_longer(cols = c(text2_high, text2_low),
                   names_to = "sim_type_selected", 
                   values_to = "text2") %>%
      select(text1, source, text2),
    by = c("text1", "source", "text2")
  )

# 11. Combine with exp1 metadata
things_stim_path <- "https://ucsdlearninglabs.org/all_stimuli/THINGSplus/"
exp1_metadata_transmuted <- exp1_metadata %>%
    filter(!grepl("distractor", Trials.trialID)) %>%
    transmute(
      text_similarity,
      text1 = Trials.targetImage,
      text2 = Trials.distractorImage,
      aoa_word1 = AoA_Est_target,
      aoa_word2 = AoA_Est_distractor,
      image_similarity,
      source = "exp1"
    )

combined_stimuli <- bind_rows(
 exp1_metadata_transmuted,
  sim_vals_sampled %>%
    transmute(
      text1,
      text2,
      image1 = paste0(things_stim_path, image1),
      image2 = paste0(things_stim_path, image2),
      text_similarity = text_sim,
      image_similarity = image_sim,
      aoa_source = source,
      aoa_word1,
      aoa_word2,
      source = "exp2"
    )
)

breaks <- seq(0.4, 0.95, by = 0.05)

# 12. Plot histogram
ggplot(combined_stimuli |> pivot_longer(cols=c(text_similarity, image_similarity), names_to="sim_type", values_to="sim_value"), aes(x = sim_value, fill = source)) +
  geom_histogram(breaks = breaks, color = "white", position = "stack", alpha = 0.6) +
  scale_x_continuous(breaks = breaks) +
  labs(x = "Cosine similarity", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~sim_type)

# 13. Print summary
cat("Selected", nrow(sampled_pairs), "pairs total\n")

Selected 56 pairs total

cat("From", length(unique(sampled_pairs$text1)), "unique text1 values\n")

From 46 unique text1 values

cat("Average pairs per text1:", round(nrow(sampled_pairs) / length(unique(sampled_pairs$text1)), 2), "\n")

Average pairs per text1: 1.22

cat("Total sampled rows:", nrow(sim_vals_sampled), "\n")

Total sampled rows: 112

library(magick)

Linking to ImageMagick 6.9.12.93
Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
Disabled features: fftw, ghostscript, x11

library(purrr)

# Example data frame: update with your actual image paths
# df <- data.frame(image1 = c("img1a.png", "img2a.png"),
#                  image2 = c("img1b.png", "img2b.png"))

# Function to combine two images with gray background and spacing
combine_images <- function(image1, image2, height = 300, gap = 30, bg_color = "gray90") {
  img1 <- image_read(image1) |> image_scale(paste0("x", height))
  img2 <- image_read(image2) |> image_scale(paste0("x", height))

  info1 <- image_info(img1)
  info2 <- image_info(img2)

  total_width <- info1$width + info2$width + gap
  max_height <- max(info1$height, info2$height)
  canvas <- image_blank(width = total_width, height = max_height, color = bg_color)

  result <- canvas |>
    image_composite(img1, offset = "+0+0") |>
    image_composite(img2, offset = paste0("+", info1$width + gap, "+0"))

  return(result)
} 

# Apply across rows of df and get list of combined images
stim_exp2 <- combined_stimuli |> 
  filter(source == "exp2")
combined_images <- pmap(stim_exp2 |> select(image1, image2), combine_images)

# Optionally display first combined image
print(combined_images[[8]])

# A tibble: 1 × 7
  format width height colorspace matte filesize density
  <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
1 png      630    300 sRGB       FALSE        0 72x72

# Optionally save each combined image
filenames <- stim_exp2 |> 
  mutate(filename = paste0("exp2_images_v4/",text1,
                           "_",
                           text2,
                           ".png")) |> 
  pull(filename)

# Step 4: Save each image
walk2(combined_images, filenames, ~image_write(.x, .y))

write.csv(combined_stimuli, here("stimuli/lookit/preprocessing/combined.csv"))
ggplot(combined_stimuli, aes(x=image_similarity, y=text_similarity, color=source)) +
  geom_point() +
  ggrepel::geom_label_repel(aes(label=paste(text1,text2))) +
  geom_smooth(method="lm") +
  ggpubr::stat_cor()

`geom_smooth()` using formula = 'y ~ x'

Warning: ggrepel: 102 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Now choosing stimuli that I think are good based on filtering

# pairs I'd want to include
good_pairs <- tibble(text1 = c("duck", "lizard", "bus", "cloud", "camel", "fork", "flashlight", "crow", "airplane", "truck", "bed", "bathtub", "pencil", "balloon", "seahorse", "ball", "backpack", "beaver"), 
                    text2_high = c("chicken", "snake", "truck", "tree", "elephant", "spoon", "lightbulb", "penguin", "train", "car", "bathtub", "shower", "broom", "lamp", "shark", "rock", "purse", "seal"),
                    text2_low = c("ladybug", "peacock", "washcloth", "zipper", "ladybug", "slide", "bathmat", "seahorse", "washcloth", "swing", "soap", "sock", "bib", "dresser", "pony", "toothpaste", "icicle", "zebra"))


sim_vals_adhoc <- sim_vals %>%
  inner_join(
    good_pairs %>%
      pivot_longer(cols = c(text2_high, text2_low),
                   names_to = "sim_type_selected", 
                   values_to = "text2") %>%
      select(text1, text2),
    by = c("text1", "text2")
  ) |> distinct()

combined_stimuli_adhoc <- bind_rows(
 exp1_metadata_transmuted,
  sim_vals_adhoc %>%
    transmute(
      text1,
      text2,
      image1 = paste0(things_stim_path, image1),
      image2 = paste0(things_stim_path, image2),
      text_similarity = text_sim,
      image_similarity = image_sim,
      aoa_source = source,
      aoa_word1,
      aoa_word2,
      source = "exp2"
    )
) |> mutate(filename = ifelse(source == "exp2", paste0("vvi_exp2_images/",text1,
                           "_",
                           text2,
                           ".png"), ""))


ggplot(combined_stimuli_adhoc |> pivot_longer(cols=c(text_similarity, image_similarity), names_to="sim_type", values_to="sim_value"), aes(x = sim_value, fill = source)) +
  geom_histogram(breaks = breaks, color = "white", position = "stack", alpha = 0.6) +
  scale_x_continuous(breaks = breaks) +
  labs(x = "Cosine similarity", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~sim_type)

stim_exp2 <- combined_stimuli_adhoc |> 
  filter(source == "exp2")
combined_images <- pmap(stim_exp2 |> select(image1, image2), combine_images)

# Optionally save each combined image
filenames <- stim_exp2 |> 
  pull(filename)

# Step 4: Save each image
walk2(combined_images, filenames, ~image_write(.x, .y))

vvi_path = "https://ucsdlearninglabs.org/all_stimuli/"
write.csv(combined_stimuli_adhoc |> mutate(filename = paste0(vvi_path, filename)), here("stimuli/lookit/preprocessing/combined_adhoc.csv"))

image vs text sim

ggplot(combined_stimuli_adhoc, aes(x=image_similarity, y=text_similarity, color=source)) +
  geom_point() +
  ggrepel::geom_label_repel(aes(label=paste(text1,text2))) +
  geom_smooth(method="lm") +
  ggpubr::stat_cor()

`geom_smooth()` using formula = 'y ~ x'

Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Looking back at exp1 z-scoring

unique_trials <- exp1_metadata |> distinct(Trials.imagePair, .keep_all=TRUE)
model <- lm(image_sim_zscore ~ image_similarity, data = unique_trials)

# Compute residuals and add to dataframe
unique_trials$resid <- abs(residuals(model))

# Flag outliers (e.g., residuals > 2 SD from mean)
threshold <- mean(unique_trials$resid) + sd(unique_trials$resid)
unique_trials$outlier <- unique_trials$resid > threshold | unique_trials$Trials.imagePair == "bulldozer-orange"


ggplot(unique_trials, aes(x = image_similarity, y = image_sim_zscore)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  xlab("Raw embeddings") +
  ylab("Z-scored embeddings") +
  ggtitle("LWL study-specific Image Embeddings") +
  ggpubr::stat_cor(method = "pearson") +
  geom_label_repel(
    data = subset(unique_trials, outlier),
    aes(label = Trials.imagePair),
    box.padding = 0.35,
    point.padding = 0.3,
    segment.color = 'grey50'
  )

`geom_smooth()` using formula = 'y ~ x'