── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
here() starts at /Users/visuallearninglab/Documents/visvocab

Note: The package "relayer" is highly experimental. Use at your own risk.

Loading required package: viridisLite

Loading required package: lme4

Loading required package: Matrix


Attaching package: 'Matrix'


The following objects are masked from 'package:tidyr':

    expand, pack, unpack



Attaching package: 'lmerTest'


The following object is masked from 'package:lme4':

    lmer


The following object is masked from 'package:stats':

    step



Attaching package: 'cowplot'


The following object is masked from 'package:lubridate':

    stamp



Attaching package: 'scales'


The following object is masked from 'package:viridis':

    viridis_pal


The following object is masked from 'package:purrr':

    discard


The following object is masked from 'package:readr':

    col_factor



Attaching package: 'rlang'


The following objects are masked from 'package:purrr':

    %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
    flatten_raw, invoke, splice

Load data

PROCESSED_DATA_PATH = here("data",PROJECT_VERSION,"processed_data")
looking_time_resampled_clean <- read.csv(file.path(PROCESSED_DATA_PATH, "level-looks_data.csv"))
all_looking_times <- read.csv(file.path(PROCESSED_DATA_PATH,"level-looks_added-metadata_data.csv"))
trial_metadata <- read.csv(here("data","metadata","level-trialtype_data.csv"))
trial_summary_data <- read.csv(file.path(PROCESSED_DATA_PATH,"level-trials_data.csv"))
# TODO: move similarity correlations to exploratory qmd file
cvcl_similarities <- read.csv(here("data", "embeddings", "similarities-cvcl_data.csv"))
openclip_similarities <- read.csv(here("data", "embeddings", "similarities-openclip_data.csv"))
saycamvit_similarities <- read.csv(here("data", "embeddings", "similarities-saycamvit_data.csv"))
imagenetvit_similarities <- read.csv(here("data", "embeddings", "similarities-imagenetvit_data.csv"))
looking_data_summarized <- trial_summary_data |>
  filter(trial_exclusion == 0 & exclude_participant == 0 & exclude_participant_insufficient_data == 0) |>
  left_join(trial_metadata) |>
  arrange(AoA_Est_target)
Joining with `by = join_by(Trials.trialID, Trials.targetImage,
Trials.distractorImage, Trials.imagePair)`
median_age <- looking_data_summarized %>%
  distinct(SubjectInfo.subjID, .keep_all = TRUE) %>%  # Keep only distinct subjects
  summarize(overall_median_age = median(SubjectInfo.testAge, na.rm = TRUE))  %>%
  pull(overall_median_age) / 30

median_age <- round(median_age, 2)

aoa_summary <- trial_metadata |>
  summarize(median_aoa = median(AoA_Est_target), 
            mean_aoa = mean(AoA_Est_target))
median_aoa <- aoa_summary$median_aoa

util function to see what the order of trials were for individual participants

order_of_trials <- function(data, subjID) {
  curr_order <- data |>
  filter(SubjectInfo.subjID == subjID) |>
  distinct(Trials.trialID, .keep_all=TRUE) |>
  arrange(Trials.ordinal) |>
  select(Trials.ordinal, Trials.trialID, Trials.targetImage, Trials.distractorImage, Trials.trialType, Trials.leftImage, Trials.rightImage, Trials.targetAudio)
  write_csv(curr_order, paste0(subjID,".csv"))
}

#order_of_trials(all_looking_times, "L7Y4Y6")

Overall timecourse plot of proportion target looking

#summarizing within subject for each time point
summarize_subj <- looking_time_resampled_clean %>%
  filter(trial_exclusion == 0 & exclude_participant ==0 & exclude_participant_insufficient_data == 0 & SubjectInfo.subjID != "PH2RNZ") %>%
  filter(!is.na(accuracy_transformed)) %>%
  group_by(SubjectInfo.subjID, time_normalized_corrected, SubjectInfo.testAge) %>%
  summarized_data(., "SubjectInfo.subjID", "accuracy_transformed", c("SubjectInfo.testAge", "time_normalized_corrected")) %>%
  rename(mean_accuracy = mean_value)
Warning: There were 281 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 1: `SubjectInfo.subjID = "34XMAG"`, `SubjectInfo.testAge = 480`,
  `time_normalized_corrected = -3400`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 280 remaining warnings.
#summarizing across subjects for each time point
summarize_across_subj <- summarize_subj %>%
  group_by(time_normalized_corrected) %>%
  summarized_data(., "time_normalized_corrected", "mean_accuracy", c("time_normalized_corrected")) |>
  rename(accuracy = mean_value)
Warning: There were 21 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 1: `time_normalized_corrected = -4567`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 20 remaining warnings.
looking_times <- ggplot(summarize_across_subj,aes(time_normalized_corrected,accuracy))+
  xlim(-2000,4000)+
  geom_errorbar(aes(ymin=accuracy-ci,ymax=accuracy+ci),width=0, alpha=0.2)+
  #geom_point(alpha=0.2)+
    geom_smooth(method="gam")+
  geom_vline(xintercept=0,size=1.5)+
  geom_hline(yintercept=0.5,size=1.2,linetype="dashed")+
  geom_vline(xintercept=300,linetype="dotted")+
  ylim(0,1)+
  xlab("Time (normalized to target word onset) in ms")+
  ylab("Proportion Target Looking")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
looking_times
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 104 rows containing non-finite outside the scale range
(`stat_smooth()`).

ggsave(here("figures",PROJECT_VERSION,"prop_looking_across_time.png"),looking_times,width=9,height=6,bg = "white")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 104 rows containing non-finite outside the scale range
(`stat_smooth()`).

Descriptive plots

Usable trial stats

#Overall baseline-corrected proportion target looking by condition
looking_data_exclusions <- trial_summary_data |>
  mutate(trial_exclusion_reason = as.factor(ifelse(is.na(trial_exclusion_reason), "included", trial_exclusion_reason))) |>
  mutate(trial_exclusion_reason = factor(trial_exclusion_reason, levels=c(setdiff(unique(trial_exclusion_reason), "included"), "included")))

# Compute the number of included trials per subject
included_counts <- looking_data_exclusions |>
  filter(trial_exclusion_reason == "included") |>
  count(SubjectInfo.subjID, name = "included_trials")


excluded_subjects <- setdiff(unique(looking_data_exclusions$SubjectInfo.subjID), 
                             unique(looking_data_summarized$SubjectInfo.subjID))

# Ensure all subjects are included and replace NA with 0
exclusions_by_subject <- looking_data_exclusions |>
  summarize(N = n(), .by = c(SubjectInfo.subjID, trial_exclusion_reason)) |>
  left_join(included_counts, by = "SubjectInfo.subjID") |>
  mutate(included_trials = replace_na(included_trials, 0)) |>  # Replace NA values
  # Ordering subject IDs so that the 'red' colored label to mark excluded subjects is placed on the subjects who are actually excluded
  mutate(SubjectInfo.subjID = factor(SubjectInfo.subjID, 
                                     levels = unique(SubjectInfo.subjID[order(-included_trials)])),
         color = ifelse(SubjectInfo.subjID %in% excluded_subjects, "red", "black")) |>
  arrange(color)

looking_data_exclusions_combined <- looking_data_exclusions |>
  summarize(N = n(),
    .by=c(trial_exclusion_reason))
# Format labels with HTML <span> for colors

exclusions_color_labels <- exclusions_by_subject |> 
  distinct(SubjectInfo.subjID, .keep_all = TRUE) |>
  mutate(colored_labels = paste0("<span style='color:", color, "'>", SubjectInfo.subjID, "</span>")) |>
  arrange(desc(included_trials)) |>
  mutate(colored_labels = factor(colored_labels, levels = colored_labels))

ggplot(exclusions_by_subject, aes(x = SubjectInfo.subjID, y = N, fill = trial_exclusion_reason)) +
  geom_bar(stat = "identity", position = "stack") +  
  labs(x = "Subject ID", y = "Count (N)", title = "Trial Exclusions by Subject", fill = "Trial Exclusion Reason") +
  theme_minimal() +  
  theme(
    axis.text.x = ggtext::element_markdown(angle = 45, hjust = 1),  # Enable HTML-styled labels
    strip.text = element_text(size = 8)
  ) +
  scale_x_discrete(labels = exclusions_color_labels$colored_labels) +  # Apply colored labels
  scale_fill_brewer(palette = "Set3")

#TODO: we have a participant who participated twice but their first session only had 2 trials. figure out how to deal with this/discard first session trials. Also have to figure out how to deal with participants who are excluded by default

Estimating item and subject-level noise

target_looking_item_subject_level <- summarized_data(looking_data_summarized, "Trials.targetImage", "corrected_target_looking", c("SubjectInfo.subjID", "SubjectInfo.testAge", "AoA_Est_target"))
Warning: There were 1396 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 2: `Trials.targetImage = "acorn"`, `SubjectInfo.subjID = "4GSNJd"`,
  `SubjectInfo.testAge = 570`, `AoA_Est_target = 5.95`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1395 remaining warnings.
target_looking_item_level <- summarized_data(target_looking_item_subject_level |> rename(mean_target_looking = mean_value), "Trials.targetImage", "mean_target_looking", "AoA_Est_target")

target_looking_item_age_level <- summarized_data(target_looking_item_subject_level |> rename(mean_target_looking = mean_value) |> add_age_split(), "Trials.targetImage", "mean_target_looking", c("AoA_Est_target", "age_half"))

target_looking_trial_subject_level <- summarized_data(looking_data_summarized, "Trials.trialID", "corrected_target_looking", c("SubjectInfo.subjID", "SubjectInfo.testAge", "AoA_Est_distractor", "AoA_Est_target", "Trials.targetImage", "Trials.distractorImage", "Trials.imagePair"))
Warning: There were 2476 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 1: `Trials.trialID = "easy-acorn-key"`, `SubjectInfo.subjID =
  "34XMAG"`, `SubjectInfo.testAge = 480`, `AoA_Est_distractor = 3.58`,
  `AoA_Est_target = 5.95`, `Trials.targetImage = "acorn"`,
  `Trials.distractorImage = "key"`, `Trials.imagePair = "acorn-key"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2475 remaining warnings.
target_looking_trial_level <- summarized_data(target_looking_trial_subject_level |> rename(mean_target_looking = mean_value), "Trials.trialID", "mean_target_looking", c("AoA_Est_distractor", "AoA_Est_target", "Trials.targetImage", "Trials.distractorImage", "Trials.imagePair"))

target_looking_subject_level <- summarized_data(looking_data_summarized, "SubjectInfo.subjID", "corrected_target_looking", "SubjectInfo.testAge")

Item-level performance

item_performances <- ggplot(target_looking_item_level, aes(reorder(Trials.targetImage, mean_value), mean_value)) +
   geom_hline(yintercept=0,linetype="dashed")+
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci),width=0, alpha=0.2)+
  (geom_point(aes(aoa_size=AoA_Est_target)) |> rename_geom_aes(new_aes = c("size" = "aoa_size")))+
  (geom_jitter(data=target_looking_item_subject_level, aes(x=Trials.targetImage, y=mean_value, color=SubjectInfo.subjID, age_size = SubjectInfo.testAge/30), alpha=0.3, width=0.2) |> rename_geom_aes(new_aes = c("size" = "age_size"))) +
  xlab("Target image")+
  ylab("Proportion of target looking")+
  ggtitle("Mean proportion of target looking across target items") +
  theme(axis.title.x = element_text(face="bold", size=15, vjust=-1),
        axis.text.x  = element_text(size=10,angle=0,vjust=0.5),
        axis.title.y = element_text(face="bold", size=15),
        axis.text.y  = element_text(size=10),
        strip.text.x = element_text(size = 10,face="bold"),
        plot.margin = margin(t = 10, r = 10, b = 30, l = 10)
        ) +
  scale_y_continuous(breaks = seq(-1, 1, by = 0.2)) +
  scale_size_c(aesthetics = "age_size",name = "Age of participant in months", range=c(2,4), guide = guide_legend(order = 2)) +
  scale_size_c( aesthetics = "aoa_size",name = "Est. AoA of target words in years", guide = guide_legend(order = 1)) +
  scale_color_viridis_d(name = "Participant IDs",option="D") +
  guides(
    color = "none"
  ) 
Warning in geom_point(aes(aoa_size = AoA_Est_target)): Ignoring unknown
aesthetics: aoa_size
Warning in geom_jitter(data = target_looking_item_subject_level, aes(x =
Trials.targetImage, : Ignoring unknown aesthetics: age_size
Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
3.5.0.
Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `transform` argument instead.
item_performances

ggsave(here("figures",PROJECT_VERSION,"item_performances.png"),item_performances,width=15,height=10,bg = "white")

trial_performances <- ggplot(summarized_data(looking_data_summarized, "Trials.trialID", "corrected_target_looking", c("Trials.trialID", "AoA_Est_target")), aes(reorder(Trials.trialID, mean_value), mean_value)) +
   geom_hline(yintercept=0,linetype="dashed")+
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci),width=0, alpha=0.2)+
  (geom_point(aes(aoa_size=AoA_Est_target)) |> rename_geom_aes(new_aes = c("size" = "aoa_size")))+
  (geom_jitter(data=looking_data_summarized,aes(x=Trials.trialID, y=corrected_target_looking, color=SubjectInfo.subjID, age_size = SubjectInfo.testAge/30), alpha=0.3, width=0.2) |> rename_geom_aes(new_aes = c("size" = "age_size"))) +
  xlab("Trial type")+
  ylab("Proportion of target looking")+
  ggtitle("Mean proportion of target looking across trial types") +
  scale_size_c(aesthetics = "age_size",name = "Age of participant in months", range=c(2,4), guide = guide_legend(order = 2)) +
  scale_size_c( aesthetics = "aoa_size",name = "Est. AoA of target words in years", guide = guide_legend(order = 1)) +
  scale_color_viridis_d(name = "Participant IDs",option="D") +
  coord_cartesian(ylim = c(-1, 1)) +
    theme(axis.title.x = element_text(face="bold", size=15, vjust=-1),
        axis.text.x  = element_text(size=8,angle=45,hjust=1),
        axis.title.y = element_text(face="bold", size=15),
        axis.text.y  = element_text(size=15),
        strip.text.x = element_text(size = 10,face="bold"),
        aspect.ratio = 1,
        plot.margin = margin(t = 10, r = 10, b = 30, l = 10)
        ) +
  guides(
    color = "none"
  ) 
Warning in geom_point(aes(aoa_size = AoA_Est_target)): Ignoring unknown
aesthetics: aoa_size
Warning in geom_jitter(data = looking_data_summarized, aes(x = Trials.trialID,
: Ignoring unknown aesthetics: age_size
trial_performances

####Item-level timecourses

# First, let's make sure we have the correct data
summarize_target <- looking_time_resampled_clean %>%
  filter(trial_exclusion == 0 & exclude_participant == 0 & exclude_participant_insufficient_data == 0) %>%
  left_join(trial_metadata) %>%
  filter(!is.na(accuracy_transformed)) %>%
  group_by(Trials.targetImage, time_normalized_corrected) %>%
  #filter(Trials.targetImage %in% c("acorn", "bulldozer", "cheese", "potato", "squirrel", "snail", "turkey", "turtle")) %>%
  summarized_data(., "Trials.targetImage", "accuracy_transformed", c("Trials.targetImage", "AoA_Est_target", "time_normalized_corrected")) %>%
  rename(mean_accuracy = mean_value)
Joining with `by = join_by(Trials.trialID, Trials.targetImage,
Trials.distractorImage, Trials.imagePair)`
Warning: There were 317 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 1: `Trials.targetImage = "acorn"`, `AoA_Est_target = 5.95`,
  `time_normalized_corrected = -3800`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 316 remaining warnings.
# Create a mapping of target images to their AoA values
target_aoa_mapping <- summarize_target %>%
  select(Trials.targetImage, AoA_Est_target) %>%
  distinct() %>%
  arrange(AoA_Est_target)  # Sort by AoA

# Convert Trials.targetImage to a factor with levels ordered by AoA (descending)
summarize_target <- summarize_target %>%
  mutate(Trials.targetImage = factor(Trials.targetImage, 
                                     levels = target_aoa_mapping$Trials.targetImage))
aoa_values <- target_aoa_mapping$AoA_Est_target
# Normalize aoa_values to range between 0 and 1
aoa_scaled <- rescale(aoa_values, to = c(0, 1))

# Generate a high-resolution color gradient (e.g., 1000 colors)
num_colors <- 1000
color_palette <- viridis(num_colors, option = "E",alpha=0.8)

# Map scaled values to colors using interpolation
color_values <- color_palette[ceiling((1-aoa_scaled) * (num_colors - 1)) + 1]


# Create named color vector
target_colors <- setNames(color_values, target_aoa_mapping$Trials.targetImage)


# Create the plot with a discrete legend but gradient colors
looking_times_target <- ggplot(summarize_target, 
                              aes(time_normalized_corrected, 
                                  mean_accuracy, 
                                  color = AoA_Est_target, group=Trials.targetImage)) +  # Group by target image
  annotate("rect", xmin = 300, xmax = 3500, ymin = 0, ymax = 1, 
          fill = "gray95", alpha = 0.5) +  
 annotate("rect", xmin = -2000, xmax = 0, ymin = 0, ymax = 1, 
           fill = "gray95", alpha = 0.5) + 

  coord_cartesian(xlim = c(-2000, 3500), ylim = c(0, 1)) +
     geom_line (alpha=0.2, size=0.8) +
   geom_smooth(data=summarize_across_subj,
              aes(x=time_normalized_corrected,
                  y=accuracy,
                  ymin=lower_ci,  # Assuming 'se' is your standard error column
                  ymax=upper_ci,  # Adjust these to match your error metric
                  group=NA),
              stat="identity",  # Use identity instead of a smoothing method
              color="black",
              fill="gray40",
              size=1.5) +
   scale_color_viridis_c(option="B",direction=-1,name="Estimated target\nword AoA") +
  guides(color = guide_colorbar(reverse = TRUE))+
  geom_vline(xintercept = 0, size = 1.5) +
  geom_hline(yintercept = 0.5, size = 1.2, linetype = "dashed") +
  geom_vline(xintercept = 300, linetype = "dotted", size=1.2) +
  xlab("Time (normalized to target word onset) in ms") +
  ylab("Proportion\nTarget Looking") +
    # Add arrows
  annotate("segment", x = -2000, xend = -50, y = 0.85, yend = 0.85, 
           arrow = arrow(length = unit(0.2, "cm"), type="closed", ends="both"), size = 1, color = "black") + 
  annotate("segment", x = 340, xend = 3490, y = 0.85, yend = 0.85, 
           arrow = arrow(length = unit(0.2, "cm"),type="closed", ends="both"), size = 1, color = "black") + 

  # Add text above arrows
  annotate("text", x = -1000, y = 0.93, label = "Baseline window", 
           size = 6, fontface = "bold") +
  annotate("text", x = 2000, y = 0.93, label = "Critical window", 
           size = 6, fontface = "bold") +
  theme_minimal() +
  scale_x_continuous(breaks=seq(-2000,3500,by=1000)) +
  scale_y_continuous(breaks=seq(0,1,by=0.5))+
  theme(
    text = element_text(size=10,face = "bold"),              # All text bold
    # Increase space between x-axis and its title
    axis.title.x = element_text(
      face = "bold", 
      size = 18,
      margin = margin(t = 10, r = 0, b = 0, l = 0)
    ),
    
    # Increase space between y-axis and its title
    axis.title.y = element_text(
      face = "bold", 
      size= 18,
      margin = margin(t=0,r = 5, b = 0, l = 0)
    ),      
    axis.text = element_text(size=14,face = "bold"),         # Axis text bold
    legend.title = element_text(size = 14, face = "bold"),  # Small bold legend title
    legend.text = element_text(size = 14, face = "bold")    # Small bold legend text
  )

# Display the plot
looking_times_target

ggsave(here("figures",PROJECT_VERSION,"timecourse_plot.svg"),looking_times_target, device="pdf",width=8.5,height=3.3,bg = "white")
ggsave(here("figures",PROJECT_VERSION,"timecourse_plot.png"),looking_times_target,width=9.5,height=5.3,bg = "white")

Age plots

Age-split timecourses

summarize_across_age_halves <- summarize_subj |> add_age_split() |>
  group_by(time_normalized_corrected, age_half) |>
  dplyr::summarize(n=n(),
            non_na_n = sum(!is.na(mean_accuracy)),
            mean_value=mean(mean_accuracy,na.rm=TRUE),
            sd_accuracy=sd(mean_accuracy,na.rm=TRUE),
            se_accuracy=sd_accuracy/sqrt(n),
            ci = ifelse(n > 1, qt(0.975, n - 1) * sd_accuracy / sqrt(n), NA)) %>%
  filter(n > 5)
`summarise()` has grouped output by 'time_normalized_corrected'. You can
override using the `.groups` argument.
looking_times_age <- ggplot(summarize_across_age_halves,aes(time_normalized_corrected,mean_value,color=age_half))+
  xlim(-2000,4000)+
  geom_errorbar(aes(ymin=mean_value-ci,ymax=mean_value+ci),width=0, alpha=0.2)+
  #geom_point(alpha=0.2)+
    geom_smooth(method="gam")+
  geom_vline(xintercept=0,size=1.5)+
  geom_hline(yintercept=0.5,size=1.2,linetype="dashed")+
  geom_vline(xintercept=300,linetype="dotted")+
  ylim(0,1)+
  xlab("Time (normalized to target word onset) in ms")+
  ylab("Proportion Target Looking") +
  labs(title="Proportion of looking time across age and time", caption = (paste0("14-24 month olds age split at median=",median_age," months"))) +
  scale_color_brewer(palette = "Set1", name="Age half")

looking_times_age
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 96 rows containing non-finite outside the scale range
(`stat_smooth()`).

ggsave(here("figures",PROJECT_VERSION,"prop_looking_across_time_age.png"),looking_times,width=9,height=6,bg = "white")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 104 rows containing non-finite outside the scale range
(`stat_smooth()`).

Age-split performance

# note: using shorter window just because there's clearly no age-related effect with longer window 
participant_age_halves = summarized_data(looking_data_summarized |> left_join(target_looking_item_level) |> rename(mean_looking = mean_value) |>  add_age_split() |> mutate(acc = corrected_target_looking), "age_half", "acc", 
                                         c("age_half", "SubjectInfo.subjID", "SubjectInfo.testAge")) 
Joining with `by = join_by(Trials.targetImage, AoA_Est_target)`
participant_age_halves$age_half <- factor(participant_age_halves$age_half, levels = rev(levels(factor(participant_age_halves$age_half))))

participant_age_halves_summarized = summarized_data(participant_age_halves |> rename(looking_time = mean_value), "age_half", "looking_time", c("age_half"))
accuracy_age <- ggplot(participant_age_halves,
       aes(x = age_half, y = mean_value, color = age_half)) +
    geom_violin(aes(color=age_half), alpha=0.3) +
  geom_jitter(size = 3, alpha = 0.3, width=0.2) +
  geom_hline(yintercept=0,size=1.2,linetype="dashed")+ 
  geom_point(data = participant_age_halves_summarized, size = 3) +
  geom_linerange(data = participant_age_halves_summarized, aes(ymin = mean_value - ci, ymax = mean_value + ci), alpha = 0.9) + 
  scale_color_brewer(palette = "Set1", name="Age half", direction=-1) +  # Using RColorBrewer for colors
  ylab("Baseline-corrected proportion of target looking") +
  xlab("Age half of child") +
  ggpubr::stat_cor(method = "pearson") +
  labs(caption = (paste0("14-24 month olds age split at median=",median_age, " months")), title="Word recognition acuracy by age")
accuracy_age

ggsave(here("figures",PROJECT_VERSION,"accuracy_age.png"),accuracy_age,width=9,height=6,bg = "white")

Age continuous

participant_age_jittered <- participant_age_halves %>%
  rowwise() %>%
  mutate(x_jittered = SubjectInfo.testAge/30 + runif(n(), -0.25, 0.25))
jitterer <- position_jitter(width = .5,seed=1)
age_continuous <- ggplot(participant_age_jittered, aes(x = x_jittered, y = mean_value)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_point(size = 4, alpha = 0.4, color="#2171B5") +
  geom_smooth(method="lm",color="#2171B5") +
  geom_linerange(aes(ymin = mean_value - ci, ymax = mean_value + ci), alpha = 0.1) + 
  scale_y_continuous(breaks = seq(-0.4, 0.3, by = 0.1), limits = c(-0.3, 0.3)) +
  ggpubr::stat_cor()+
  labs(x = "Age in months", y = "Baseline-corrected proportion target looking") +
  #ggtitle("Corrected proportion of target looking by age") +
  theme_minimal()
age_continuous
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 13 rows containing missing values or values outside the scale range
(`geom_segment()`).

younger group mean

younger_mean <- participant_age_halves |>
  filter(age_half == "younger") |>
  summarize(mean_value = mean(SubjectInfo.testAge)/30) |>
  pull(mean_value)

Order effects

First vs second instances for primary targets

first_instance_target <- looking_data_summarized |>
  group_by(SubjectInfo.subjID, Trials.targetImage) |>
  arrange(Trials.ordinal, .by_group = TRUE) |>
  filter(Trials.trialType %in% c("easy", "hard")) |>
  slice(1) |>
  ungroup() |>
  mutate(order_id = "one")
  
second_instance_target <- looking_data_summarized |>
  group_by(SubjectInfo.subjID, Trials.targetImage) |>
  arrange(Trials.ordinal, .by_group = TRUE) |>
  filter(Trials.trialType %in% c("easy", "hard")) |>
  slice(2) |>
  ungroup() |>
  mutate(order_id = "two")

ordered_conditions <- rbind(first_instance_target, second_instance_target)

order_plot <- half_violins_plot(ordered_conditions, "order_id", "corrected_target_looking", "SubjectInfo.subjID", c("one", "two"), input_xlab = "Target image order")
Warning: There were 2 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 121: `order_id = "two"` `SubjectInfo.subjID = "FHRCdd"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
Warning in geom_path(aes(group = SubjectInfo.subjID), color = "black", fill =
NA, : Ignoring unknown parameters: `fill`
order_plot

Easy vs hard trial plots

Proportion of target looking for easy vs hard trials

avg_corrected_target_looking_by_condition <- summarized_data(looking_data_summarized,
                                                             "Trials.trialType", "corrected_target_looking", "SubjectInfo.subjID")
high_accuracy_subjects <- avg_corrected_target_looking_by_condition |>
  filter(lower_ci > 0 & N > 5) |>
  distinct(SubjectInfo.subjID)

overall_condition_plot <- half_violins_plot(looking_data_summarized |> group_by(SubjectInfo.subjID, Trials.targetImage) |>
  arrange(Trials.ordinal, .by_group = TRUE) |>
  slice(2) |>
  ungroup() |> mutate(
  Trials.trialType = ifelse(Trials.trialType == "easy-distractor", "easy", Trials.trialType),
  Trials.trialType = ifelse(Trials.trialType == "hard-distractor", "hard", Trials.trialType)) |> filter(Trials.trialType == "easy" | Trials.trialType == "hard"), "Trials.trialType", "corrected_target_looking", "SubjectInfo.subjID", c("easy", "hard"))
Warning: There were 22 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 3: `Trials.trialType = "easy"` `SubjectInfo.subjID = "4GSNJd"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 21 remaining warnings.
Warning in geom_path(aes(group = SubjectInfo.subjID), color = "black", fill =
NA, : Ignoring unknown parameters: `fill`
overall_condition_plot
`geom_path()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?

Target image difficulty

# summarize average accuracy within participant (by word alone)
condition_based_looking <- looking_data_summarized  |>
  filter(!grepl("distractor", Trials.trialType)) |>
  distinct(SubjectInfo.subjID, Trials.trialID, Trials.targetImage, corrected_target_looking, Trials.trialType, AoA_Est_target)

target_looking_by_target_word <- looking_data_summarized  |>
  filter(!grepl("distractor", Trials.trialType)) |>
  group_by(SubjectInfo.subjID, Trials.targetImage) |>
  summarize(target_looking_diff = corrected_target_looking[match("easy", Trials.trialType)] - corrected_target_looking[match("hard", Trials.trialType)],
            baseline_window_looking_diff = mean_target_looking_baseline_window[match("easy", Trials.trialType)] - mean_target_looking_baseline_window[match("hard", Trials.trialType)], 
            critical_window_looking_diff = mean_target_looking_critical_window[match("easy", Trials.trialType)] - mean_target_looking_critical_window[match("hard", Trials.trialType)]) |>
  filter(!is.na(target_looking_diff))
`summarise()` has grouped output by 'SubjectInfo.subjID'. You can override
using the `.groups` argument.
#clean names for individual images for plot
overall_target_looking_by_word <- target_looking_by_target_word %>%
  filter(!is.na(target_looking_diff)) %>%
  group_by(Trials.targetImage) %>%
  summarize(N=n(),
            corrected_target_looking=mean(target_looking_diff,na.rm=TRUE),
            ci=qt(0.975, N-1)*sd(target_looking_diff,na.rm=T)/sqrt(N),
            lower_ci=corrected_target_looking-ci,
            upper_ci=corrected_target_looking+ci
          ) 


word_prefs <- ggplot(overall_target_looking_by_word,aes(reorder(Trials.targetImage,corrected_target_looking),corrected_target_looking))+
  geom_hline(yintercept=0,linetype="dashed")+
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci),width=0, alpha=0.3)+
  geom_jitter(data=target_looking_by_target_word, aes(x=Trials.targetImage, y=target_looking_diff, color=SubjectInfo.subjID), size = 2.5, alpha = 0.3, width=0.1) +
  geom_point(size = 2.4)+
  xlab("Target Word")+
  ylab("Easy trial prefered target looking")+
  theme(axis.title.x = element_text(face="bold", size=15),
        axis.text.x  = element_text(size=10,angle=90,vjust=0.5,hjust=1),
        axis.title.y = element_text(face="bold", size=15),
        axis.text.y  = element_text(size=10),
        strip.text.x = element_text(size = 10,face="bold")
        ) +
  scale_y_continuous(breaks = seq(-1, 1, by = 0.2)) +
  scale_size_continuous(name = "Number of participants") + 
  scale_color_viridis_d(name = "Participant IDs",option="D") +
  guides(color = "none")
  
word_prefs

Baseline image-pair preferences

saliency_effects <- looking_data_summarized |>
  # Calculating the proportion of time looking at the target word even if it isn't the target word in that particular study
 mutate(original_target_looking_baseline_window = ifelse(grepl("distractor", Trials.trialType), 1 - mean_target_looking_baseline_window,   mean_target_looking_baseline_window),
        original_target_looking_critical_window = ifelse(grepl("distractor", Trials.trialType), 1 - mean_target_looking_critical_window, mean_target_looking_critical_window)) |>
  add_age_split() |>
 summarize(mean_baseline_looking = mean(original_target_looking_baseline_window),
           mean_critical_looking = mean(original_target_looking_critical_window),
           .by = c(Trials.imagePair, age_half))

CLIP similarity plots

Comparing proportion of target looking time to CLIP and other model cosine similarities

# CLIP
clip_data_summarized <- summarize_similarity_data(looking_data_summarized)
clip_data_summarized_low_aoa <- summarize_similarity_data(looking_data_summarized |> filter(AoA_Est_target < 4.78))
# |>filter(SubjectInfo.subjID %in% high_accuracy_subjects$SubjectInfo.subjID)))
clip_age_half_summarized <- looking_data_summarized |>
  add_age_split() |>
  summarize_similarity_data(extra_fields = c("age_half", "AoA_Est_target"))

clip_plots <- generate_multimodal_plots(clip_data_summarized, "CLIP")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggrepel: 20 unlabeled data points (too many overlaps). Consider increasing max.overlaps
Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
clip_plots_low_aoa <- generate_multimodal_plots(clip_data_summarized_low_aoa, "CLIP AoA < 4.78")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
input_median_age = median_age
clip_age_plots <- generate_multimodal_age_effect_plots(clip_age_half_summarized, model_type="CLIP", median_age=input_median_age)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
clip_plots
Warning: ggrepel: 31 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 31 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggrepel: 31 unlabeled data points (too many overlaps). Consider increasing max.overlaps

clip_age_plots

# CVCL
create_model_plots(cvcl_similarities, name="CVCL", median_age=input_median_age)
Warning in left_join(select(looking_data_summarized, -text_similarity, -multimodal_similarity, : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 6 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
Warning: ggrepel: 60 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 56 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 58 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

create_model_plots(imagenetvit_similarities, name="ImageNetVIT", median_age=input_median_age)
`geom_smooth()` using formula = 'y ~ x'
Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
`geom_smooth()` using formula = 'y ~ x'

create_model_plots(saycamvit_similarities, name="SayCamVIT",median_age=input_median_age)
`geom_smooth()` using formula = 'y ~ x'
Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
`geom_smooth()` using formula = 'y ~ x'

Create CLIP model plot with all three similarities

clip_data_long <- clip_data_summarized |>
  pivot_longer(cols = c("text_similarity", "image_similarity"), names_to = "sim_type", values_to = "similarity")
all_clip_plot <- multiple_similarity_effects_plot(clip_data_long, "similarity", group_var="sim_type", input_title="Looking time and CLIP embedding correlations")
all_clip_plot
Warning: `is.ggproto()` was deprecated in ggplot2 3.5.2.
ℹ Please use `is_ggproto()` instead.
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

ggsave(here("figures",PROJECT_VERSION,"clip_plot.svg"),all_clip_plot,width=10, height=10.5,bg = "white",device="pdf")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
ggsave(here("figures",PROJECT_VERSION,"clip_plot.png"),all_clip_plot,width=20, height=10.5,bg = "white")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

Some exploratory age effects with three age bins

clip_age_bins_summarized <- looking_data_summarized |>
  mutate(age_bin = case_when(
    SubjectInfo.testAge < 19*30 ~ "14-18",
    SubjectInfo.testAge < 22*30 ~ "18-21",
    TRUE ~ "21-24")) |>
  summarize_similarity_data(extra_fields = c("age_bin", "AoA_Est_target"))

similarity_age_half_plot(clip_age_bins_summarized, "image_similarity", group_var="age_bin", model_type="CLIP")
`geom_smooth()` using formula = 'y ~ x'

AoA effects

input_median_age = median_age
facet_plot <- generate_aoa_facet_plots(
  data = clip_age_half_summarized, 
  model_type = "CLIP",
  median_age = input_median_age,  
  age_grouping = "age_half",
  include_multimodal = FALSE
)
`geom_smooth()` using formula = 'y ~ x'
# For separate plots by AoA group:
#low_aoa_plots <- generate_multimodal_age_effect_plots(
#  data = clip_age_half_summarized %>% filter(AoA_Est_target <= 4.44),
#  model_type = "CLIP",
#  median_age = input_median_age,  
#  age_grouping = "age_half",
#  include_multimodal = FALSE
#)
facet_plot
`geom_smooth()` using formula = 'y ~ x'

OpenCLIP training

openclip_similarities_combined <- openclip_similarities |>
  rename(word_a = word1, word_b = word2) |>
  bind_rows(
    openclip_similarities |>
      rename(word_a = word2, word_b = word1)
  )
looking_data_w_openclip <- looking_data_summarized |>
  select(-text_similarity, -multimodal_similarity, -image_similarity) |>
  left_join(openclip_similarities_combined, by = c("Trials.distractorImage"="word_a", "Trials.targetImage"="word_b"))
Warning in left_join(select(looking_data_summarized, -text_similarity, -multimodal_similarity, : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 6 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
openclip_data_summarized <- summarize_similarity_data(looking_data_w_openclip, extra_fields=c("epoch"))
openclip_age_half_summarized <- looking_data_w_openclip |>
  add_age_split() |>
  summarize_similarity_data(extra_fields = c("age_half", "epoch"))

# Function to calculate Pearson's correlation per epoch
calculate_correlations <- function(data, x_var, y_var, group_var = c("epoch"), conf_level = 0.95) {
  data |>
    group_by(across(all_of(group_var))) |>
    summarize(
      {
        cor_test <- cor.test(.data[[x_var]], .data[[y_var]], method = "pearson", conf.level = conf_level)
        tibble(
          pearson_cor = cor_test$estimate,
          p_value = cor_test$p.value,
          ci_lower = cor_test$conf.int[1],
          ci_upper = cor_test$conf.int[2]
        )
      },
      .groups = "drop"
    )
}

image_correlation_results_age_split <- calculate_correlations(openclip_age_half_summarized, "image_similarity", "mean_value", group_var=c("epoch", "age_half"))

text_correlation_results <- calculate_correlations(openclip_data_summarized, "text_similarity", "mean_value")
image_correlation_results <- calculate_correlations(openclip_data_summarized, "image_similarity", "mean_value")

ggplot(image_correlation_results, aes(x = epoch, y = pearson_cor)) +
  geom_point(aes(color = p_value < 0.05), size = 3) + 
  geom_smooth(span = 2) +
  labs(title = "Image similarity correlation across Open-CLIP training",
       x = "Epoch",
       y = "Pearson Correlation") +
  scale_color_manual(values = c("TRUE" = "black", "FALSE" = "gray")) +  # Set color for significance
  theme_minimal() +
  theme(legend.position = "none")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(text_correlation_results, aes(x = epoch, y = pearson_cor)) +
  geom_point(aes(color = p_value < 0.05), size = 3) + 
  geom_smooth(span = 2) +
  labs(title = "Word similarity correlation across Open-CLIP training",
       x = "Epoch",
       y = "Pearson Correlation") +
  scale_color_manual(values = c("TRUE" = "black", "FALSE" = "gray")) +  # Set color for significance
  theme_minimal() +
  theme(legend.position = "none")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

image_openclip_age <- epoch_age_half_plot(calculate_correlations(openclip_age_half_summarized, "image_similarity", "mean_value", group_var=c("epoch", "age_half")), "image_similarity")
text_openclip_age <- epoch_age_half_plot(calculate_correlations(openclip_age_half_summarized, "text_similarity", "mean_value", group_var=c("epoch", "age_half")), "text_similarity")
image_openclip_age
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

text_openclip_age
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggsave(here("figures",PROJECT_VERSION,"openclip_image_embeddings_age.png"),image_openclip_age,width=9,height=6,bg = "white")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Examining linear correlation between similarity types

cor(data.frame(trial_metadata$multimodal_similarity, trial_metadata$image_similarity, trial_metadata$text_similarity, trial_metadata$cor))
                                     trial_metadata.multimodal_similarity
trial_metadata.multimodal_similarity                            1.0000000
trial_metadata.image_similarity                                 0.4004366
trial_metadata.text_similarity                                  0.4690952
trial_metadata.cor                                              0.4866477
                                     trial_metadata.image_similarity
trial_metadata.multimodal_similarity                       0.4004366
trial_metadata.image_similarity                            1.0000000
trial_metadata.text_similarity                             0.7655238
trial_metadata.cor                                         0.6097675
                                     trial_metadata.text_similarity
trial_metadata.multimodal_similarity                      0.4690952
trial_metadata.image_similarity                           0.7655238
trial_metadata.text_similarity                            1.0000000
trial_metadata.cor                                        0.8812508
                                     trial_metadata.cor
trial_metadata.multimodal_similarity          0.4866477
trial_metadata.image_similarity               0.6097675
trial_metadata.text_similarity                0.8812508
trial_metadata.cor                            1.0000000
cor.test(trial_metadata$image_similarity, trial_metadata$text_similarity)

    Pearson's product-moment correlation

data:  trial_metadata$image_similarity and trial_metadata$text_similarity
t = 6.5168, df = 30, p-value = 3.321e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5686112 0.8794603
sample estimates:
      cor 
0.7655238 
cor.test((imagenetvit_similarities |> arrange(word1, word2))$image_similarity, (trial_metadata |> distinct(Trials.imagePair, .keep_all=TRUE) |> arrange(Trials.imagePair))$image_similarity, method="spearman")

    Spearman's rank correlation rho

data:  (arrange(imagenetvit_similarities, word1, word2))$image_similarity and (arrange(distinct(trial_metadata, Trials.imagePair, .keep_all = TRUE), Trials.imagePair))$image_similarity
S = 272, p-value = 0.01597
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho 
0.6 
# TODO: add direct similarity matrix 

AoA plots

aoa_jittered <- target_looking_item_level %>%
  rowwise() %>%
  mutate(aoa_x_jittered = AoA_Est_target + runif(n(), -0.01, 0.01))

aoa_across_time <- ggplot(aoa_jittered, aes(x=aoa_x_jittered, y=mean_value))+
  geom_hline(yintercept=0,linetype="dashed")+
  geom_point(size=8, alpha=0.5, color="#CF784B")+
  #geom_linerange(aes(ymin = mean_value - ci, ymax = mean_value + ci), alpha = 0.1) + 
  geom_smooth(method="lm", color="#CF784B", alpha=0.5)+
  geom_label_repel(size=5,aes(label=Trials.targetImage),force_pull=10,nudge_y=-0.03) +
  xlab("Estimated target word age-of-acquisition (AoA)")+
  ylab("Baseline-corrected\nprop. target looking")+
  #ggtitle("Prop. of target looking by estimated AoA of target words")+
  theme_minimal()+
 theme(
    text = element_text(size=10,face = "bold"),              # All text bold
    # Increase space between x-axis and its title
    axis.title.x = element_text(
      face = "bold", 
      size = 20,
      margin = margin(t = 10, r = 0, b = 0, l = 0)
    ),
    
    # Increase space between y-axis and its title
    axis.title.y = element_text(
      face = "bold", 
      size= 20,
      margin = margin(t = 10, r = 5, b = 0, l = 0)
    ),      
    axis.text = element_text(size=16,face = "bold"),         # Axis text bold
    legend.title = element_text(size = 12, face = "bold"),  # Small bold legend title
    legend.text = element_text(size = 12, face = "bold")    # Small bold legend text
  )+
 scale_x_continuous(breaks = seq(3, 6, by = 0.5)) +
   coord_cartesian(xlim = c(3.2,6.5), ylim = c(-0.12, 0.18))
aoa_across_time
`geom_smooth()` using formula = 'y ~ x'

ggsave(here("figures",PROJECT_VERSION,"aoa_plot.svg"),aoa_across_time,width=8.5,height=3.3,bg = "white",device="pdf")
`geom_smooth()` using formula = 'y ~ x'
  #ggpubr::stat_cor(method = "spearman", label.y = max(target_looking_item_level$mean_value) + 0.05)


age_half_plot(target_looking_item_age_level, "AoA_Est_target", x_label="AoA of target word", title="Target looking by estimated AoA of target word across age half")
`geom_smooth()` using formula = 'y ~ x'

ggplot(target_looking_trial_level, aes(x=AoA_Est_distractor, y=mean_value)) +
  geom_point()+
  geom_label_repel(aes(label=Trials.imagePair)) +
  geom_smooth(method="gam")+
  geom_hline(yintercept=0,linetype="dashed")+
  xlab("Estimated AoA of distractor words")+
  ylab("Baseline-corrected prop. of target looking")+
  ggtitle("Prop. of target looking by estimated AoA of distractor words")+
  theme_minimal()+
  theme(axis.title.x = element_text(face="bold", size=15, vjust=-1),
        axis.text.x  = element_text(size=10,angle=0,vjust=0.5),
        axis.title.y = element_text(face="bold", size=15),
        axis.text.y  = element_text(size=10),
        strip.text.x = element_text(size = 10,face="bold"),
        
        ) +
  ggpubr::stat_cor(method = "spearman")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

hist(target_looking_item_level$AoA_Est_target)

aoa_data <- looking_data_summarized |> add_aoa_split() 
aoa_data$aoa_half <- factor(aoa_data$aoa_half, levels = c("lower", "higher"))

aoa_plot <- half_violins_plot(aoa_data, "aoa_half", "corrected_target_looking", "SubjectInfo.subjID", c("lower", "higher"), input_xlab = "Age-of-acquisition of target word", input_caption=paste0("AoA split at median=", median_aoa))
Warning in geom_path(aes(group = SubjectInfo.subjID), color = "black", fill =
NA, : Ignoring unknown parameters: `fill`
aoa_plot

summarize_trial <- looking_time_resampled_clean %>%
  filter(trial_exclusion == 0 & exclude_participant ==0 & exclude_participant_insufficient_data == 0) %>%
  filter(!is.na(accuracy_transformed)) %>%
  group_by(SubjectInfo.subjID, time_normalized_corrected, SubjectInfo.testAge) %>%
  summarized_data(., "Trials.trialID", "accuracy_transformed", c("time_normalized_corrected")) %>%
  rename(mean_accuracy = mean_value)
Warning: There were 417 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 1: `Trials.trialID = "easy-acorn-key"` `time_normalized_corrected =
  -3800`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 416 remaining warnings.
 summarize_across_aoa <- summarize_trial |> left_join(trial_metadata) |> add_aoa_split() |>
  group_by(time_normalized_corrected, aoa_half) |>
  dplyr::summarize(n=n(),
            non_na_n = sum(!is.na(mean_accuracy)),
            mean_value=mean(mean_accuracy,na.rm=TRUE),
            sd_accuracy=sd(mean_accuracy,na.rm=TRUE),
            se_accuracy=sd_accuracy/sqrt(n),
            ci = ifelse(n > 1, qt(0.975, n - 1) * sd_accuracy / sqrt(n), NA)) 
Joining with `by = join_by(Trials.trialID)`
`summarise()` has grouped output by 'time_normalized_corrected'. You can
override using the `.groups` argument.
looking_times_aoa <- ggplot(summarize_across_aoa,aes(time_normalized_corrected,mean_value,color=aoa_half))+
  xlim(-2000,4000)+
  geom_errorbar(aes(ymin=mean_value-ci,ymax=mean_value+ci),width=0, alpha=0.2)+
  #geom_point(alpha=0.2)+
    geom_smooth(method="gam")+
  geom_vline(xintercept=0,size=1.5)+
  geom_hline(yintercept=0.5,size=1.2,linetype="dashed")+
  geom_vline(xintercept=300,linetype="dotted")+
  ylim(0,1)+
  xlab("Time (normalized to target word onset) in ms")+
  ylab("Proportion Target Looking") +
 # labs(title="Proportion of looking time across time, split at mean age-of-acquisition") +
  scale_color_manual(values=c("#6BAED6", "#4292C6"), name="Target word \nage-of-acquisition (AoA)", labels=c( "AoA >= 4.78", "AoA < 4.78"))+
   guides(color = guide_legend(reverse = TRUE))

looking_times_aoa
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 191 rows containing non-finite outside the scale range
(`stat_smooth()`).

AoA correlations

aoa_wb_trials <- trial_metadata |> filter(!is.na(AoA_target_WB))
cor.test(aoa_wb_trials$AoA_target_WB, aoa_wb_trials$AoA_Est_target, method="spearman")
Warning in cor.test.default(aoa_wb_trials$AoA_target_WB,
aoa_wb_trials$AoA_Est_target, : Cannot compute exact p-value with ties

    Spearman's rank correlation rho

data:  aoa_wb_trials$AoA_target_WB and aoa_wb_trials$AoA_Est_target
S = 603.22, p-value = 0.3121
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.2607651 
cor.test(trial_metadata$AoA_Est_target, trial_metadata$image_similarity, method="pearson")

    Pearson's product-moment correlation

data:  trial_metadata$AoA_Est_target and trial_metadata$image_similarity
t = 1.4523, df = 30, p-value = 0.1568
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1014633  0.5553600
sample estimates:
      cor 
0.2562984 
subjects_data <- read.csv(here("data/main/processed_data/level-participant_added-trials_data.csv")) |> filter(exclude_participant_insufficient_data == 0)
trial_data <- looking_data_summarized |> distinct(SubjectInfo.subjID, SubjectInfo.testAge)
subjects_data <- subjects_data |> 
  left_join(trial_data)
Joining with `by = join_by(SubjectInfo.subjID)`
hist(subjects_data$SubjectInfo.testAge/30, breaks=10)

Saliency correlations

target_looking_trial_age_level <- summarized_data(target_looking_trial_subject_level |> rename(mean_target_looking = mean_value) |> add_age_split(), "Trials.trialID", "mean_target_looking", c("AoA_Est_target", "age_half"))
trialid_age_level_summary <- target_looking_trial_age_level |> left_join(trial_metadata)
Joining with `by = join_by(Trials.trialID, AoA_Est_target)`
trialid_level_summary <- target_looking_trial_level |> left_join(trial_metadata)
Joining with `by = join_by(Trials.trialID, AoA_Est_distractor, AoA_Est_target,
Trials.targetImage, Trials.distractorImage, Trials.imagePair)`
age_half_plot(trialid_age_level_summary, "MeanTargetSaliency", x_label="Mean target saliency")
`geom_smooth()` using formula = 'y ~ x'

age_half_plot(trialid_age_level_summary, "MeanSaliencyDiff", x_label="Mean saliency difference", title="Target looking by mean saliency difference across age and trial types")
`geom_smooth()` using formula = 'y ~ x'

target_looking_image_pair <- summarized_data(looking_data_summarized, "Trials.imagePair", "corrected_target_looking", c("SubjectInfo.subjID", "SubjectInfo.testAge"))
Warning: There were 338 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 2: `Trials.imagePair = "acorn-coconut"`, `SubjectInfo.subjID =
  "3EXdF6"`, `SubjectInfo.testAge = 720`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 337 remaining warnings.
target_looking_imagepair_level <- summarized_data(target_looking_image_pair |> rename("mean_looking"="mean_value") |> add_age_split(), "Trials.imagePair", "mean_looking", c("age_half")) |> left_join(trial_metadata |> distinct(Trials.imagePair, .keep_all=TRUE)) |> mutate(MeanSaliencyDiff = abs(MeanSaliencyDiff))
Joining with `by = join_by(Trials.imagePair)`
age_half_plot(target_looking_imagepair_level, "MeanSaliencyDiff", x_label="Mean saliency difference", title="Target looking by mean saliency difference across age half and image pairs")
`geom_smooth()` using formula = 'y ~ x'

Saliency vs baseline

baseline_looking_trial_age_level <- summarized_data(looking_data_summarized |> add_age_split(), "Trials.trialID", "mean_target_looking_baseline_window", c("age_half")) |> left_join(trial_metadata)
Joining with `by = join_by(Trials.trialID)`
baseline_looking_trial_level <- summarized_data(looking_data_summarized |> rename("mean_looking"="mean_target_looking_critical_window"), "Trials.trialID", "mean_looking", c("Trials.trialID")) |> left_join(trial_metadata)
Joining with `by = join_by(Trials.trialID)`
age_half_plot(baseline_looking_trial_age_level, "MeanSaliencyDiff", x_label="Mean saliency difference", y_label = "Target looking in baseline window")
`geom_smooth()` using formula = 'y ~ x'

ggplot(baseline_looking_trial_level, aes(x=MeanSaliencyDiff, y=mean_value)) +
  geom_smooth(method="lm") +
  geom_point() +
  ylab("Baseline looking") +
  xlab("Mean target-distractor saliency difference") +
  ggpubr::stat_cor()
`geom_smooth()` using formula = 'y ~ x'

Embedding type comparisons

Luce vs logit

luce_vs_logit10 <- read.csv(here("data/embeddings/similarities_clip.csv"))
ggplot(luce_vs_logit10, aes(x=luce, y=softmax_logit_scale_10)) +
  geom_point() +
  geom_smooth() +
  ggpubr::stat_cor() +
  labs(title="Luce vs Logit10 similarity", x="Luce similarity", y="Logit10 similarity")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

cor.test(trial_metadata$text_similarity, trial_metadata$image_similarity)

    Pearson's product-moment correlation

data:  trial_metadata$text_similarity and trial_metadata$image_similarity
t = 6.5168, df = 30, p-value = 3.321e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5686112 0.8794603
sample estimates:
      cor 
0.7655238 
cor.test(trial_metadata$multimodal_similarity, trial_metadata$image_similarity)

    Pearson's product-moment correlation

data:  trial_metadata$multimodal_similarity and trial_metadata$image_similarity
t = 2.3936, df = 30, p-value = 0.02314
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.06013991 0.65734550
sample estimates:
      cor 
0.4004366 
cor.test(trial_metadata$AoA_Est_target, trial_metadata$text_similarity)

    Pearson's product-moment correlation

data:  trial_metadata$AoA_Est_target and trial_metadata$text_similarity
t = 0.42649, df = 30, p-value = 0.6728
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2786047  0.4150887
sample estimates:
       cor 
0.07763097 
cor.test(trial_metadata$AoA_Est_target, trial_metadata$image_similarity)

    Pearson's product-moment correlation

data:  trial_metadata$AoA_Est_target and trial_metadata$image_similarity
t = 1.4523, df = 30, p-value = 0.1568
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1014633  0.5553600
sample estimates:
      cor 
0.2562984 
cor.test(trial_metadata$MeanSaliencyDiff, trial_metadata$AoA_Est_target)

    Pearson's product-moment correlation

data:  trial_metadata$MeanSaliencyDiff and trial_metadata$AoA_Est_target
t = -0.037373, df = 30, p-value = 0.9704
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.3546734  0.3426861
sample estimates:
         cor 
-0.006823239 

Z-scoring

zscore <- read.csv(here("data/embeddings/zscore_similarities.csv"))
l2_zscore <- read.csv(here("data/embeddings/l2_zscore_similarities.csv"))
raw <- read.csv(here("data/embeddings/raw_similarities.csv"))
cor_df <- data.frame(
  zscore = zscore$image_sim,
  raw = raw$image_sim,
  l2_zscore = l2_zscore$image_sim,
  text = paste(zscore$text1,"-",zscore$text2)
)

cor_df_text <- data.frame(
  zscore = zscore$text_sim,
  raw = raw$text_sim,
  l2_zscore = l2_zscore$text_sim
)

cor(cor_df |> select(-text))
             zscore       raw l2_zscore
zscore    1.0000000 0.7133655 0.9986330
raw       0.7133655 1.0000000 0.6843753
l2_zscore 0.9986330 0.6843753 1.0000000
cor.test(cor_df_text$zscore, cor_df_text$raw)

    Pearson's product-moment correlation

data:  cor_df_text$zscore and cor_df_text$raw
t = 14.009, df = 274, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5715425 0.7099364
sample estimates:
      cor 
0.6460174 
cor(cor_df |> select(-text), method="spearman")
             zscore       raw l2_zscore
zscore    1.0000000 0.6278530 0.9975741
raw       0.6278530 1.0000000 0.5905256
l2_zscore 0.9975741 0.5905256 1.0000000
cor.test(trial_metadata$text_similarity, trial_metadata$cor)

    Pearson's product-moment correlation

data:  trial_metadata$text_similarity and trial_metadata$cor
t = 10.212, df = 30, p-value = 2.799e-11
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.7687985 0.9408377
sample estimates:
      cor 
0.8812508 
model <- lm(zscore ~ raw, data = cor_df)

# 2. Compute residuals and add to dataframe
cor_df$resid <- abs(residuals(model))

# 3. Flag outliers (e.g., residuals > 2 SD from mean)
threshold <- mean(cor_df$resid) + 2 * sd(cor_df$resid)
cor_df$outlier <- cor_df$resid > threshold | cor_df$text == "bulldozer - orange"

# 4. Plot
ggplot(cor_df, aes(x = raw, y = zscore)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  xlab("Raw embeddings") +
  ylab("Z-scored embeddings") +
  ggtitle("LWL Pairwise Image Embeddings") +
  ggpubr::stat_cor(method = "pearson") +
  geom_label_repel(
    data = subset(cor_df, outlier),
    aes(label = text),
    box.padding = 0.35,
    point.padding = 0.3,
    segment.color = 'grey50'
  )
`geom_smooth()` using formula = 'y ~ x'

unique_trials <- trial_metadata |> distinct(Trials.imagePair, .keep_all=TRUE)
model <- lm(image_sim ~ image_similarity, data = unique_trials)

# 2. Compute residuals and add to dataframe
unique_trials$resid <- abs(residuals(model))

# 3. Flag outliers (e.g., residuals > 2 SD from mean)
threshold <- mean(unique_trials$resid) + sd(unique_trials$resid)
unique_trials$outlier <- unique_trials$resid > threshold | unique_trials$Trials.imagePair == "bulldozer-orange"

# 4. Plot
ggplot(unique_trials, aes(x = image_similarity, y = image_sim)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  xlab("Raw embeddings") +
  ylab("Z-scored embeddings") +
  ggtitle("LWL study-specific Image Embeddings") +
  ggpubr::stat_cor(method = "pearson") +
  geom_label_repel(
    data = subset(unique_trials, outlier),
    aes(label = Trials.imagePair),
    box.padding = 0.35,
    point.padding = 0.3,
    segment.color = 'grey50'
  )
`geom_smooth()` using formula = 'y ~ x'