library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(here)

here() starts at /Users/visuallearninglab/Documents/visvocab

all_garden_roar_data_with_age <- read_csv(here("data/garden/GARDEN-phase1_study-VO_type-rawtrials_data.csv"))

Rows: 24824 Columns: 69
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (26): trialId, answerWord, assessment_stage, block, internal_node_id, i...
dbl  (22): correct, correctSide, numAFC, replay, response, rt, start_time_un...
lgl  (12): save_trial, completed, scores.raw.composite.practice.thetaEstimat...
dttm  (9): serverTimestamp, start_time, CreateTime_x, timeFinished, timeStar...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

low_counts <- all_garden_roar_data_with_age |>
  filter(assessment_stage == 'test_response') |>
  group_by(targetWord) |>
  mutate(num_participants = n()) |>
  filter(num_participants < 5) |> # correct trial types should have at least 10 participants 
  select(-starts_with("scores."), -starts_with("user.")) |>
  ungroup()

summarized_low_counts <- low_counts |>
  summarize(n = n(), child_ids = list(child_id), .by=c("targetWord", "itemSource"))

filtered_garden_roar_data_with_age <- all_garden_roar_data_with_age |>
  filter(assessment_stage == 'test_response') |>
  filter(!(targetWord %in% summarized_low_counts$targetWord)) |>
   filter(assessment_stage == 'test_response') |>
  select(child_id,correct, correctSide, child__age_rounded, targetWord, answerWord, trial_index, rt, options, itemSource, trialId)

garden_similarities <- read_csv(here("data/garden/vv_clip_similarities.csv"))

Rows: 476 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): target, option, image
dbl (6): trial, img_idx, tgt_idx, sim_img_img, sim_img_txt, sim_txt_txt

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

all_children <- filtered_garden_roar_data_with_age |> distinct(child_id, .keep_all = TRUE)
age_check <- filtered_garden_roar_data_with_age |> 
  group_by(child_id) |> 
  summarise(n_ages = n_distinct(child__age_rounded), .groups = "drop") |> 
  filter(n_ages > 1)
print(age_check)

# A tibble: 0 × 2
# ℹ 2 variables: child_id <chr>, n_ages <int>

hist(all_children$child__age_rounded/30, breaks=30)

min(all_children$child__age_rounded/30)

[1] 37

prop_correct <- filtered_garden_roar_data_with_age |>
  group_by(child_id) |>
   mutate(percentage_correct = round(mean(correct), 4), num_trials = length(unique(trialId)),
          age_months = child__age_rounded/30
          )  |>
  distinct(percentage_correct, child_id, age_months, num_trials)

library(ggthemes)
ggplot(prop_correct, aes(x=age_months, y=percentage_correct, size=num_trials)) +
  geom_point(alpha=0.3) +
  ylab("Percent correct") +
  xlab("Age in years") +
  geom_smooth(aes(weight=num_trials)) +
  theme(legend.position='none')

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: size.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

library(purrr)
library(jsonlite)


Attaching package: 'jsonlite'

The following object is masked from 'package:purrr':

    flatten

filtered_garden_roar_data_with_age <- filtered_garden_roar_data_with_age |> mutate(parsed = map(options, ~ fromJSON(gsub("'", '"', .x))),, 
    sorted_items = map(parsed, ~ sort(as.character(unname(.x)))),
    image1 = map_chr(sorted_items, 1),
    image2 = map_chr(sorted_items, 2),
    image3 = map_chr(sorted_items, 3),
    image4 = map_chr(sorted_items, 4)
  ) %>%
  select(-sorted_items, -parsed)

garden_similarities_sorted <- garden_similarities |> 
  group_by(trial) %>%
  arrange(image, .by_group = TRUE) %>%
  mutate(option = paste0("image", row_number())) %>%
  ungroup()

garden_with_sims <- filtered_garden_roar_data_with_age |>
  # first add in trial ids
  left_join(garden_similarities_sorted |> distinct(target, trial), by=c("targetWord"="target")) |>
  rename(full_trial_id = trial) |>
  filter(!is.na(full_trial_id)) |>
  left_join(garden_similarities_sorted |> select(image, sim_img_img, sim_img_txt, sim_txt_txt), by=c("answerWord"="image")) |>
  mutate(age_in_months = child__age_rounded/30)

Replicating error pattern graph I think

total_trials <- garden_with_sims |> 
  filter(correct == 0) |> 
  count(targetWord, name = "totalTargetTrials")

total_trials_full <- garden_with_sims |> count(targetWord)

garden_error_patterns <- garden_with_sims |> 
  filter(correct == 0) |> 
  count(targetWord, answerWord, sim_img_img, sim_img_txt, sim_txt_txt, name = "n") |> 
  left_join(total_trials, by = "targetWord") |> 
  mutate(pc = n / totalTargetTrials) 
  # |> filter(totalTargetTrials > 20)

ggplot(garden_error_patterns, aes(x=sim_img_img, y=pc)) +
  geom_jitter(alpha=0.4) + 
  geom_smooth(method="lm") +
  ggpubr::stat_cor()

`geom_smooth()` using formula = 'y ~ x'

VisVocab plot?

most_similar_distractors <- garden_with_sims %>%
  filter(answerWord != targetWord) %>%
  group_by(targetWord) %>%
  filter(if_any(
    c("sim_img_img", "sim_img_txt", "sim_txt_txt"),
    ~ . == max(c_across(c("sim_img_img", "sim_img_txt", "sim_txt_txt")), na.rm = TRUE)
  )) %>%
  slice(1) %>%  # in case of ties, keep just one
  select(most_similar = answerWord, targetWord, max_sim_img_img = sim_img_img, max_sim_img_txt = sim_img_txt, max_sim_txt_txt = sim_txt_txt)


# Step 4: Summarize percent correct when distractor was present (per targetWord)
summary_stats <- garden_with_sims %>%
  left_join(most_similar_distractors, by = "targetWord") %>%
  group_by(targetWord, most_similar) %>%
  filter(age_in_months < 48) |>
  summarise(
    n_trials = n(),
    percent_correct = mean(correct, na.rm = TRUE) * 100,
    across(starts_with("max_sim"), unique),
    .groups = "drop"
  )

ggplot(summary_stats |> filter(!(targetWord %in% c("net", "kimono", "artichoke"))), aes(x=max_sim_txt_txt, y=percent_correct, label=paste(targetWord, most_similar, sep="-"))) +
  ggrepel::geom_label_repel() +
  geom_point() + 
  geom_smooth(method="lm") +
  ggpubr::stat_cor() +
  labs(title="Text similarity effects in 3 year olds")

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Merge in AoA data

aoa_data <- read.csv(here("stimuli/lookit/preprocessing/older_stimuli/exp1_all_trials2023-04-11.csv")) %>%
  select(Word1, Word2, AoA_Est_Word1, AoA_Est_Word2) %>%
  pivot_longer(
    cols = everything(),
    names_to = c(".value", "which"),
    names_pattern = "(Word|AoA_Est_Word)(\\d+)"
  ) %>%
  rename(word = Word, aoa = AoA_Est_Word) %>%
  distinct(word, aoa)

aoa_comps <- summary_stats |> left_join(aoa_data, by=c("targetWord"="word"))
cor.test(aoa_comps$aoa, aoa_comps$max_sim_txt_txt)


    Pearson's product-moment correlation

data:  aoa_comps$aoa and aoa_comps$max_sim_txt_txt
t = -5.1041, df = 87, p-value = 1.933e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.6257423 -0.3019746
sample estimates:
       cor 
-0.4800423

cor.test(aoa_comps$aoa, aoa_comps$max_sim_img_img)


    Pearson's product-moment correlation

data:  aoa_comps$aoa and aoa_comps$max_sim_img_img
t = -2.7361, df = 87, p-value = 0.007536
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.46261629 -0.07778073
sample estimates:
      cor 
-0.281478

ok..can’t make any strong conclusions then.

Merge in VisVocab data

trial_metadata <- read.csv(here("data","metadata","level-trialtype_data.csv"))
trial_summary_data <- read.csv(here("data","main", "processed_data","level-trials_data.csv"))

usable_trials <- trial_summary_data |>
  # excluding scam participants
  filter(exclude_participant_insufficient_data == 0 & trial_exclusion == 0 & exclude_participant == 0 & SubjectInfo.subjID != "PH2RNZ") 

# Merging with similarity information and mean-centering main effects
trials_with_effect_vars <- usable_trials |>
  left_join(trial_metadata) |>
  mutate(age_in_months = SubjectInfo.testAge/30)

Joining with `by = join_by(Trials.trialID, Trials.targetImage,
Trials.distractorImage, Trials.imagePair)`

source("helpers.R")


Attaching package: 'rlang'

The following objects are masked from 'package:jsonlite':

    flatten, unbox

The following objects are masked from 'package:purrr':

    %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
    flatten_raw, invoke, splice

trial_summary_data_summarized <- summarized_data(trials_with_effect_vars, "Trials.targetImage", "corrected_target_looking", c("Trials.distractorImage", "Trials.trialType", "Trials.trialID", "image_similarity", "text_similarity")) |>
  mutate(mean_value = mean_value+0.5)

garden_visvocab_corr_base <- trial_summary_data_summarized |> 
  inner_join(summary_stats, 
            by = c("Trials.targetImage" = "targetWord", 
                   "Trials.distractorImage" = "most_similar"))

cor.test(garden_visvocab_corr_base$mean_value, garden_visvocab_corr_base$percent_correct, method="spearman")


    Spearman's rank correlation rho

data:  garden_visvocab_corr_base$mean_value and garden_visvocab_corr_base$percent_correct
S = 32, p-value = 0.115
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6190476

garden_multiple <- garden_with_sims |> 
  count(targetWord, answerWord, sim_img_img, sim_img_txt, sim_txt_txt, name = "n") |> 
  left_join(total_trials_full |> rename(total = n), by = "targetWord") |> 
  mutate(pc = n / total)

garden_visvocab_error <- trial_summary_data_summarized |> 
  inner_join(garden_error_patterns, 
            by = c("Trials.targetImage" = "targetWord", 
                   "Trials.distractorImage" = "answerWord"))
cor.test(garden_visvocab_error$mean_value, garden_visvocab_error$pc, method="spearman")

Warning in cor.test.default(garden_visvocab_error$mean_value,
garden_visvocab_error$pc, : Cannot compute exact p-value with ties


    Spearman's rank correlation rho

data:  garden_visvocab_error$mean_value and garden_visvocab_error$pc
S = 1028.8, p-value = 0.04219
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.5128973

are these 16 pairs significant?

ggplot(garden_visvocab_error, aes(x=sim_img_img, y=pc, label=paste(Trials.targetImage, Trials.distractorImage, sep="-"))) +
  geom_jitter(alpha=0.4, size=4) + 
  ggrepel::geom_label_repel() +
  geom_smooth(method="lm") +
  ggpubr::stat_cor() +
  xlab("Image similarity") +
  ylab("Percentage of errors")

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

Well, in a certain sense, our effects are getting stronger with age..restricting to 3 year olds.

total_trials_3 <- garden_with_sims |> 
  filter(correct == 0 & age_in_months < 48) |> 
  count(targetWord, name = "totalTargetTrials")

total_trials_full_3 <- garden_with_sims |> count(targetWord)

garden_error_patterns_3 <- garden_with_sims |> 
  filter(correct == 0) |> 
  filter(age_in_months < 48) |>
  count(targetWord, answerWord, sim_img_img, sim_img_txt, sim_txt_txt, name = "n") |> 
  left_join(total_trials_3, by = "targetWord") |> 
  mutate(pc = n / totalTargetTrials)

garden_visvocab_error_3 <- trial_summary_data_summarized |> 
  inner_join(garden_error_patterns_3, 
            by = c("Trials.targetImage" = "targetWord", 
                   "Trials.distractorImage" = "answerWord"))
cor.test(garden_visvocab_error_3$mean_value, garden_visvocab_error_3$pc, method="spearman")

Warning in cor.test.default(garden_visvocab_error_3$mean_value,
garden_visvocab_error_3$pc, : Cannot compute exact p-value with ties


    Spearman's rank correlation rho

data:  garden_visvocab_error_3$mean_value and garden_visvocab_error_3$pc
S = 857.27, p-value = 0.04175
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.5308313

ggplot(garden_visvocab_error_3, aes(x=sim_img_img, y=pc, label=paste(Trials.targetImage, Trials.distractorImage, sep="-"))) +
  geom_jitter(alpha=0.4, size=4) + 
  ggrepel::geom_label_repel() +
  geom_smooth(method="lm") +
  ggpubr::stat_cor() +
  xlab("Image similarity") +
  ylab("Percentage of errors") +
  labs(title="Error patterns for GARDEN 3-year-olds filtered to VisVocab stimuli")

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

This is very clean. Lets see what new stimuli are good fits here..

lm_fit <- lm(pc ~ sim_img_img, data = garden_visvocab_error_3)
slope <- coef(lm_fit)[["sim_img_img"]]
intercept <- coef(lm_fit)[["(Intercept)"]]
garden_error_patterns_3$predicted_pc <- intercept + slope * garden_error_patterns_3$sim_img_img
garden_error_patterns_3$within_threshold <- abs(garden_error_patterns_3$pc - garden_error_patterns_3$predicted_pc) < 0.5  # example: within 5%

ggplot(garden_error_patterns_3, aes(x = sim_img_img, y = pc, 
                             label = paste(targetWord, answerWord, sep = "-"),
                             color = within_threshold)) +
  geom_jitter(alpha = 0.6, size = 4) +
  ggrepel::geom_label_repel() +
  geom_smooth(method = "lm", color = "black") +
  ggpubr::stat_cor() +
  xlab("Image similarity") +
  ylab("Percentage of errors") +
  labs(title = "Stimuli fitting regression slope (GARDEN 3-year-olds, VisVocab)") +
  scale_color_manual(values = c("TRUE" = "lightblue", "FALSE" = "gray"))

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

Warning: ggrepel: 238 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

ggplot(garden_visvocab_error_3, aes(x=sim_img_img, y=mean_value, label=paste(Trials.targetImage, Trials.distractorImage, sep="-"))) +
  geom_jitter(alpha=0.4, size=4) + 
  ggrepel::geom_label_repel() +
  geom_smooth(method="lm") +
  ggpubr::stat_cor() +
  xlab("Image similarity") +
  ylab("Corrected target looking") +
  labs(title="VisVocab data")

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?