main_analysis

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4          ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.1.9000     ✔ tibble    3.2.1     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.2          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
here() starts at /Users/visuallearninglab/Documents/visvocab

Note: The package "relayer" is highly experimental. Use at your own risk.

Loading required package: viridisLite

# Function to summarize whether a trial is usable based on whether the subject is looking at the screen for greater than 50% of the critical window
summarize_subj_usable_trials <- function(data, critical_window, suffix, additional_fields=NULL) {
  paste("hi")
  additional_fields <- additional_fields %||% list()
  
  data %>%
    filter(time_normalized_corrected >= critical_window[1] &
             time_normalized_corrected <= critical_window[2]) %>%
    group_by(SubjectInfo.subjID, Trials.trialID, Trials.ordinal, Trials.trialType) %>%
    summarize(
      length = n(),
      useable_frames = sum(not_looking_away, na.rm = TRUE),
      percent_usable = useable_frames / length,
      usable = ifelse(percent_usable >= 0.5, 1, 0), # usable if at least 50% looking
      mean_target_looking = mean(accuracy, na.rm = TRUE),
      !!!additional_fields,
    ) %>%
    rename_with(~ paste0(., "_", suffix), -c(SubjectInfo.subjID, Trials.trialID, Trials.ordinal, Trials.trialType))
}

# Function to compute whether a trial is usable based on whether both the critical window and the baseline window are usable
compute_usable_trial <- function(baseline_col, critical_col) {
  case_when(
    is.na(baseline_col) ~ 0,
    is.na(critical_col) ~ 0,
    baseline_col == 1 & critical_col == 1 ~ 1,
    TRUE ~ 0
  )
}

# Calculate mean, standard deviation, standard error and confidence intervals for data grouped across two variables
summarized_data <- function(data, x_var, y_var, group_var) {
  return(data |>
           group_by(across(all_of(c(x_var, group_var)))) |>
           summarize(
                   #across(everything(), ~ if (n_distinct(.) == 1) first(.) else NA),
                    mean_value = mean(.data[[y_var]], na.rm = TRUE),
                     sd_value = sd(.data[[y_var]], na.rm = TRUE),
                     N = n(),
                     se = sd_value / sqrt(n()),
                     ci=qt(0.975, N-1)*sd_value/sqrt(N),
                     lower_ci=mean_value-ci,
                     upper_ci=mean_value+ci,
                     .groups = 'drop') |>
           select(where(~ !all(is.na(.))))
  )
}

# make aesthetics aware size scale, also use better scaling
scale_size_c <- function(name = waiver(), breaks = waiver(), labels = waiver(), 
                         limits = NULL, range = c(1, 6), trans = "identity", guide = "legend", aesthetics = "size") 
{
  continuous_scale(aesthetics, "area", scales::rescale_pal(range), name = name, 
                   breaks = breaks, labels = labels, limits = limits, trans = trans, 
                   guide = guide)
}

# summarize target looking by input condition
summarize_data <- function(data,summary_field) {
  return(data  |>
           summarize(N=n(),
                     #mean_age = mean(age),
                     #mean_age_mo = mean(age_mo),
                     average_corrected_target_looking=mean(corrected_target_looking,na.rm=TRUE),
                     se=sd(corrected_target_looking,na.rm=T)/sqrt(N),
                     ci=qt(0.975, N-1)*sd(corrected_target_looking,na.rm=T)/sqrt(N),
                     lower_ci=average_corrected_target_looking-ci,
                     upper_ci=average_corrected_target_looking+ci,
                     lower_se=average_corrected_target_looking-se,
                     upper_se=average_corrected_target_looking+se,
                     average_critical_window_looking=mean(mean_target_looking_critical_window,na.rm=TRUE),
                     critical_window_ci = qt(0.975, N-1)*sd(mean_target_looking_critical_window,na.rm=T)/sqrt(N),
                     critical_window_lower_ci=average_critical_window_looking-critical_window_ci,
                     critical_window_upper_ci=average_critical_window_looking+critical_window_ci) |>
    rename_with(~ paste0(., "_", suffix), -c(SubjectInfo.subjID, Trials.trialID, Trials.ordinal, Trials.trialType)))
  }


#stolen from peekbank/ peekds
#https://github.com/langcog/peekds/blob/master/R/generate_aoi.R

resample_aoi_trial <- function(df_trial, sample_duration=1000/30) {
  
  print(paste0("Subject Number: ",unique(df_trial$sub_num), "; Trial Number: ", unique(df_trial$Trials.ordinal)))
  
  t_origin <- df_trial$t_norm
  data_origin <- df_trial$aoi
  
  # create the new timestamps for resampling
  t_start <- min(t_origin) - (min(t_origin) %% sample_duration)
  t_resampled <- seq(from = t_start, to = max(t_origin),
                     by = sample_duration)
  
  # exchange strings values with integers for resampling
  # this step critical for interpolating missing vals quickly and correctly
  aoi_num <- data_origin %>%
    dplyr::recode(target = 1, distractor = 2, other = 3, missing = 4)
  
  # start resampling with approx
  aoi_resampled <- stats::approx(x = t_origin, y = aoi_num, xout = t_resampled,
                                 method = "constant", rule = 2,
                                 ties = "ordered")$y
  aoi_resampled_recoded <- aoi_resampled %>%
    dplyr::recode("1" = "target", "2" = "distractor",
                  "3" = "other", "4" = "missing")
  
  
  # adding back the columns to match schema
  dplyr::tibble(t_norm = t_resampled,
                aoi = aoi_resampled_recoded,
                trial_id = df_trial$trial_id[1],
                administration_id = df_trial$administration_id[1])
}

resample_times <- function(df_table, sample_duration) {
  
  # first check if this data frame has all the correct columns required for
  # re-sampling
  required_columns <- c("trial_id", "administration_id", "t_norm", "aoi")
  
  # re-zero and normalize times first
  # this is mandatory, comes from our decision that not linking resampling and
  # centering causes a lot of problems
  if (!all(required_columns %in% colnames(df_table))) {
    stop(.msg("Resample times function requires the following columns to be
              present in the dataframe:
              {paste(required_columns, collapse = ', ')}. Times should be
              re-zeroed and normalized first before being resampled!"))
  }
  
  # main resampling call
  # start resampling process by iterating through every trial within every
  # administration
  df_out <- df_table %>%
    dplyr::mutate(admin_trial_id = paste(.data$administration_id,
                                         .data$trial_id, sep = "_")) %>%
    split(.$admin_trial_id) %>%
    purrr::map_df(resample_aoi_trial, sample_duration=sample_duration) %>%
    dplyr::arrange(.data$administration_id, .data$trial_id)
  
  return(df_out)
}

Load data

looking_time_resampled_clean <- read.csv(here("data","processed_data","looking_times_resampled_clean.csv"))
all_looking_times <- read.csv(here("data","processed_data","looking_times_with_exclusion_info.csv"))
trial_metadata <- read.csv(here("data","metadata","trial_metadata.csv"))
trial_summary_data <- read.csv(here("data","processed_data","trial_summary_data.csv"))

Overall timecourse plot of proportion target looking

#summarizing within subject for each time point
summarize_subj <- looking_time_resampled_clean %>%
  filter(trial_exclusion == 0 & exclude_participant ==0 & exclude_participant_insufficient_data == 0) %>%
  group_by(SubjectInfo.subjID, time_normalized_corrected) %>%
  summarize(N=n(),
            non_na_n = sum(!is.na(accuracy_transformed)), 
            mean_accuracy=mean(accuracy_transformed,na.rm=TRUE),
            sd_accuracy=sd(accuracy_transformed,na.rm=TRUE),
            se_accuracy=sd_accuracy/sqrt(non_na_n),
            ci=qt(0.975, non_na_n-1)*sd(accuracy_transformed,na.rm=T)/sqrt(non_na_n),
            lower_ci=mean_accuracy-ci,
            upper_ci=mean_accuracy+ci) %>%
  filter(non_na_n > 10) %>%
  ungroup()

Warning: There were 88 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, non_na_n - 1) * sd(accuracy_transformed, na.rm =
  T)/sqrt(non_na_n)`.
ℹ In group 1: `SubjectInfo.subjID = "AWMRW7"` and `time_normalized_corrected =
  -3700`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 87 remaining warnings.

`summarise()` has grouped output by 'SubjectInfo.subjID'. You can override
using the `.groups` argument.

#summarizing across subjects for each time point
summarize_across_subj <- summarize_subj %>%
  group_by(time_normalized_corrected) %>%
  dplyr::summarize(n=n(),
            accuracy=mean(mean_accuracy,na.rm=TRUE),
            sd_accuracy=sd(mean_accuracy,na.rm=TRUE),
            se_accuracy=sd_accuracy/sqrt(n),
            ci=qt(0.975, non_na_n-1)*sd(mean_accuracy,na.rm=T)/sqrt(non_na_n)) %>%
  filter(n > 2)

Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.

`summarise()` has grouped output by 'time_normalized_corrected'. You can
override using the `.groups` argument.

looking_times <- ggplot(summarize_across_subj,aes(time_normalized_corrected,accuracy))+
  xlim(-2000,4000)+
  geom_errorbar(aes(ymin=accuracy-ci,ymax=accuracy+ci),width=0, alpha=0.2)+
  #geom_point(alpha=0.2)+
    geom_smooth(method="gam")+
  geom_vline(xintercept=0,size=1.5)+
  geom_hline(yintercept=0.5,size=1.2,linetype="dashed")+
  geom_vline(xintercept=300,linetype="dotted")+
  ylim(0,1)+
  xlab("Time (normalized to target word onset) in ms")+
  ylab("Proportion Target Looking")

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

looking_times

`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

Warning: Removed 127 rows containing non-finite outside the scale range
(`stat_smooth()`).

ggsave(here("data","figures","prop_looking_across_time.png"),looking_times,width=9,height=6,bg = "white")

`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

Warning: Removed 127 rows containing non-finite outside the scale range
(`stat_smooth()`).

Descriptive pilot plots

Usable trial stats

#Overall baseline-corrected proportion target looking by condition
looking_data_summarized <- trial_summary_data |>
  filter(trial_exclusion == 0 & exclude_participant == 0 & exclude_participant_insufficient_data == 0) |>
  left_join(trial_metadata) |>
  arrange(AoA_Est_target)

Joining with `by = join_by(Trials.trialID, Trials.targetImage,
Trials.distractorImage, Trials.imagePair)`

looking_data_by_subject_trial_type <- trial_summary_data |>
  mutate(trial_exclusion_reason = as.factor(ifelse(is.na(trial_exclusion_reason), "included", trial_exclusion_reason))) |>
  summarize(N = n(),
    .by=c(SubjectInfo.subjID, Trials.trialType, trial_exclusion_reason))

looking_data_by_subject_trial_type$trial_exclusion_reason <- factor(looking_data_by_subject_trial_type$trial_exclusion_reason, levels = c( setdiff(unique(looking_data_by_subject_trial_type$trial_exclusion_reason), "included"), "included"))
ggplot(looking_data_by_subject_trial_type, aes(x = Trials.trialType, y = N, fill = trial_exclusion_reason)) +
  geom_bar(stat = "identity", position = "stack") +  # Use stacked bar chart
  facet_wrap(~ SubjectInfo.subjID, ncol = 5) +  # Create separate bars for each subject
  labs(x = "Trial Type", y = "Count (N)", title = "Trial Exclusions by Trial Type and Subject", fill = "Trial Exclusion Reason") +
  theme_minimal() +  # Apply minimal theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.text = element_text(size = 8)) +
  scale_fill_brewer(palette = "Set3")

Subject-level timecourses

looking_times <- ggplot(summarize_subj,aes(time_normalized_corrected,mean_accuracy,color=SubjectInfo.subjID))+
  xlim(-2000,4000)+
  geom_errorbar(aes(ymin=mean_accuracy-ci,ymax=mean_accuracy+ci),width=0, alpha=0.2)+
  #geom_point(alpha=0.2)+
    geom_smooth(method="gam")+
  geom_vline(xintercept=0,size=1.5)+
  geom_hline(yintercept=0.5,size=1.2,linetype="dashed")+
  geom_vline(xintercept=300,linetype="dotted")+
  ylim(0,1)+
  xlab("Time (normalized to target word onset) in ms")+
  ylab("Proportion Target Looking") +
  scale_color_brewer(palette = "Set3", name="Participant ID")
looking_times

`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

Warning: Removed 133 rows containing non-finite outside the scale range
(`stat_smooth()`).

ggsave(here("data","figures","prop_looking_across_time_and_subject.png"),looking_times,width=9,height=6,bg = "white")

`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

Warning: Removed 133 rows containing non-finite outside the scale range
(`stat_smooth()`).

Estimating item and subject-level noise

target_looking_item_subject_level <- summarized_data(looking_data_summarized, "Trials.targetImage", "corrected_target_looking", c("SubjectInfo.subjID", "SubjectInfo.testAge", "AoA_Est_target"))

Warning: There were 67 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd_value/sqrt(N)`.
ℹ In group 1: `Trials.targetImage = "acorn"`, `SubjectInfo.subjID = "AWMRW7"`,
  `SubjectInfo.testAge = 720`, `AoA_Est_target = 5.95`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 66 remaining warnings.

target_looking_item_level <- summarized_data(target_looking_item_subject_level |> rename(mean_target_looking = mean_value), "Trials.targetImage", "mean_target_looking", "AoA_Est_target")

target_looking_subject_level <- summarized_data(looking_data_summarized, "SubjectInfo.subjID", "corrected_target_looking", "SubjectInfo.testAge")

Subject-level performance

subj_performances <- ggplot(target_looking_subject_level,aes(reorder(SubjectInfo.subjID, mean_value),mean_value))+
  geom_hline(yintercept=0,linetype="dashed")+
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci),width=0)+
  (geom_point(aes(age_size=SubjectInfo.testAge/30)) |> rename_geom_aes(new_aes = c("size" = "age_size")))+
  (geom_jitter(data=target_looking_item_subject_level |> mutate(Trials.targetImage = reorder(Trials.targetImage, AoA_Est_target)), aes(x=SubjectInfo.subjID, y=mean_value, color=Trials.targetImage, aoa_size = AoA_Est_target), alpha=0.3, width=0.2) |> rename_geom_aes(new_aes = c("size" = "aoa_size"))) +
  xlab("Participant ID")+
  ylab("Proportion of time looking at the target over the distractor")+
  ggtitle("Mean proportion of target looking across subjects") +
  theme(axis.title.x = element_text(face="bold", size=15, vjust=-1),
        axis.text.x  = element_text(size=10,angle=0,vjust=0.5),
        axis.title.y = element_text(face="bold", size=15),
        axis.text.y  = element_text(size=10),
        strip.text.x = element_text(size = 10,face="bold")
        ) +
  scale_y_continuous(breaks = seq(-1, 1, by = 0.2)) +
  scale_size_c(aesthetics = "age_size",name = "Age of participant in months", range=c(2,4), guide = guide_legend(order = 2)) +
  scale_size_c( aesthetics = "aoa_size",name = "Est. AoA of target words in years", guide = guide_legend(order = 1)) +
  scale_color_viridis_d(name = "Target words",option="D") +
  guides(
    size = guide_legend(position = "bottom", order = 2),
    color = guide_legend(position = "right", order = 1)
  )

Warning in geom_point(aes(age_size = SubjectInfo.testAge/30)): Ignoring unknown
aesthetics: age_size

Warning in geom_jitter(data = mutate(target_looking_item_subject_level, :
Ignoring unknown aesthetics: aoa_size

Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
3.5.0.

Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `transform` argument instead.

subj_performances

ggsave(here("data","figures","subj_performances.png"),subj_performances,width=9,height=7,bg = "white")

Item-level performance

item_performances <- ggplot(target_looking_item_level, aes(reorder(Trials.targetImage, mean_value), mean_value)) +
   geom_hline(yintercept=0,linetype="dashed")+
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci),width=0, alpha=0.2)+
  (geom_point(aes(aoa_size=AoA_Est_target)) |> rename_geom_aes(new_aes = c("size" = "aoa_size")))+
  (geom_jitter(data=target_looking_item_subject_level, aes(x=Trials.targetImage, y=mean_value, color=SubjectInfo.subjID, age_size = SubjectInfo.testAge/30), alpha=0.3, width=0.2) |> rename_geom_aes(new_aes = c("size" = "age_size"))) +
  xlab("Target image")+
  ylab("Proportion of target looking")+
  ggtitle("Mean proportion of target looking across target items") +
  theme(axis.title.x = element_text(face="bold", size=15, vjust=-1),
        axis.text.x  = element_text(size=10,angle=0,vjust=0.5),
        axis.title.y = element_text(face="bold", size=15),
        axis.text.y  = element_text(size=10),
        strip.text.x = element_text(size = 10,face="bold"),
        plot.margin = margin(t = 10, r = 10, b = 30, l = 10)
        ) +
  scale_y_continuous(breaks = seq(-1, 1, by = 0.2)) +
  scale_size_c(aesthetics = "age_size",name = "Age of participant in months", range=c(2,4), guide = guide_legend(order = 2)) +
  scale_size_c( aesthetics = "aoa_size",name = "Est. AoA of target words in years", guide = guide_legend(order = 1)) +
  scale_color_viridis_d(name = "Participant IDs",option="D") +
  guides(
    color = "none"
  )

Warning in geom_point(aes(aoa_size = AoA_Est_target)): Ignoring unknown
aesthetics: aoa_size

Warning in geom_jitter(data = target_looking_item_subject_level, aes(x =
Trials.targetImage, : Ignoring unknown aesthetics: age_size

item_performances

ggsave(here("data","figures","item_performances.png"),item_performances,width=15,height=10,bg = "white")

trial_performances <- ggplot(summarized_data(looking_data_summarized, "Trials.trialID", "corrected_target_looking", c("Trials.trialID", "AoA_Est_target")), aes(reorder(Trials.trialID, mean_value), mean_value)) +
   geom_hline(yintercept=0,linetype="dashed")+
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci),width=0, alpha=0.2)+
  (geom_point(aes(aoa_size=AoA_Est_target)) |> rename_geom_aes(new_aes = c("size" = "aoa_size")))+
  (geom_jitter(data=looking_data_summarized,aes(x=Trials.trialID, y=corrected_target_looking, color=SubjectInfo.subjID, age_size = SubjectInfo.testAge/30), alpha=0.3, width=0.2) |> rename_geom_aes(new_aes = c("size" = "age_size"))) +
  xlab("Trial type")+
  ylab("Proportion of target looking")+
  ggtitle("Mean proportion of target looking across trial types") +
  scale_size_c(aesthetics = "age_size",name = "Age of participant in months", range=c(2,4), guide = guide_legend(order = 2)) +
  scale_size_c( aesthetics = "aoa_size",name = "Est. AoA of target words in years", guide = guide_legend(order = 1)) +
  scale_color_viridis_d(name = "Participant IDs",option="D") +
  coord_cartesian(ylim = c(-1, 1)) +
    theme(axis.title.x = element_text(face="bold", size=15, vjust=-1),
        axis.text.x  = element_text(size=8,angle=45,hjust=1),
        axis.title.y = element_text(face="bold", size=15),
        axis.text.y  = element_text(size=15),
        strip.text.x = element_text(size = 10,face="bold"),
        aspect.ratio = 1,
        plot.margin = margin(t = 10, r = 10, b = 30, l = 10)
        ) +
  guides(
    color = "none"
  )

Warning in geom_point(aes(aoa_size = AoA_Est_target)): Ignoring unknown
aesthetics: aoa_size

Warning in geom_jitter(data = looking_data_summarized, aes(x = Trials.trialID,
: Ignoring unknown aesthetics: age_size

trial_performances

ggsave(here("data","figures","trial_performances.png"),trial_performances,width=15,height=10,bg = "white")

Condition-based performance by subject

looking_data_by_condition <- summarized_data(looking_data_summarized, "Trials.trialType", "corrected_target_looking", "SubjectInfo.subjID")

looking_by_part_condition <- ggplot(looking_data_by_condition, aes(x = Trials.trialType, y = mean_value)) +
  geom_bar(stat = "identity", aes(fill = ifelse(!xor(lower_ci > 0, upper_ci > 0), "Above chance", "Chance"))) +
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci),width=0, alpha=0.3) +
  geom_jitter(data = looking_data_summarized, aes(x=Trials.trialType, y=corrected_target_looking, aoa_size = AoA_Est_target), width=0.1, alpha=0.3, size=2) +
  facet_wrap(~ SubjectInfo.subjID, ncol = 5) +  # Create separate bars for each subject
  labs(x = "Trial type", y = "Prop. of target looking over the distractor", title = "Proportion of target looking by participant and condition type", fill = "Directional looking pattern") +
  theme_minimal() +  # Apply minimal theme
  theme(axis.text.x = element_text(), strip.text = element_text(size = 8)) +
  scale_fill_brewer(palette = "Set4")

Warning in geom_jitter(data = looking_data_summarized, aes(x =
Trials.trialType, : Ignoring unknown aesthetics: aoa_size

Warning: Unknown palette: "Set4"

looking_by_part_condition

ggsave(here("data","figures","looking_by_participant_condition.png"),looking_by_part_condition,width=15,height=5,bg = "white")

Easy vs hard trial plots

Proportion of target looking for easy vs hard trials

avg_corrected_target_looking_by_condition <- looking_data_summarized  %>%
  group_by(SubjectInfo.subjID, Trials.trialType) %>%
  summarize(N=n(),
            #mean_age = mean(age),
            #mean_age_mo = mean(age_mo),
            average_corrected_target_looking=mean(corrected_target_looking,na.rm=TRUE),
            se=sd(corrected_target_looking,na.rm=T)/sqrt(N),
            ci=qt(0.975, N-1)*sd(corrected_target_looking,na.rm=T)/sqrt(N),
            lower_ci=average_corrected_target_looking-ci,
            upper_ci=average_corrected_target_looking+ci,
            lower_se=average_corrected_target_looking-se,
            upper_se=average_corrected_target_looking+se,
            average_critical_window_looking=mean(mean_target_looking_critical_window,na.rm=TRUE),
            critical_window_ci = qt(0.975, N-1)*sd(mean_target_looking_critical_window,na.rm=T)/sqrt(N),
            critical_window_lower_ci=average_critical_window_looking-critical_window_ci,
            critical_window_upper_ci=average_critical_window_looking+critical_window_ci)

`summarise()` has grouped output by 'SubjectInfo.subjID'. You can override
using the `.groups` argument.

#baseline-corrected target looking summarized overall
overall_corrected_target_looking_by_condition <- avg_corrected_target_looking_by_condition %>%
  group_by(Trials.trialType) %>%
  summarize(N=n(),
            corrected_target_looking=mean(average_corrected_target_looking,na.rm=TRUE),
            ci=qt(0.975, N-1)*sd(average_corrected_target_looking,na.rm=T)/sqrt(N),
            lower_ci=corrected_target_looking-ci,
            upper_ci=corrected_target_looking+ci)

overall_corrected_target_looking_by_condition %>%
  knitr::kable()

Trials.trialType	N	corrected_target_looking	ci	lower_ci	upper_ci
easy	4	-0.0536085	0.3000025	-0.3536110	0.2463940
easy-distractor	4	0.1586855	0.3223870	-0.1637015	0.4810725
hard	4	0.0856846	0.1124564	-0.0267717	0.1981410
hard-distractor	4	0.0503629	0.3174820	-0.2671192	0.3678449

set.seed(2)
avg_corrected_target_looking_by_condition <- avg_corrected_target_looking_by_condition |> filter(Trials.trialType == "easy" | Trials.trialType == "hard")
overall_corrected_target_looking_by_condition <- overall_corrected_target_looking_by_condition |>
  filter(Trials.trialType == "easy" | Trials.trialType == "hard")
jitterer <- position_jitter(width = .05,seed=1)

overall_condition_plot <- ggplot(avg_corrected_target_looking_by_condition, aes(x=Trials.trialType,y=average_corrected_target_looking, fill=Trials.trialType))+
  geom_half_violin(data=filter(avg_corrected_target_looking_by_condition, Trials.trialType=="easy"),position = position_nudge(x = -.1, y = 0), width=1,trim = FALSE, alpha = .8,color=NA,side="l")+
  geom_half_violin(data=filter(avg_corrected_target_looking_by_condition, Trials.trialType=="hard"),position = position_nudge(x = .1, y = 0), width=1,trim = FALSE, alpha = .8,color=NA,side="r")+
  geom_path(aes(group=SubjectInfo.subjID),color="black",fill=NA,alpha=0.15,size=0.75,position=jitterer)+   geom_point(aes(color=Trials.trialType,group=SubjectInfo.subjID), size = 2.5, alpha=0.15,position=jitterer)+
  geom_point(data=overall_corrected_target_looking_by_condition,aes(y=corrected_target_looking),color="black",size=5)+
  geom_line(data=overall_corrected_target_looking_by_condition,aes(y=corrected_target_looking,group=1),color="black",size=3)+
  geom_errorbar(data=overall_corrected_target_looking_by_condition,aes(y=corrected_target_looking,ymin=lower_ci,ymax=upper_ci),width=0,size=1.2,color="black")+
  geom_hline(yintercept=0,linetype="dashed")+
  theme(legend.position="none")+
  xlab("Distractor Difficulty Condition")+
  ylab("Baseline-Corrected\nProportion Target Looking")+
  theme(axis.title.x = element_text(face="bold", size=20),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(face="bold", size=20),
        axis.text.y  = element_text(size=16),
        strip.text.x = element_text(size = 16,face="bold"))

Warning in geom_path(aes(group = SubjectInfo.subjID), color = "black", fill =
NA, : Ignoring unknown parameters: `fill`

overall_condition_plot

ggsave(here("data","figures","condition_based_looking.png"),overall_condition_plot,width=9,height=6,bg = "white")

Target image difficulty

# summarize average accuracy within participant (by word alone)
condition_based_looking <- looking_data_summarized  |>
  filter(!grepl("distractor", Trials.trialType)) |>
  distinct(SubjectInfo.subjID, Trials.trialID, Trials.targetImage, corrected_target_looking)

target_looking_by_target_word <- looking_data_summarized  |>
  filter(!grepl("distractor", Trials.trialType)) |>
  group_by(SubjectInfo.subjID, Trials.targetImage) |>
  summarize(target_looking_diff = corrected_target_looking[match("easy", Trials.trialType)] - corrected_target_looking[match("hard", Trials.trialType)],
            baseline_window_looking_diff = mean_target_looking_baseline_window[match("easy", Trials.trialType)] - mean_target_looking_baseline_window[match("hard", Trials.trialType)], 
            critical_window_looking_diff = mean_target_looking_critical_window[match("easy", Trials.trialType)] - mean_target_looking_critical_window[match("hard", Trials.trialType)])

`summarise()` has grouped output by 'SubjectInfo.subjID'. You can override
using the `.groups` argument.

#clean names for individual images for plot
overall_target_looking_by_word <- target_looking_by_target_word %>%
  filter(!is.na(target_looking_diff)) %>%
  group_by(Trials.targetImage) %>%
  summarize(N=n(),
            corrected_target_looking=mean(target_looking_diff,na.rm=TRUE),
            ci=qt(0.975, N-1)*sd(target_looking_diff,na.rm=T)/sqrt(N),
            lower_ci=corrected_target_looking-ci,
            upper_ci=corrected_target_looking+ci
          )

Warning: There were 2 warnings in `summarize()`.
The first warning was:
ℹ In argument: `ci = qt(0.975, N - 1) * sd(target_looking_diff, na.rm =
  T)/sqrt(N)`.
ℹ In group 1: `Trials.targetImage = "acorn"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

word_prefs <- ggplot(overall_target_looking_by_word,aes(reorder(Trials.targetImage,corrected_target_looking),corrected_target_looking))+
  geom_hline(yintercept=0,linetype="dashed")+
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci),width=0)+
  geom_point(aes(size=N))+
  xlab("Target Word")+
  ylab("Easy trial prefered target looking")+
  theme(axis.title.x = element_text(face="bold", size=15),
        axis.text.x  = element_text(size=10,angle=90,vjust=0.5,hjust=1),
        axis.title.y = element_text(face="bold", size=15),
        axis.text.y  = element_text(size=10),
        strip.text.x = element_text(size = 10,face="bold")
        ) +
  scale_y_continuous(breaks = seq(-1, 1, by = 0.2)) +
  scale_size_continuous(name = "Number of participants")
  
word_prefs

ggsave(here("data","figures","easy_trial_pref_by_word.png"),word_prefs,width=9,height=6,bg = "white")

Baseline image-pair preferences

saliency_effects <- looking_data_summarized |>
  # Calculating the proportion of time looking at the target word even if it isn't the target word in that particular study
 mutate(original_target_looking_baseline_window = ifelse(grepl("distractor", Trials.trialType), 1 - mean_target_looking_baseline_window,   mean_target_looking_baseline_window),
        original_target_looking_critical_window = ifelse(grepl("distractor", Trials.trialType), 1 - mean_target_looking_critical_window, mean_target_looking_critical_window)) |>
 summarize(mean_baseline_looking = mean(original_target_looking_baseline_window),
           mean_critical_looking = mean(original_target_looking_critical_window),
           .by = Trials.imagePair)

CLIP similarity plots

Comparing proportion of looking time to CLIP cosine similarities

proportions <- looking_data_summarized |>
  filter(!is.na(corrected_target_looking)) |>
  summarize(N=n(), prop_target_looking = mean(corrected_target_looking+0.5),
            prop_distractor_looking = mean(-corrected_target_looking+0.5),
            .by = c(Trials.trialID, Trials.targetImage, Trials.distractorImage)) |>
  filter(N > 2)

ggplot(summarized_data(looking_data_summarized, "Trials.trialID", "corrected_target_looking", c("Trials.trialID", "Trials.targetImage", "Trials.distractorImage", "text_similarity", "cor")), aes(x=cor, y = mean_value)) +
  geom_point() +
  geom_smooth(method="lm") +
    geom_label_repel(aes(label=paste(Trials.targetImage, "-", Trials.distractorImage))) +
  ylab("Probability of looking at target") +
  xlab("CLIP text similarity") +
  ggpubr::stat_cor(method = "pearson")

`geom_smooth()` using formula = 'y ~ x'

Examining linear correlation between similarity types

cor(data.frame(trial_metadata$multimodal_similarity, trial_metadata$image_similarity, trial_metadata$text_similarity, trial_metadata$cor))

                                     trial_metadata.multimodal_similarity
trial_metadata.multimodal_similarity                            1.0000000
trial_metadata.image_similarity                                 0.9421492
trial_metadata.text_similarity                                  0.9069870
trial_metadata.cor                                              0.7947394
                                     trial_metadata.image_similarity
trial_metadata.multimodal_similarity                       0.9421492
trial_metadata.image_similarity                            1.0000000
trial_metadata.text_similarity                             0.7721763
trial_metadata.cor                                         0.6225654
                                     trial_metadata.text_similarity
trial_metadata.multimodal_similarity                      0.9069870
trial_metadata.image_similarity                           0.7721763
trial_metadata.text_similarity                            1.0000000
trial_metadata.cor                                        0.8804124
                                     trial_metadata.cor
trial_metadata.multimodal_similarity          0.7947394
trial_metadata.image_similarity               0.6225654
trial_metadata.text_similarity                0.8804124
trial_metadata.cor                            1.0000000

cor.test(trial_metadata$image_similarity, trial_metadata$text_similarity)


    Pearson's product-moment correlation

data:  trial_metadata$image_similarity and trial_metadata$text_similarity
t = 6.6562, df = 30, p-value = 2.264e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5795199 0.8830943
sample estimates:
      cor 
0.7721763