Preliminaries and data loading

library(peekbankr)
library(tidyverse)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(tictoc)
library(langcog)
## 
## Attaching package: 'langcog'
## The following object is masked from 'package:base':
## 
##     scale
library(here)
## here() starts at /Users/mcfrank/Projects/peekbank/peekbank-paper
library(assertthat)
## 
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
## 
##     has_name
t_range <- c(-1000,3000)
knitr::opts_chunk$set(cache = TRUE, warn = FALSE, message = FALSE)
load(file = "data/aoi_data_joined.Rds")

Reaction time computation

For kicks we tried out test-driven development.

get_rt <- function (rle_data, SAMPLING_RATE = 40) {
  # end if no data
  if (is.null(rle_data$values) | is.null(rle_data$lengths)) {
    return(tibble(rt = NA, 
                  shift_type = NA))
  }
  
  onset_aoi <- rle_data$values[1] # zero point AOI
  
  # end if missing for start
  if (!(onset_aoi %in% c("target","distractor"))) {
    return(tibble(rt = NA, 
                  shift_type = "other"))
  }

  first_landing <- rle_data$values[rle_data$values != onset_aoi &
                                    rle_data$values %in% c("target","distractor")][1]

  # end if no shift
  if (is.na(first_landing)) {
    return(tibble(rt = NA, 
                  shift_type = "no shift"))
  }
  
  shift_type <- case_when(onset_aoi == "distractor" &
                           first_landing == "target" ~ "D-T",
                         onset_aoi == "target" &
                           first_landing == "distractor" ~ "T-D",
                         TRUE ~ "other")

  first_landing_idx <- which(rle_data$values == first_landing)[1]
  
  values_before_first_landing <- rle_data$lengths[1:(first_landing_idx-1)]

  # rt is the number of samples happening before arrival + 1 
  # (first sample of arrival)
  # times the length of a sample
  rt <- (sum(values_before_first_landing) + 1) * (1000/SAMPLING_RATE)

  return(tibble(rt = rt, 
                shift_type = shift_type))
}


#### tests #### 
# friendly case: RT
friendly <- tibble(lengths = c(10,1,10),
                   values =  c("target","missing","distractor"))
assert_that(get_rt(friendly)$rt == 300)
## [1] TRUE
assert_that(get_rt(friendly)$shift_type == "T-D")
## [1] TRUE
# RT of missing is NA
no_data <- tibble(lengths = NULL,
                  values = NULL)
assert_that(is.na(get_rt(no_data)$rt))
## Warning: Unknown or uninitialised column: `values`.
## Warning: Unknown or uninitialised column: `lengths`.
## [1] TRUE
assert_that(is.na(get_rt(no_data)$shift_type))
## Warning: Unknown or uninitialised column: `values`.

## Warning: Unknown or uninitialised column: `lengths`.
## [1] TRUE
# RT of something with no shift is NA
no_shift <- tibble(lengths = 100,
                   values =  "target")
assert_that(is.na(get_rt(no_shift)$rt))
## [1] TRUE
assert_that(get_rt(no_shift)$shift_type == "no shift")
## [1] TRUE
# RT of something with no landing is NA
no_landing <- tibble(lengths = c(10,10),
                     values =  c("target", "missing"))
assert_that(is.na(get_rt(no_landing)$rt))
## [1] TRUE
assert_that(get_rt(no_landing)$shift_type == "no shift")
## [1] TRUE
# RT where we start on missing is NA, shift_type is "no"
start_on_missing <- tibble(lengths = c(1, 10, 10),
                           values =  c("missing","target","distractor"))
assert_that(is.na(get_rt(start_on_missing)$rt))
## [1] TRUE
assert_that(get_rt(start_on_missing)$shift_type == "other")
## [1] TRUE

Now process the RTs and join back in the relevant data.

rt_data <- aoi_data_joined %>%
  filter(any(t_norm == 0), # must have data at 0
         t_norm >= 0) %>% # only pass data after 0
  group_by(administration_id, trial_id) %>%
  summarise(lengths = rle(aoi)$lengths, 
            values = rle(aoi)$values) 

rts <- rt_data %>%
  group_by(administration_id, trial_id) %>%
  nest() %>%
  mutate(data = lapply(data, get_rt)) %>%
  unnest(cols = c(data)) 

rts <- left_join(rts, 
                 aoi_data_joined %>%
                   select(administration_id, trial_id, 
                          age, dataset_name, 
                          english_stimulus_label, 
                          stimulus_novelty, trial_order) %>%
                   distinct())

Shift statistics. Why are there more D-T than T-D? Looks like that’s true across several datasets (perhaps some with novel words)?

Also looks like lots of “other” in the eye-tracking datasets?

ggplot(rts, aes(x = shift_type)) + 
  geom_histogram(stat = "count") + 
  facet_wrap(~dataset_name)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

corr_fam_rts <- rts %>%
  filter(stimulus_novelty == "familiar",
         shift_type == "D-T", 
         !is.na(rt))
  

 # group_by(dataset_name) %>%
  # filter(rt_value < max(rt_value)-100, !is.na(rt_value)) # note sloppy exclusion of bad RTs (no shift trials)

Descriptives

Descriptive histograms.

ggplot(corr_fam_rts, 
       aes(x = rt)) + 
  geom_histogram() + 
  facet_wrap(~dataset_name, scales = "free_y") 

Logs? This is cool as it suggests that the < 300 ms RTs really are junk.

ggplot(corr_fam_rts, 
       aes(x = rt)) + 
  geom_histogram() + 
  scale_x_log10() + 
  facet_wrap(~dataset_name, scales = "free_y") 

So let’s get rid of them.

rts <- filter(rts, rt > 300)
corr_fam_rts <- filter(corr_fam_rts, rt > 300)

RT by item?

rt_means <- corr_fam_rts %>%
  filter(age > 6, age < 66) %>%
  mutate(age_binned = cut(age, seq(6,66,12))) %>%
  group_by(administration_id, trial_id, english_stimulus_label, age_binned, shift_type) %>%
  summarise(sub_mean_rt = mean(rt)) %>%
  group_by(english_stimulus_label, age_binned, shift_type) %>%
  summarise(mean_rt = mean(sub_mean_rt), 
            sd_rt = sd(sub_mean_rt), 
            n = n(),
            ci = 1.96 * sd_rt / sqrt(n)) %>%
  filter(n > 10) %>%
  mutate(significant = (mean_rt - ci > .5 & mean_rt + ci > .5) |
           (mean_rt - ci < .5 & mean_rt + ci < .5)) %>%
  # group_by(age_binned) %>%
  mutate(sorted_stimulus_label = fct_reorder(as.factor(english_stimulus_label), 
                                             mean_rt, mean))

ggplot(rt_means, 
       aes(y = sorted_stimulus_label, x = mean_rt)) + 
  ggstance::geom_linerangeh(aes(xmin = mean_rt - ci, xmax = mean_rt + ci)) + 
  geom_point(aes(col = age_binned), size = 2) + 
  scale_shape_manual(values = c(1, 20)) + 
  geom_vline(xintercept = .5, lty = 2) + 
  facet_wrap(~shift_type) + 
  theme_mikabr()

Overall RT curve across age.

rt_subs <- corr_fam_rts %>%
  filter(stimulus_novelty == "familiar", rt > 300, rt < 3000) %>%
  group_by(administration_id, trial_id, english_stimulus_label, age) %>%
  summarise(sub_mean_rt = mean(rt)) 


ggplot(rt_subs, 
       aes(x = age, y = sub_mean_rt)) + 
  geom_jitter(alpha = .03, width = .3, height = 0) +
  geom_smooth(method = "lm") + 
  scale_y_log10() +
  xlim(12,60) + 
  theme_mikabr() 
## Warning: Removed 90 rows containing non-finite values (stat_smooth).
## Warning: Removed 125 rows containing missing values (geom_point).

Summary RT curves across age

corr_fam_rts %>%
  filter(age > 6, age < 66) %>%
  mutate(age_binned = cut(age, seq(6,66,12))) %>%
  group_by(age_binned) %>%
  ggplot(aes(x = age_binned, y = rt)) + 
  stat_summary(fun.data = "mean_cl_boot") +
  facet_wrap(~dataset_name)

corr_fam_rts %>%
  filter(age > 6, age < 66) %>%
  mutate(age_binned = cut(age, seq(6,66,12))) %>%
  group_by(age_binned) %>%
  ggplot(aes(x = age_binned, y = rt)) + 
  stat_summary(fun.data = "mean_cl_boot") 

Overall RT curve across trial order

rt_subs <- corr_fam_rts %>%
  group_by(administration_id, trial_id, english_stimulus_label, dataset_name, 
           trial_order) %>%
  summarise(sub_mean_rt = mean(rt)) 


ggplot(rt_subs, 
       aes(x = trial_order, y = sub_mean_rt)) + 
  geom_point(alpha = .1) +
  geom_smooth(method = "lm") + 
  scale_y_log10() + 
  xlim(12,60) + 
  theme_mikabr() 
## Warning: Removed 4034 rows containing non-finite values (stat_smooth).
## Warning: Removed 4034 rows containing missing values (geom_point).

rt_subs <- corr_fam_rts %>%
  group_by(administration_id, trial_id, 
           english_stimulus_label, age, dataset_name) %>%
  summarise(sub_mean_rt = mean(rt)) %>%
  group_by(english_stimulus_label) %>%
  filter(n() > 100) 

ggplot(rt_subs, 
       aes(x = age, y = sub_mean_rt, col = dataset_name)) + 
  geom_point(alpha = .1) +
  geom_smooth(method = "lm", formula = y ~ x) + 
  facet_wrap(~english_stimulus_label) + 
  ylim(0,4000) +
  theme_mikabr()
## Warning: Removed 26 rows containing non-finite values (stat_smooth).
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

Models

mod_data <- corr_fam_rts %>%
  filter(!is.na(age), age < 66)
mod_data$log_rt <- log(mod_data$rt)
mod_data$age_centered <- scale(mod_data$age, scale = FALSE)
mod_data$age_binned <- as.numeric(as.character(cut(mod_data$age, 
                                      breaks = seq(6,66,12), 
                                      labels = seq(12, 60, 12))))

FWIW, no obvious big contributions for polynomials degree > 2. Adding age slopes decreases convergence and doesn’t seem to capture that much variance?

mod <- lmer(log_rt ~ poly(age_centered, 2) + 
              (1 | dataset_name) + 
              (1 | english_stimulus_label) +
              (1 | administration_id),
  data = mod_data)

summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## log_rt ~ poly(age_centered, 2) + (1 | dataset_name) + (1 | english_stimulus_label) +  
##     (1 | administration_id)
##    Data: mod_data
## 
## REML criterion at convergence: 9607.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2247 -0.5953 -0.0482  0.5492  3.5278 
## 
## Random effects:
##  Groups                 Name        Variance Std.Dev.
##  administration_id      (Intercept) 0.03502  0.1871  
##  english_stimulus_label (Intercept) 0.01494  0.1222  
##  dataset_name           (Intercept) 0.02099  0.1449  
##  Residual                           0.16868  0.4107  
## Number of obs: 8062, groups:  
## administration_id, 1223; english_stimulus_label, 155; dataset_name, 11
## 
## Fixed effects:
##                          Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)               6.75784    0.04708   10.86358 143.532  < 2e-16 ***
## poly(age_centered, 2)1  -13.41498    1.22461 1258.48701 -10.954  < 2e-16 ***
## poly(age_centered, 2)2    2.54411    0.70376 1728.58402   3.615 0.000309 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) p(_,2)1
## ply(g_c,2)1 -0.058        
## ply(g_c,2)2  0.052 -0.491
mod_data$fitted <- fitted(mod)

ggplot(mod_data, 
       aes(x = age_binned, y = log_rt)) + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  stat_summary(aes(y = fitted), fun = mean, geom = "line") +
  xlab("Age (months)") +
  ylab("Reaction time (ms)") 

Just for kicks, let’s look at the linear version.

mod_linear <- lmer(rt ~ poly(age_centered, 2) + 
              (1 | dataset_name) + 
              (1 | english_stimulus_label) +
              (1 | administration_id),
  data = mod_data)

summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## log_rt ~ poly(age_centered, 2) + (1 | dataset_name) + (1 | english_stimulus_label) +  
##     (1 | administration_id)
##    Data: mod_data
## 
## REML criterion at convergence: 9607.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2247 -0.5953 -0.0482  0.5492  3.5278 
## 
## Random effects:
##  Groups                 Name        Variance Std.Dev.
##  administration_id      (Intercept) 0.03502  0.1871  
##  english_stimulus_label (Intercept) 0.01494  0.1222  
##  dataset_name           (Intercept) 0.02099  0.1449  
##  Residual                           0.16868  0.4107  
## Number of obs: 8062, groups:  
## administration_id, 1223; english_stimulus_label, 155; dataset_name, 11
## 
## Fixed effects:
##                          Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)               6.75784    0.04708   10.86358 143.532  < 2e-16 ***
## poly(age_centered, 2)1  -13.41498    1.22461 1258.48701 -10.954  < 2e-16 ***
## poly(age_centered, 2)2    2.54411    0.70376 1728.58402   3.615 0.000309 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) p(_,2)1
## ply(g_c,2)1 -0.058        
## ply(g_c,2)2  0.052 -0.491
mod_data$fitted_linear <- fitted(mod_linear)

ggplot(mod_data, 
       aes(x = age_binned, y = rt)) + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  stat_summary(aes(y = fitted_linear), fun = mean, geom = "line") +
  xlab("Age (months)") +
  ylab("Reaction time (ms)") 

Previously, we had this issue that the random effects are very substantial so the fitted pattern looks kind of odd. Let’s plot the model predictions…

uncentering_factor <- mean(mod_data$age)
newdata <- tibble(age_centered = c(12, 24, 36, 48, 60) - uncentering_factor)

newdata$pred <- predict(mod, newdata = newdata, re.form = NA)

ggplot(newdata, 
       aes(x = age_centered + uncentering_factor, y = pred)) + 
    stat_summary(data = mod_data, 
                 aes(x = age_binned, y = log_rt), 
                 fun.data = mean_se, geom = "pointrange") +
  geom_line(lty = 2)

Again, show in linear space:

newdata$pred_linear <- predict(mod_linear, newdata = newdata, re.form = NA)

ggplot(newdata, 
       aes(x = age_centered + uncentering_factor, y = pred_linear)) + 
    stat_summary(data = mod_data, 
                 aes(x = age_binned, y = rt), 
                 fun.data = mean_se, geom = "pointrange") +
  geom_line(lty = 2)

Seems like the data seemingly under-estimate developmental change in RTs because of item selection and dataset/method differences…