library(peekbankr)
library(tidyverse)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(tictoc)
library(langcog)
##
## Attaching package: 'langcog'
## The following object is masked from 'package:base':
##
## scale
library(here)
## here() starts at /Users/mcfrank/Projects/peekbank/peekbank-paper
t_range <- c(-1000,3000)
knitr::opts_chunk$set(cache = TRUE, warn = FALSE, message = FALSE)
get data
con <- connect_to_peekbank(dbname = "peekbank_dev")
datasets <- get_datasets(connection = con) %>% collect()
administrations <- get_administrations(connection = con) %>% collect()
tic()
aoi_timepoints <- get_aoi_timepoints(connection = con) %>% collect()
toc()
## 32.411 sec elapsed
stimuli <- get_stimuli(connection = con) %>% collect()
trial_types <- get_trial_types(connection = con) %>% collect()
trials <- get_trials(connection = con) %>% collect()
aoi_data_joined <- aoi_timepoints %>%
right_join(administrations) %>%
right_join(trials) %>%
right_join(trial_types) %>%
right_join(datasets) %>%
mutate(stimulus_id = target_id) %>%
right_join(stimuli) %>%
filter(t_norm > t_range[1],
t_norm < t_range[2])
# save(file = "data/aoi_data_joined.Rds", aoi_data_joined)
load cached data
# load(file = "data/aoi_data_joined.Rds")
subinfo <- aoi_data_joined %>%
group_by(subject_id, dataset_id, lab_dataset_id, age) %>%
summarise(trials = length(unique(trial_id)))
subinfo %>%
ggplot(aes(x = age, fill = lab_dataset_id)) +
geom_histogram(binwidth = 1) +
theme_mikabr() +
scale_fill_solarized(name = "Dataset") +
xlab("Age (months)") +
scale_x_continuous(breaks = c(12,24,36,48,60))
## Warning: Removed 14 rows containing non-finite values (stat_bin).
Time series
means <- aoi_data_joined %>%
filter(age > 12, age <= 60) %>%
mutate(age_binned = cut(age, seq(0,60,12))) %>%
group_by(t_norm, dataset_name, age_binned, stimulus_novelty) %>%
summarise(n = sum(aoi %in% c("target","distractor"), na.rm = TRUE),
p = sum(aoi == "target", na.rm = TRUE),
prop_looking = p / n,
ci_lower = binom::binom.confint(p, n, method = "bayes")$lower,
ci_upper = binom::binom.confint(p, n, method = "bayes")$upper)
ggplot(means,
aes(x = t_norm, y = prop_looking)) +
geom_line(aes(col = dataset_name)) +
geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper,
fill = dataset_name), alpha = .5) +
facet_grid(age_binned~stimulus_novelty) +
geom_hline(yintercept = .5, lty = 2) +
geom_vline(xintercept = 0, lty = 2) +
ylab("Proportion Target Looking") +
xlab("Time (msec)") +
theme_classic() +
scale_color_solarized() +
scale_fill_solarized()
Familiar trials.
window_min <- 300
window_max <- 2300
familiar_means <- aoi_data_joined %>%
filter(age > 12, age <= 24,
stimulus_novelty == "familiar") %>%
filter(t_norm > window_min, t_norm < window_max) %>%
group_by(administration_id, trial_id, english_stimulus_label) %>%
summarise(n = sum(aoi %in% c("target","distractor"), na.rm = TRUE),
p = sum(aoi == "target", na.rm = TRUE),
prop_looking = p / n) %>%
group_by(english_stimulus_label) %>%
summarise(mean_prop = mean(prop_looking, na.rm=TRUE),
sd_prop = sd(prop_looking, na.rm=TRUE),
n = n(),
ci = 1.96 * sd_prop / sqrt(n)) %>%
mutate(significant = (mean_prop - ci > .5 & mean_prop + ci > .5) |
(mean_prop - ci < .5 & mean_prop + ci < .5)) %>%
ungroup %>%
mutate(sorted_stimulus_label = fct_reorder(as.factor(english_stimulus_label),
mean_prop, mean))
ggplot(familiar_means,
aes(y = sorted_stimulus_label, x = mean_prop)) +
ggstance::geom_linerangeh(aes(xmin = mean_prop - ci, xmax = mean_prop + ci)) +
geom_point(aes(shape = significant), size = 2) +
scale_shape_manual(values = c(1, 20)) +
geom_vline(xintercept = .5, lty = 2) +
theme_mikabr()
SAMPLING_RATE <- 40
rts <- aoi_data_joined %>%
# filter(!(dataset_name %in% "adams_marchman_2018")) %>%
group_by(administration_id, trial_id, age, english_stimulus_label,
dataset_name, stimulus_novelty, trial_order) %>%
filter(!all(aoi == "missing"), !all(t_norm < 0)) %>%
mutate(crit_onset_aoi = aoi[t_norm == 0],
fst_shift_land_aoi = rle(aoi[t_norm >= 0])$values[3],
shift_type = case_when(
crit_onset_aoi == "distractor" & fst_shift_land_aoi == "target" ~ "D-T",
crit_onset_aoi == "distractor" & fst_shift_land_aoi == "other" ~ "D-O",
crit_onset_aoi == "target" & fst_shift_land_aoi == "distractor" ~ "T-D",
crit_onset_aoi == "target" & fst_shift_land_aoi == "other" ~ "T-O",
TRUE ~ "other shift")) %>%
summarise(rt_value = rle(aoi[t_norm >= 0])$lengths[1] * SAMPLING_RATE,
fst_shift_gap_ms = rle(aoi[t_norm >= 0])$lengths[2] * SAMPLING_RATE) %>%
group_by(dataset_name) %>%
filter(rt_value < max(rt_value)-100, !is.na(rt_value)) # note sloppy exclusion of bad RTs (no shift trials)
Descriptive histograms.
ggplot(rts,
aes(x = rt_value)) +
geom_histogram() +
facet_wrap(~dataset_name, scales = "free_y")
Logs? This is cool as it suggests that the < 300 ms RTs really are junk.
ggplot(rts,
aes(x = rt_value)) +
geom_histogram() +
scale_x_log10() +
facet_wrap(~dataset_name, scales = "free_y")
RT by item?
rt_means <- rts %>%
filter(age > 6, age < 66) %>%
mutate(age_binned = cut(age, seq(6,66,12))) %>%
group_by(administration_id, trial_id, english_stimulus_label, age_binned) %>%
summarise(sub_mean_rt = mean(rt_value)) %>%
group_by(english_stimulus_label, age_binned) %>%
summarise(mean_rt = mean(sub_mean_rt),
sd_rt = sd(sub_mean_rt),
n = n(),
ci = 1.96 * sd_rt / sqrt(n)) %>%
filter(n > 10) %>%
mutate(significant = (mean_rt - ci > .5 & mean_rt + ci > .5) |
(mean_rt - ci < .5 & mean_rt + ci < .5)) %>%
# group_by(age_binned) %>%
mutate(sorted_stimulus_label = fct_reorder(as.factor(english_stimulus_label),
mean_rt, mean))
ggplot(rt_means,
aes(y = sorted_stimulus_label, x = mean_rt)) +
ggstance::geom_linerangeh(aes(xmin = mean_rt - ci, xmax = mean_rt + ci)) +
geom_point(aes(col = age_binned), size = 2) +
scale_shape_manual(values = c(1, 20)) +
geom_vline(xintercept = .5, lty = 2) +
theme_mikabr()
Overall RT curve across age.
rt_subs <- rts %>%
filter(stimulus_novelty == "familiar", rt_value > 300, rt_value < 3000) %>%
group_by(administration_id, trial_id, english_stimulus_label, age) %>%
summarise(sub_mean_rt = mean(rt_value))
ggplot(rt_subs,
aes(x = age, y = sub_mean_rt)) +
geom_jitter(alpha = .03, width = .3, height = 0) +
geom_smooth() +
scale_y_log10() +
xlim(12,60) +
theme_mikabr()
## Warning: Removed 228 rows containing non-finite values (stat_smooth).
## Warning: Removed 297 rows containing missing values (geom_point).
Overall RT curve across trial order
rt_subs <- rts %>%
filter(stimulus_novelty == "familiar", rt_value > 300, rt_value < 3000) %>%
group_by(administration_id, trial_id, english_stimulus_label, dataset_name,
trial_order) %>%
summarise(sub_mean_rt = mean(rt_value))
ggplot(rt_subs,
aes(x = trial_order, y = sub_mean_rt)) +
geom_point(alpha = .1) +
geom_smooth(method = "lm") +
scale_y_log10() +
xlim(12,60) +
theme_mikabr()
## Warning: Removed 5354 rows containing non-finite values (stat_smooth).
## Warning: Removed 5354 rows containing missing values (geom_point).
rt_subs <- rts %>%
filter(stimulus_novelty == "familiar", rt_value > 300, rt_value < 3000) %>%
group_by(administration_id, trial_id, english_stimulus_label, age, dataset_name) %>%
summarise(sub_mean_rt = mean(rt_value)) %>%
group_by(english_stimulus_label) %>%
filter(n() > 100)
ggplot(rt_subs,
aes(x = age, y = sub_mean_rt, col = dataset_name)) +
geom_point(alpha = .1) +
geom_smooth(method = "lm", formula = y ~ x) +
facet_wrap(~english_stimulus_label) +
ylim(0,4000) +
theme_mikabr()
## Warning: Removed 79 rows containing non-finite values (stat_smooth).
## Warning: Removed 79 rows containing missing values (geom_point).
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
Create dataframe.
rt_model_data <- rts %>%
filter(rt_value > 300) %>%
mutate(log_rt = log(rt_value),
age_centered = scale(age, scale = FALSE))
Sanity check the lm model with no random effects.
summary(lm(log_rt ~ age_centered * stimulus_novelty + age_centered * trial_order,
data = rt_model_data))
##
## Call:
## lm(formula = log_rt ~ age_centered * stimulus_novelty + age_centered *
## trial_order, data = rt_model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.49238 -0.49623 -0.00881 0.51165 1.55089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.081e+00 1.043e-02 679.052 < 2e-16 ***
## age_centered -4.426e-03 1.178e-03 -3.756 0.000173 ***
## stimulus_noveltynovel -5.428e-02 1.218e-02 -4.457 8.36e-06 ***
## trial_order 2.868e-03 5.871e-04 4.886 1.04e-06 ***
## age_centered:stimulus_noveltynovel 3.487e-03 1.078e-03 3.236 0.001215 **
## age_centered:trial_order -1.377e-04 9.535e-05 -1.444 0.148823
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6812 on 17406 degrees of freedom
## (184 observations deleted due to missingness)
## Multiple R-squared: 0.006905, Adjusted R-squared: 0.00662
## F-statistic: 24.21 on 5 and 17406 DF, p-value: < 2.2e-16
Go for it.
rt_mod <- lmer(log_rt ~ age_centered * stimulus_novelty +
age_centered * trial_order +
(1 | english_stimulus_label) + (1 | dataset_name),
data = rt_model_data)
summary(rt_mod)$coef %>%
knitr::kable(digits = 3)
Estimate | Std. Error | df | t value | Pr(>|t|) | |
---|---|---|---|---|---|
(Intercept) | 7.097 | 0.041 | 7.819 | 174.277 | 0.000 |
age_centered | -0.005 | 0.001 | 17369.014 | -3.946 | 0.000 |
stimulus_noveltynovel | 0.055 | 0.022 | 12.058 | 2.476 | 0.029 |
trial_order | 0.001 | 0.001 | 16115.894 | 1.676 | 0.094 |
age_centered:stimulus_noveltynovel | 0.004 | 0.001 | 17363.013 | 3.304 | 0.001 |
age_centered:trial_order | 0.000 | 0.000 | 17374.935 | -1.240 | 0.215 |
Start by summarizing.
t_min <- 300
t_max <- 2300
by_trial_means <- aoi_data_joined %>%
filter(age > 12, age <= 60, stimulus_novelty == "familiar",
t_norm > t_min, t_norm < t_max) %>%
mutate(age_binned = cut(age, seq(12,60,12))) %>%
rename(target_label = english_stimulus_label) %>%
group_by(administration_id, trial_id, target_label, distractor_id,
age, age_binned) %>%
summarise(prop_looking = sum(aoi == "target", na.rm = TRUE) /
(sum(aoi == "target", na.rm=TRUE) +
sum(aoi=="distractor", na.rm=TRUE)),
prop_missing = mean(aoi == "missing", na.rm = TRUE)) %>%
left_join(stimuli, by = c("distractor_id" = "stimulus_id")) %>%
rename(distractor_label = english_stimulus_label)
Missingness.
hist(by_trial_means$prop_missing)
Preliminary item analysis.
item_means <- by_trial_means %>%
group_by(target_label, age_binned) %>%
summarise(mean_prop = mean(prop_looking),
sd_prop = sd(prop_looking),
ci = 1.96 * sd_prop / sqrt(n()))
ggplot(item_means, aes(x = mean_prop, y = target_label, col = age_binned)) +
ggstance::geom_pointrangeh(aes(xmin = mean_prop - ci, xmax = mean_prop + ci)) +
geom_vline(xintercept = .5, lty = 2) +
xlim(0,1) +
theme_mikabr()
## Warning: Removed 99 rows containing missing values (geom_pointrangeh).
Make some models!
acc_mod_data <- by_trial_means %>%
filter(prop_missing < .3)
acc_mod_data$age_centered <- acc_mod_data$age - mean(acc_mod_data$age)
m1 <- lmer(prop_looking ~ age_centered
+ (1| administration_id) + (1 | target_label),
data = acc_mod_data)
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: prop_looking ~ age_centered + (1 | administration_id) + (1 |
## target_label)
## Data: acc_mod_data
##
## REML criterion at convergence: 5864.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7883 -0.6100 0.1299 0.7793 2.3271
##
## Random effects:
## Groups Name Variance Std.Dev.
## administration_id (Intercept) 0.005283 0.07269
## target_label (Intercept) 0.005629 0.07503
## Residual 0.082712 0.28760
## Number of obs: 14792, groups: administration_id, 1097; target_label, 115
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 6.173e-01 1.060e-02 8.065e+01 58.22 <2e-16 ***
## age_centered 4.336e-03 3.452e-04 2.943e+03 12.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.056
add age slopes!
m2 <- lmer(prop_looking ~ age_centered
+ (1 | administration_id) + (age_centered || target_label),
data = acc_mod_data)
summary(m2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## prop_looking ~ age_centered + (1 | administration_id) + (age_centered ||
## target_label)
## Data: acc_mod_data
##
## REML criterion at convergence: 5835.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7976 -0.6119 0.1265 0.7795 2.3444
##
## Random effects:
## Groups Name Variance Std.Dev.
## administration_id (Intercept) 5.275e-03 0.072627
## target_label (Intercept) 5.770e-03 0.075957
## target_label.1 age_centered 1.531e-05 0.003913
## Residual 8.231e-02 0.286896
## Number of obs: 14792, groups: administration_id, 1097; target_label, 115
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 6.125e-01 1.090e-02 8.326e+01 56.170 < 2e-16 ***
## age_centered 3.296e-03 9.317e-04 3.245e+01 3.537 0.00124 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.062
distractors
m3 <- lmer(prop_looking ~ age_centered
+ (1 | administration_id) + (age_centered || target_label) +
+ (1 | distractor_label),
data = acc_mod_data)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00466064 (tol = 0.002, component 1)
summary(m3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## prop_looking ~ age_centered + (1 | administration_id) + (age_centered ||
## target_label) + +(1 | distractor_label)
## Data: acc_mod_data
##
## REML criterion at convergence: 5779.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8153 -0.6092 0.1304 0.7718 2.3441
##
## Random effects:
## Groups Name Variance Std.Dev.
## administration_id (Intercept) 5.190e-03 0.072042
## target_label (Intercept) 5.426e-03 0.073660
## target_label.1 age_centered 6.594e-06 0.002568
## distractor_label (Intercept) 3.552e-03 0.059600
## Residual 8.178e-02 0.285975
## Number of obs: 14792, groups:
## administration_id, 1097; target_label, 115; distractor_label, 115
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 6.114e-01 1.315e-02 8.418e+01 46.482 < 2e-16 ***
## age_centered 4.205e-03 7.742e-04 1.800e+01 5.432 3.68e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.012
## convergence code: 0
## Model failed to converge with max|grad| = 0.00466064 (tol = 0.002, component 1)
age for distractors
m4 <- lmer(prop_looking ~ age_centered
+ (1 | administration_id) + (age_centered || target_label) +
+ (age_centered || distractor_label),
data = acc_mod_data)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0157822 (tol = 0.002, component 1)
summary(m4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## prop_looking ~ age_centered + (1 | administration_id) + (age_centered ||
## target_label) + +(age_centered || distractor_label)
## Data: acc_mod_data
##
## REML criterion at convergence: 5779.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8151 -0.6094 0.1303 0.7717 2.3444
##
## Random effects:
## Groups Name Variance Std.Dev.
## administration_id (Intercept) 5.192e-03 0.0720562
## target_label (Intercept) 5.448e-03 0.0738096
## target_label.1 age_centered 6.498e-06 0.0025491
## distractor_label (Intercept) 3.541e-03 0.0595093
## distractor_label.1 age_centered 8.861e-08 0.0002977
## Residual 8.178e-02 0.2859714
## Number of obs: 14792, groups:
## administration_id, 1097; target_label, 115; distractor_label, 115
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 6.114e-01 1.316e-02 8.398e+01 46.461 < 2e-16 ***
## age_centered 4.196e-03 7.754e-04 1.742e+01 5.412 4.29e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.012
## convergence code: 0
## Model failed to converge with max|grad| = 0.0157822 (tol = 0.002, component 1)
Question: do distractors matter in word recognition? Here’s the idea: test between a no-distractor model and a distractor-informed model.
Let’s assume each target \(T\) and distractor \(D\) has a recognizability score, with lower being more difficult.
No-distractor model: target accuracy should be about target difficulty \(T\).
Distractor model: target accuracy should be a luce choice of \(T / (T + D)\).
Two plausible models of recognizability would be 1) childes frequency data and 2) wordbank age of acquisition data.
Let’s start with the wordbank data because it’s easy.
We’ll use Wordbank AOAs estimated in the wordbank book with a bayesian GLM.
Larger AOAs are harder, that’s the opposite of recognizability. Let’s try using inverse AoA.
aoas <- read_csv(here("data/bglm_aoas_english.csv"))
aoa_data <- left_join(acc_mod_data,
aoas %>%
transmute(target_label = definition,
target_aoa = bglm_aoa,
target_category = category)) %>%
left_join(aoas %>%
transmute(distractor_label = definition,
distractor_aoa = bglm_aoa)) %>%
filter(!is.na(target_aoa), !is.na(distractor_aoa)) %>%
ungroup() %>%
mutate(target_aoa_centered = target_aoa - mean(target_aoa),
distractor_aoa_centered = distractor_aoa - mean(distractor_aoa))
aoa_data <- aoa_data %>%
mutate(inverse_target = 1/target_aoa,
inverse_distractor = 1/distractor_aoa,
luce = inverse_target / (inverse_target + inverse_distractor))
m1 <- lmer(prop_looking ~ luce + (1|administration_id) + (1|dataset_name),
data = aoa_data)
summary(m1)$coef %>%
knitr::kable(digits = 3)
Estimate | Std. Error | df | t value | Pr(>|t|) | |
---|---|---|---|---|---|
(Intercept) | 0.718 | 0.035 | 205.467 | 20.572 | 0.000 |
luce | -0.126 | 0.063 | 8991.056 | -1.984 | 0.047 |
m2 <- lmer(prop_looking ~ target_aoa + (1|administration_id) + (1|dataset_name),
data = aoa_data)
summary(m2)$coef %>%
knitr::kable(digits = 3)
Estimate | Std. Error | df | t value | Pr(>|t|) | |
---|---|---|---|---|---|
(Intercept) | 0.665 | 0.024 | 42.548 | 27.973 | 0.000 |
target_aoa | -0.001 | 0.001 | 6189.088 | -0.513 | 0.608 |
m3 <- lmer(prop_looking ~ inverse_target + (1|administration_id) +
(1|dataset_name),
data = aoa_data)
summary(m3)$coef %>%
knitr::kable(digits = 3)
Estimate | Std. Error | df | t value | Pr(>|t|) | |
---|---|---|---|---|---|
(Intercept) | 0.664 | 0.023 | 36.232 | 28.784 | 0.000 |
inverse_target | -0.156 | 0.296 | 7121.531 | -0.525 | 0.599 |
Add age.
m1_age <- lmer(prop_looking ~ luce * age + (1 | administration_id) +
(1 | dataset_name),
data = aoa_data)
summary(m1_age)$coef %>%
knitr::kable(digits = 3)
Estimate | Std. Error | df | t value | Pr(>|t|) | |
---|---|---|---|---|---|
(Intercept) | 0.310 | 0.083 | 2784.277 | 3.743 | 0.000 |
luce | 0.402 | 0.162 | 9122.254 | 2.490 | 0.013 |
age | 0.017 | 0.003 | 9311.413 | 5.255 | 0.000 |
luce:age | -0.023 | 0.007 | 9184.104 | -3.545 | 0.000 |
And visualize.
ggplot(aoa_data %>%
mutate(luce_binned = cut(luce, quantile(luce, c(0,.25, .5, .75, 1)),
include.lowest = TRUE)),
aes(x = age, y = prop_looking, col = luce_binned)) +
geom_jitter(alpha = .1, width = .2) +
geom_smooth(method = "lm")
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).