suppressMessages(library(here))
suppressMessages(library(jpeg))
suppressMessages(library(grid))
suppressMessages(library(lmerTest))
## Warning: package 'lme4' was built under R version 3.5.2
suppressMessages(library(car))
source(here::here("helper/common.R"))
## Warning: package 'purrr' was built under R version 3.5.2
##
## Attaching package: 'purrr'
## The following object is masked from 'package:car':
##
## some
## Warning: package 'readxl' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'rlang' was built under R version 3.5.2
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl,
## flatten_int, flatten_lgl, flatten_raw, invoke, list_along,
## modify, prepend, splice
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:rlang':
##
## set_names
## The following object is masked from 'package:purrr':
##
## set_names
## Warning: package 'knitr' was built under R version 3.5.2
## Warning: package 'DT' was built under R version 3.5.2
## Warning: package 'assertthat' was built under R version 3.5.2
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:rlang':
##
## has_name
## ── Attaching packages ─────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ readr 1.3.1
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## ── Conflicts ────────────────────────── tidyverse_conflicts() ──
## ✖ rlang::%@%() masks purrr::%@%()
## ✖ rlang::as_function() masks purrr::as_function()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ rlang::flatten() masks purrr::flatten()
## ✖ rlang::flatten_chr() masks purrr::flatten_chr()
## ✖ rlang::flatten_dbl() masks purrr::flatten_dbl()
## ✖ rlang::flatten_int() masks purrr::flatten_int()
## ✖ rlang::flatten_lgl() masks purrr::flatten_lgl()
## ✖ rlang::flatten_raw() masks purrr::flatten_raw()
## ✖ tibble::has_name() masks assertthat::has_name(), rlang::has_name()
## ✖ rlang::invoke() masks purrr::invoke()
## ✖ dplyr::lag() masks stats::lag()
## ✖ rlang::list_along() masks purrr::list_along()
## ✖ rlang::modify() masks purrr::modify()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ rlang::prepend() masks purrr::prepend()
## ✖ dplyr::recode() masks car::recode()
## ✖ magrittr::set_names() masks rlang::set_names(), purrr::set_names()
## ✖ purrr::some() masks car::some()
## ✖ rlang::splice() masks purrr::splice()
## ✖ tidyr::unpack() masks Matrix::unpack()
source(here("helper/preprocessing_helper.R"))
knitr::opts_chunk$set(cache = TRUE, warn = FALSE, message = FALSE)
Intro
Pilot data analysis for MB2.
We can re-preprocess all of our data, this is set not to evaluate by default. In order to do this, you will need to register with Eyelink to get their binary package and then install edfR
.
labs <- dir(here::here("pilot_data"))
for (lab in labs) {
print(lab)
source(here::here("pilot_data", lab, "import_scripts", "import.R"))
}
File reading
labs <- dir(here::here("pilot_data"))
d <- labs %>%
map_df(function(lab) {
aoi_data <- read_csv(here(paste0("pilot_data/",
lab,"/processed_data/aoi_data.csv")))
subjects <- read_csv(here(paste0("pilot_data/",
lab,"/processed_data/subjects.csv")))
trials <- read_csv(here(paste0("pilot_data/",
lab,"/processed_data/trials.csv")))
datasets <- read_csv(here(paste0("pilot_data/",
lab,"/processed_data/datasets.csv")))
left_join(aoi_data, subjects) %>%
left_join(trials) %>%
left_join(datasets) %>%
select(lab_subject_id, lab_dataset_id, lab_trial_id, trial_id,
age, t, aoi, trial_num, error) %>%
rename(subid = lab_subject_id,
lab = lab_dataset_id,
stimulus = lab_trial_id)
})
Exclusions
# exclude subject marked with any error and/or less than 8 trials
d <- d %>%
group_by(lab, subid) %>%
mutate(error_subj = any(error)) %>%
exclude_by(quo(error_subj), quiet=FALSE)
##
## filtering by error_subj
## [1] "This variable excludes 50 trials, which is 8.3 % of all trials."
## [1] "9 subjects, 11.4 %, have all trials where error_subj is true: exclude"
## [1] "Excluded: "
##
##
## lab subid
## ------------------- --------
## CEU_babylab mb2_p14
## CEU_babylab mb2_p8
## copenhagen_babylab mb2p01
## copenhagen_babylab mb2p05
## copenhagen_babylab mb2p13
# exclude trials under 35s (which are not complete trials)
d <- ungroup(d) %>%
group_by(lab, trial_id, subid) %>%
mutate(time_range = (max(t) - min(t))/1000) %>%
exclude_by(quo(time_range <= 35), quiet=FALSE)
##
## filtering by time_range <= 35
## [1] "This variable excludes 2 trials, which is 0.4 % of all trials."
## [1] "0 subjects, 0 %, have all trials where time_range <= 35 is true: exclude"
## [1] "Excluded: "
##
##
## lab trial_id subid
## -------------- --------- -------
## CEU_babylab 86 mb2_p5
## ubc_infantlab 11 AC15
# print trial time ranges by lab
ungroup(d) %>%
group_by(lab) %>%
summarise(shortest_trial=min(time_range),
longest_trial=max(time_range)) %>%
kable(digits=2)
CEU_babylab |
38.58 |
38.62 |
copenhagen_babylab |
38.58 |
38.62 |
goettingen_babylab |
38.30 |
38.73 |
lmu_babylab |
38.58 |
38.73 |
trento_babylab |
38.70 |
38.75 |
ubc_infantlab |
36.45 |
38.70 |
uchicago_babylab |
38.55 |
38.62 |
# exclude subjects which did not complete all 8 trials
d <- ungroup(d) %>%
group_by(lab, subid) %>%
mutate(trials_completed = length(unique(trial_id))) %>%
exclude_by(quo(trials_completed < 8),quiet=FALSE)
##
## filtering by trials_completed < 8
## [1] "This variable excludes 10 trials, which is 1.8 % of all trials."
## [1] "2 subjects, 2.9 %, have all trials where trials_completed < 8 is true: exclude"
## [1] "Excluded: "
##
##
## lab subid
## -------------- -------
## CEU_babylab mb2_p5
## ubc_infantlab AC15
Analysis
Descriptives
d %>%
group_by(lab, subid) %>%
summarise(age = mean(age)) %>%
summarise(n = n(),
age = mean(age)/30.25) %>%
kable(digits = 2)
CEU_babylab |
13 |
24.73 |
copenhagen_babylab |
10 |
22.18 |
goettingen_babylab |
6 |
24.79 |
lmu_babylab |
13 |
22.99 |
trento_babylab |
5 |
21.41 |
ubc_infantlab |
10 |
21.22 |
uchicago_babylab |
11 |
23.70 |
Anticipation plot across all trials.
ms <- d %>%
group_by(t, trial_num) %>%
summarise(target = mean(aoi == "target", na.rm=TRUE),
distractor = mean(aoi == "distractor", na.rm=TRUE)) %>%
gather(region, looking, target, distractor)
ggplot(ms, aes(x = t, y = looking, col = region)) +
geom_line() +
geom_vline(xintercept = 120, col = "red", lty = 2)

In the primary time period of interest
ms <- d %>%
group_by(t) %>%
summarise(target = mean(aoi == "target", na.rm = TRUE),
distractor = mean(aoi == "distractor", na.rm = TRUE)) %>%
gather(region, looking, target, distractor)
ggplot(ms, aes(x = t, y = looking, col = region)) +
geom_point() +
xlim(-4000 + 120, 4000 + 120) +
geom_vline(xintercept = 120, col = "red", lty = 2) +
geom_text(x = -4000, y = .95, group = 1, col = "black",
label = "Anticipation", hjust = 0) +
geom_text(x = 200, y = .95, group = 1, col = "black",
label = "Reaction", hjust = 0)

Now, broken down by trial. Summary across anticipation window.
ms <- d %>%
filter(t > -4000, t < 120) %>%
group_by(lab, subid, trial_num) %>%
summarise(target = mean(aoi == "target", na.rm = TRUE)) %>%
group_by(trial_num) %>%
langcog::multi_boot_standard(col = "target", na.rm = TRUE)
ggplot(ms, aes(x = trial_num, y = mean)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_line()

Binned for cleaner curves
ms <- d %>%
mutate(block = ifelse(trial_num < 5, "Trials 1-4", "Trials 5-8")) %>%
group_by(t, block) %>%
summarise(target = mean(aoi == "target", na.rm = TRUE),
distractor = mean(aoi == "distractor", na.rm = TRUE)) %>%
gather(region, looking, target, distractor)
ggplot(ms, aes(x = t, y = looking, col = region)) +
geom_point() +
# geom_smooth(span = 2, se = FALSE) +
xlim(-4000 + 120, 4000 + 120) +
geom_vline(xintercept = 120, col = "black", lty = 3) +
annotate("text", x = -3800, y = 1, col = "black",
label = "Anticipation", hjust = 0) +
annotate("text", x = 200, y = 1, col = "black",
label = "Reaction", hjust = 0) +
ggthemes::scale_color_solarized(name = "Area of Interest") +
xlab("Time (msec)") +
ylab("Proportion gaze in AOI") +
theme(legend.position = "bottom") +
facet_wrap(~block)

And by lab:
ms <- d %>%
mutate(block = ifelse(trial_num < 5, 1, 2)) %>%
group_by(t, lab, block) %>%
summarise(target = mean(aoi == "target", na.rm = TRUE),
distractor = mean(aoi == "distractor", na.rm = TRUE)) %>%
gather(region, looking, target, distractor)
ggplot(ms, aes(x = t, y = looking, col = region)) +
geom_point() +
# geom_smooth(span = 2, se = FALSE) +
xlim(-4000 + 120, 4000 + 120) +
geom_vline(xintercept = 120, col = "red", lty = 2) +
facet_grid(lab~block)

Main analysis 1: First anticipatory look
First, create a function that extract the time stamp of first aoi.
t_first_aoi <- function(df){
df <- df %>%
filter(t >= -4000, t <= 120, aoi == c("target", "distractor"))
df = df %>% group_by(subid, trial_num) %>% mutate(first_look_time = t[1])
return(df)
}
Second, create a dataframe to identify the first look and see if the first look is >100ms
Note that this is done ad-hoc for a number of participants. TODO - cleanup!
df_t_first_aoi <- d %>% split(.$subid, .$trial_num) %>%
map_df(t_first_aoi) %>%
mutate(aoi_dummy = case_when(aoi == "target" ~ 1, aoi == "distractor" ~ 0),
subid_trial = paste0(subid, trial_num))
df_t_first_aoi <- df_t_first_aoi %>%
group_by(subid, trial_num) %>%
mutate(diff = aoi_dummy - lag(aoi_dummy))
head_4 <- function(df){df %>% slice(1:4)}
df_t_4 <- df_t_first_aoi %>% split(.$subid, .$trial_num) %>%
map_df(head_4) %>%
filter(diff != 0) %>%
mutate(subid_trial = paste0(subid, trial_num))
unique(df_t_4$subid_trial)
## [1] "lmu_child044" "lmu_child111" "lmu_child115" "lmu_child121"
## [5] "mb2_p123" "mb2_p164" "mb2_p37" "MB2_P46"
## [9] "MB2_P56" "MB2-108" "MB2-117" "MB2-124"
## [13] "MB2-126" "mb2p067" "mb2p103" "mb2p114"
## [17] "mb2p126" "mb2p127"
df_t_change_kids <- df_t_first_aoi %>%
mutate(subid_trial = paste0(subid, trial_num)) %>%
filter(subid_trial %in% df_t_4$subid_trial) %>%
select(subid_trial, subid, trial_num, t, first_look_time, aoi, aoi_dummy)
df_t_first_aoi <- df_t_first_aoi %>%
select(subid_trial, subid, lab, stimulus, age, trial_num, t, first_look_time, aoi, aoi_dummy) %>%
filter(t == first_look_time) %>%
mutate(first_look_location = case_when(subid_trial == "lmu_child044" ~ 1,
subid_trial == "lmu_child111" ~ 0,
subid_trial == "lmu_child115" ~ 0,
subid_trial == "lmu_child121" ~ 1,
subid_trial == "mb2_p123" ~ 1,
subid_trial == "mb2_p164" ~ 1,
subid_trial == "mb2_p37" ~ 0,
subid_trial == "MB2_P46" ~ 0,
subid_trial == "MB2_P56" ~ 0,
subid_trial == "MB2-108" ~ 0,
subid_trial == "MB2-117" ~ 1,
subid_trial == "MB2-124" ~ 1,
subid_trial == "MB2-126" ~ 1,
subid_trial == "mb2p067" ~ 1,
subid_trial == "mb2p103" ~ 1,
subid_trial == "mb2p114" ~ NA_real_,
subid_trial == "mb2p126" ~ 0,
subid_trial == "mb2p127" ~ 1,
TRUE ~ aoi_dummy)) %>%
filter(!is.na(first_look_location))
Data visualization across different test trials
ggplot(df_t_first_aoi, aes(x = first_look_location)) +
geom_bar() +
facet_wrap(.~trial_num, ncol = 4)

Descriptive stats
mean and sd of first look location in different trials. All numbers seem to be above 50%
df_t_first_aoi %>%
select(trial_num, first_look_location) %>%
group_by(trial_num) %>%
summarize(number_of_kids_each_trial = n(),
mean(first_look_location),
sd(first_look_location))
## # A tibble: 8 x 4
## trial_num number_of_kids_each_… `mean(first_look_lo… `sd(first_look_loca…
## <dbl> <int> <dbl> <dbl>
## 1 1 61 0.852 0.358
## 2 2 60 0.633 0.486
## 3 3 59 0.746 0.439
## 4 4 55 0.836 0.373
## 5 5 60 0.55 0.502
## 6 6 60 0.75 0.437
## 7 7 51 0.667 0.476
## 8 8 55 0.6 0.494
Mixed-level analysis: First look location
first_look_logistic <- glmer(first_look_location ~ trial_num + (1|subid), data = df_t_first_aoi, family = "binomial")
summary(first_look_logistic)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: first_look_location ~ trial_num + (1 | subid)
## Data: df_t_first_aoi
##
## AIC BIC logLik deviance df.resid
## 557.5 569.9 -275.8 551.5 458
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0057 -1.2246 0.5488 0.6507 0.9099
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid (Intercept) 0.2019 0.4493
## Number of obs: 461, groups: subid, 63
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.41505 0.24904 5.682 1.33e-08 ***
## trial_num -0.11355 0.04637 -2.449 0.0143 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## trial_num -0.872
Mean level.
mean(df_t_first_aoi$first_look_location)
## [1] 0.7049892
Main analysis 2: Differential analysis
Data visualization across different test trials
ms_proportion_trials <- d %>%
group_by(t, trial_num) %>%
summarise(target = mean(aoi == "target", na.rm = TRUE),
distractor = mean(aoi == "distractor", na.rm = TRUE)) %>%
mutate(proportion_to_target = target / (target + distractor)) %>%
filter(t >= -4000 & t <= 120+ 4000)
ggplot(ms_proportion_trials,
aes(x = t, y = proportion_to_target)) +
geom_point(alpha = 0.3) +
ylim(0,1) +
# geom_smooth(span = 2, se = FALSE) +
xlim(-4000 + 120, 4000 + 120) +
geom_vline(xintercept = 120, col = "red", lty = 2) +
geom_text(x = -4000, y = .95, group = 1, col = "black",
label = "Anticipation", hjust = 0 ) +
geom_text(x = 200, y = .95, group = 1, col = "black",
label = "Reaction", hjust = 0 )+
facet_wrap(.~trial_num, ncol = 4)

Create data-set for the mixed-level analysis
ms_proportion_trials <- d %>%
filter (t >= -4000 & t <= 120) %>%
group_by(trial_num, subid, lab) %>%
summarise(mean_target = mean(aoi == "target", na.rm = TRUE),
sem_target = sd(aoi == "target", na.rm = TRUE) / sqrt(length(!is.na(aoi))),
mean_distractor = mean(aoi == "distractor", na.rm = TRUE),
sem_distractor = sd(aoi == "distractor", na.rm = TRUE) / sqrt(length(!is.na(aoi))),
up_ci_target = mean_target + (sem_target * 1.96),
low_ci_target = mean_target - (sem_target * 1.96),
up_ci_distractor = mean_distractor + (sem_distractor * 1.96),
low_ci_distractor = mean_distractor - (sem_distractor * 1.96)) %>%
mutate(proportion_to_target = mean_target / (mean_target + mean_distractor))
Mixed-level analysis for diff-score analysis
Note that technically we had 68 participants, but this analysis was based on 63 kids because not all kids looked at either target/distractor between -4000 ms to 120 ms (i.e., they either looked away (missing data) or looked at other parts of the screen (other))
# Lmer_originally, I tried the following but it didn't converge: lmer(proportion_to_target ~ trial_num + + (1|lab) + (trial_num|subid), data = ms_proportion_trials), so I changed the model to a simpler one below
prop_model <- lmer(proportion_to_target ~ trial_num + (1|subid), data = ms_proportion_trials)
summary(prop_model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: proportion_to_target ~ trial_num + (1 | subid)
## Data: ms_proportion_trials
##
## REML criterion at convergence: 372.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.1939 -0.6319 0.3596 0.8207 1.1991
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid (Intercept) 0.004365 0.06607
## Residual 0.117512 0.34280
## Number of obs: 494, groups: subid, 63
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.752455 0.034608 406.548470 21.74 <2e-16 ***
## trial_num -0.014235 0.006746 441.916651 -2.11 0.0354 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## trial_num -0.860
# did a linear hypothesis test to see if the intercept of the prop_model is signficantly higher than chance level (0.5)
linearHypothesis(prop_model, "(Intercept) = 0.5")
## Linear hypothesis test
##
## Hypothesis:
## (Intercept) = 0.5
##
## Model 1: restricted model
## Model 2: proportion_to_target ~ trial_num + (1 | subid)
##
## Df Chisq Pr(>Chisq)
## 1
## 2 1 53.212 2.995e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# the linear hypothesis test is also confirmed by calculating the z score of this intercept, I used the coefficient and the sd of the random intercept
z_intercept <- 0.7525/0.066 #much higher than 1.96,