suppressMessages(library(here))
suppressMessages(library(jpeg))
suppressMessages(library(grid))
suppressMessages(library(lmerTest))

## Warning: package 'lme4' was built under R version 3.5.2

suppressMessages(library(car))

source(here::here("helper/common.R"))

## Warning: package 'purrr' was built under R version 3.5.2

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:car':
## 
##     some

## Warning: package 'readxl' was built under R version 3.5.2

## Warning: package 'stringr' was built under R version 3.5.2

## Warning: package 'rlang' was built under R version 3.5.2

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl,
##     flatten_int, flatten_lgl, flatten_raw, invoke, list_along,
##     modify, prepend, splice

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:rlang':
## 
##     set_names

## The following object is masked from 'package:purrr':
## 
##     set_names

## Warning: package 'knitr' was built under R version 3.5.2

## Warning: package 'DT' was built under R version 3.5.2

## Warning: package 'assertthat' was built under R version 3.5.2

## 
## Attaching package: 'assertthat'

## The following object is masked from 'package:rlang':
## 
##     has_name

## ── Attaching packages ─────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0     ✔ readr   1.3.1
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ forcats 0.3.0

## Warning: package 'tibble' was built under R version 3.5.2

## Warning: package 'tidyr' was built under R version 3.5.2

## Warning: package 'dplyr' was built under R version 3.5.2

## ── Conflicts ────────────────────────── tidyverse_conflicts() ──
## ✖ rlang::%@%()          masks purrr::%@%()
## ✖ rlang::as_function()  masks purrr::as_function()
## ✖ tidyr::expand()       masks Matrix::expand()
## ✖ tidyr::extract()      masks magrittr::extract()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ rlang::flatten()      masks purrr::flatten()
## ✖ rlang::flatten_chr()  masks purrr::flatten_chr()
## ✖ rlang::flatten_dbl()  masks purrr::flatten_dbl()
## ✖ rlang::flatten_int()  masks purrr::flatten_int()
## ✖ rlang::flatten_lgl()  masks purrr::flatten_lgl()
## ✖ rlang::flatten_raw()  masks purrr::flatten_raw()
## ✖ tibble::has_name()    masks assertthat::has_name(), rlang::has_name()
## ✖ rlang::invoke()       masks purrr::invoke()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ rlang::list_along()   masks purrr::list_along()
## ✖ rlang::modify()       masks purrr::modify()
## ✖ tidyr::pack()         masks Matrix::pack()
## ✖ rlang::prepend()      masks purrr::prepend()
## ✖ dplyr::recode()       masks car::recode()
## ✖ magrittr::set_names() masks rlang::set_names(), purrr::set_names()
## ✖ purrr::some()         masks car::some()
## ✖ rlang::splice()       masks purrr::splice()
## ✖ tidyr::unpack()       masks Matrix::unpack()

source(here("helper/preprocessing_helper.R"))

knitr::opts_chunk$set(cache = TRUE, warn = FALSE, message = FALSE)

1 Intro

Pilot data analysis for MB2.

We can re-preprocess all of our data, this is set not to evaluate by default. In order to do this, you will need to register with Eyelink to get their binary package and then install edfR.

labs <- dir(here::here("pilot_data"))

for (lab in labs) {
  print(lab)
  source(here::here("pilot_data", lab, "import_scripts", "import.R"))
}

2 File reading

labs <- dir(here::here("pilot_data"))

d <- labs %>%
  map_df(function(lab) {
    aoi_data <- read_csv(here(paste0("pilot_data/",
                                     lab,"/processed_data/aoi_data.csv"))) 
    subjects <- read_csv(here(paste0("pilot_data/",
                                     lab,"/processed_data/subjects.csv"))) 
    trials <- read_csv(here(paste0("pilot_data/",
                                   lab,"/processed_data/trials.csv"))) 
    datasets <- read_csv(here(paste0("pilot_data/",
                                     lab,"/processed_data/datasets.csv")))
    
    left_join(aoi_data, subjects) %>%
      left_join(trials) %>%
      left_join(datasets) %>%
      select(lab_subject_id, lab_dataset_id, lab_trial_id, trial_id, 
             age, t, aoi, trial_num, error) %>%
      rename(subid = lab_subject_id, 
             lab = lab_dataset_id, 
             stimulus = lab_trial_id)
  })

3 Exclusions

# exclude subject marked with any error and/or less than 8 trials
d <- d %>% 
  group_by(lab, subid) %>%
  mutate(error_subj = any(error)) %>%
      exclude_by(quo(error_subj), quiet=FALSE)

## 
## filtering by error_subj 
## [1] "This variable excludes 50 trials, which is  8.3 % of all trials."
## [1] "9  subjects, 11.4 %, have all trials where error_subj is true: exclude"
## [1] "Excluded: "
## 
## 
## lab                  subid   
## -------------------  --------
## CEU_babylab          mb2_p14 
## CEU_babylab          mb2_p8  
## copenhagen_babylab   mb2p01  
## copenhagen_babylab   mb2p05  
## copenhagen_babylab   mb2p13

# exclude trials under 35s (which are not complete trials)
d <- ungroup(d) %>% 
  group_by(lab, trial_id, subid) %>%
  mutate(time_range = (max(t) - min(t))/1000) %>%
          exclude_by(quo(time_range <= 35), quiet=FALSE)

## 
## filtering by time_range <= 35 
## [1] "This variable excludes 2 trials, which is  0.4 % of all trials."
## [1] "0  subjects, 0 %, have all trials where time_range <= 35 is true: exclude"
## [1] "Excluded: "
## 
## 
## lab              trial_id  subid  
## --------------  ---------  -------
## CEU_babylab            86  mb2_p5 
## ubc_infantlab          11  AC15

# print trial time ranges by lab
ungroup(d) %>%
  group_by(lab) %>% 
  summarise(shortest_trial=min(time_range),
            longest_trial=max(time_range)) %>%
  kable(digits=2)

lab	shortest_trial	longest_trial
CEU_babylab	38.58	38.62
copenhagen_babylab	38.58	38.62
goettingen_babylab	38.30	38.73
lmu_babylab	38.58	38.73
trento_babylab	38.70	38.75
ubc_infantlab	36.45	38.70
uchicago_babylab	38.55	38.62

# exclude subjects which did not complete all 8 trials
d <- ungroup(d) %>% 
  group_by(lab, subid) %>%
  mutate(trials_completed = length(unique(trial_id))) %>%
            exclude_by(quo(trials_completed < 8),quiet=FALSE)

## 
## filtering by trials_completed < 8 
## [1] "This variable excludes 10 trials, which is  1.8 % of all trials."
## [1] "2  subjects, 2.9 %, have all trials where trials_completed < 8 is true: exclude"
## [1] "Excluded: "
## 
## 
## lab             subid  
## --------------  -------
## CEU_babylab     mb2_p5 
## ubc_infantlab   AC15

4 Analysis

Descriptives

d %>%
  group_by(lab, subid) %>%
  summarise(age = mean(age)) %>%
  summarise(n = n(), 
            age = mean(age)/30.25) %>%
  kable(digits = 2)

lab	n	age
CEU_babylab	13	24.73
copenhagen_babylab	10	22.18
goettingen_babylab	6	24.79
lmu_babylab	13	22.99
trento_babylab	5	21.41
ubc_infantlab	10	21.22
uchicago_babylab	11	23.70

Anticipation plot across all trials.

ms <- d %>%
  group_by(t, trial_num) %>%
  summarise(target = mean(aoi == "target", na.rm=TRUE),
            distractor = mean(aoi == "distractor", na.rm=TRUE)) %>%
  gather(region, looking, target, distractor) 

ggplot(ms, aes(x = t, y = looking, col = region)) + 
  geom_line() + 
  geom_vline(xintercept = 120, col = "red", lty = 2)

In the primary time period of interest

ms <- d %>%
  group_by(t) %>%
  summarise(target = mean(aoi == "target", na.rm = TRUE),
            distractor = mean(aoi == "distractor", na.rm = TRUE)) %>%
  gather(region, looking, target, distractor) 
  
ggplot(ms, aes(x = t, y = looking, col = region)) +
  geom_point() + 
  xlim(-4000 + 120, 4000 + 120) + 
  geom_vline(xintercept = 120, col = "red", lty = 2) + 
  geom_text(x = -4000, y = .95, group = 1, col = "black", 
            label = "Anticipation", hjust = 0) + 
  geom_text(x = 200, y = .95, group = 1, col = "black", 
            label = "Reaction", hjust = 0)

Now, broken down by trial. Summary across anticipation window.

ms <- d %>%
  filter(t > -4000, t < 120) %>%
  group_by(lab, subid, trial_num) %>%
  summarise(target = mean(aoi == "target", na.rm = TRUE)) %>%
  group_by(trial_num) %>%
  langcog::multi_boot_standard(col = "target", na.rm = TRUE)


ggplot(ms, aes(x = trial_num, y = mean)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) + 
  geom_line()

Binned for cleaner curves

ms <- d %>%
  mutate(block = ifelse(trial_num < 5, "Trials 1-4", "Trials 5-8")) %>%
  group_by(t, block) %>%
  summarise(target = mean(aoi == "target", na.rm = TRUE),
            distractor = mean(aoi == "distractor", na.rm = TRUE)) %>%
  gather(region, looking, target, distractor) 
  
ggplot(ms, aes(x = t, y = looking, col = region)) +
  geom_point() + 
  # geom_smooth(span = 2, se = FALSE) + 
  xlim(-4000 + 120, 4000 + 120) +
  geom_vline(xintercept = 120, col = "black", lty = 3) + 
  annotate("text", x = -3800, y = 1, col = "black",
            label = "Anticipation", hjust = 0) +
  annotate("text", x = 200, y = 1, col = "black", 
            label = "Reaction", hjust = 0) + 
  ggthemes::scale_color_solarized(name = "Area of Interest") + 
  xlab("Time (msec)") + 
  ylab("Proportion gaze in AOI") + 
  theme(legend.position = "bottom") + 
  facet_wrap(~block)

And by lab:

ms <- d %>%
  mutate(block = ifelse(trial_num < 5, 1, 2)) %>%
  group_by(t, lab, block) %>%
  summarise(target = mean(aoi == "target", na.rm = TRUE),
            distractor = mean(aoi == "distractor", na.rm = TRUE)) %>%
  gather(region, looking, target, distractor) 
  
ggplot(ms, aes(x = t, y = looking, col = region)) +
  geom_point() + 
  # geom_smooth(span = 2, se = FALSE) + 
  xlim(-4000 + 120, 4000 + 120) + 
  geom_vline(xintercept = 120, col = "red", lty = 2) +
  facet_grid(lab~block)

5 Main analysis 1: First anticipatory look

First, create a function that extract the time stamp of first aoi.

t_first_aoi <- function(df){
  
  df <- df %>% 
    filter(t >= -4000, t <= 120, aoi == c("target", "distractor")) 
  
 df = df %>% group_by(subid, trial_num) %>% mutate(first_look_time = t[1])
  
  return(df)
}

Second, create a dataframe to identify the first look and see if the first look is >100ms

Note that this is done ad-hoc for a number of participants. TODO - cleanup!

df_t_first_aoi <- d %>% split(.$subid, .$trial_num) %>% 
  map_df(t_first_aoi) %>% 
  mutate(aoi_dummy = case_when(aoi == "target" ~ 1, aoi == "distractor" ~ 0),
         subid_trial = paste0(subid, trial_num)) 

df_t_first_aoi <- df_t_first_aoi %>% 
                  group_by(subid, trial_num) %>% 
                  mutate(diff = aoi_dummy - lag(aoi_dummy))

head_4 <- function(df){df %>% slice(1:4)}

df_t_4 <- df_t_first_aoi %>% split(.$subid, .$trial_num) %>%
                  map_df(head_4) %>%
                  filter(diff != 0) %>% 
                  mutate(subid_trial = paste0(subid, trial_num))

unique(df_t_4$subid_trial)

##  [1] "lmu_child044" "lmu_child111" "lmu_child115" "lmu_child121"
##  [5] "mb2_p123"     "mb2_p164"     "mb2_p37"      "MB2_P46"     
##  [9] "MB2_P56"      "MB2-108"      "MB2-117"      "MB2-124"     
## [13] "MB2-126"      "mb2p067"      "mb2p103"      "mb2p114"     
## [17] "mb2p126"      "mb2p127"

df_t_change_kids <- df_t_first_aoi %>% 
                    mutate(subid_trial = paste0(subid, trial_num)) %>% 
                    filter(subid_trial %in% df_t_4$subid_trial) %>% 
                    select(subid_trial, subid, trial_num, t, first_look_time, aoi, aoi_dummy)

df_t_first_aoi <- df_t_first_aoi %>% 
  select(subid_trial, subid, lab, stimulus, age, trial_num, t, first_look_time, aoi, aoi_dummy) %>% 
  filter(t == first_look_time) %>% 
  mutate(first_look_location = case_when(subid_trial == "lmu_child044" ~ 1,
                                       subid_trial == "lmu_child111" ~ 0,
                                       subid_trial == "lmu_child115" ~ 0,
                                       subid_trial == "lmu_child121" ~ 1,
                                       subid_trial == "mb2_p123" ~ 1,
                                       subid_trial == "mb2_p164" ~ 1,
                                       subid_trial == "mb2_p37" ~ 0,
                                       subid_trial == "MB2_P46" ~ 0,
                                       subid_trial == "MB2_P56" ~ 0,
                                       subid_trial == "MB2-108" ~ 0,
                                       subid_trial == "MB2-117" ~ 1,
                                       subid_trial == "MB2-124" ~ 1,
                                       subid_trial == "MB2-126" ~ 1,
                                       subid_trial == "mb2p067" ~ 1,
                                       subid_trial == "mb2p103" ~ 1,
                                       subid_trial == "mb2p114" ~ NA_real_,
                                       subid_trial == "mb2p126" ~ 0,
                                       subid_trial == "mb2p127" ~ 1,
                                       TRUE ~ aoi_dummy)) %>% 
  filter(!is.na(first_look_location))

5.1 Data visualization across different test trials

ggplot(df_t_first_aoi, aes(x = first_look_location)) + 
  geom_bar() + 
  facet_wrap(.~trial_num, ncol = 4)

5.2 Descriptive stats

mean and sd of first look location in different trials. All numbers seem to be above 50%

df_t_first_aoi %>% 
  select(trial_num, first_look_location) %>% 
  group_by(trial_num) %>% 
  summarize(number_of_kids_each_trial = n(),
            mean(first_look_location),
            sd(first_look_location))

## # A tibble: 8 x 4
##   trial_num number_of_kids_each_… `mean(first_look_lo… `sd(first_look_loca…
##       <dbl>                 <int>                <dbl>                <dbl>
## 1         1                    61                0.852                0.358
## 2         2                    60                0.633                0.486
## 3         3                    59                0.746                0.439
## 4         4                    55                0.836                0.373
## 5         5                    60                0.55                 0.502
## 6         6                    60                0.75                 0.437
## 7         7                    51                0.667                0.476
## 8         8                    55                0.6                  0.494

5.3 Mixed-level analysis: First look location

first_look_logistic <- glmer(first_look_location ~ trial_num + (1|subid), data = df_t_first_aoi, family = "binomial")

summary(first_look_logistic)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: first_look_location ~ trial_num + (1 | subid)
##    Data: df_t_first_aoi
## 
##      AIC      BIC   logLik deviance df.resid 
##    557.5    569.9   -275.8    551.5      458 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0057 -1.2246  0.5488  0.6507  0.9099 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  subid  (Intercept) 0.2019   0.4493  
## Number of obs: 461, groups:  subid, 63
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.41505    0.24904   5.682 1.33e-08 ***
## trial_num   -0.11355    0.04637  -2.449   0.0143 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## trial_num -0.872

Mean level.

mean(df_t_first_aoi$first_look_location)

## [1] 0.7049892

6 Main analysis 2: Differential analysis

6.1 Data visualization across different test trials

ms_proportion_trials <- d %>%
  group_by(t, trial_num) %>%
  summarise(target = mean(aoi == "target", na.rm = TRUE),
            distractor = mean(aoi == "distractor", na.rm = TRUE)) %>%
  mutate(proportion_to_target = target / (target + distractor)) %>% 
  filter(t >= -4000 & t <= 120+ 4000)


ggplot(ms_proportion_trials, 
       aes(x = t, y = proportion_to_target)) +
  geom_point(alpha = 0.3) + 
  ylim(0,1) + 
  # geom_smooth(span = 2, se = FALSE) + 
  xlim(-4000 + 120, 4000 + 120) + 
  geom_vline(xintercept = 120, col = "red", lty = 2) + 
  geom_text(x = -4000, y = .95, group = 1, col = "black", 
            label = "Anticipation", hjust = 0 ) + 
  geom_text(x = 200, y = .95, group = 1, col = "black", 
            label = "Reaction", hjust = 0 )+
  facet_wrap(.~trial_num, ncol = 4)

Create data-set for the mixed-level analysis

ms_proportion_trials <- d %>%
  filter (t >= -4000 & t <= 120) %>% 
  group_by(trial_num, subid, lab) %>%
  summarise(mean_target = mean(aoi == "target", na.rm = TRUE),
            sem_target = sd(aoi == "target", na.rm = TRUE) / sqrt(length(!is.na(aoi))),
            mean_distractor = mean(aoi == "distractor", na.rm = TRUE), 
            sem_distractor = sd(aoi == "distractor", na.rm = TRUE) / sqrt(length(!is.na(aoi))),
            up_ci_target = mean_target + (sem_target * 1.96),
            low_ci_target = mean_target - (sem_target * 1.96),
            up_ci_distractor = mean_distractor + (sem_distractor * 1.96),
            low_ci_distractor = mean_distractor - (sem_distractor * 1.96)) %>% 
  mutate(proportion_to_target = mean_target / (mean_target + mean_distractor))

6.2 Mixed-level analysis for diff-score analysis

Note that technically we had 68 participants, but this analysis was based on 63 kids because not all kids looked at either target/distractor between -4000 ms to 120 ms (i.e., they either looked away (missing data) or looked at other parts of the screen (other))

# Lmer_originally, I tried the following but it didn't converge:  lmer(proportion_to_target ~ trial_num + + (1|lab) + (trial_num|subid), data = ms_proportion_trials), so I changed the model to a simpler one below
prop_model <- lmer(proportion_to_target ~ trial_num + (1|subid), data = ms_proportion_trials)
summary(prop_model)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: proportion_to_target ~ trial_num + (1 | subid)
##    Data: ms_proportion_trials
## 
## REML criterion at convergence: 372.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1939 -0.6319  0.3596  0.8207  1.1991 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  subid    (Intercept) 0.004365 0.06607 
##  Residual             0.117512 0.34280 
## Number of obs: 494, groups:  subid, 63
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   0.752455   0.034608 406.548470   21.74   <2e-16 ***
## trial_num    -0.014235   0.006746 441.916651   -2.11   0.0354 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## trial_num -0.860

# did a linear hypothesis test to see if the intercept of the prop_model is signficantly higher than chance level (0.5)
linearHypothesis(prop_model, "(Intercept) = 0.5")

## Linear hypothesis test
## 
## Hypothesis:
## (Intercept) = 0.5
## 
## Model 1: restricted model
## Model 2: proportion_to_target ~ trial_num + (1 | subid)
## 
##   Df  Chisq Pr(>Chisq)    
## 1                         
## 2  1 53.212  2.995e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# the linear hypothesis test is also confirmed by calculating the z score of this intercept, I used the coefficient and the sd of the random intercept
z_intercept <- 0.7525/0.066 #much higher than 1.96,

MB2 Pilot Analysis

The ManyBabies Analysis Team

Sun Nov 24 21:01:26 2019