02_trial_merge.Qmd

library(tidyverse)
library(janitor)

Attaching package: 'janitor'
The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(here)
here() starts at /Users/mcfrank/Projects/manybabies/mb1-africa
library(lme4)
Loading required package: Matrix

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
library(lmerTest)

Attaching package: 'lmerTest'
The following object is masked from 'package:lme4':

    lmer
The following object is masked from 'package:stats':

    step

Let’s read in trial data from the different labs.

Labs right now are: Senegal, Uganda, Malawi, Rwanda, Ghana, and Kenya.

Ghana

Ghana is in long form.

ghana <- readxl::read_xlsx(here("processed_data", "trials_cleaned", "Omane - Ghana.xlsx")) |>
  clean_names() |>
  mutate(looking_time_s = as.numeric(lookin_time_s), 
         lab = "Omane - Ghana") |>
  rename(order = test_order) |>
  select(lab, subid, order, trial_type, stimulus, trial_num, looking_time_s, 
         trial_error, trial_error_type) 
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `looking_time_s = as.numeric(lookin_time_s)`.
Caused by warning:
! NAs introduced by coercion

Just for fun.

ggplot(ghana, 
       aes(x = trial_num, y = looking_time_s, col = trial_type)) + 
  geom_jitter(width = .2, height = 0, alpha = .5) + 
  geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 28 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at -2.005
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 1.005
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 0
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1.01
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
-2.005
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
1.005
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
number 0
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : There are other near
singularities as well. 1.01
Warning: Removed 28 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(ghana, aes(x = looking_time_s, fill = trial_type)) + geom_histogram(binwidth = 1)
Warning: Removed 28 rows containing non-finite outside the scale range
(`stat_bin()`).

Uganda

Uganda is parsed in parse_uganda.Qmd.

uganda <- read_csv(here("processed_data","trials_cleaned","Kizito - Uganda parsed.csv"))
Rows: 579 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): trial_type, lab, subid
dbl (2): trial_num, looking_time_s

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(uganda, 
       aes(x = trial_num, y = looking_time_s, col = trial_type)) + 
  geom_jitter(width = .2, height = 0, alpha = .5) + 
  geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at -2.005
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 1.005
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 0
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1.01
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
-2.005
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
1.005
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
number 0
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : There are other near
singularities as well. 1.01

ggplot(uganda, aes(x = looking_time_s, fill = trial_type)) + geom_histogram(binwidth = 1)

Senegal, Malawi, Rwanda, Kenya, Nigeria

Next is a set in event format.

log_labs <- c("Diop - Senegal", "Lamba - Malawi", 
              "Mushimiyimana - Rwanda", "Ziedler - Kenya", "Bentu - Nigeria")

files <- lapply(log_labs, function (x) dir(path = here("processed_data", "trials_cleaned", x)))

log_labs_data_raw <- map_df(1:5, \(x) {
  read_csv(here("processed_data", "trials_cleaned", log_labs[x], files[x])) |>
    mutate(lab = log_labs[x])
  }) |>
  filter(!(SubjectID %in% c("Phase", "MBG_IDS"))) |>
  janitor::clean_names()
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 883 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): SubjectID, Phase, Order (randomization), Trial, Repeat, EndType, H...
dbl  (9): Trial Start, Trial End, TotalLook, TotalLookAway, TotalLeft, Total...
lgl  (4): StimLabel, Left, Right, ISS

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 1787 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): SubjectID, Phase, Order (randomization), Trial, Repeat, EndType, H...
dbl  (9): Trial Start, Trial End, TotalLook, TotalLookAway, TotalLeft, Total...
lgl  (4): StimLabel, Left, Right, ISS

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 2886 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): SubjectID, Phase, Order (randomization), Trial, Repeat, EndType, H...
dbl  (9): Trial Start, Trial End, TotalLook, TotalLookAway, TotalLeft, Total...
lgl  (4): StimLabel, Left, Right, ISS

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 1858 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): SubjectID, Phase, Order (randomization), Trial, Repeat, EndType, H...
dbl  (9): Trial Start, Trial End, TotalLook, TotalLookAway, TotalLeft, Total...
lgl  (4): StimLabel, Left, Right, ISS

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 491 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): SubjectID, Phase, Order (randomization), Trial, Repeat, EndType, H...
dbl  (9): Trial Start, Trial End, TotalLook, TotalLookAway, TotalLeft, Total...
lgl  (5): StimLabel, Left, Right, ISS, Looks

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Goal is; select(lab, subid, order, trial_type, stimulus, trial_num, looking_time_s, trial_error, trial_error_type)

log_labs_data <- log_labs_data_raw |>
  filter(end_type != "AGAbort") |>
  mutate(subid = subject_id, 
         order = as.numeric(str_sub(order_randomization, 7, 7)), 
         stimulus = str_replace(str_to_lower(stim_name), "\\_", ""), 
         trial_type = case_when(str_detect(stim_name, "IDS") ~ "IDS", 
                                str_detect(stim_name, "ADS") ~ "ADS", 
                                TRUE ~ "training"), 
         trial = as.numeric(trial), 
         trial_num = ifelse(trial < 3, trial - 3, trial - 2), 
         looking_time_s = total_look / 1000, 
         looking_time_diff = (trial_end - trial_start) / 1000, 
         total_center = total_center / 1000, 
         enabled_diff = (look_disabled - look_enabled) / 1000,
         trial_error = NA, 
         trial_error_type = NA) |>
  select(lab, subid, order, trial_type, stimulus, trial_num, looking_time_s, looking_time_diff, total_center, enabled_diff,
         trial_error, trial_error_type) 
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `order = as.numeric(str_sub(order_randomization, 7, 7))`.
Caused by warning:
! NAs introduced by coercion

Joint analysis

d <- bind_rows(ghana, uganda, log_labs_data)

We investigate what the different columns mean. Our hypothesis: - looking_time_s: total look = looking time, not including lookaways - enabled_diff: look_disabled - look_enabled = looking time, including lookaways but not attn getters - looking_time_diff: trial_end - trial_start = including lookaways AND attention getters

ggplot(d, 
       aes(x = trial_num, y = looking_time_s, col = trial_type)) + 
  geom_jitter(width = .2, height = 0, alpha = .5) + 
  geom_smooth(method = "lm") + 
  facet_wrap(~lab)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 28 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 28 rows containing missing values or values outside the scale range
(`geom_point()`).

We have issues with two labs.

  1. Senegal has a lot of zero looking times. It appears that this is due plausibly to two intersecting issues:
  1. a bug in habit such that if you hold down the key forever, you get LT = 0.
  2. probable misuse of the software by holding down the key a lot.
  1. Nigeria has no looking times and their enabled_diff is always 18s. I worry that they basically didn’t press any keys and just let the experiment run. I also think that they maybe did not use the orders that were assigned. (This is a separate problem).

Senegal

Can we rescue Senegal data? Let’s look at their data. They have a few looking times that are not 0 or 18.

ggplot(filter(d, lab == "Diop - Senegal"),
       aes(x = trial_num, y = looking_time_s, col = trial_type)) + 
  geom_jitter(width = .2, height = 0, alpha = .5) + 
  geom_smooth(method = "lm") + 
  facet_wrap(~lab)
`geom_smooth()` using formula = 'y ~ x'

Let’s look at the trial lengths (enabled_diff).

ggplot(filter(d, lab == "Diop - Senegal"),
       aes(x = trial_num, y = enabled_diff, col = trial_type)) + 
  geom_jitter(width = .2, height = 0, alpha = .5) + 
  geom_smooth(method = "lm") + 
  facet_wrap(~lab)
`geom_smooth()` using formula = 'y ~ x'

Reluctantly, I think we can’t include these data.

Data passing minimal quality checks

d_clean <- d |>
  filter(!(lab %in% c("Diop - Senegal", "Bentu - Nigeria")),
         looking_time_s > 0,
                   trial_type != "training")
ggplot(d_clean, 
       aes(x = trial_num, y = looking_time_s, col = trial_type)) + 
  geom_jitter(width = .2, height = 0, alpha = .5) + 
  scale_y_log10() + 
  facet_wrap(~lab) + 
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'

Models

d_clean$trial_num_centered <- d_clean$trial_num - 8.5

mod <- lmer(log(looking_time_s) ~ trial_num_centered * trial_type  + 
              (1 | subid) + 
              (trial_type | lab), 
     data = filter(d_clean))
summary(mod)
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: log(looking_time_s) ~ trial_num_centered * trial_type + (1 |  
    subid) + (trial_type | lab)
   Data: filter(d_clean)

REML criterion at convergence: 5391.6

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.2385 -0.5768  0.1161  0.6837  3.2203 

Random effects:
 Groups   Name          Variance  Std.Dev. Corr
 subid    (Intercept)   0.1376398 0.37100      
 lab      (Intercept)   0.1072212 0.32745      
          trial_typeIDS 0.0008757 0.02959  0.29
 Residual               0.4081313 0.63885      
Number of obs: 2598, groups:  subid, 175; lab, 5

Fixed effects:
                                   Estimate Std. Error         df t value
(Intercept)                       1.692e+00  1.502e-01  4.016e+00  11.263
trial_num_centered               -4.671e-02  4.006e-03  2.372e+03 -11.660
trial_typeIDS                     8.610e-02  2.859e-02  3.744e+00   3.012
trial_num_centered:trial_typeIDS -9.339e-04  5.697e-03  2.391e+03  -0.164
                                 Pr(>|t|)    
(Intercept)                      0.000346 ***
trial_num_centered                < 2e-16 ***
trial_typeIDS                    0.042996 *  
trial_num_centered:trial_typeIDS 0.869803    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) trl_n_ tr_IDS
trl_nm_cntr  0.021              
tril_typIDS  0.058 -0.092       
trl_n_:_IDS -0.014 -0.704  0.080