library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(here)

## here() starts at /Users/caoanjie/Desktop/projects/looking_time/preschooler-analysis

source(here("helper/clean_data.R"))

MERGED_DATA_PATH <- here("data/01_merged_data/merged_data.csv")
bing_d <- read_csv(here("data/bing_info.csv"))

## Rows: 212 Columns: 8

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): bing_id, redcap_event_name, redcap_repeat_instrument
## dbl  (4): redcap_repeat_instance, study_name_frank, child_age_today_scheduli...
## date (1): date_of_test
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

bing_d <- bing_d %>% filter(study_name_frank == 16) %>% 
  select(bing_id, child_age_today_scheduling) %>% 
  mutate(child_age_group = case_when(
    child_age_today_scheduling > 3 & child_age_today_scheduling <= 4 ~ "3", 
    child_age_today_scheduling > 4 & child_age_today_scheduling <= 5 ~ "4",
    child_age_today_scheduling > 5 & child_age_today_scheduling <= 6 ~ "5",
  )) %>% 
  ## make sure each kid only participated once 
  distinct(bing_id, .keep_all = TRUE)



raw_df <- read_csv(MERGED_DATA_PATH) %>% 
  left_join(bing_d, by = "bing_id")

## Rows: 9472 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): trial_type, internal_node_id, subject, responses, key_press, block...
## dbl (10): trial_index, time_elapsed, rt, minimum_viewing_duration, trial_loo...
## lgl  (2): success, trial_stimulus
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

preprocessed df

main_d <- tidy_all_rt_task_data(raw_df)

## Adding missing grouping variables: `subject`

adult_d <- read_csv(here("data/adult_data.csv")) %>% mutate(child_age_group = "adult")

## Rows: 18198 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): subject, block_type, task_type, trial_type, stimulus_displayed, tas...
## dbl (5): block_number, deviant_position, trial_number, trial_looking_time, t...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

main_d <- main_d %>% 
  bind_rows(adult_d) %>% 
  mutate(participant_type = case_when(
    child_age_group == "adult" ~ "adult", 
    TRUE ~ "kids"
  )) %>% 
  mutate(deviant_position_print = case_when(
    deviant_position == 2 ~ "Deviant at 2nd trial", 
    deviant_position == 4 ~ "Deviant at 4th trial",
    deviant_position == 6 ~ "Deviant at 6th trial",
    TRUE ~ "No Deviant"
  ))
# missing: memory question

exclusion criteria:

memory question and memory practice

failed_memory_test_kids <- raw_df %>% 
  select(subject, 
         stimulus_type, 
         memory_block_index, button_pressed, correct_answer) %>% 
  filter(stimulus_type == "memory_test") %>% 
  mutate(block_number = memory_block_index + 1, 
         memory_correct = ((button_pressed == 0 & correct_answer == "left") | 
                            (button_pressed == 1 & correct_answer == "right"))) %>% 
  group_by(subject) %>% 
  summarise(sum_correct = sum(as.numeric(memory_correct))) %>% 
  filter(sum_correct <= 4) %>% 
  pull(subject)

failed_memory_practice_kids <- raw_df %>% 
  select(subject, 
         stimulus_type, 
         memory_block_index, button_pressed, correct_answer) %>% 
  filter(stimulus_type == "memory_practice") %>% 
  mutate(block_number = memory_block_index + 1, 
         memory_correct = ((button_pressed == 0 & correct_answer == "left") | 
                            (button_pressed == 1 & correct_answer == "right"))) %>% 
  group_by(subject) %>% 
  summarise(sum_correct = sum(as.numeric(memory_correct))) %>% 
  filter(sum_correct == 0) %>% 
  pull(subject)

looking time

flat_looking_time <- main_d %>% 
  filter(participant_type == "kids") %>% 
  group_by(subject) %>% 
  mutate(sd_lt = (log(sd(trial_looking_time)))) %>% 
  filter(sd_lt < 0.15) %>% 
  pull(subject)

exclude participants

main_d <- main_d %>% 
  filter(!(subject %in% failed_memory_test_kids)) %>% 
  filter(!(subject %in% failed_memory_practice_kids))

exclude trial

summary_lt_d <- main_d %>% 
  filter(participant_type == "kids") %>% 
  summarise(
     median = median(log(trial_looking_time)),
    mad = mad(log(trial_looking_time)), 
    upper = median + 3 * mad, 
    lower = median - 3 * mad) 


main_d <- main_d %>% 
  filter((participant_type == "kids" & log(trial_looking_time) > summary_lt_d$lower 
          & log(trial_looking_time) < summary_lt_d$upper) | participant_type == "adult"
          )

final sample sie

main_d %>% 
  distinct(subject, child_age_group) %>% 
  group_by(child_age_group) %>% 
  count()

## # A tibble: 4 × 2
## # Groups:   child_age_group [4]
##   child_age_group     n
##   <chr>           <int>
## 1 3                  18
## 2 4                  26
## 3 5                  20
## 4 adult             380

 main_d %>% 
  filter(participant_type == "kids") %>% 
  ggplot(aes(x = log(trial_looking_time), fill = as.factor(child_age_group))) + 
  geom_density(alpha = .3)   + 
  facet_wrap(~trial_type)

kids only

 main_d %>% 
  filter(participant_type == "kids") %>% 
  #filter(child_age_group > 3) %>% 
  ggplot(aes(x = trial_number, y = log(trial_looking_time))) +
  stat_summary(fun.data = "mean_cl_boot", position = position_dodge(width = .2)) + 
  stat_summary(geom = "line", fun.data = "mean_cl_boot", position = position_dodge(width = .2)) + 
  facet_wrap(~deviant_position_print)+  
  xlab("Trial Number") + 
    ylab("Looking Time (Log msc)")+

     theme_classic()+
  langcog::scale_color_solarized(name = "Participant Type")

kids vs adults

 main_d %>% 
  #filter(participant_type == "kids") %>% 
  #filter(child_age_group > 3) %>% 
  ggplot(aes(x = trial_number, y = log(trial_looking_time))) +
  stat_summary(fun.data = "mean_cl_boot", position = position_dodge(width = .2),
               aes(color = as.factor(participant_type))) + 
  stat_summary(geom = "line", fun.data = "mean_cl_boot", position = position_dodge(width = .2),
               aes(color = as.factor(participant_type))
              ) + 
  facet_wrap(~deviant_position_print)+  
  xlab("Trial Number") + 
    ylab("Looking Time (Log msc)")+

     theme_classic()+
  langcog::scale_color_solarized(name = "Participant Type")

3 vs 4 vs 5

 main_d %>% 
  filter(participant_type == "kids") %>% 
  ggplot(aes(x = trial_number, y = log(trial_looking_time))) +
  stat_summary(fun.data = "mean_cl_boot", position = position_dodge(width = .2), 
               aes(color = as.factor(child_age_group))) + 
  stat_summary(geom = "line", fun.data = "mean_cl_boot", position = position_dodge(width = .2), 
               aes(color = as.factor(child_age_group))) + 
  facet_wrap(~deviant_position_print)+  
  xlab("Trial Number") + 
  ylab("Looking Time (Log msc)")+
     theme_classic()+
  langcog::scale_color_solarized(name = "Age Group")

3 vs 4 vs 5 vs adults

 main_d %>% 
  #filter(participant_type == "kids") %>% 
  ggplot(aes(x = trial_number, y = log(trial_looking_time))) +
  stat_summary(fun.data = "mean_cl_boot", position = position_dodge(width = .2), 
               aes(color = as.factor(child_age_group))) + 
  stat_summary(geom = "line", fun.data = "mean_cl_boot", position = position_dodge(width = .2), 
               aes(color = as.factor(child_age_group))) + 
  facet_wrap(~deviant_position_print)+  
  xlab("Trial Number") + 
  ylab("Looking Time (Log msc)")+
     theme_classic()+
  langcog::scale_color_solarized(name = "Age Group")

01_preprocessing.rmd

anjie

preprocessed df

exclusion criteria:

memory question and memory practice

looking time

exclude participants

exclude trial

final sample sie

kids only

kids vs adults

3 vs 4 vs 5

3 vs 4 vs 5 vs adults