Aggregate the Raw Data

raw_df <- list.files(RAW_DATA_DIR, full.names = T) %>%
  map_df(read_csv) %>% 
  select(rt, trial_type, trial_index, subject, trial_stimulus, trial_stimulus_type, 
         task_type, task_target_stimulus, task_background_stimulus, task_deviant_stimuli, task_order_number, block_order_number, trial_looking_time, 
         trial_pressed_space_bar,trial_space_bar_rt, responses, question_order) %>% 
  filter(trial_type == "stimulus-presentation" | trial_type == "demog-age" | trial_type == "demog-gender-and-education" | trial_type == "demog-ethnic-US" | trial_type == "demog-disorder-history") %>% 
  mutate(
    stimuli_type = case_when(
    grepl("complex", trial_stimulus) ~ "complex", 
    grepl("simple", trial_stimulus) ~ "simple"
  ))

write_csv(raw_df, AGGREGATED_DATA_PATH)

raw_df %>% datatable()

Extract the Demog Information

demog_df <- raw_df %>% 
  filter(grepl("demog", trial_type)) %>% 
  select(subject, trial_type, responses) %>% 
  toJSON() %>% 
  fromJSON() %>% 
    mutate(
      demog_question = map(responses, ~ fromJSON(.) %>% as.data.frame())) %>% 
    unnest(demog_question)  %>% 
  group_by(subject) %>%
  mutate_at(vars(-group_cols()), function(x) {x[!is.na(x)][1]}) %>%
  distinct() %>% 
  select(-trial_type, responses)
  


demog_df %>% datatable()

joining back

data_with_demog <- left_join(raw_df,demog_df, by = "subject")
data_with_demog %>% datatable()

Exclusion criteria applied

filter out based on demog (neurological disorder question)

demog_filtered <- data_with_demog %>% 
  filter(
    current_neuro != "Yes" && past_neuro != "Yes" && current_ld != "Yes" && past_ld != "Yes"
  )

demog_filtered

## # A tibble: 0 x 27
## # … with 27 variables: rt <chr>, trial_type <chr>, trial_index <dbl>,
## #   subject <chr>, trial_stimulus <chr>, trial_stimulus_type <chr>,
## #   task_type <chr>, task_target_stimulus <chr>,
## #   task_background_stimulus <chr>, task_deviant_stimuli <chr>,
## #   task_order_number <chr>, block_order_number <chr>,
## #   trial_looking_time <dbl>, trial_pressed_space_bar <chr>,
## #   trial_space_bar_rt <dbl>, responses.x <chr>, question_order <chr>,
## #   stimuli_type <chr>, responses.y <chr>, age <fct>, ethnicity <fct>,
## #   gender <fct>, education <fct>, current_neuro <fct>, past_neuro <fct>,
## #   current_ld <fct>, past_ld <fct>

participant-level exclusion: pressing space bar

## proportion of pressing space bar in target trial 
prop_press_space_summary <- data_with_demog %>% 
  select(subject, trial_pressed_space_bar, trial_stimulus_type) %>% 
  mutate(trial_pressed_space_bar = if_else(is.na(trial_pressed_space_bar), "no", trial_pressed_space_bar)) %>% 
  filter(trial_stimulus_type == "target") %>% 
  group_by(subject,trial_pressed_space_bar) %>% 
  summarize(
    n = n()
  ) %>% 
  pivot_wider(names_from = trial_pressed_space_bar, values_from = n) %>% 
  mutate(
    no = if_else(is.na(no), 0.0, as.numeric(no)),
    yes = if_else(is.na(yes), 0.0, as.numeric(yes)), 
    sum = no + yes, 
    no_press_prop = no / sum
  ) %>% 
  select(subject, no_press_prop)

## `summarise()` regrouping output by 'subject' (override with `.groups` argument)

## proportion of pressing space bar in non-target trial 
prop_wrong_press_summary <- data_with_demog %>% 
  select(subject, trial_pressed_space_bar, trial_stimulus_type) %>% 
  mutate(trial_pressed_space_bar = if_else(is.na(trial_pressed_space_bar), "no", trial_pressed_space_bar)) %>% 
  filter(trial_stimulus_type != "target") %>% 
  group_by(subject,trial_pressed_space_bar) %>% 
  summarize(
    n = n()
  ) %>% 
  pivot_wider(names_from = trial_pressed_space_bar, values_from = n) %>% 
  mutate(
    no = if_else(is.na(no), 0.0, as.numeric(no)),
    yes = if_else(is.na(yes), 0.0, as.numeric(yes)), 
    sum = no + yes, 
    wrong_press_prop = yes / sum
  ) %>% 
  select(subject, wrong_press_prop)

## `summarise()` regrouping output by 'subject' (override with `.groups` argument)

prop_wrong_press_summary

## # A tibble: 2 x 2
## # Groups:   subject [2]
##   subject         wrong_press_prop
##   <chr>                      <dbl>
## 1 SS1602727692346          0.00481
## 2 SS1602730104581          0.0481

data_with_demog_spacebar <- left_join(data_with_demog, prop_press_space_summary, by = "subject")
data_with_demog_spacebar <- left_join(data_with_demog_spacebar, prop_wrong_press_summary, by = "subject")

filtered_spacebar <- data_with_demog_spacebar %>% 
  filter(no_press_prop > 0.25 | wrong_press_prop >0.25)

trial-level exclusion:????

summary_lt <- data_with_demog %>% 
  filter(trial_type == "stimulus-presentation") %>% 
    summarize(
    mean_lt = mean(as.numeric(trial_looking_time), na.rm = TRUE), 
    sd_lt = sd(trial_looking_time, na.rm = TRUE),
    upper_lt = mean_lt + 3 * sd_lt, 
    lower_lt = mean_lt - 3 * sd_lt
  )

UPPER_LT <- summary_lt %>% select(upper_lt) %>% pull()
LOWER_LT <- summary_lt %>% select(lower_lt) %>% pull()
summary_lt %>% kable()

mean_lt	sd_lt	upper_lt	lower_lt
1077.792	628.1518	2962.248	-806.6632

trial_lt_to_cut <- data_with_demog %>% 
  filter(trial_type == "stimulus-presentation") %>% 
  filter(trial_looking_time < LOWER_LT | trial_looking_time > UPPER_LT)

# check if the participants are missing 25% trials after applying the trial-based exclusion criteria 
num_total_trials <- data_with_demog %>% 
  filter(trial_type == "stimulus-presentation") %>% 
  group_by(subject) %>% 
  count()

num_to_cut_trials <- trial_lt_to_cut %>% 
  filter(trial_type == "stimulus-presentation") %>% 
  group_by(subject) %>% 
  count()

num_total_trials %>% kable()

subject	n
SS1602727692346	244
SS1602730104581	244

num_to_cut_trials %>% kable()

subject	n
SS1602727692346	9
SS1602730104581	1

transformation needed?

data_with_demog %>% 
  ggplot(aes(x = as.numeric(trial_looking_time))) + 
  geom_histogram() + 
  geom_vline(xintercept = UPPER_LT)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 8 rows containing non-finite values (stat_bin).

plots

data_with_demog %>% 
  filter(trial_type == "stimulus-presentation") %>% 
  group_by(trial_stimulus_type, task_type) %>% 
  summarize(
    mean = mean(as.numeric(trial_looking_time), na.rm = TRUE),
    sd = sd(as.numeric(trial_looking_time), na.rm = TRUE), 
    n = n(),
    ci_range_95 =  qt(1 - (0.05 / 2), n - 1) * (sd/sqrt(n)),
    ci_lower = mean - ci_range_95,
    ci_upper = mean + ci_range_95
  ) %>% 
  ggplot(aes(x = trial_stimulus_type, y = mean)) + 
  geom_point() + 
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  facet_wrap(~task_type)

## `summarise()` regrouping output by 'trial_stimulus_type' (override with `.groups` argument)

orirignal plot?

task_type_order <- c("all_simple", "all_complex", "mixed_simple_deviant", "mixed_complex_deviant")

data_with_demog %>% 
  filter(trial_type == "stimulus-presentation") %>% 
  filter(trial_stimulus_type != "target") %>% 
  group_by(trial_stimulus_type, task_type) %>% 
  summarize(
    mean = mean(as.numeric(rt), na.rm = TRUE),
    sd = sd(as.numeric(rt), na.rm = TRUE), 
    n = n(),
    ci_range_95 =  qt(1 - (0.05 / 2), n - 1) * (sd/sqrt(n)),
    ci_lower = mean - ci_range_95,
    ci_upper = mean + ci_range_95
  ) %>% 
  ggplot(aes(x = task_type, y = mean)) + 
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  facet_wrap(~trial_stimulus_type) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` regrouping output by 'trial_stimulus_type' (override with `.groups` argument)

final clean data

(it’s a lie right now because not excluding anyone but it should look sth similar to this)

clean_data <- data_with_demog_spacebar %>% 
  filter(trial_type == "stimulus-presentation") %>% 
  select(subject,  task_type, task_order_number, trial_stimulus_type, trial_stimulus, stimuli_type,
         trial_looking_time, trial_pressed_space_bar, trial_space_bar_rt, age, ethnicity, gender, education, no_press_prop, wrong_press_prop) %>% 
  mutate(
    trial_stimulus_complexity = stimuli_type, 
    trial_stimulus_path = gsub("<img src='", "", trial_stimulus), 
    trial_stimulus_path = gsub("' width ='500' height = '500' style='border:5px solid black'>", "", trial_stimulus_path), 
    demog_age = age, 
    demog_ethnicity = ethnicity, 
    demog_gender = gender, 
    demog_education = education, 
    target_no_press_percent = no_press_prop,
    non_target_press_percent = wrong_press_prop,
    trial_pressed_space_bar = if_else(is.na(trial_pressed_space_bar), "no", trial_pressed_space_bar)
  ) %>% 
  select(
    subject, task_type, task_order_number, 
    trial_stimulus_path, trial_stimulus_type, trial_stimulus_complexity, trial_looking_time, 
    trial_pressed_space_bar, trial_space_bar_rt,  target_no_press_percent, non_target_press_percent, 
    demog_age, demog_ethnicity, demog_gender, demog_education
  )
  
clean_data %>% datatable()

Data Preprocessing and Excluding

anjie

10/14/2020

Aggregate the Raw Data

Extract the Demog Information

Exclusion criteria applied

filter out based on demog (neurological disorder question)

participant-level exclusion: pressing space bar

trial-level exclusion:????

plots

final clean data