Thanks to reproduction by Hartshorne et al., discovered a bug in the distributed data I believe that the code to tidy and create CSVs from the original data was flawed in particular, in de-identifying the subject IDs (yes, some were identifiable, I was young and inexperienced) I appear to have incorrectly renamed subjects, leading to redundancy.

Further, I identified a bug in the subject exclusion code - I was iteratively removing subjects, meaning that the mean and SD used for exclusion changed as the loop below progressed

for i = 1:length(vals)
vals{i}(vals{i}<mean(vals{i})-(2*std(vals{i}))) = [];  
end

Anonymization

library(tidyverse)
library(digest)
library(here)

## here() starts at /Users/mcfrank/Old Projects/Segmentation/word_seg

first read in the data

note these paths are absolute not relative, and deal with non-anonymized data, not available.

d1 <- read_tsv("~/Old Projects/Segmentation/seg_compare/expts/data/data_for_distribution/FGGT-E1-data.txt",
               col_names = c("subid","sent.len","timestamp","word.len","rt","keypress","correct"), 
               col_types = "cncnncn")

## Warning: 3022 parsing failures.
## row col  expected    actual                                                                                        file
##   1  -- 7 columns 8 columns '~/Old Projects/Segmentation/seg_compare/expts/data/data_for_distribution/FGGT-E1-data.txt'
##   2  -- 7 columns 8 columns '~/Old Projects/Segmentation/seg_compare/expts/data/data_for_distribution/FGGT-E1-data.txt'
##   3  -- 7 columns 8 columns '~/Old Projects/Segmentation/seg_compare/expts/data/data_for_distribution/FGGT-E1-data.txt'
##   4  -- 7 columns 8 columns '~/Old Projects/Segmentation/seg_compare/expts/data/data_for_distribution/FGGT-E1-data.txt'
##   5  -- 7 columns 8 columns '~/Old Projects/Segmentation/seg_compare/expts/data/data_for_distribution/FGGT-E1-data.txt'
## ... ... ......... ......... ...........................................................................................
## See problems(...) for more details.

d2 <- read_tsv("~/Old Projects/Segmentation/seg_compare/expts/data/data_for_distribution/FGGT-E2-data-mod-11-18-20.txt",
               col_names = c("subid","n.tokens","timestamp","word.len","rt","keypress","correct"),
               col_types = "cncnncn")
d3 <- read_tsv("~/Old Projects/Segmentation/seg_compare/expts/data/data_for_distribution/FGGT-E3-data-mod-11-18-20.txt",
               col_names = c("subid","n.types","timestamp","word.len","rt","keypress","correct"),
               col_types = "cncnncn")

now output as deidentified data via hashing.

d1 %<>%
  group_by(subid, sent.len) %>%
  mutate(subid = digest::digest(str_c(subid[1], timestamp[1])))

write_csv(d1, here("updated_data/FGGT-E1-corrected-data.csv"))

d2 %<>%
  group_by(subid, n.tokens) %>%
  mutate(subid = digest::digest(str_c(subid[1], timestamp[1])))

write_csv(d2, here("updated_data/FGGT-E2-corrected-data.csv"))

d3 %<>%
  group_by(subid, n.types) %>%
  mutate(subid = digest::digest(str_c(subid[1], timestamp[1])))

write_csv(d3, here("updated_data/FGGT-E3-corrected-data.csv"))

E1 analysis

Should have 101 subjects, with 3 excluded.

length(unique(d1$subid))

## [1] 101

Check on number of trials.

d1_trials <- d1 %>%
  group_by(subid) %>%
  count %>%
  pull

all(d1_trials <= 30)

## [1] TRUE

Reproduce incorrect exclusions from E1 - this function is NOT the right way to exclude, but is based on my old matlab code.

exclude_iteratively <- function (df) {
  exclusions <- c()
  
  for (i in 1:length(df$subid)) {
    if (df$correct[i] < mean(df$correct, na.rm=TRUE) - 2 * sd(df$correct, na.rm=TRUE)) {
      df$correct[i] <- NA
      exclusions <- c(exclusions, df$subid[i])
    }
  }
  
  return(exclusions)
}

Sub means

d1_ms <- d1 %>%
  group_by(subid, sent.len) %>%
  summarise(correct = mean(correct))

## `summarise()` regrouping output by 'subid' (override with `.groups` argument)

Running the bad exclusion function still doesn’t reproduce the 3 exclusions from the original code, likely that’s because it is actually ORDER DEPENDENT unfortunately.

d1_ms %>%
  split(.$sent.len) %>%
  map(exclude_iteratively)

## $`1`
## [1] "29b8bf7ee10071dccbdde5818a3b4158"
## 
## $`2`
## [1] "2f74a2982d79513963d841a14aa7ba92" "b0f94505890f822b634777945a7c5b57"
## 
## $`3`
## [1] "8c97f9ddaa787c949f70544814c94678" "a5fb9d7117e3a561b7871a2f9d162ab4"
## 
## $`4`
## NULL
## 
## $`6`
## NULL
## 
## $`8`
## NULL
## 
## $`12`
## NULL
## 
## $`24`
## [1] "fc10ccdc6269971068fae07a62030c6b"

Demonstration that if you arrange the data differently, this exclusion does different things.

d1_ms %>%
  group_by(sent.len) %>%
  arrange(correct) %>%
  split(.$sent.len) %>%
  map(exclude_iteratively)

## $`1`
## [1] "29b8bf7ee10071dccbdde5818a3b4158"
## 
## $`2`
## [1] "2f74a2982d79513963d841a14aa7ba92" "b0f94505890f822b634777945a7c5b57"
## [3] "41ab246772f8ade11537f963ab44d398"
## 
## $`3`
## [1] "8c97f9ddaa787c949f70544814c94678" "a5fb9d7117e3a561b7871a2f9d162ab4"
## 
## $`4`
## NULL
## 
## $`6`
## NULL
## 
## $`8`
## NULL
## 
## $`12`
## NULL
## 
## $`24`
## [1] "fc10ccdc6269971068fae07a62030c6b" "ab4d859dd238e5d0b446eebbd64e6d16"

Sad. Here’s a tidier (and more correct) way to do it. Now we get 4.

d1_exclusions <- d1 %>%
  group_by(subid, sent.len) %>%
  summarise(correct = mean(correct)) %>%
  group_by(sent.len) %>% 
  mutate(cond_mean = mean(correct), 
         cond_sd = sd(correct), 
         exclude = correct < cond_mean - 2*cond_sd) %>%
  filter(exclude) %>%
  pull(subid)

## `summarise()` regrouping output by 'subid' (override with `.groups` argument)

d1_exclusions

## [1] "29b8bf7ee10071dccbdde5818a3b4158" "2f74a2982d79513963d841a14aa7ba92"
## [3] "8c97f9ddaa787c949f70544814c94678" "fc10ccdc6269971068fae07a62030c6b"

d1_paper_mss <- d1 %>%
  filter(!(subid %in% d1_exclusions)) %>%
  group_by(sent.len, subid) %>%
  summarise(correct = mean(correct))

## `summarise()` regrouping output by 'sent.len' (override with `.groups` argument)

d1_paper_ms <- d1_paper_mss %>%
  group_by(sent.len) %>%
  summarise(mean = mean(correct))

## `summarise()` ungrouping output (override with `.groups` argument)

ggplot(d1_paper_mss, 
       aes(x = sent.len, y = correct)) + 
  geom_jitter(height = 0, width = .1, alpha = .5) + 
  geom_line(data = d1_paper_ms, aes(x = sent.len, y = mean))

E2 analysis

Should have 72 subjects, with 0 excluded. Instead we have 73 because subject ID 74 got used twice sequentially in the same condition. I modified the data on 11/18/20 to rename this subject and uniquify them.

length(unique(d2$subid))

## [1] 73

Check the number of trials.

d2_trials <- d2 %>%
  group_by(subid) %>%
  count %>%
  pull

all(d2_trials <= 30)

## [1] TRUE

We reported 0 exclusions. Here’s the current exclusions list.

d2_exclusions <- d2 %>%
  group_by(subid, n.tokens) %>%
  summarise(correct = mean(correct)) %>%
  group_by(n.tokens) %>% 
  mutate(cond_mean = mean(correct), 
         cond_sd = sd(correct), 
         exclude = correct < cond_mean - 2*cond_sd) %>%
  filter(exclude) %>%
  pull(subid)

## `summarise()` regrouping output by 'subid' (override with `.groups` argument)

d2_exclusions

## character(0)

d2_paper_mss <- d2 %>%
  filter(!(subid %in% d2_exclusions)) %>%
  group_by(n.tokens, subid) %>%
  summarise(correct = mean(correct))

## `summarise()` regrouping output by 'n.tokens' (override with `.groups` argument)

d2_paper_ms <- d2_paper_mss %>%
  group_by(n.tokens) %>%
  summarise(mean = mean(correct))

## `summarise()` ungrouping output (override with `.groups` argument)

ggplot(d2_paper_mss, 
       aes(x = n.tokens, y = correct)) + 
  geom_jitter(height = 0, width = .1, alpha = .5) + 
  geom_line(data = d2_paper_ms, aes(x = n.tokens, y = mean))

E3 analysis

Should have 64 participants with 3 excluded. But in fact, we have the same subid reuse issue, with six different subjects. Ugh. So we have:

length(unique(d3$subid))

## [1] 69

Check the number of trials.

d3_trials <- d3 %>%
  group_by(subid) %>%
  count %>%
  pull

all(d3_trials <= 30)

## [1] TRUE

We reported two exclusions. Here’s what we get with the current dataset.

d3_exclusions <- d3 %>%
  group_by(subid, n.types) %>%
  summarise(correct = mean(correct)) %>%
  group_by(n.types) %>% 
  mutate(cond_mean = mean(correct), 
         cond_sd = sd(correct), 
         exclude = correct < cond_mean - 2*cond_sd) %>%
  filter(exclude) %>%
  pull(subid)

## `summarise()` regrouping output by 'subid' (override with `.groups` argument)

d3_exclusions

## [1] "30905b7c381c105a94ab7eadc97f061a"

d3_paper_mss <- d3 %>%
  filter(!(subid %in% d3_exclusions)) %>%
  group_by(n.types, subid) %>%
  summarise(correct = mean(correct))

## `summarise()` regrouping output by 'n.types' (override with `.groups` argument)

d3_paper_ms <- d3_paper_mss %>%
  group_by(n.types) %>%
  summarise(mean = mean(correct))

## `summarise()` ungrouping output (override with `.groups` argument)

ggplot(d3_paper_mss, 
       aes(x = n.types, y = correct)) + 
  geom_jitter(height = 0, width = .1, alpha = .5) + 
  geom_line(data = d3_paper_ms, aes(x = n.types, y = mean))

Revised FGGT data exporting

Mike Frank

11/18/2020

Anonymization

E1 analysis

E2 analysis

E3 analysis