library(tidyverse)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(knitr)

This Rmd produces cached data files.

Load Typically-Developing Data

# CDI:WS
base::load("data/eng_ws_raw_data.Rds")

data_TD_WS <- eng_ws %>%
  mutate(produces = value == "produces") %>%
  filter(!is.na(category)) %>% 
  dplyr::select(data_id, produces, age, production, sex, 
                ethnicity, mom_ed, category, definition) %>%
  mutate(form="WS")

save(data_TD_WS, file="data/TD_WS.Rdata")

Clean ASD CDI:WS data

Lots of data munging

data_AS_WS <- read.delim("data/mci_sentences02.txt", 
                         header = TRUE, sep = "\t", dec = ".") %>% 
  filter(mcs_vc_total != "999")

# extracting first row as a descriptive dataframe
description_WS <- data_AS_WS[1:1,]
description_WS <- as.data.frame(t(description_WS))
names(description_WS) <- "description"


# what we get rid of:
eliminated_WS <- data_AS_WS %>%
  dplyr::select(c(701:709,780:850)) %>%
  slice(1:1) %>%
  gather(key = "Column names", value = "Description")#%>%
  #head(10)

# grammar items (past tense, future, not present, etc.)
description_WS[701:709,]
# complexity and examples (e.g. longest sentences)
description_WS[780:850,]

#using mci_sentences02_id   as a distinctive id and The NDAR Global Unique Identifier 
data_raw_AS_WS <- data_AS_WS %>%
  dplyr::select(c("mci_sentences02_id","subjectkey","interview_age", 
                  "collection_id", "dataset_id", "interview_date", 
                  "src_subject_id", "sex", 21:779)) %>% # starting from 785 is complexity. we kept vocabs, word endings, word forms.
  dplyr::select(-(689:697))
  #dplyr::select(-(684:692)) # before adding interview_age -> sex...

colnames(data_raw_AS_WS) <- as.character(unlist(data_raw_AS_WS[1,])) #unlist the row
data_raw_AS_WS = data_raw_AS_WS[-1, ]

data_raw_AS_WS <- data_raw_AS_WS %>%
  rename(id = "mci_sentences02_id",
         GUID = "The NDAR Global Unique Identifier (GUID) for research subject", 
         age = "Age in months at the time of the interview/test/sampling/imaging.",
         test_date = "Date on which the interview/genetic test/sampling/imaging/biospecimen was completed. MM/DD/YYYY",
         src_subject_id = "Subject ID how it's defined in lab/project",
         sex = "Sex of the subject") %>%
  mutate(age = as.numeric(as.character(age)))

data_clean_AS_WS <- data_raw_AS_WS %>%
  gather(key = "definition", value = "value", 
         -c(id,GUID,age,collection_id,dataset_id,test_date,src_subject_id,sex)) %>%
  separate(definition, c("category","definition"),sep = "\\. ") %>%
  mutate_all(na_if,"",)%>% #if blank then fill in NA
   mutate(value = ifelse(value == 0, FALSE, TRUE))

data_all_AS_WS <- data_clean_AS_WS %>%
     group_by(category, definition, age) %>%
       summarise(num_true = sum(value, na.rm = TRUE),
                 num_false = n() - num_true,
                 prop = num_true/n())

# unique(data_all_AS_WS$category)

data_clean_AS_WS <- data_clean_AS_WS %>% filter(category!="Word forms, noun", 
                                                category!="Word forms, verbs",
                                                category!="Word endings, nouns",
                                                category!="Word endings, verbs") %>%
  mutate(category = case_when(category=="Sound Effects and Animal Sounds" ~ "sounds",
                              category=="Animals (Real or Toy)" ~ "animals",
                              category=="Vehicles (Real or Toy)" ~ "vehicles",
                              category=="Food and Drink" ~ "food_drink",
                              category=="Clothinq" ~ "clothing",
                              category=="Body Parts" ~ "body_parts",
                              category=="Small Household Items" ~ "household",
                              category=="Furniture and Rooms" ~ "furniture_rooms",
                              category=="Outside Thlnqs" ~ "outside",
                              category=="Action Words" ~ "action_words",
                              category=="Places to Go" ~ "places",
                              category=="Helping Verbs" ~ "helping_verbs",
                              category=="Connectinq Words" ~ "connecting_words",
                              category=="Descriptive Words" ~ "descriptive_words",
                              category=="Words About Time" ~ "time_words",
                              category=="Quantifiers and Articles" ~ "quantifiers",
                              category=="Games and Routines" ~ "games_routines",
                              category=="Question Words" ~ "question_words",
                              category=="Prepositions and Locations" ~ "locations",
                              category=="Helpinq Verbs" ~ "helping_verbs",
                              TRUE ~ category)) %>%
  mutate(category = tolower(category), 
         definition = case_when(definition=="baa" ~ "baa baa",
                                definition=="cockadoodle" ~ "cockadoodledoo",
                                definition=="quack" ~ "quack quack",
                                definition=="uhoh" ~ "uh oh",
                                definition=="woof" ~ "woof woof",
                                definition=="yum" ~ "yum yum",
                                definition=="chicken" & category=="food_drink" ~ "chicken (food)",
                                definition=="chicken" ~ "chicken (animal)",
                                definition=="fish" & category=="food_drink" ~ "fish (food)",
                                definition=="fish" ~ "fish (animal)",
                                definition=="playdough" ~ "play dough",
                                definition=="vagina" ~ "vagina*",
                                definition=="penis" ~ "penis*",
                                definition=="frenchfries" ~ "french fries",
                                definition=="greenbeans" ~ "green beans",
                                definition=="toy" ~ "toy (object)",
                                definition=="drink" & category=="action_words" ~ "drink (action)",
                                definition=="drink" ~ "drink (beverage)",
                                definition=="gasstation" ~ "gas station",
                                definition=="orange" & category=="food_drink" ~ "orange (food)",
                                definition=="orange" ~ "orange (description)",
                                definition=="allgone" ~ "all gone",
                                definition=="water" & category=="food_drink" ~ "water (beverage)",
                                definition=="water" ~ "water (not beverage)",
                                definition=="feet" ~ "foot",
                                definition=="callph" ~ "call (on phone)",
                                definition=="clean" & category=="action_words" ~ "clean (action)",
                                definition=="clean" ~ "clean (description)",
                                definition=="owie  booboo" ~ "owie/boo boo",
                                definition=="dont" ~ "don't",
                                definition=="5bowl" ~ "bowl",
                                definition=="can" & category=="household" ~ "can (object)",
                                definition=="can" & category=="helping_verbs" ~ "can (auxiliary)",
                                definition=="rockingchair" ~ "rocking chair",
                                definition=="alot" ~ "a lot",
                                definition=="buttocks/bottom" ~ "buttocks/bottom*",
                                definition=="daddy" ~ "daddy*",
                                definition=="childname" ~ "child's own name",
                                definition=="washingmachine" ~ "washing machine",
                                definition=="try" ~ "try/try to",
                                definition=="work" & category=="places" ~ "work (place)",
                                definition=="work" ~ "work (action)",
                                definition=="giveme five" ~ "give me five!",
                                definition=="mommy" ~ "mommy*",
                                definition=="grandma" ~ "grandma*",
                                definition=="church" ~ "church*",
                                definition=="daddy" ~ "daddy*",
                                definition=="grandpa" ~ "grandpa*",
                                definition=="patty cake" ~ "pattycake",
                                definition=="dry" & category=="action_words" ~ "dry (action)",
                                definition=="dry" ~ "dry (description)",
                                definition=="lemme" ~ "lemme/let me",
                                definition=="tissklee" ~ "tissue/kleenex",
                                definition=="did" ~ "did/did ya",
                                definition=="gonna get  you" ~ "gonna get you!",
                                definition=="peanutbutter" ~ "peanut butter",
                                definition=="playpen" ~ "play pen",
                                definition=="potatochip" ~ "potato chip",
                                definition=="wanna" ~ "wanna/want to",
                                definition=="watch" & category=="action_words" ~ "watch (action)",
                                definition=="watch" ~ "watch (object)",
                                definition=="dress" ~ "dress (object)",
                                definition=="gonna" ~ "gonna/going to",
                                definition=="gotta" ~ "gotta/got to",
                                definition=="hafta" ~ "hafta/have to",
                                definition=="highchair" ~ "high chair",
                                definition=="lawnmower" ~ "lawn mower",
                                definition=="little" ~ "little (description)",
                                definition=="petname" ~ "pet's name",
                                definition=="so big" ~ "so big!",
                                definition=="need" ~ "need/need to",
                                definition=="shush" ~ "shh/shush/hush",
                                definition=="swing" & category=="action_words" ~ "swing (action)",
                                definition=="swing" ~ "swing (object)",
                                definition=="slide" & category=="action_words" ~ "slide (action)",
                                definition=="slide" ~ "slide (object)",
                                TRUE ~ definition))

save(data_clean_AS_WS, file="data/ASD_WS.Rdata")
# summary(data_all_AS_WS)

Clean ASD CDI:WG data

Note: found a problem in the NDAR description file – mcg_vc18_back is given the definition “backyard” (making a duplicate) instead of “back”. Also, “throw” (col 517) was left out of previous semantic network growth analyses.

data_AS_WG <- read.delim("data/mci_words_gestures01.txt", 
                         header = TRUE, sep = "\t", dec = ".") %>% 
  filter(mcg_vc_totcom != "999")

# extracting first row as a descriptive dataframe
description_WG <- data_AS_WG[1:1,]
description_WG <- as.data.frame(t(description_WG))
names(description_WG) <- "description"

# we only kept vocab 
eliminated_WG <- data_AS_WG %>%
  dplyr::select(c(23:58,454:520))

#using mci_words_gestures01_id as a distinctive id and The NDAR Global Unique Identifier 

data_raw_AS_WG <- data_AS_WG %>%
  dplyr::select(c("collection_id","dataset_id","sex","mci_words_gestures01_id","subjectkey","interview_age", 59:453, 517))

colnames(data_raw_AS_WG) <- as.character(unlist(data_raw_AS_WG[1,])) #unlist the row
data_raw_AS_WG = data_raw_AS_WG[-1, ]

# what are the duplicated
AS_WG_duplicated <- data_raw_AS_WG[duplicated(colnames(data_raw_AS_WG))] # can call colnames 

# making column names unique
names(data_raw_AS_WG) <- make.unique(names(data_raw_AS_WG), sep="_")


data_raw_AS_WG <- data_raw_AS_WG %>%
  rename(id = "mci_words_gestures01_id",
         GUID = "The NDAR Global Unique Identifier (GUID) for research subject", 
         age = "Age in months at the time of the interview/test/sampling/imaging.",
         house = "MacArthur Words and Gestures: Vocabulary Checklist: House",
         sex = "Sex of the subject") %>%
  mutate(age = as.numeric(as.character(age)))


data_clean_AS_WG <- data_raw_AS_WG %>%
  gather(key = "definition", value = "value", 
         -c(id,GUID,age,sex,dataset_id,collection_id)) %>%
  #separate(definition, c("category","definition"),sep = "\\. ") %>%
  mutate_all(na_if,"",) %>% #if blank then fill in NA
   mutate(value = ifelse(value == 0, FALSE, TRUE))

data_all_AS_WG <- data_clean_AS_WG %>%
     group_by(definition, age) %>%
       summarise(num_true = sum(value, na.rm = TRUE),
                 num_false = n() - num_true,
                 prop = num_true/n())

data_clean_AS_WG <- data_clean_AS_WG %>% 
  mutate(definition = case_when(
    definition=="bye or bye bye" ~ "bye",
    definition=="chicken" ~ "chicken (animal)",
    definition=="chicken_1" ~ "chicken (food)",
    definition=="peek-a-boo" ~ "peekaboo",
    definition=="water" ~ "water (beverage)",
    definition=="water_1" ~ "water (not beverage)",
    definition=="church" ~ "church*",
    definition=="clean" ~ "clean (action)",
    definition=="clean_1" ~ "clean (description)",
    definition=="daddy" ~ "daddy*",
    definition=="dress" ~ "dress (object)",
    definition=="towl" ~ "towel",
    definition=="grandpa" ~ "grandpa*",
    definition=="grandma" ~ "grandma*",
    definition=="mommy" ~ "mommy*",
    definition=="owie/ boo boo" ~ "owie/boo boo",
    definition=="little" ~ "little (description)",
    definition=="drink" ~ "drink (beverage)",
    definition=="drink_1" ~ "drink (action)",
    definition=="dry" ~ "dry (description)",
    definition=="fire truck" ~ "firetruck",
    definition=="fish" ~ "fish (animal)",
    definition=="fish_1" ~ "fish (food)",
    definition=="toy" ~ "toy (object)",
    definition=="teddy bear" ~ "teddybear",
    definition=="swing" ~ "swing (object)",
    definition=="swing_1" ~ "swing (action)",
    definition=="work" ~ "work (place)",
    definition=="orange" ~ "orange (food)",
    definition=="patty cake" ~ "pattycake",
    definition=="slide" ~ "slide (object)",
    definition=="watch" ~ "watch (object)",
    definition=="watch_1" ~ "watch (action)",
    definition=="backyard_1" ~ "back", # "mcg_vc18_back"
    TRUE ~ definition
  ))


save(data_clean_AS_WG, file="data/ASD_WG.Rdata")

# summary(data_all_AS_WG)

Merging in dataset ID information and cleaning

We need to consider which ASD studies are relevant to include (e.g., some may be from more severe cases than others), as well as which have oddities (e.g., duplicate/mismatched subject IDs). At a first glance at the summary by dataset_id, a few of the datasets have all 0 production scores (e.g., dataset_id = c(9137, 17935, 21697, 17999, 17151)). Are these true 0 CDI scores, or are the CDI data for these studies somehow missing? As these studies comprise 1170 participants (of our 4488 total ASD sample), it is important to know what’s going on, and whether or not they should be excluded. I also note that studies 17935 and 21697 suspiciously have the same age range, mean_age, and number of subjects – are these 194 subjects duplicated under different dataset_ids?

For now, I remove all of the datasets without any non-zero production scores.

load("data/ASD_WS.Rdata") # data_clean_AS_WS
load("data/ASD_WG.Rdata") # data_clean_AS_WG
load("data/TD_WS.Rdata") # data_TD_WS
AS_WS_summary <- data_clean_AS_WS %>% 
  group_by(collection_id, age, sex, dataset_id) %>%
  summarise(production = sum(value, na.rm=T)) %>%
  group_by(collection_id) %>%
  summarise(min_age = min(age), 
            max_age = max(age), 
            mean_age = mean(age),
            n = n(), 
            mean_prod = mean(production)) %>%
  arrange(desc(n))
## `summarise()` has grouped output by 'collection_id', 'age', 'sex'. You can
## override using the `.groups` argument.
AS_WS_summary %>%
  kable(format = "html", table.attr = "style='width:50%;'", digits=1)
collection_id min_age max_age mean_age n mean_prod
2024 12 39 23.6 285 2521.9
2368 17 122 60.5 120 924.4
9 1 48 31.1 64 0.0
2666 17 34 24.8 60 0.0
2664 19 64 36.5 54 500.2
1856 23 47 31.1 25 282.4
2026 23 60 31.8 17 2542.6
2355 17 26 21.3 15 0.0
AS_WG_summary <- data_clean_AS_WG %>% 
  group_by(collection_id, age, sex, dataset_id) %>%
  summarise(production = sum(value, na.rm=T)) %>%
  group_by(collection_id) %>%
  summarise(min_age = min(age), 
            max_age = max(age), 
            mean_age = mean(age),
            n = n(), 
            mean_prod = mean(production)) %>%
  arrange(desc(n)) 
## `summarise()` has grouped output by 'collection_id', 'age', 'sex'. You can
## override using the `.groups` argument.
AS_WG_summary %>%
  kable(format = "html", table.attr = "style='width:50%;'", digits=1)
collection_id min_age max_age mean_age n mean_prod
2024 7 20 13.2 193 505.8
2368 13 132 61.5 123 436.8
1885 8 63 35.6 120 0.0
2026 7 21 14.2 83 0.0
1600 0 45 19.9 74 0.0
9 3 48 26.7 70 0.0
19 11 38 22.0 67 3188.0
16 15 30 23.4 56 0.0
8 11 48 27.2 50 0.0
2192 60 173 81.1 30 131.0
2355 11 20 14.0 23 21.1
2666 12 22 16.8 19 0.0
2878 8 19 12.7 19 101.6
2510 59 101 76.1 18 171.2
1952 49 85 68.1 16 0.0
6 10 18 14.1 15 0.0
2503 27 53 36.9 14 0.0
2027 12 29 21.8 13 964.8
2089 17 22 19.5 12 0.0
2169 74 124 106.3 3 0.0

Remove all datasets with all 0 production.

zero_prod_WG <- filter(AS_WG_summary, mean_prod < 1) %>% pull(collection_id)
zero_prod_WS <- filter(AS_WS_summary, mean_prod < 1) %>% pull(collection_id)

data_clean_AS_WG %<>% filter(!(collection_id %in% zero_prod_WG))
data_clean_AS_WS %<>% filter(!(collection_id %in% zero_prod_WS))
data_sources <- read_csv("data/NDA data sources.csv")
## Rows: 10 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): collection_title, PI, NDA link, Likely paper link, Notes
## dbl (5): collection_id, n, n_unique, n_unique_id, prop_male
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_clean_AS_WS %<>%
  mutate(collection_id = as.numeric(collection_id)) %>%
  left_join(select(data_sources, collection_id, collection_title))
## Joining, by = "collection_id"
data_clean_AS_WG %<>%
  mutate(collection_id = as.numeric(collection_id)) %>%
  left_join(select(data_sources, collection_id, collection_title))
## Joining, by = "collection_id"

We will constrain our analysis to those children close to the intended age ranges, extending a wider margin for older children to capture the attenuated language learning of children with ASD.

asd_ws <- data_clean_AS_WS %>% 
  group_by(id, age, sex, dataset_id) %>%
  summarise(production = sum(value, na.rm=T))
## `summarise()` has grouped output by 'id', 'age', 'sex'. You can override using
## the `.groups` argument.
asd_wg <- data_clean_AS_WG %>% 
  group_by(id, age) %>%
  summarise(production = sum(value, na.rm=T))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.

Note that there are many CDI:WS administrations for ASD children outside the intended age range (16-30 months): 352 children aged <16 months (mean production = 22 words), and 894 children aged >30 months (mean production = 476 words).

Similarly, there are many CDI:WG administrations for ASD children outside the intended age range (8-16 months): 2 children aged <12 months (mean production = 9 words), and 1313 children aged >16 months (mean production = 195 words).

Thus, for the CDI:WS we include 3148 children aged 12-48 months (removing only 5.1% of the data). For the CDI:WG we include 3750 children aged 8-36 months (removing only 7% of the data).

data_clean_AS_WG %<>% filter(age>=8, age<=36)
data_clean_AS_WS %<>% filter(age>=12, age<=48)

Final dataset stats

data_clean_AS_WG %>% 
  group_by(collection_id, collection_title, id) %>% 
  count() %>% 
  group_by(collection_id, collection_title) %>% 
  count() %>% 
  kable()
collection_id collection_title n
19 NA 1409
2024 Divergent biases for conspecifics as early markers for Autism Spectum Disorders 1955
2027 NA 61
2355 Neurobehavioral Research on Infants at Risk for Language Delay and ASD 213
2368 Clinical and Immunological Investigations of Subtypes of Autism 62
2878 NA 50
data_clean_AS_WS %>% 
  group_by(collection_id, collection_title, id) %>% 
  count() %>% 
  group_by(collection_id, collection_title) %>% 
  count() %>% 
  kable()
collection_id collection_title n
1856 Early Language Development within the Autism Spectrum 90
2024 Divergent biases for conspecifics as early markers for Autism Spectum Disorders 2785
2026 Biomarkers of Developmental Trajectories and Treatment in ASD 114
2368 Clinical and Immunological Investigations of Subtypes of Autism 109
2664 How Autism Affects Speech in Multitalker Environments 50

Save data

d_asd_wg <- data_clean_AS_WG
d_asd_ws <- data_clean_AS_WS
d_td_ws <- data_TD_WS

save(d_asd_wg, d_asd_ws, d_td_ws,
     file = "data/cached_data.Rds")