library(tidyverse)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(knitr)
This Rmd produces cached data files.
# CDI:WS
base::load("data/eng_ws_raw_data.Rds")
data_TD_WS <- eng_ws %>%
mutate(produces = value == "produces") %>%
filter(!is.na(category)) %>%
dplyr::select(data_id, produces, age, production, sex,
ethnicity, mom_ed, category, definition) %>%
mutate(form="WS")
save(data_TD_WS, file="data/TD_WS.Rdata")
Lots of data munging
data_AS_WS <- read.delim("data/mci_sentences02.txt",
header = TRUE, sep = "\t", dec = ".") %>%
filter(mcs_vc_total != "999")
# extracting first row as a descriptive dataframe
description_WS <- data_AS_WS[1:1,]
description_WS <- as.data.frame(t(description_WS))
names(description_WS) <- "description"
# what we get rid of:
eliminated_WS <- data_AS_WS %>%
dplyr::select(c(701:709,780:850)) %>%
slice(1:1) %>%
gather(key = "Column names", value = "Description")#%>%
#head(10)
# grammar items (past tense, future, not present, etc.)
description_WS[701:709,]
# complexity and examples (e.g. longest sentences)
description_WS[780:850,]
#using mci_sentences02_id as a distinctive id and The NDAR Global Unique Identifier
data_raw_AS_WS <- data_AS_WS %>%
dplyr::select(c("mci_sentences02_id","subjectkey","interview_age",
"collection_id", "dataset_id", "interview_date",
"src_subject_id", "sex", 21:779)) %>% # starting from 785 is complexity. we kept vocabs, word endings, word forms.
dplyr::select(-(689:697))
#dplyr::select(-(684:692)) # before adding interview_age -> sex...
colnames(data_raw_AS_WS) <- as.character(unlist(data_raw_AS_WS[1,])) #unlist the row
data_raw_AS_WS = data_raw_AS_WS[-1, ]
data_raw_AS_WS <- data_raw_AS_WS %>%
rename(id = "mci_sentences02_id",
GUID = "The NDAR Global Unique Identifier (GUID) for research subject",
age = "Age in months at the time of the interview/test/sampling/imaging.",
test_date = "Date on which the interview/genetic test/sampling/imaging/biospecimen was completed. MM/DD/YYYY",
src_subject_id = "Subject ID how it's defined in lab/project",
sex = "Sex of the subject") %>%
mutate(age = as.numeric(as.character(age)))
data_clean_AS_WS <- data_raw_AS_WS %>%
gather(key = "definition", value = "value",
-c(id,GUID,age,collection_id,dataset_id,test_date,src_subject_id,sex)) %>%
separate(definition, c("category","definition"),sep = "\\. ") %>%
mutate_all(na_if,"",)%>% #if blank then fill in NA
mutate(value = ifelse(value == 0, FALSE, TRUE))
data_all_AS_WS <- data_clean_AS_WS %>%
group_by(category, definition, age) %>%
summarise(num_true = sum(value, na.rm = TRUE),
num_false = n() - num_true,
prop = num_true/n())
# unique(data_all_AS_WS$category)
data_clean_AS_WS <- data_clean_AS_WS %>% filter(category!="Word forms, noun",
category!="Word forms, verbs",
category!="Word endings, nouns",
category!="Word endings, verbs") %>%
mutate(category = case_when(category=="Sound Effects and Animal Sounds" ~ "sounds",
category=="Animals (Real or Toy)" ~ "animals",
category=="Vehicles (Real or Toy)" ~ "vehicles",
category=="Food and Drink" ~ "food_drink",
category=="Clothinq" ~ "clothing",
category=="Body Parts" ~ "body_parts",
category=="Small Household Items" ~ "household",
category=="Furniture and Rooms" ~ "furniture_rooms",
category=="Outside Thlnqs" ~ "outside",
category=="Action Words" ~ "action_words",
category=="Places to Go" ~ "places",
category=="Helping Verbs" ~ "helping_verbs",
category=="Connectinq Words" ~ "connecting_words",
category=="Descriptive Words" ~ "descriptive_words",
category=="Words About Time" ~ "time_words",
category=="Quantifiers and Articles" ~ "quantifiers",
category=="Games and Routines" ~ "games_routines",
category=="Question Words" ~ "question_words",
category=="Prepositions and Locations" ~ "locations",
category=="Helpinq Verbs" ~ "helping_verbs",
TRUE ~ category)) %>%
mutate(category = tolower(category),
definition = case_when(definition=="baa" ~ "baa baa",
definition=="cockadoodle" ~ "cockadoodledoo",
definition=="quack" ~ "quack quack",
definition=="uhoh" ~ "uh oh",
definition=="woof" ~ "woof woof",
definition=="yum" ~ "yum yum",
definition=="chicken" & category=="food_drink" ~ "chicken (food)",
definition=="chicken" ~ "chicken (animal)",
definition=="fish" & category=="food_drink" ~ "fish (food)",
definition=="fish" ~ "fish (animal)",
definition=="playdough" ~ "play dough",
definition=="vagina" ~ "vagina*",
definition=="penis" ~ "penis*",
definition=="frenchfries" ~ "french fries",
definition=="greenbeans" ~ "green beans",
definition=="toy" ~ "toy (object)",
definition=="drink" & category=="action_words" ~ "drink (action)",
definition=="drink" ~ "drink (beverage)",
definition=="gasstation" ~ "gas station",
definition=="orange" & category=="food_drink" ~ "orange (food)",
definition=="orange" ~ "orange (description)",
definition=="allgone" ~ "all gone",
definition=="water" & category=="food_drink" ~ "water (beverage)",
definition=="water" ~ "water (not beverage)",
definition=="feet" ~ "foot",
definition=="callph" ~ "call (on phone)",
definition=="clean" & category=="action_words" ~ "clean (action)",
definition=="clean" ~ "clean (description)",
definition=="owie booboo" ~ "owie/boo boo",
definition=="dont" ~ "don't",
definition=="5bowl" ~ "bowl",
definition=="can" & category=="household" ~ "can (object)",
definition=="can" & category=="helping_verbs" ~ "can (auxiliary)",
definition=="rockingchair" ~ "rocking chair",
definition=="alot" ~ "a lot",
definition=="buttocks/bottom" ~ "buttocks/bottom*",
definition=="daddy" ~ "daddy*",
definition=="childname" ~ "child's own name",
definition=="washingmachine" ~ "washing machine",
definition=="try" ~ "try/try to",
definition=="work" & category=="places" ~ "work (place)",
definition=="work" ~ "work (action)",
definition=="giveme five" ~ "give me five!",
definition=="mommy" ~ "mommy*",
definition=="grandma" ~ "grandma*",
definition=="church" ~ "church*",
definition=="daddy" ~ "daddy*",
definition=="grandpa" ~ "grandpa*",
definition=="patty cake" ~ "pattycake",
definition=="dry" & category=="action_words" ~ "dry (action)",
definition=="dry" ~ "dry (description)",
definition=="lemme" ~ "lemme/let me",
definition=="tissklee" ~ "tissue/kleenex",
definition=="did" ~ "did/did ya",
definition=="gonna get you" ~ "gonna get you!",
definition=="peanutbutter" ~ "peanut butter",
definition=="playpen" ~ "play pen",
definition=="potatochip" ~ "potato chip",
definition=="wanna" ~ "wanna/want to",
definition=="watch" & category=="action_words" ~ "watch (action)",
definition=="watch" ~ "watch (object)",
definition=="dress" ~ "dress (object)",
definition=="gonna" ~ "gonna/going to",
definition=="gotta" ~ "gotta/got to",
definition=="hafta" ~ "hafta/have to",
definition=="highchair" ~ "high chair",
definition=="lawnmower" ~ "lawn mower",
definition=="little" ~ "little (description)",
definition=="petname" ~ "pet's name",
definition=="so big" ~ "so big!",
definition=="need" ~ "need/need to",
definition=="shush" ~ "shh/shush/hush",
definition=="swing" & category=="action_words" ~ "swing (action)",
definition=="swing" ~ "swing (object)",
definition=="slide" & category=="action_words" ~ "slide (action)",
definition=="slide" ~ "slide (object)",
TRUE ~ definition))
save(data_clean_AS_WS, file="data/ASD_WS.Rdata")
# summary(data_all_AS_WS)
Note: found a problem in the NDAR description file – mcg_vc18_back is given the definition “backyard” (making a duplicate) instead of “back”. Also, “throw” (col 517) was left out of previous semantic network growth analyses.
data_AS_WG <- read.delim("data/mci_words_gestures01.txt",
header = TRUE, sep = "\t", dec = ".") %>%
filter(mcg_vc_totcom != "999")
# extracting first row as a descriptive dataframe
description_WG <- data_AS_WG[1:1,]
description_WG <- as.data.frame(t(description_WG))
names(description_WG) <- "description"
# we only kept vocab
eliminated_WG <- data_AS_WG %>%
dplyr::select(c(23:58,454:520))
#using mci_words_gestures01_id as a distinctive id and The NDAR Global Unique Identifier
data_raw_AS_WG <- data_AS_WG %>%
dplyr::select(c("collection_id","dataset_id","sex","mci_words_gestures01_id","subjectkey","interview_age", 59:453, 517))
colnames(data_raw_AS_WG) <- as.character(unlist(data_raw_AS_WG[1,])) #unlist the row
data_raw_AS_WG = data_raw_AS_WG[-1, ]
# what are the duplicated
AS_WG_duplicated <- data_raw_AS_WG[duplicated(colnames(data_raw_AS_WG))] # can call colnames
# making column names unique
names(data_raw_AS_WG) <- make.unique(names(data_raw_AS_WG), sep="_")
data_raw_AS_WG <- data_raw_AS_WG %>%
rename(id = "mci_words_gestures01_id",
GUID = "The NDAR Global Unique Identifier (GUID) for research subject",
age = "Age in months at the time of the interview/test/sampling/imaging.",
house = "MacArthur Words and Gestures: Vocabulary Checklist: House",
sex = "Sex of the subject") %>%
mutate(age = as.numeric(as.character(age)))
data_clean_AS_WG <- data_raw_AS_WG %>%
gather(key = "definition", value = "value",
-c(id,GUID,age,sex,dataset_id,collection_id)) %>%
#separate(definition, c("category","definition"),sep = "\\. ") %>%
mutate_all(na_if,"",) %>% #if blank then fill in NA
mutate(value = ifelse(value == 0, FALSE, TRUE))
data_all_AS_WG <- data_clean_AS_WG %>%
group_by(definition, age) %>%
summarise(num_true = sum(value, na.rm = TRUE),
num_false = n() - num_true,
prop = num_true/n())
data_clean_AS_WG <- data_clean_AS_WG %>%
mutate(definition = case_when(
definition=="bye or bye bye" ~ "bye",
definition=="chicken" ~ "chicken (animal)",
definition=="chicken_1" ~ "chicken (food)",
definition=="peek-a-boo" ~ "peekaboo",
definition=="water" ~ "water (beverage)",
definition=="water_1" ~ "water (not beverage)",
definition=="church" ~ "church*",
definition=="clean" ~ "clean (action)",
definition=="clean_1" ~ "clean (description)",
definition=="daddy" ~ "daddy*",
definition=="dress" ~ "dress (object)",
definition=="towl" ~ "towel",
definition=="grandpa" ~ "grandpa*",
definition=="grandma" ~ "grandma*",
definition=="mommy" ~ "mommy*",
definition=="owie/ boo boo" ~ "owie/boo boo",
definition=="little" ~ "little (description)",
definition=="drink" ~ "drink (beverage)",
definition=="drink_1" ~ "drink (action)",
definition=="dry" ~ "dry (description)",
definition=="fire truck" ~ "firetruck",
definition=="fish" ~ "fish (animal)",
definition=="fish_1" ~ "fish (food)",
definition=="toy" ~ "toy (object)",
definition=="teddy bear" ~ "teddybear",
definition=="swing" ~ "swing (object)",
definition=="swing_1" ~ "swing (action)",
definition=="work" ~ "work (place)",
definition=="orange" ~ "orange (food)",
definition=="patty cake" ~ "pattycake",
definition=="slide" ~ "slide (object)",
definition=="watch" ~ "watch (object)",
definition=="watch_1" ~ "watch (action)",
definition=="backyard_1" ~ "back", # "mcg_vc18_back"
TRUE ~ definition
))
save(data_clean_AS_WG, file="data/ASD_WG.Rdata")
# summary(data_all_AS_WG)
We need to consider which ASD studies are relevant to include (e.g., some may be from more severe cases than others), as well as which have oddities (e.g., duplicate/mismatched subject IDs). At a first glance at the summary by dataset_id, a few of the datasets have all 0 production scores (e.g., dataset_id = c(9137, 17935, 21697, 17999, 17151)). Are these true 0 CDI scores, or are the CDI data for these studies somehow missing? As these studies comprise 1170 participants (of our 4488 total ASD sample), it is important to know what’s going on, and whether or not they should be excluded. I also note that studies 17935 and 21697 suspiciously have the same age range, mean_age, and number of subjects – are these 194 subjects duplicated under different dataset_ids?
For now, I remove all of the datasets without any non-zero production scores.
load("data/ASD_WS.Rdata") # data_clean_AS_WS
load("data/ASD_WG.Rdata") # data_clean_AS_WG
load("data/TD_WS.Rdata") # data_TD_WS
AS_WS_summary <- data_clean_AS_WS %>%
group_by(collection_id, age, sex, dataset_id) %>%
summarise(production = sum(value, na.rm=T)) %>%
group_by(collection_id) %>%
summarise(min_age = min(age),
max_age = max(age),
mean_age = mean(age),
n = n(),
mean_prod = mean(production)) %>%
arrange(desc(n))
## `summarise()` has grouped output by 'collection_id', 'age', 'sex'. You can
## override using the `.groups` argument.
AS_WS_summary %>%
kable(format = "html", table.attr = "style='width:50%;'", digits=1)
collection_id | min_age | max_age | mean_age | n | mean_prod |
---|---|---|---|---|---|
2024 | 12 | 39 | 23.6 | 285 | 2521.9 |
2368 | 17 | 122 | 60.5 | 120 | 924.4 |
9 | 1 | 48 | 31.1 | 64 | 0.0 |
2666 | 17 | 34 | 24.8 | 60 | 0.0 |
2664 | 19 | 64 | 36.5 | 54 | 500.2 |
1856 | 23 | 47 | 31.1 | 25 | 282.4 |
2026 | 23 | 60 | 31.8 | 17 | 2542.6 |
2355 | 17 | 26 | 21.3 | 15 | 0.0 |
AS_WG_summary <- data_clean_AS_WG %>%
group_by(collection_id, age, sex, dataset_id) %>%
summarise(production = sum(value, na.rm=T)) %>%
group_by(collection_id) %>%
summarise(min_age = min(age),
max_age = max(age),
mean_age = mean(age),
n = n(),
mean_prod = mean(production)) %>%
arrange(desc(n))
## `summarise()` has grouped output by 'collection_id', 'age', 'sex'. You can
## override using the `.groups` argument.
AS_WG_summary %>%
kable(format = "html", table.attr = "style='width:50%;'", digits=1)
collection_id | min_age | max_age | mean_age | n | mean_prod |
---|---|---|---|---|---|
2024 | 7 | 20 | 13.2 | 193 | 505.8 |
2368 | 13 | 132 | 61.5 | 123 | 436.8 |
1885 | 8 | 63 | 35.6 | 120 | 0.0 |
2026 | 7 | 21 | 14.2 | 83 | 0.0 |
1600 | 0 | 45 | 19.9 | 74 | 0.0 |
9 | 3 | 48 | 26.7 | 70 | 0.0 |
19 | 11 | 38 | 22.0 | 67 | 3188.0 |
16 | 15 | 30 | 23.4 | 56 | 0.0 |
8 | 11 | 48 | 27.2 | 50 | 0.0 |
2192 | 60 | 173 | 81.1 | 30 | 131.0 |
2355 | 11 | 20 | 14.0 | 23 | 21.1 |
2666 | 12 | 22 | 16.8 | 19 | 0.0 |
2878 | 8 | 19 | 12.7 | 19 | 101.6 |
2510 | 59 | 101 | 76.1 | 18 | 171.2 |
1952 | 49 | 85 | 68.1 | 16 | 0.0 |
6 | 10 | 18 | 14.1 | 15 | 0.0 |
2503 | 27 | 53 | 36.9 | 14 | 0.0 |
2027 | 12 | 29 | 21.8 | 13 | 964.8 |
2089 | 17 | 22 | 19.5 | 12 | 0.0 |
2169 | 74 | 124 | 106.3 | 3 | 0.0 |
Remove all datasets with all 0 production.
zero_prod_WG <- filter(AS_WG_summary, mean_prod < 1) %>% pull(collection_id)
zero_prod_WS <- filter(AS_WS_summary, mean_prod < 1) %>% pull(collection_id)
data_clean_AS_WG %<>% filter(!(collection_id %in% zero_prod_WG))
data_clean_AS_WS %<>% filter(!(collection_id %in% zero_prod_WS))
data_sources <- read_csv("data/NDA data sources.csv")
## Rows: 10 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): collection_title, PI, NDA link, Likely paper link, Notes
## dbl (5): collection_id, n, n_unique, n_unique_id, prop_male
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_clean_AS_WS %<>%
mutate(collection_id = as.numeric(collection_id)) %>%
left_join(select(data_sources, collection_id, collection_title))
## Joining, by = "collection_id"
data_clean_AS_WG %<>%
mutate(collection_id = as.numeric(collection_id)) %>%
left_join(select(data_sources, collection_id, collection_title))
## Joining, by = "collection_id"
We will constrain our analysis to those children close to the intended age ranges, extending a wider margin for older children to capture the attenuated language learning of children with ASD.
asd_ws <- data_clean_AS_WS %>%
group_by(id, age, sex, dataset_id) %>%
summarise(production = sum(value, na.rm=T))
## `summarise()` has grouped output by 'id', 'age', 'sex'. You can override using
## the `.groups` argument.
asd_wg <- data_clean_AS_WG %>%
group_by(id, age) %>%
summarise(production = sum(value, na.rm=T))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.
Note that there are many CDI:WS administrations for ASD children outside the intended age range (16-30 months): 352 children aged <16 months (mean production = 22 words), and 894 children aged >30 months (mean production = 476 words).
Similarly, there are many CDI:WG administrations for ASD children outside the intended age range (8-16 months): 2 children aged <12 months (mean production = 9 words), and 1313 children aged >16 months (mean production = 195 words).
Thus, for the CDI:WS we include 3148 children aged 12-48 months (removing only 5.1% of the data). For the CDI:WG we include 3750 children aged 8-36 months (removing only 7% of the data).
data_clean_AS_WG %<>% filter(age>=8, age<=36)
data_clean_AS_WS %<>% filter(age>=12, age<=48)
data_clean_AS_WG %>%
group_by(collection_id, collection_title, id) %>%
count() %>%
group_by(collection_id, collection_title) %>%
count() %>%
kable()
collection_id | collection_title | n |
---|---|---|
19 | NA | 1409 |
2024 | Divergent biases for conspecifics as early markers for Autism Spectum Disorders | 1955 |
2027 | NA | 61 |
2355 | Neurobehavioral Research on Infants at Risk for Language Delay and ASD | 213 |
2368 | Clinical and Immunological Investigations of Subtypes of Autism | 62 |
2878 | NA | 50 |
data_clean_AS_WS %>%
group_by(collection_id, collection_title, id) %>%
count() %>%
group_by(collection_id, collection_title) %>%
count() %>%
kable()
collection_id | collection_title | n |
---|---|---|
1856 | Early Language Development within the Autism Spectrum | 90 |
2024 | Divergent biases for conspecifics as early markers for Autism Spectum Disorders | 2785 |
2026 | Biomarkers of Developmental Trajectories and Treatment in ASD | 114 |
2368 | Clinical and Immunological Investigations of Subtypes of Autism | 109 |
2664 | How Autism Affects Speech in Multitalker Environments | 50 |
d_asd_wg <- data_clean_AS_WG
d_asd_ws <- data_clean_AS_WS
d_td_ws <- data_TD_WS
save(d_asd_wg, d_asd_ws, d_td_ws,
file = "data/cached_data.Rds")