General data cleaning
# set factor levels where appropriate
abcd <- abcd %>%
mutate(
family_id = factor(family_id),
site_id = factor(site_id),
Rship2child=factor(Rship2child,
levels = c(1,2,3,4,5),
labels = c("BioMom", "BioDad", "AdoptParent", "CustodParent", "Other")),
anyfamilyinABCD = factor(anyfamilyinABCD,
levels = c(0,1,2,3),
labels = c("no other fam in ABCD", "sibling in ABCD", "twin in ABCD", "triplets in ABCD")),
sex_atbirth = factor(sex_atbirth,
levels = c(1,2,3,4),
labels = c("male", "female", "intersex-male", "intersex-female")),
sex = factor(sex,
levels = c("F", "M"),
labels = c("F", "M")),
gender_identity = factor(gender_identity,
levels = c(1,2,3,4,5,6),
labels = c("male", "female", "trans-male", "trans-female", "queer", "other")),
language = factor(language,
levels = c(0,1),
labels = c("English", "Spanish")),
plannedpreg = factor(plannedpreg,
levels = c(0,1),
labels = c("No", "Yes")),
motordev = factor(motordev,
levels = c(1,2,3,4,5),
labels = c("much earlier","somewhat earlier","about average","somewhat later", "much later")),
speechdev = factor(speechdev,
levels = c(1,2,3,4,5),
labels = c("much earlier","somewhat earlier","about average","somewhat later", "much later")),
race_ethnicity = factor(race_ethnicity,
levels = c(1,2,3,4,5),
labels = c("White", "Black", "Hispanic", "Asian", "Other")),
demo_comb_income_v2 = factor(demo_comb_income_v2,
levels = c(1,2,3,4,5,6,7,8,9,10),
labels = c("< $5k", "$5k-11,999", "$12k-15,999", "$16k-24,999", "$25k-34,999",
"$35k-49,999", "$50k-74,999", "$75k-99,999", "$100k-199,999", "$200k +")),
grades= factor(grades,
levels = c(1,2,3,4,5,6,-1),
labels = c("As, excellent", "Bs, good", "Cs, average", "Ds, below ave", "Fs, struggling", "ungraded", NA_character_)),
school_type= factor(school_type,
levels = c(1:9),
labels = c("Not in school", "Regular public", "Regular private", "Voc/Tech", "Cyber", "Home School", "Special. for emot/beh probs", "Other", "Charter")),
cat_medhx = factor(cat_medhx,
levels = c(0,1,2,3),
labels = c("none", "1", "2", "3 or more")),
mhserv_ever = factor(mhserv_ever,
levels = c(0,1,3),
labels = c("No", "Yes", NA_character_)),
demo_prnt_ed_v2 = factor(demo_prnt_ed_v2),
demo_prtnr_ed_v2 = factor(demo_prtnr_ed_v2),
demo_prnt_empl_v2 = factor(demo_prnt_empl_v2),
demo_prtnr_empl_v2 = factor(demo_prtnr_empl_v2),
demo_prnt_marital_v2 = factor(demo_prnt_marital_v2,
levels = c(1:6),
labels = c("married", "widowed", "divorced", "separated", "never married", "living with partner")),
devhx_10_prenatvitamins = factor(devhx_10_prenatvitamins,
levels = c(-1,0,1),
labels = c(NA_character_, "No", "Yes")),
demo_prnt_prtnr_v2 = factor(demo_prnt_prtnr_v2,
levels = c(1,2),
labels = c("Yes", "No"))) %>%
dplyr::rename(
combined_income = demo_comb_income_v2,
resps_partner = demo_prnt_prtnr_v2,
parent1_ed = demo_prnt_ed_v2 ,
parent2_ed =demo_prtnr_ed_v2,
parent1_emp = demo_prnt_empl_v2 ,
parent2_emp = demo_prtnr_empl_v2
)
# declare all remaining categorical variables as such
cols <- c('twin','twinortriplet', 'biomom', 'car_accid', 'other_accid', 'fire', 'natur_disast', 'terrorism', 'war_death', 'community_viol', 'attack_bynonfam', 'attack_byfam', 'beaten_byfam', 'deaththreat_nonfam', 'deaththreat_fam', 'witness_dom_viol', 'sexabuse_fam', 'sexabuse_nonfam', 'sexabuse_peer', 'death_loved', 'mhserv_mh_outpat', 'mhserv_mh_parthosp' , 'mhserv_mh_hosp' , 'mhserv_subs_outpat' , 'mhserv_subs_parthosp' , 'mhserv_subs_hosp' , 'mhserv_therapy' , 'mhserv_meds' , 'mhserv_other' , 'mhserv_none', 'bullying', 'ed_ft_emot' , 'ed_ft_learn', 'ed_ft_aid', 'ed_specif_subj', 'ed_pt_aid', 'ed_resource_rm', 'ed_tutor', 'ed_gifted','ed_other', 'ed_none', 'brain_injury', 'bronchitis', 'cancer_leukemia', 'cereb_palsy', 'diabetes', 'epilepsy_seizures', 'kidney_disease', 'lead_poison', 'musc_dystrophy', 'multi_sclerosis', 'heart_problems', 'sickle_cell_anemia', 'bad_headaches','highfever', 'famhx_alcoholism', 'famhx_drugabuse', 'famhx_depression', 'famhx_manic', 'famhx_psychotic', 'famhx_conduct', 'famhx_nerves', 'famhx_clinic_support', 'famhx_hosp', 'famhx_suicide', 'premature', 'csection', 'knowofbiofam')
# removed: 'dep', 'bip', 'psychos', 'anx', 'ocd', 'eat', 'att', 'beh', 'neurodev', 'subs', 'trauma', 'sleep', 'selfharm', 'suic_idea','suic_beh', 'homic'
abcd[cols] <- lapply(abcd[cols], factor)
# import variable descriptions/labels
labels = read.csv("/Users/niamhdooley/OneDrive - Royal College of Surgeons in Ireland/AAAPhD/ABCD/Project 1- BW and CBCL/DownloadedData/ABCDbeh1/abcd_labels.csv")
labelst <- t(labels) #transpose
abcd <- sjlabelled::set_label(abcd, labelst) # from package sjlabelled
# clean workspace
rm(list=c("labels"))
abcd<- move_columns(abcd, twinortriplet, .after = "anyfamilyinABCD")
#label(abcd$twinortriplet) <- "Does child have any twin/triplet (in or not in ABCD)"
head(abcd[,c(6, 7, 13, 24)]) # show first 5 entries for some key variables
## tp_cbcl Rship2child sex syn_attention_r
## 1 baseline_year_1_arm_1 BioMom M 17
## 2 baseline_year_1_arm_1 BioMom F 3
## 3 baseline_year_1_arm_1 BioMom M 1
## 4 baseline_year_1_arm_1 BioMom F 8
## 5 baseline_year_1_arm_1 BioMom M 2
## 6 baseline_year_1_arm_1 BioMom F 0
abcd1<-abcd
# clean up KSADS summary vars further:
ind1 = which(names(abcd1)=="dep")
ind2 = which(names(abcd1)=="homic")
for (var in ind1:ind2) {
# for all instances of 2+ in that dx group, give a 1
rows <- which(abcd1[,var]>1)
abcd1[rows, var] <- 1
# make that var a factor with 2 levels
abcd1[,var] <- factor(abcd1[,var],
levels=c(1,0),
labels = c("Dx", "No Dx"))
}
# note: when you run subset() you lose var labels, get them back from the source with copy_labels
abcd1 <- sjlabelled::copy_labels(abcd1, abcd)
abcd <- abcd1
rm(list=c("abcd1", "labelst"))
# some missing values coded as "NaN" rather than NA
# swap factor level "NaN" for an actual NA.
abcd1 <- abcd
for (v in 1:length(abcd1)) {
if (is.factor(abcd1[,v])) {
rows <- which(abcd1[,v]=="NaN")
abcd1[rows,v] = NA_character_
}
}
# do similar thing for famhx variables: replace response 7 ("refuse to answer") with missing
ind1= which(names(abcd)=="famhx_alcoholism")
ind2= which(names(abcd)=="famhx_suicide")
for (v in ind1:ind2) {
rows <- which(abcd1[,v]==7)
abcd1[rows,v] = NA_character_
}
# do same thing for "grades"-- replace "ungraded" with missing
rows <- which(abcd1$grades == "ungraded")
abcd1$grades[rows] = NA_character_
# "NaN" and 7 etc, are still listed as levels, but they are empty
abcd1 <- droplevels(abcd1) # drops all empty levels of factors
# replace var labels (deleted after) subsetting
abcd1 <- sjlabelled::copy_labels(abcd1, abcd)
abcd <- abcd1
rm(list=c("abcd1"))
# set levels of the binary variable "premature" (note: this var has been over-rided by premature.37orless)
levels(abcd$premature) <- list("Full Term"=0 , "Premature"=1)
Specific variable cleaning
Family units
# we should retain only 1 child per family (can't use mixed model/nesting in my elastic net function), keep the eldest of each household.
a<- as.data.frame(table(abcd$family_id))
ids <- a[a$Freq>1,] # some have 2 kids in same hs, few have 3, no more
tokeep <- c()
for (x in 1:length(ids$Var1)) {
rows <- which(abcd$family_id==ids$Var1[x])
# chose the oldest kid
ind <- which.max(abcd$child_age[rows])
tokeep <-c(tokeep, rows[ind])
}
abcd.1sib <- abcd[c(tokeep),]
abcd.1sib <- sjlabelled::copy_labels(abcd.1sib, abcd)
# check
b<- as.data.frame(table(abcd.1sib$family_id))
b.ids <- b[b$Freq>1,]
# add these now unrelated subjects to abcd$anyfamilyinABCD = "no other fam in ABCD"
abcd <- rbind(abcd[abcd$anyfamilyinABCD=="no other fam in ABCD",], abcd.1sib)
# now 9,987 unrelated kids
abcd <- sjlabelled::copy_labels(abcd, abcd.1sib)
rm(list=c("abcd.1sib"))
# Remove any duplicate rows
abcd1 <- abcd
a<-as.data.frame(table(abcd$src_subject_id))
row=which(a$Freq>1)
id<- as.character(a$Var1[row])
rows= which(abcd$src_subject_id==id)
abcd <- abcd[-c(rows[1]),]
abcd <- sjlabelled::copy_labels(abcd, abcd1)
# there should be 9,988 individuals in the df
# note: there were two variables denoting whether child was a twin or not, they don't match up perfectly...
# "twin" was a binary variable. "anyfamilyinABCD" defined what their relationship was with their other family member taking parrt in ABCD (twin, sibling, etc). Use the "twin" and "anyfamilyinABCD" variables in conjunction to define who was a multiple birth.
abcd$multibirth <- 0
rows <- which(abcd$twin==1 | abcd$anyfamilyinABCD=="twin in ABCD" | abcd$anyfamilyinABCD=="triplets in ABCD")
abcd$multibirth[rows] <- 1
abcd$multibirth[is.na(abcd$twins)] <- NA
abcd<- move_columns(abcd, twin, .after = "anyfamilyinABCD")
Birth weight & gestational age
abcd$lbw <- ifelse(abcd$bw_kgs<2.5, 1, 0) # create "low" birth weight
abcd$bw.z <- scale(abcd$bw_kgs, scale=F) # demean birth weight
abcd<- move_columns(abcd, bw.z, .after = "bw_kgs")
# check gestational age variables
table(abcd$premature, abcd$premature_wks)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## Full Term 0 0 0 0 0 0 0 0 0 0 0 0 0
## Premature 55 163 242 411 170 167 68 123 27 26 4 10 30
# NOTE: I found out, from private correspondance with one of the ABCD PIs and data curators that premature here means anything under 40 weeks (and not the classic definition of premature/preterm as <37 weeks)
# relabel cases who are premature by just 1/2 weeks (i.e. born at 38 or 39 weeks) as Full-Term. Create new var to do so.
abcd$premature.37orless <- abcd$premature
rows <- which((abcd$premature_wks==1) | (abcd$premature_wks==2))
abcd$premature.37orless[rows] = "Full Term"
table(abcd$premature.37orless, abcd$premature_wks) # check
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## Full Term 55 163 0 0 0 0 0 0 0 0 0 0 0
## Premature 0 0 242 411 170 167 68 123 27 26 4 10 30
# make a factor var of premature_wks
abcd$premature.cat <- cut(abcd$premature_wks, c(-Inf,2,3,4,7,Inf),
labels = c("1/2 wks early", "3 wks early (37w)", "4 wks early (36w)", "5-7 wks early (33-35w)", "8+ wks early (<=32w)"))
summary(abcd$premature.cat)
## 1/2 wks early 3 wks early (37w) 4 wks early (36w)
## 218 242 411
## 5-7 wks early (33-35w) 8+ wks early (<=32w) NA's
## 405 220 8491
# make another 'prematurity in weeks' var which calls any "on time" = 0.
# note: its flawed in the sense its winsorized/capped at values 0 and 13.
abcd$prem_wks <- abcd$premature_wks
abcd$prem_wks[abcd$premature=="Full Term"] = 0
# now convert this to actual gestational age (0=40 "full term")
abcd$ga_wks <- 40-abcd$prem_wks
table(abcd$ga_wks, abcd$prem_wks)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 27 0 0 0 0 0 0 0 0 0 0 0 0 0 30
## 28 0 0 0 0 0 0 0 0 0 0 0 0 10 0
## 29 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## 30 0 0 0 0 0 0 0 0 0 0 26 0 0 0
## 31 0 0 0 0 0 0 0 0 0 27 0 0 0 0
## 32 0 0 0 0 0 0 0 0 123 0 0 0 0 0
## 33 0 0 0 0 0 0 0 68 0 0 0 0 0 0
## 34 0 0 0 0 0 0 167 0 0 0 0 0 0 0
## 35 0 0 0 0 0 170 0 0 0 0 0 0 0 0
## 36 0 0 0 0 411 0 0 0 0 0 0 0 0 0
## 37 0 0 0 242 0 0 0 0 0 0 0 0 0 0
## 38 0 0 163 0 0 0 0 0 0 0 0 0 0 0
## 39 0 55 0 0 0 0 0 0 0 0 0 0 0 0
## 40 8344 0 0 0 0 0 0 0 0 0 0 0 0 0
ggplot(data = abcd) +
geom_point(aes(x = prem_wks, y = bw_kgs, colour=prem_wks)) +
labs(y = "BW (kgs)", x = "Prematurity (weeks)") +
theme_minimal()
## Warning: Removed 523 rows containing missing values (geom_point).

# **outlier observed: preterm baby born at 6kg unlikely/extreme outlier. Remove.
row <- which(abcd$bw_kgs>6 & abcd$prem_wks>5)
abcd1 <- abcd[-c(row),]
# when you run subset() you lose var labels, get them back from the source with copy_labels
abcd1 <- sjlabelled::copy_labels(abcd1, abcd)
abcd <- abcd1
rm(list=c("abcd1"))
Parent psychiatric history
# get a sum of mental illnesses among the biological parents of the child
abcd$parent_mh_n <- rowSums(cbind(abcd$fam1, abcd$fam2))
summary(abcd$parent_mh_n)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.676 3.000 18.000
# replace cells with missing if the respondant does not know about the biological family of the child (e.g. if adoptive parent).
rows1 <- which(abcd$knowofbiofam==0)
rows2 <- which(is.na(abcd$knowofbiofam))
rows <- c(rows1, rows2)
abcd$parent_mh_n[rows] <- NA
ggplot(data=abcd)+
geom_histogram(aes(parent_mh_n), binwidth=1)
## Warning: Removed 113 rows containing non-finite values (stat_bin).

Parent education level
# create "household education" variable via the dominance criterion (household = the highest edu rank available)
abcd$hsd_ed <- NA
for (sub in 1:length(abcd$parent1_ed)) {
if (is.na(abcd$parent2_ed[sub])) { #if parent2 has no edu avail, select parent1's ed
abcd$hsd_ed[sub] = abcd$parent1_ed[sub]}
else if (is.na(abcd$parent1_ed[sub])) { #if parent1 has no edu avail, select parent2's ed
abcd$hsd_ed[sub] = abcd$parent2_ed[sub]}
else if (!is.na(abcd$parent1_ed[sub]) && !is.na(abcd$parent2_ed[sub])) { #if both avail, select largest
abcd$hsd_ed[sub] = max(as.numeric(paste(abcd$parent1_ed[sub])), as.numeric(paste(abcd$parent2_ed[sub])))}
}
abcd<- move_columns(abcd, hsd_ed, .after = "parent2_ed")
abcd<- move_columns(abcd, parent1_ed, .before = "parent2_ed")
# make it a factor and re-level hsd education
abcd <- abcd %>%
mutate(
hsd_ed = factor(hsd_ed))
levels(abcd$hsd_ed) <- list("None/kinder"=0 ,
"Incomplete Schooling"=c(1:12),
"High School Degree/GED"=c(13:14),
"Some college"=15,
"Associate Degree"=c(16,17),
"Primary/Profess Degree"=c(18,20),
"Masters"=19,
"Doctoral" = c(21)) # no hsd had level 0
# nobody had "none/kinder" as their household max
abcd$hsd_ed <- droplevels(abcd$hsd_ed)
# create a numeric version of the variable
abcd$hsd_ed.n <- as.numeric(abcd$hsd_ed)
abcd$hsd_ed.n.z <- scale(abcd$hsd_ed.n, scale=F) # centre so that mean=0, don't alter the unit
# check the numerical variable has correct #s in each "group"
summary(abcd$hsd_ed)
## Incomplete Schooling High School Degree/GED Some college
## 511 981 1296
## Associate Degree Primary/Profess Degree Masters
## 1302 2963 2357
## Doctoral NA's
## 561 15
summary(as.factor(abcd$hsd_ed.n))
## 1 2 3 4 5 6 7 NA's
## 511 981 1296 1302 2963 2357 561 15
## re-label "some college" as high school/GED only
## merge masters and doctoral degrees as postgrad
## make sure label notes primary & professional degrees
abcd$hsd_ed5 <- abcd$hsd_ed
levels(abcd$hsd_ed5) <- c("Incomplete Schooling", "High School Degree/GED", "High School Degree/GED" ,"Associate Degree", "Primary/Professional Degree","Postgrad" ,"Postgrad")
abcd$hsd_ed5 <- droplevels(abcd$hsd_ed5)
abcd$hsd_ed5_n <- as.numeric(abcd$hsd_ed5)
abcd$hsd_ed5.n.z <- scale(abcd$hsd_ed5_n, scale=F) # centre so that mean=0, don't alter the unit
Parent employment & income
# Note: this var was not used in the end
# simplify into employed (1), unemployed (2:11)
levels(abcd$parent1_emp) <- list("Employed"= c(1,9,10) , "Unemployed"=c(2,3,11,6,7,5,4,8))
levels(abcd$parent2_emp) <- list("Employed"= c(1,9,10) , "Unemployed"=c(2,3,11,6,7,5,4,8))
# for income, simply make a numeric version of the var
abcd$income.n <- as.numeric(abcd$combined_income)
abcd$income.n.z <- scale(abcd$income.n, scale=F) # centre so that mean=0, don't alter the unit
# check that worked
summary(as.factor(abcd$income.n))
## 1 2 3 4 5 6 7 8 9 10 NA's
## 360 371 241 449 572 790 1247 1321 2728 1032 875
summary(abcd$combined_income)
## < $5k $5k-11,999 $12k-15,999 $16k-24,999 $25k-34,999
## 360 371 241 449 572
## $35k-49,999 $50k-74,999 $75k-99,999 $100k-199,999 $200k +
## 790 1247 1321 2728 1032
## NA's
## 875
Parental age
abcd$matage.z <- scale(abcd$matage_birth, scale=F) # demean mum's age
abcd<- move_columns(abcd, matage.z, .after = "matage_birth")
# clean paternal age variable (there are 2 extremely high, improbable, age values, 300+ , replace with NA)
rows <- which(abcd$patage_birth>150)
abcd$patage_birth[rows] <- NA
# "young" less than 20
# "old" 35<
ggplot(data=abcd)+
geom_histogram(aes(x=matage_birth), fill="darkblue", colour="blue", binwidth=1)
## Warning: Removed 227 rows containing non-finite values (stat_bin).

ggplot(data=abcd)+
geom_histogram(aes(x=patage_birth), fill="darkblue", colour="blue", binwidth=1)
## Warning: Removed 558 rows containing non-finite values (stat_bin).

# create "years over 35", "years under 20"
# mother
abcd$mumold <- abcd$matage_birth-35
abcd$mumold[abcd$mumold<0] <- 0
abcd$mumyoung <- abcd$matage_birth-20
abcd$mumyoung[abcd$mumyoung>0] <- 0
abcd$mumyoung <- abs(abcd$mumyoung)
# father
abcd$dadold <- abcd$patage_birth-35
abcd$dadold[abcd$dadold<0] <- 0
abcd$dadyoung <- abcd$patage_birth-20
abcd$dadyoung[abcd$dadyoung>0] <- 0
abcd$dadyoung <- abs(abcd$dadyoung)
Maternal substance-use during pregnancy
Smoking
# create var for "smoked at all during preg"
abcd$smokedpreg <- NA
for (i in 1:length(abcd$devhx_8_tobacco))
{if ( is.na(abcd$devhx_8_tobacco[i]) && is.na(abcd$devhx_9_tobacco[i]) ) {
abcd$smokedpreg[i] = NA
}else if (is.na(abcd$devhx_8_tobacco[i]) && abcd$devhx_9_tobacco[i]==0 ) {
abcd$smokedpreg[i] = 1
}else if (abcd$devhx_8_tobacco[i]==0 && is.na(abcd$devhx_9_tobacco[i]) ) {
abcd$smokedpreg[i] = 2
}else if (is.na(abcd$devhx_8_tobacco[i]) && abcd$devhx_9_tobacco[i]==1 ) {
abcd$smokedpreg[i] = 3
}else if (abcd$devhx_8_tobacco[i]==1 && is.na(abcd$devhx_9_tobacco[i]) ) {
abcd$smokedpreg[i] = 4
}else if (abcd$devhx_8_tobacco[i]==0 && abcd$devhx_9_tobacco[i]==1 ) {
abcd$smokedpreg[i] = 5
}else if (abcd$devhx_8_tobacco[i]==1 && abcd$devhx_9_tobacco[i]==0 ) {
abcd$smokedpreg[i] = 6
}else if ((abcd$devhx_8_tobacco[i]==1) && (abcd$devhx_9_tobacco[i]==1) ) {
abcd$smokedpreg[i] = 7
}else if (abcd$devhx_8_tobacco[i]==0 && abcd$devhx_9_tobacco[i]==0 ) {
abcd$smokedpreg[i] = 0}}
abcd<- move_columns(abcd, devhx_9_tobacco, .after = "devhx_8_tobacco")
abcd<- move_columns(abcd, smokedpreg, .after = "devhx_9_tobacco")
abcd$smokedpreg <- factor(abcd$smokedpreg,
levels = c(0,1,2,3,4,5,6,7),
labels = c("no, never" , # clear no
"not after pregtest-before unknown", # NA
"not before pregtest-after unknown", # NA
"yes after pregtest, before unknown", # yes
"yes beforepregtest, after unknown", # yes
"no before pregtest, yes after", # yes
"yes before pregtest, not after", # yes
"yes, continuously")) # yes
summary(abcd$smokedpreg)
## no, never not after pregtest-before unknown
## 8333 60
## not before pregtest-after unknown yes after pregtest, before unknown
## 13 5
## yes beforepregtest, after unknown no before pregtest, yes after
## 32 6
## yes before pregtest, not after yes, continuously
## 833 520
## NA's
## 184
# replace some levels with NAs
levels(abcd$smokedpreg)[levels(abcd$smokedpreg)=="not after pregtest-before unknown"] <- NA
levels(abcd$smokedpreg)[levels(abcd$smokedpreg)=="not before pregtest-after unknown"] <- NA
levels(abcd$smokedpreg) <- c("No", "Yes", "Yes", "Yes", "Yes", "Yes")
summary(abcd$smokedpreg)
## No Yes NA's
## 8333 1396 257
Drinking
abcd$drankpreg <- NA
for (i in 1:length(abcd$sex))
{if ( is.na(abcd$devhx_8_alcohol[i]) && is.na(abcd$devhx_9_alcohol[i]) ) {
abcd$drankpreg[i] = NA
}else if (is.na(abcd$devhx_8_alcohol[i]) && abcd$devhx_9_alcohol[i]==0 ) {
abcd$drankpreg[i] = 1
}else if (abcd$devhx_8_alcohol[i]==0 && is.na(abcd$devhx_9_alcohol[i]) ) {
abcd$drankpreg[i] = 2
}else if (is.na(abcd$devhx_8_alcohol[i]) && abcd$devhx_9_alcohol[i]==1 ) {
abcd$drankpreg[i] = 3
}else if (abcd$devhx_8_alcohol[i]==1 && is.na(abcd$devhx_9_alcohol[i]) ) {
abcd$drankpreg[i] = 4
}else if (abcd$devhx_8_alcohol[i]==0 && abcd$devhx_9_alcohol[i]==1 ) {
abcd$drankpreg[i] = 5
}else if (abcd$devhx_8_alcohol[i]==1 && abcd$devhx_9_alcohol[i]==0 ) {
abcd$drankpreg[i] = 6
}else if ((abcd$devhx_8_alcohol[i]==1) && (abcd$devhx_9_alcohol[i]==1) ) {
abcd$drankpreg[i] = 7
}else if (abcd$devhx_8_alcohol[i]==0 && abcd$devhx_9_alcohol[i]==0 ) {
abcd$drankpreg[i] = 0}}
abcd$drankpreg <- factor(abcd$drankpreg,
levels = c(0,1,2,3,4,5,6,7),
labels = c("no, never" , # 0.easy no
"not after pregtest-before unknown", # 1. NA
"not before pregtest-after unknown", # 2. NA
"yes after pregtest, before unknown", # 3. yes*
"yes beforepregtest, after unknown", # 4. yes*
"no before pregtest, yes after", # 5. yes
"yes before pregtest, not after", # 6. yes
"yes, continuously")) # 7. yes
summary(abcd$drankpreg)
## no, never not after pregtest-before unknown
## 6847 370
## not before pregtest-after unknown yes after pregtest, before unknown
## 8 11
## yes beforepregtest, after unknown no before pregtest, yes after
## 39 24
## yes before pregtest, not after yes, continuously
## 2232 247
## NA's
## 208
# a "yes" at any time (e.g. only before she knew she was pregnancy, etc) get's you a "yes" to the variable drank in pregnancy
# unknown (potential) drinking level-- mark as NA
levels(abcd$drankpreg)[levels(abcd$drankpreg)=="not after pregtest-before unknown"] <- NA
levels(abcd$drankpreg)[levels(abcd$drankpreg)=="not before pregtest-after unknown"] <- NA
levels(abcd$drankpreg) <- c("No", "Yes", "Yes", "Yes", "Yes", "Yes")
summary(abcd$drankpreg)
## No Yes NA's
## 6847 2553 586
abcd<- move_columns(abcd, devhx_9_alcohol, .after = "devhx_8_alcohol")
abcd<- move_columns(abcd, drankpreg, .after = "devhx_9_alcohol")
Drugs
# illegal
abcd$preg_marijuana <- NA
abcd$preg_coc_crack <- NA
abcd$preg_morph_her <- NA
abcd$preg_oxy <- NA
abcd$preg_other<- NA
# simplify time-based data to "<drugX> any time in preg"
for (i in 1:length(abcd$src_subject_id)) {
# MARIJUANA
if (is.na(abcd$devhx_8_marijuana[i]) && is.na(abcd$devhx_9_marijuana[i])) {
abcd$preg_marijuana[i] <- NA
} else {
abcd$preg_marijuana[i] <- colSums(rbind(abcd$devhx_8_marijuana[i], abcd$devhx_9_marijuana[i]))}
# COCAINE
if (is.na(abcd$devhx_8_coc_crack[i]) && is.na(abcd$devhx_9_coc_crack[i])) {
abcd$preg_coc_crack[i] <- NA
} else {
abcd$preg_coc_crack[i] <- colSums(rbind(abcd$devhx_8_coc_crack[i], abcd$devhx_9_coc_crack[i]))}
# MORPHINE
if (is.na(abcd$devhx_8_her_morph[i]) && is.na(abcd$devhx_9_her_morph[i])) {
abcd$preg_morph_her[i] <- NA
} else {
abcd$preg_morph_her[i] <- colSums(rbind(abcd$devhx_8_her_morph[i], abcd$devhx_9_her_morph[i]))}
# OXYCONTIN
if (is.na(abcd$devhx_8_oxycont[i]) && is.na(abcd$devhx_9_oxycont[i])) {
abcd$preg_oxy[i] <- NA
} else {
abcd$preg_oxy[i] <- colSums(rbind(abcd$devhx_8_oxycont[i], abcd$devhx_9_oxycont[i]))}
# OTHER
if (is.na(abcd$devhx_8_other_drugs[i]) && is.na(abcd$devhx_9_other_drugs[i])) {
abcd$preg_other[i] <- NA
} else {
abcd$preg_other[i] <- colSums(rbind(abcd$devhx_8_other_drugs[i], abcd$devhx_9_other_drugs[i]))}
}
# make double instances of substance-use (before & during known preg) into just "yes"
abcd$preg_marijuana[abcd$preg_marijuana==2] <- 1
abcd$preg_coc_crack[abcd$preg_coc_crack==2] <- 1
abcd$preg_morph_her[abcd$preg_morph_her==2] <- 1
abcd$preg_oxy[abcd$preg_oxy==2] <- 1
abcd$preg_other[abcd$preg_other==2] <- 1
# make count variable of drugs used
abcd$preg_numdrugs <- NA
for (i in 1:length(abcd$src_subject_id)) {
if (is.na(abcd$preg_marijuana[i]) &
is.na(abcd$preg_coc_crack[i]) &
is.na(abcd$preg_morph_her[i]) &
is.na(abcd$preg_oxy[i]) &
is.na(abcd$preg_other[i])) {
abcd$preg_numdrugs[i] <- NA
}
else {
abcd$preg_numdrugs[i] <- colSums(
rbind(
abcd$preg_marijuana[i] ,
abcd$preg_coc_crack[i] ,
abcd$preg_morph_her[i] ,
abcd$preg_oxy[i] ,
abcd$preg_other[i]),na.rm = T)}
}
# Make a sum var for (any drug other than marijuana)
abcd$drug_notweed <- NA
for (i in 1:length(abcd$src_subject_id)) {
if (is.na(abcd$preg_coc_crack[i]) &
is.na(abcd$preg_morph_her[i]) &
is.na(abcd$preg_oxy[i]) &
is.na(abcd$preg_other[i])) {
abcd$drug_notweed[i] <- NA
}
else {
abcd$drug_notweed[i] <- colSums(
rbind(
abcd$preg_coc_crack[i] ,
abcd$preg_morph_her[i] ,
abcd$preg_oxy[i] ,
abcd$preg_other[i]),na.rm = T)}
}
abcd$drug_notweed[abcd$drug_notweed>1] <- 1
abcd <- abcd %>%
mutate(
preg_marijuana=factor(preg_marijuana,
levels = c(0,1),
labels = c("No", "Yes")),
preg_coc_crack=factor(preg_coc_crack,
levels = c(0,1),
labels = c("No", "Yes")),
preg_morph_her=factor(preg_morph_her,
levels = c(0,1),
labels = c("No", "Yes")),
preg_oxy=factor(preg_oxy,
levels = c(0,1),
labels = c("No", "Yes")),
preg_other=factor(preg_other,
levels = c(0,1),
labels = c("No", "Yes")))
# make binary var
abcd$drugspreg<-NA
for (i in 1:length(abcd$devhx_8_marijuana)) {
if (is.na(abcd$devhx_8_marijuana[i]) &
is.na(abcd$devhx_9_marijuana[i]) &
is.na(abcd$devhx_8_coc_crack[i]) &
is.na(abcd$devhx_9_coc_crack[i]) &
is.na(abcd$devhx_8_her_morph[i]) &
is.na(abcd$devhx_9_her_morph[i]) &
is.na(abcd$devhx_8_oxycont[i]) &
is.na(abcd$devhx_9_oxycont[i]) &
is.na(abcd$devhx_8_other_drugs[i]) &
is.na(abcd$devhx_9_other_drugs[i])) {
abcd$drugspreg[i] <- NA}
else {abcd$drugspreg[i] <-colSums(rbind(
abcd$devhx_8_marijuana[i] ,
abcd$devhx_9_marijuana[i] ,
abcd$devhx_8_coc_crack[i] ,
abcd$devhx_9_coc_crack[i] ,
abcd$devhx_8_her_morph[i] ,
abcd$devhx_9_her_morph[i] ,
abcd$devhx_8_oxycont[i] ,
abcd$devhx_9_oxycont[i] ,
abcd$devhx_8_other_drugs[i] ,
abcd$devhx_9_other_drugs[i]),na.rm = T)}}
# simplify the var into "took illegal drugs *ever* during pregnancy Y/N"
for (i in 1:length(abcd$drugspreg)) {
if (!is.na(abcd$drugspreg[i]) & abcd$drugspreg[i] >= 1)
{abcd$drugspreg[i] <- 1}}
#the problem with doing 1,2+ categories here is that it could mean cocaine when they both did and didnt know of preg, or it could mean cocaine not knowing of preg, marijuana knowing of.
abcd<- move_columns(abcd, drugspreg, .before = "devhx_8_marijuana")
# make factor of it
abcd$drugspreg <- factor(abcd$drugspreg,
levels = c(0, 1),
labels = c("No", "Yes"))
# plot maternal smoking
g1 <- ggplot(data = abcd, aes(x = smokedpreg, fill = smokedpreg)) +
geom_bar() +
scale_fill_brewer(palette = "Set3") +
labs(y = "Count", x = "Gestational Smoking") +
theme_classic() +
theme(legend.position ="none")+
geom_text(stat="count", aes(label=..count..), vjust=-0.3)
# plot maternal drinking
g2<- ggplot(data = abcd, aes(x = drankpreg, fill = drankpreg)) +
geom_bar() +
scale_fill_brewer(palette = "Set3") +
labs(y = "Count", x = "Gestational Alcohol") +
theme_classic() +
theme(legend.position ="none")+
geom_text(stat="count", aes(label=..count..), vjust=-0.3)
# plot maternal drug use
g3<-ggplot(data = abcd, aes(x = drugspreg, fill = drugspreg)) +
geom_bar() +
scale_fill_brewer(palette = "Set3") +
labs(y = "Count", x = "Gestational Illegal Drugs") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90), legend.position ="none")+
geom_text(stat="count", aes(label=..count..), vjust=-0.3)
ggarrange(g1, g2, g3, nrow=2, ncol=2)

options(qwraps2_markup = "markdown")
stats <-
list("Marijuana"=
list("Yes"= ~ n_perc(preg_marijuana=="Yes" , na_rm=TRUE)),
"Cocaine"=
list("Yes"= ~ n_perc(preg_coc_crack=="Yes" , na_rm=TRUE)),
"Oxycontin"=
list("Yes"= ~ n_perc(preg_oxy=="Yes" , na_rm=TRUE)),
"Morphine/Heroin"=
list("Yes"= ~ n_perc(preg_morph_her=="Yes" , na_rm=TRUE)),
"Other drug"=
list("Yes"= ~ n_perc(preg_other=="Yes" , na_rm=TRUE))
)
desc <- qwraps2::summary_table(abcd, stats)
print(desc)
##
##
## | |abcd (N = 9,986) |
## |:-------------------|:-----------------|
## |**Marijuana** | |
## | Yes |585/9,653 (6.06%) |
## |**Cocaine** | |
## | Yes |68/9,712 (0.70%) |
## |**Oxycontin** | |
## | Yes |32/9,695 (0.33%) |
## |**Morphine/Heroin** | |
## | Yes |20/9,709 (0.21%) |
## |**Other drug** | |
## | Yes |91/9,615 (0.95%) |
# 'other' drugs: Amphetamines or methamphetamine (meth); Barbituates; Benzodiazepines; Caffeine; Cathinones (bath salts; Fake or synthetic marijuana (like spice or K2); GHB (liquid G or Georgia home boy); Hallucinogens (LSD or acid; Inhalants; Ketamine (special K); MDMA (ecstasy); Opioids; Other; Don't Know
Medication
# prescription meds (note, what this var represents is "took meds at *any* point in preg")
abcd$medspreg <- NA
for (i in 1:length(abcd$preg1_presc))
{if ( is.na(abcd$preg1_presc[i]) & is.na(abcd$preg2_presc[i]) ) {
abcd$medspreg[i] = NA
}else if (is.na(abcd$preg1_presc[i]) & abcd$preg2_presc[i]==0 ) {
abcd$medspreg[i] = NA
}else if (abcd$preg1_presc[i]==0 & is.na(abcd$preg2_presc[i])) {
abcd$medspreg[i] = NA
}else if (is.na(abcd$preg1_presc[i]) & abcd$preg2_presc[i]==1 ) {
abcd$medspreg[i] = 1
}else if (abcd$preg1_presc[i]==1 & is.na(abcd$preg2_presc[i])) {
abcd$medspreg[i] = 1
}else if (!is.na(abcd$preg1_presc[i]) & is.na(abcd$preg2_presc[i]) ) {
abcd$medspreg[i] = abcd$preg1_presc[i]
}else if ((abcd$preg1_presc[i]==1) | (abcd$preg2_presc[i]==1) ) {
abcd$medspreg[i] = 1
}else {abcd$medspreg[i]= 0}}
abcd<- move_columns(abcd, medspreg, .after = "preg2_presc")
abcd<- move_columns(abcd, preg1_presc_n, .after = "medspreg")
# make factor of it
abcd$medspreg <- factor(abcd$medspreg,
levels = c(0, 1),
labels = c("No", "Yes"))
ggplot(data = abcd, aes(x = medspreg, fill = medspreg)) +
geom_bar() +
scale_fill_brewer(palette = "Set3") +
labs(y = "Count", x = "Gestational Prescription Meds") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90), legend.position ="none")+
geom_text(stat="count", aes(label=..count..), vjust=-0.3)

Obstetric complications
in pregnancy
options(qwraps2_markup = "markdown")
stats <-
list("1. gest_nausea"=
list("Yes"= ~ n_perc(gest_nausea==1 , na_rm=TRUE)),
"2. gest_hibloodpres"=
list("Yes"= ~ n_perc(gest_hibloodpres==1 , na_rm=TRUE)),
"3. gest_uti"=
list("Yes"= ~ n_perc(gest_uti==1 , na_rm=TRUE)),
"4. gest_proteinuria"=
list("Yes"= ~ n_perc(gest_proteinuria==1 , na_rm=TRUE)),
"5. gest_bleed"=
list("Yes"= ~ n_perc(gest_bleed==1 , na_rm=TRUE)),
"6. gest_preeclampsia"=
list("Yes"= ~ n_perc(gest_preeclampsia==1 , na_rm=TRUE)),
"7. gest_anemia"=
list("Yes"= ~ n_perc(gest_anemia==1 , na_rm=TRUE)),
"8. gest_rubella"=
list("Yes"= ~ n_perc(gest_rubella==1 , na_rm=TRUE)),
"9. gest_gallbladder"=
list("Yes"= ~ n_perc(gest_gallbladder==1 , na_rm=TRUE)),
"10. gest_placenta"=
list("Yes"= ~ n_perc(gest_placenta==1 , na_rm=TRUE)),
"11. gest_diabetes"=
list("Yes"= ~ n_perc(gest_diabetes==1 , na_rm=TRUE)),
"12. gest_accident"=
list("Yes"= ~ n_perc(gest_accident==1 , na_rm=TRUE)),
"13. gest_other"=
list("Yes"= ~ n_perc(gest_other==1 , na_rm=TRUE))
)
desc <- qwraps2::summary_table(abcd, stats)
print(desc)
##
##
## | |abcd (N = 9,986) |
## |:------------------------|:--------------------|
## |**1. gest_nausea** | |
## | Yes |1,354/9,642 (14.04%) |
## |**2. gest_hibloodpres** | |
## | Yes |916/9,631 (9.51%) |
## |**3. gest_uti** | |
## | Yes |728/9,575 (7.60%) |
## |**4. gest_proteinuria** | |
## | Yes |47/9,624 (0.49%) |
## |**5. gest_bleed** | |
## | Yes |391/9,698 (4.03%) |
## |**6. gest_preeclampsia** | |
## | Yes |672/9,641 (6.97%) |
## |**7. gest_anemia** | |
## | Yes |396/9,647 (4.10%) |
## |**8. gest_rubella** | |
## | Yes |12/9,703 (0.12%) |
## |**9. gest_gallbladder** | |
## | Yes |114/9,686 (1.18%) |
## |**10. gest_placenta** | |
## | Yes |283/9,669 (2.93%) |
## |**11. gest_diabetes** | |
## | Yes |679/9,658 (7.03%) |
## |**12. gest_accident** | |
## | Yes |188/9,702 (1.94%) |
## |**13. gest_other** | |
## | Yes |759/9,679 (7.84%) |
# as proteinurea rates are so low (<1%), merge pre-eclampsia & proteinuria
abcd$preeclamp_protein <- rowSums(cbind(abcd$gest_preeclampsia, abcd$gest_proteinuria), na.rm=T)
abcd$preeclamp_protein[is.na(abcd$gest_preeclampsia) & is.na(abcd$gest_proteinuria)] <- NA
abcd$preeclamp_protein[abcd$preeclamp_protein>1] <- 1
for (i in 1:length(abcd$sex)) {
if (is.na(abcd$gest_nausea[i]) &&
is.na(abcd$gest_hibloodpres[i]) &&
is.na(abcd$gest_uti[i]) &&
is.na(abcd$preeclamp_protein[i]) &&
is.na(abcd$gest_bleed[i]) &&
is.na(abcd$gest_diabetes[i]) &&
is.na(abcd$gest_anemia[i]) &&
is.na(abcd$gest_rubella[i]) &&
is.na(abcd$gest_placenta[i]) &&
is.na(abcd$gest_gallbladder[i]) &&
is.na(abcd$gest_accident[i]) &&
is.na(abcd$gest_other[i])) {
abcd$gest_anyOG_n[i] <- NA
}
else {
abcd$gest_anyOG_n[i] <-
colSums(
rbind(
abcd$gest_nausea[i],
abcd$gest_hibloodpres[i],
abcd$gest_uti[i],
abcd$preeclamp_protein[i],
abcd$gest_bleed[i],
abcd$gest_diabetes[i],
abcd$gest_anemia[i],
abcd$gest_rubella[i],
abcd$gest_placenta[i],
abcd$gest_gallbladder[i],
abcd$gest_accident[i],
abcd$gest_other[i]
),
na.rm = T
)
}
}
abcd<- move_columns(abcd, gest_anyOG_n, .after = "gest_other")
summary(abcd$gest_anyOG_n)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.6675 1.0000 12.0000 233
table(abcd$gest_anyOG_n)
##
## 0 1 2 3 4 5 6 7 8 10 11 12
## 5658 2576 986 329 132 45 10 7 1 1 1 7
at the delivery/birth
# birthcomp_blue, birthcomp_slowheart, birthcomp_nobreathe, birthcomp_convuls, birthcomp_jaundice, birthcomp_oxygen, birthcomp_blood, birthcomp_rhincomp,
options(qwraps2_markup = "markdown")
stats <-
list("1. birthcomp_jaundice"=
list("Yes"= ~ n_perc(birthcomp_jaundice==1 , na_rm=TRUE)),
"2. birthcomp_oxygen"=
list("Yes"= ~ n_perc(birthcomp_oxygen==1 , na_rm=TRUE)),
"3. birthcomp_blue"=
list("Yes"= ~ n_perc(birthcomp_blue==1 , na_rm=TRUE)),
"4. birthcomp_slowheart"=
list("Yes"= ~ n_perc(birthcomp_slowheart==1 , na_rm=TRUE)),
"5. birthcomp_nobreathe"=
list("Yes"= ~ n_perc(birthcomp_nobreathe==1 , na_rm=TRUE)),
"6. birthcomp_rhincomp"=
list("Yes"= ~ n_perc(birthcomp_rhincomp==1 , na_rm=TRUE)),
"7. birthcomp_blood"=
list("Yes"= ~ n_perc(birthcomp_blood==1 , na_rm=TRUE)),
"8. birthcomp_convuls"=
list("Yes"= ~ n_perc(birthcomp_convuls==1 , na_rm=TRUE))
)
desc <- qwraps2::summary_table(abcd, stats)
print(desc)
##
##
## | |abcd (N = 9,986) |
## |:--------------------------|:--------------------|
## |**1. birthcomp_jaundice** | |
## | Yes |1,528/9,739 (15.69%) |
## |**2. birthcomp_oxygen** | |
## | Yes |800/9,734 (8.22%) |
## |**3. birthcomp_blue** | |
## | Yes |302/9,714 (3.11%) |
## |**4. birthcomp_slowheart** | |
## | Yes |273/9,697 (2.82%) |
## |**5. birthcomp_nobreathe** | |
## | Yes |443/9,730 (4.55%) |
## |**6. birthcomp_rhincomp** | |
## | Yes |243/9,639 (2.52%) |
## |**7. birthcomp_blood** | |
## | Yes |37/9,805 (0.38%) |
## |**8. birthcomp_convuls** | |
## | Yes |13/9,795 (0.13%) |
#birthcomp_blue, birthcomp_slowheart, birthcomp_nobreathe, birthcomp_convuls, birthcomp_jaundice, birthcomp_oxygen, birthcomp_blood, birthcomp_rhincomp,
for (i in 1:length(abcd$sex)) {
if (is.na(abcd$birthcomp_blue[i]) &&
is.na(abcd$birthcomp_slowheart[i]) &&
is.na(abcd$birthcomp_nobreathe[i]) &&
is.na(abcd$birthcomp_convuls[i]) &&
is.na(abcd$birthcomp_jaundice[i]) &&
is.na(abcd$birthcomp_oxygen[i]) &&
is.na(abcd$birthcomp_blood[i]) &&
is.na(abcd$birthcomp_rhincomp[i]))
{abcd$birthcomp_any_n[i] <- NA}
else {
abcd$birthcomp_any_n[i] <-
colSums(
rbind(
abcd$birthcomp_blue[i],
abcd$birthcomp_slowheart[i],
abcd$birthcomp_nobreathe[i],
abcd$birthcomp_convuls[i],
abcd$birthcomp_jaundice[i],
abcd$birthcomp_oxygen[i],
abcd$birthcomp_blood[i],
abcd$birthcomp_rhincomp[i]),
na.rm = T)
}
}
abcd<- move_columns(abcd, birthcomp_any_n, .before = "birthcomp_blue")
summary(abcd$birthcomp_any_n)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.3697 1.0000 8.0000 144
table(abcd$birthcomp_any_n)
##
## 0 1 2 3 4 5 6 8
## 7263 1871 476 152 52 20 6 2
ind1 = which(names(abcd)=="birthcomp_blue")
ind2 = which(names(abcd)=="birthcomp_rhincomp")
# there are 13 types of gestational complication
tidypreg <- abcd[,ind1:ind2] %>%
gather(key = "gest", value = "YesNo")
tidypreg$gest <- ordered(tidypreg$gest,
levels = c("birthcomp_jaundice","birthcomp_oxygen","birthcomp_nobreathe","birthcomp_blue", "birthcomp_slowheart" , "birthcomp_blood", "birthcomp_rhincomp", "birthcomp_convuls"))
ggplot(tidypreg, aes(x=gest, fill=as.factor(YesNo)))+
geom_bar() +
theme(axis.text.x = element_text(angle = 90))

Early life milestones
# firstyear_fever (# days in yr1), firstyear_ill (# days in yr1), breastfed_nmths (# of mths child was breastfed), age_roll (in mths), age_sit, age_walk, age_word, motordev, speechdev,
# replace any value for firstyear_fever > 365 as NA (there was one = 999)
abcd$firstyear_fever[abcd$firstyear_fever>365] <- NA
# replace age_roll, age_sit, age_walk & age_word > 100 (thats ~8 years old) with NA (v high values here, could be data mis-entry or severe developmental delays)
abcd$age_roll[abcd$age_roll>100] <- NA
abcd$age_sit[abcd$age_sit>100] <- NA
abcd$age_walk[abcd$age_walk>100] <- NA
abcd$age_word[abcd$age_word>100] <- NA
# recode motor & speech dev string levels as about average=0, somewhat earlier/later ±1, much earlier/later ±2.
# recode motordev
levels(abcd$motordev)[levels(abcd$motordev)=="about average"] <- 0
levels(abcd$motordev)[levels(abcd$motordev)=="somewhat earlier"] <- -1
levels(abcd$motordev)[levels(abcd$motordev)=="much earlier"] <- -2
levels(abcd$motordev)[levels(abcd$motordev)=="somewhat later"] <- 1
levels(abcd$motordev)[levels(abcd$motordev)=="much later"] <- 2
# recode speechdec
levels(abcd$speechdev)[levels(abcd$speechdev)=="about average"] <- 0
levels(abcd$speechdev)[levels(abcd$speechdev)=="somewhat earlier"] <- -1
levels(abcd$speechdev)[levels(abcd$speechdev)=="much earlier"] <- -2
levels(abcd$speechdev)[levels(abcd$speechdev)=="somewhat later"] <- 1
levels(abcd$speechdev)[levels(abcd$speechdev)=="much later"] <- 2
roll <- ggplot(data = abcd, aes(x = age_roll)) +
geom_bar(fill = "darkseagreen") +
labs(y = "Density", x = "age first rolled (months)") +
theme_classic()+
geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)
sit <- ggplot(data = abcd, aes(x = age_sit)) +
geom_bar(fill = "cornsilk2") +
labs(y = "Density", x = "age first sat (months)") +
theme_classic()+
geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)
walk <- ggplot(data = abcd, aes(x = age_walk)) +
geom_bar(fill = "darkslategray3") +
labs(y = "Density", x = "age first walked (months)") +
theme_classic()+
geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)
talk <- ggplot(data = abcd, aes(x = age_word)) +
geom_bar(fill = "pink3") +
labs(y = "Density", x = "age first word (months)") +
theme_classic()+
geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)
ggarrange(roll, sit, walk, talk, nrow=2, ncol=2)

# despite these being interesting vars, there are many (n ~ 1000) missing values for them. Same for gest_ndocvisits unfortunately.
# let's go with subjective rating of whether motor and speech dev was average early or late
motor <- ggplot(data = abcd, aes(x = motordev)) +
geom_bar(fill = "darkslategray3") +
labs(y = "Density", x = "Motor Development\n0=Average, negative=earlier (advanced),\n positive=later (delayed)") +
theme_classic()+
geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)
speech <- ggplot(data = abcd, aes(x = speechdev)) +
geom_bar(fill = "pink3") +
labs(y = "Density", x = "Speech Development\n0=Average, negative=earlier (advanced),\n positive=later (delayed)") +
theme_classic()+
geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)
ggarrange(motor, speech, nrow=1, ncol=2)

# Item wording:
# - Would you say his/her motor development (sitting, crawling, walking) was earlier, average, or later than most other children?
# - Would you say his/her speech development was earlier, average, or later than most other children?
CBCL scores
Parent’s attention problems
ggplot(data=abcd) +
geom_histogram((aes(asr_scr_attention_r)), binwidth=1, colour="black")
## Warning: Removed 7 rows containing non-finite values (stat_bin).

# compare parent and child attention problems (split by child's sex)
ggplot(data=abcd, aes(asr_scr_attention_r, syn_attention_r, colour=sex)) +
geom_smooth(method="lm", formula= y~poly(x,2))
## Warning: Removed 14 rows containing non-finite values (stat_smooth).

Child’s attention problems
ggplot(data=abcd)+
geom_histogram(aes(syn_attention_r), binwidth =1,
fill="grey50", colour="black")+
labs(x="",
title="Cohort1")+
scale_x_continuous(name="CBCL attention problems",
breaks=c(0:20))+
geom_vline(xintercept= mean(abcd$syn_attention_r, na.rm=T),
colour = "cyan3", linetype="dashed")+
geom_vline(xintercept= quantile(abcd$syn_attention_r, .80, na.rm=T),
colour = "orange", linetype="dashed")+
geom_vline(xintercept= quantile(abcd$syn_attention_r, .90, na.rm=T),
colour = "red", linetype="dashed")+
geom_vline(xintercept= 9,
colour = "red")+
ylim(c(0,4700))+
theme_classic()
## Warning: Removed 11 rows containing non-finite values (stat_bin).

# distribution of child's attention problems, view cut-offs of the mean (cyan), 80th percentile (orange), 90th percentile (red dashed), and raw score of 9/20 (red solid)
#table(abcd$syn_attention_r)
# observe relationship between t-scores and raw scores for child attenion problems
# note T >= 65 A CUT-OFF FOR "DEVIANT" (ACHENBACH ET AL., 2003)
# note T > 62 WAS OPTIMAL CUT-OFF SCORE TO DISTINGUIGH ADHD FROM NO ADHD (GOMEZ ET AL., 2019)
ggplot(data=abcd)+
geom_smooth(aes(x=syn_attention_r, y = syn_attention_t), formula=y~poly(x,2), method="lm")+
scale_x_continuous(name="CBCL attention problems",
breaks=c(0:20))+
geom_hline(yintercept= 65,linetype="dashed")+
geom_hline(yintercept= 62,linetype="dotted", colour="blue")+
geom_vline(xintercept= 9.6,linetype="dashed")+
geom_vline(xintercept= 9.2,linetype="dotted", colour="blue")
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
