imports

library(tidyverse)
library(dplyr)
library(sjlabelled)
library(ggplot2)
library(ggpubr)
library(Hmisc)
library(sjmisc)
library(caTools)
library(qwraps2)

# import csv. Some minimal data cleaning done in MATLAB to the initial csv from NDA. Contact niamhdooley@rcsi.come for further details. 
abcd = read.csv("/Users/niamhdooley/OneDrive - Royal College of Surgeons in Ireland/AAAPhD/ABCD/Project 1- BW and CBCL/DownloadedData/ABCDbeh1/abcd.csv")
# num vars : 252

General data cleaning

# set factor levels where appropriate
abcd <- abcd %>%
    mutate(
     family_id = factor(family_id),
     site_id = factor(site_id),
     Rship2child=factor(Rship2child,
          levels = c(1,2,3,4,5),
          labels = c("BioMom", "BioDad", "AdoptParent", "CustodParent", "Other")),
     anyfamilyinABCD = factor(anyfamilyinABCD,
          levels = c(0,1,2,3),
          labels = c("no other fam in ABCD", "sibling in ABCD", "twin in ABCD", "triplets in ABCD")),
     sex_atbirth = factor(sex_atbirth,
          levels = c(1,2,3,4),
          labels = c("male", "female", "intersex-male", "intersex-female")),
     sex = factor(sex,
          levels = c("F", "M"),
          labels = c("F", "M")),
     gender_identity = factor(gender_identity,
          levels = c(1,2,3,4,5,6),
          labels = c("male", "female", "trans-male", "trans-female", "queer", "other")),
     language = factor(language,
          levels = c(0,1),
          labels = c("English", "Spanish")),
     plannedpreg = factor(plannedpreg,
          levels = c(0,1),
          labels = c("No", "Yes")),
     motordev = factor(motordev,
          levels = c(1,2,3,4,5),
          labels = c("much earlier","somewhat earlier","about average","somewhat later", "much later")),
     speechdev = factor(speechdev,
          levels = c(1,2,3,4,5),
          labels = c("much earlier","somewhat earlier","about average","somewhat later", "much later")),     
     race_ethnicity = factor(race_ethnicity,
          levels = c(1,2,3,4,5),
          labels = c("White", "Black", "Hispanic", "Asian", "Other")),
    demo_comb_income_v2 = factor(demo_comb_income_v2,
          levels = c(1,2,3,4,5,6,7,8,9,10),
          labels = c("< $5k", "$5k-11,999", "$12k-15,999", "$16k-24,999", "$25k-34,999",
                     "$35k-49,999", "$50k-74,999", "$75k-99,999", "$100k-199,999", "$200k +")),
    grades= factor(grades,
           levels = c(1,2,3,4,5,6,-1),
           labels = c("As, excellent", "Bs, good", "Cs, average", "Ds, below ave", "Fs, struggling", "ungraded", NA_character_)),
    school_type= factor(school_type,
           levels = c(1:9),
           labels = c("Not in school", "Regular public", "Regular private", "Voc/Tech", "Cyber", "Home School", "Special. for emot/beh probs", "Other", "Charter")),
    cat_medhx = factor(cat_medhx,
           levels = c(0,1,2,3),
           labels = c("none", "1", "2", "3 or more")),
    mhserv_ever = factor(mhserv_ever,
           levels = c(0,1,3),
           labels = c("No", "Yes", NA_character_)),
    demo_prnt_ed_v2 = factor(demo_prnt_ed_v2),
    demo_prtnr_ed_v2 = factor(demo_prtnr_ed_v2),
    demo_prnt_empl_v2 = factor(demo_prnt_empl_v2),
    demo_prtnr_empl_v2 = factor(demo_prtnr_empl_v2),
    demo_prnt_marital_v2 = factor(demo_prnt_marital_v2,
          levels = c(1:6),
          labels = c("married", "widowed", "divorced", "separated", "never married", "living with partner")),
    devhx_10_prenatvitamins = factor(devhx_10_prenatvitamins,
                          levels = c(-1,0,1),
                          labels = c(NA_character_, "No", "Yes")),
    demo_prnt_prtnr_v2 = factor(demo_prnt_prtnr_v2,
          levels = c(1,2),
          labels = c("Yes", "No")))  %>%
dplyr::rename(
    combined_income = demo_comb_income_v2,
    resps_partner = demo_prnt_prtnr_v2,
    parent1_ed = demo_prnt_ed_v2 ,
    parent2_ed =demo_prtnr_ed_v2,
    parent1_emp = demo_prnt_empl_v2 ,
    parent2_emp = demo_prtnr_empl_v2
    )


# declare all remaining categorical variables as such 

cols <- c('twin','twinortriplet', 'biomom', 'car_accid', 'other_accid', 'fire', 'natur_disast', 'terrorism', 'war_death', 'community_viol', 'attack_bynonfam', 'attack_byfam', 'beaten_byfam', 'deaththreat_nonfam', 'deaththreat_fam', 'witness_dom_viol', 'sexabuse_fam', 'sexabuse_nonfam', 'sexabuse_peer', 'death_loved',  'mhserv_mh_outpat', 'mhserv_mh_parthosp' , 'mhserv_mh_hosp' , 'mhserv_subs_outpat' , 'mhserv_subs_parthosp' , 'mhserv_subs_hosp' , 'mhserv_therapy' , 'mhserv_meds' , 'mhserv_other' , 'mhserv_none', 'bullying', 'ed_ft_emot' , 'ed_ft_learn', 'ed_ft_aid', 'ed_specif_subj', 'ed_pt_aid', 'ed_resource_rm', 'ed_tutor',  'ed_gifted','ed_other', 'ed_none', 'brain_injury', 'bronchitis', 'cancer_leukemia', 'cereb_palsy', 'diabetes', 'epilepsy_seizures', 'kidney_disease', 'lead_poison', 'musc_dystrophy', 'multi_sclerosis', 'heart_problems', 'sickle_cell_anemia', 'bad_headaches','highfever', 'famhx_alcoholism', 'famhx_drugabuse', 'famhx_depression', 'famhx_manic', 'famhx_psychotic', 'famhx_conduct', 'famhx_nerves', 'famhx_clinic_support', 'famhx_hosp', 'famhx_suicide',  'premature', 'csection', 'knowofbiofam')

# removed: 'dep', 'bip', 'psychos', 'anx', 'ocd', 'eat', 'att', 'beh', 'neurodev', 'subs', 'trauma', 'sleep', 'selfharm', 'suic_idea','suic_beh', 'homic'

abcd[cols] <- lapply(abcd[cols], factor)

# import variable descriptions/labels
labels = read.csv("/Users/niamhdooley/OneDrive - Royal College of Surgeons in Ireland/AAAPhD/ABCD/Project 1- BW and CBCL/DownloadedData/ABCDbeh1/abcd_labels.csv")

labelst <- t(labels) #transpose

abcd <- sjlabelled::set_label(abcd, labelst) # from package sjlabelled

# clean workspace
rm(list=c("labels"))

abcd<- move_columns(abcd, twinortriplet, .after = "anyfamilyinABCD")
#label(abcd$twinortriplet) <- "Does child have any twin/triplet (in or not in ABCD)"

head(abcd[,c(6, 7, 13, 24)]) # show first 5 entries for some key variables

##                 tp_cbcl Rship2child sex syn_attention_r
## 1 baseline_year_1_arm_1      BioMom   M              17
## 2 baseline_year_1_arm_1      BioMom   F               3
## 3 baseline_year_1_arm_1      BioMom   M               1
## 4 baseline_year_1_arm_1      BioMom   F               8
## 5 baseline_year_1_arm_1      BioMom   M               2
## 6 baseline_year_1_arm_1      BioMom   F               0

abcd1<-abcd
# clean up KSADS summary vars further: 
ind1 = which(names(abcd1)=="dep")
ind2 = which(names(abcd1)=="homic")
for (var in ind1:ind2) {
  # for all instances of 2+ in that dx group, give a 1
  rows <- which(abcd1[,var]>1)
  abcd1[rows, var] <- 1 

  # make that var a factor with 2 levels
  abcd1[,var] <- factor(abcd1[,var],
                   levels=c(1,0),
                   labels = c("Dx", "No Dx"))
}

# note: when you run subset() you lose var labels, get them back from the source with copy_labels
abcd1 <- sjlabelled::copy_labels(abcd1, abcd)
abcd <- abcd1
rm(list=c("abcd1", "labelst"))

# some missing values coded as "NaN" rather than NA
# swap factor level "NaN" for an actual NA.
abcd1 <- abcd
for (v in 1:length(abcd1)) {
   if (is.factor(abcd1[,v])) {
      rows <- which(abcd1[,v]=="NaN")
      abcd1[rows,v] = NA_character_
   }
}

# do similar thing for famhx variables: replace response 7 ("refuse to answer") with missing
ind1= which(names(abcd)=="famhx_alcoholism")
ind2= which(names(abcd)=="famhx_suicide")
for (v in ind1:ind2) {
      rows <- which(abcd1[,v]==7)
      abcd1[rows,v] = NA_character_
}

# do same thing for "grades"-- replace "ungraded" with missing
rows <- which(abcd1$grades == "ungraded")
abcd1$grades[rows] = NA_character_

# "NaN" and 7 etc, are still listed as levels, but they are empty
abcd1 <- droplevels(abcd1) # drops all empty levels of factors

# replace var labels (deleted after) subsetting
abcd1 <- sjlabelled::copy_labels(abcd1, abcd)
abcd <- abcd1
rm(list=c("abcd1"))

# set levels of the binary variable "premature" (note: this var has been over-rided by premature.37orless)
levels(abcd$premature) <- list("Full Term"=0 ,  "Premature"=1)

Specific variable cleaning

Family units

# we should retain only 1 child per family (can't use mixed model/nesting in my elastic net function), keep the eldest of each household.
a<- as.data.frame(table(abcd$family_id))
ids <- a[a$Freq>1,] # some have 2 kids in same hs, few have 3, no more

tokeep <- c()
for (x in 1:length(ids$Var1)) {
  rows <- which(abcd$family_id==ids$Var1[x])
  
  # chose the oldest kid
  ind <- which.max(abcd$child_age[rows])
  tokeep <-c(tokeep, rows[ind]) 
}

abcd.1sib <- abcd[c(tokeep),]
abcd.1sib <- sjlabelled::copy_labels(abcd.1sib, abcd)

# check
b<- as.data.frame(table(abcd.1sib$family_id))
b.ids <- b[b$Freq>1,]

# add these now unrelated subjects to abcd$anyfamilyinABCD = "no other fam in ABCD"
abcd <- rbind(abcd[abcd$anyfamilyinABCD=="no other fam in ABCD",], abcd.1sib)
# now 9,987 unrelated kids

abcd <- sjlabelled::copy_labels(abcd, abcd.1sib)

rm(list=c("abcd.1sib"))

# Remove any duplicate rows

abcd1 <- abcd
a<-as.data.frame(table(abcd$src_subject_id))
row=which(a$Freq>1)
id<- as.character(a$Var1[row])

rows= which(abcd$src_subject_id==id)
abcd <- abcd[-c(rows[1]),]
abcd <- sjlabelled::copy_labels(abcd, abcd1)

# there should be 9,988 individuals in the df

# note: there were two variables denoting whether child was a twin or not, they don't match up perfectly...

# "twin" was a binary variable. "anyfamilyinABCD" defined what their relationship was with their other family member taking parrt in ABCD (twin, sibling, etc). Use the "twin" and "anyfamilyinABCD" variables in conjunction to define who was a multiple birth. 
abcd$multibirth <- 0
rows <- which(abcd$twin==1 | abcd$anyfamilyinABCD=="twin in ABCD" | abcd$anyfamilyinABCD=="triplets in ABCD")
abcd$multibirth[rows] <- 1
abcd$multibirth[is.na(abcd$twins)] <- NA
   
abcd<- move_columns(abcd, twin, .after = "anyfamilyinABCD")

Birth weight & gestational age

abcd$lbw <- ifelse(abcd$bw_kgs<2.5, 1, 0)  # create "low" birth weight
abcd$bw.z <- scale(abcd$bw_kgs, scale=F) # demean birth weight
abcd<- move_columns(abcd, bw.z, .after = "bw_kgs")


# check gestational age variables
table(abcd$premature, abcd$premature_wks)

##            
##               1   2   3   4   5   6   7   8   9  10  11  12  13
##   Full Term   0   0   0   0   0   0   0   0   0   0   0   0   0
##   Premature  55 163 242 411 170 167  68 123  27  26   4  10  30

# NOTE: I found out, from private correspondance with one of the ABCD PIs and data curators that premature here means anything under 40 weeks (and not the classic definition of premature/preterm as <37 weeks)

# relabel cases who are premature by just 1/2 weeks (i.e. born at 38 or 39 weeks) as Full-Term. Create new var to do so.
abcd$premature.37orless <- abcd$premature
rows <- which((abcd$premature_wks==1) | (abcd$premature_wks==2))
abcd$premature.37orless[rows] = "Full Term"

table(abcd$premature.37orless, abcd$premature_wks) # check

##            
##               1   2   3   4   5   6   7   8   9  10  11  12  13
##   Full Term  55 163   0   0   0   0   0   0   0   0   0   0   0
##   Premature   0   0 242 411 170 167  68 123  27  26   4  10  30

# make a factor var of premature_wks
abcd$premature.cat <- cut(abcd$premature_wks, c(-Inf,2,3,4,7,Inf), 
                     labels = c("1/2 wks early", "3 wks early (37w)", "4 wks early (36w)", "5-7 wks early (33-35w)", "8+ wks early (<=32w)"))
summary(abcd$premature.cat)

##          1/2 wks early      3 wks early (37w)      4 wks early (36w) 
##                    218                    242                    411 
## 5-7 wks early (33-35w)   8+ wks early (<=32w)                   NA's 
##                    405                    220                   8491

# make another 'prematurity in weeks' var which calls any "on time" = 0.
   # note: its flawed in the sense its winsorized/capped at values 0 and 13.
abcd$prem_wks <- abcd$premature_wks
abcd$prem_wks[abcd$premature=="Full Term"] = 0

# now convert this to actual gestational age (0=40 "full term")
abcd$ga_wks <- 40-abcd$prem_wks

table(abcd$ga_wks, abcd$prem_wks)

##     
##         0    1    2    3    4    5    6    7    8    9   10   11   12   13
##   27    0    0    0    0    0    0    0    0    0    0    0    0    0   30
##   28    0    0    0    0    0    0    0    0    0    0    0    0   10    0
##   29    0    0    0    0    0    0    0    0    0    0    0    4    0    0
##   30    0    0    0    0    0    0    0    0    0    0   26    0    0    0
##   31    0    0    0    0    0    0    0    0    0   27    0    0    0    0
##   32    0    0    0    0    0    0    0    0  123    0    0    0    0    0
##   33    0    0    0    0    0    0    0   68    0    0    0    0    0    0
##   34    0    0    0    0    0    0  167    0    0    0    0    0    0    0
##   35    0    0    0    0    0  170    0    0    0    0    0    0    0    0
##   36    0    0    0    0  411    0    0    0    0    0    0    0    0    0
##   37    0    0    0  242    0    0    0    0    0    0    0    0    0    0
##   38    0    0  163    0    0    0    0    0    0    0    0    0    0    0
##   39    0   55    0    0    0    0    0    0    0    0    0    0    0    0
##   40 8344    0    0    0    0    0    0    0    0    0    0    0    0    0

ggplot(data = abcd) + 
   geom_point(aes(x = prem_wks, y = bw_kgs, colour=prem_wks)) +
   labs(y = "BW (kgs)", x = "Prematurity (weeks)") +
   theme_minimal()

## Warning: Removed 523 rows containing missing values (geom_point).

# **outlier observed: preterm baby born at 6kg unlikely/extreme outlier. Remove.
row <- which(abcd$bw_kgs>6 & abcd$prem_wks>5)
abcd1 <- abcd[-c(row),]

# when you run subset() you lose var labels, get them back from the source with copy_labels
abcd1 <- sjlabelled::copy_labels(abcd1, abcd)
abcd <- abcd1
rm(list=c("abcd1"))

Parent psychiatric history

# get a sum of mental illnesses among the biological parents of the child
abcd$parent_mh_n <- rowSums(cbind(abcd$fam1, abcd$fam2))
summary(abcd$parent_mh_n)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.676   3.000  18.000

# replace cells with missing if the respondant does not know about the biological family of the child (e.g. if adoptive parent). 
rows1 <- which(abcd$knowofbiofam==0)
rows2 <- which(is.na(abcd$knowofbiofam))
rows <- c(rows1, rows2)
abcd$parent_mh_n[rows] <- NA

ggplot(data=abcd)+
   geom_histogram(aes(parent_mh_n), binwidth=1)

## Warning: Removed 113 rows containing non-finite values (stat_bin).

Parent education level

# create "household education" variable via the dominance criterion (household = the highest edu rank available)
   abcd$hsd_ed <- NA
   for (sub in 1:length(abcd$parent1_ed)) {
   if (is.na(abcd$parent2_ed[sub])) {           #if parent2 has no edu avail, select parent1's ed
      abcd$hsd_ed[sub] = abcd$parent1_ed[sub]}
   else if (is.na(abcd$parent1_ed[sub])) {      #if parent1 has no edu avail, select parent2's ed
      abcd$hsd_ed[sub] = abcd$parent2_ed[sub]}
   else if (!is.na(abcd$parent1_ed[sub]) && !is.na(abcd$parent2_ed[sub])) {    #if both avail, select largest
      abcd$hsd_ed[sub] = max(as.numeric(paste(abcd$parent1_ed[sub])), as.numeric(paste(abcd$parent2_ed[sub])))}
   }
   abcd<- move_columns(abcd, hsd_ed, .after = "parent2_ed")
   abcd<- move_columns(abcd, parent1_ed, .before = "parent2_ed")
   
   # make it a factor and re-level hsd education 
   abcd <- abcd %>%
       mutate(
        hsd_ed = factor(hsd_ed))
   
   levels(abcd$hsd_ed) <- list("None/kinder"=0 ,  
                               "Incomplete Schooling"=c(1:12),  
                               "High School Degree/GED"=c(13:14), 
                               "Some college"=15, 
                               "Associate Degree"=c(16,17),  
                               "Primary/Profess Degree"=c(18,20),  
                               "Masters"=19, 
                               "Doctoral" = c(21)) # no hsd had level 0

   # nobody had "none/kinder" as their household max
    abcd$hsd_ed <- droplevels(abcd$hsd_ed) 
    
   # create a numeric version of the variable
   abcd$hsd_ed.n <- as.numeric(abcd$hsd_ed) 
   abcd$hsd_ed.n.z <- scale(abcd$hsd_ed.n, scale=F) # centre so that mean=0, don't alter the unit

   
   # check the numerical variable has correct #s in each "group"
   summary(abcd$hsd_ed)

##   Incomplete Schooling High School Degree/GED           Some college 
##                    511                    981                   1296 
##       Associate Degree Primary/Profess Degree                Masters 
##                   1302                   2963                   2357 
##               Doctoral                   NA's 
##                    561                     15

   summary(as.factor(abcd$hsd_ed.n))

##    1    2    3    4    5    6    7 NA's 
##  511  981 1296 1302 2963 2357  561   15

   ## re-label "some college" as high school/GED only
   ## merge masters and doctoral degrees as postgrad
   ## make sure label notes primary & professional degrees
abcd$hsd_ed5 <- abcd$hsd_ed
levels(abcd$hsd_ed5) <- c("Incomplete Schooling", "High School Degree/GED", "High School Degree/GED"  ,"Associate Degree", "Primary/Professional Degree","Postgrad" ,"Postgrad")
abcd$hsd_ed5 <- droplevels(abcd$hsd_ed5)
abcd$hsd_ed5_n <- as.numeric(abcd$hsd_ed5)
abcd$hsd_ed5.n.z <- scale(abcd$hsd_ed5_n, scale=F) # centre so that mean=0, don't alter the unit

Parent employment & income

# Note: this var was not used in the end
# simplify into employed (1), unemployed (2:11)
   levels(abcd$parent1_emp) <- list("Employed"= c(1,9,10) , "Unemployed"=c(2,3,11,6,7,5,4,8))
   levels(abcd$parent2_emp) <- list("Employed"= c(1,9,10) , "Unemployed"=c(2,3,11,6,7,5,4,8))


# for income, simply make a numeric version of the var
   abcd$income.n <- as.numeric(abcd$combined_income) 
   
   abcd$income.n.z <- scale(abcd$income.n, scale=F) # centre so that mean=0, don't alter the unit
   
   # check that worked
   summary(as.factor(abcd$income.n))

##    1    2    3    4    5    6    7    8    9   10 NA's 
##  360  371  241  449  572  790 1247 1321 2728 1032  875

   summary(abcd$combined_income)

##         < $5k    $5k-11,999   $12k-15,999   $16k-24,999   $25k-34,999 
##           360           371           241           449           572 
##   $35k-49,999   $50k-74,999   $75k-99,999 $100k-199,999       $200k + 
##           790          1247          1321          2728          1032 
##          NA's 
##           875

Parental age

abcd$matage.z <- scale(abcd$matage_birth, scale=F) # demean mum's age
abcd<- move_columns(abcd, matage.z, .after = "matage_birth")

# clean paternal age variable (there are 2 extremely high, improbable, age values, 300+ , replace with NA)
rows <- which(abcd$patage_birth>150)
abcd$patage_birth[rows] <- NA

# "young" less than 20
# "old" 35<

ggplot(data=abcd)+
   geom_histogram(aes(x=matage_birth), fill="darkblue", colour="blue", binwidth=1)

## Warning: Removed 227 rows containing non-finite values (stat_bin).

ggplot(data=abcd)+
   geom_histogram(aes(x=patage_birth), fill="darkblue", colour="blue", binwidth=1)

## Warning: Removed 558 rows containing non-finite values (stat_bin).

# create "years over 35", "years under 20"
# mother
abcd$mumold <- abcd$matage_birth-35
abcd$mumold[abcd$mumold<0] <- 0

abcd$mumyoung <- abcd$matage_birth-20
abcd$mumyoung[abcd$mumyoung>0] <- 0
abcd$mumyoung <- abs(abcd$mumyoung)

# father
abcd$dadold <- abcd$patage_birth-35
abcd$dadold[abcd$dadold<0] <- 0

abcd$dadyoung <- abcd$patage_birth-20
abcd$dadyoung[abcd$dadyoung>0] <- 0
abcd$dadyoung <- abs(abcd$dadyoung)

Maternal substance-use during pregnancy

Smoking

# create var for "smoked at all during preg"
abcd$smokedpreg <- NA
for (i in 1:length(abcd$devhx_8_tobacco)) 
   {if ( is.na(abcd$devhx_8_tobacco[i]) && is.na(abcd$devhx_9_tobacco[i]) ) {
      abcd$smokedpreg[i] = NA
   }else if (is.na(abcd$devhx_8_tobacco[i]) && abcd$devhx_9_tobacco[i]==0 ) {
      abcd$smokedpreg[i] = 1
   }else if (abcd$devhx_8_tobacco[i]==0 && is.na(abcd$devhx_9_tobacco[i]) ) {
      abcd$smokedpreg[i] = 2
   }else if (is.na(abcd$devhx_8_tobacco[i]) && abcd$devhx_9_tobacco[i]==1 ) {
      abcd$smokedpreg[i] = 3
   }else if (abcd$devhx_8_tobacco[i]==1 && is.na(abcd$devhx_9_tobacco[i]) ) {
      abcd$smokedpreg[i] = 4
   }else if (abcd$devhx_8_tobacco[i]==0 && abcd$devhx_9_tobacco[i]==1 ) {
      abcd$smokedpreg[i] = 5
   }else if (abcd$devhx_8_tobacco[i]==1 && abcd$devhx_9_tobacco[i]==0 ) {
      abcd$smokedpreg[i] = 6
   }else if ((abcd$devhx_8_tobacco[i]==1) && (abcd$devhx_9_tobacco[i]==1) ) {
      abcd$smokedpreg[i] = 7
   }else if (abcd$devhx_8_tobacco[i]==0 && abcd$devhx_9_tobacco[i]==0 ) {
      abcd$smokedpreg[i] = 0}}

abcd<- move_columns(abcd, devhx_9_tobacco, .after = "devhx_8_tobacco")
abcd<- move_columns(abcd, smokedpreg, .after = "devhx_9_tobacco")

abcd$smokedpreg <- factor(abcd$smokedpreg,
          levels = c(0,1,2,3,4,5,6,7),
          labels = c("no, never" ,                         # clear no
                     "not after pregtest-before unknown",  # NA
                     "not before pregtest-after unknown",  # NA
                     "yes after pregtest, before unknown", # yes
                     "yes beforepregtest, after unknown",  # yes
                     "no before pregtest, yes after",      # yes
                     "yes before pregtest, not after",     # yes
                     "yes, continuously"))                 # yes

summary(abcd$smokedpreg)

##                          no, never  not after pregtest-before unknown 
##                               8333                                 60 
##  not before pregtest-after unknown yes after pregtest, before unknown 
##                                 13                                  5 
##  yes beforepregtest, after unknown      no before pregtest, yes after 
##                                 32                                  6 
##     yes before pregtest, not after                  yes, continuously 
##                                833                                520 
##                               NA's 
##                                184

# replace some levels with NAs
levels(abcd$smokedpreg)[levels(abcd$smokedpreg)=="not after pregtest-before unknown"] <- NA
levels(abcd$smokedpreg)[levels(abcd$smokedpreg)=="not before pregtest-after unknown"] <- NA

levels(abcd$smokedpreg) <- c("No", "Yes", "Yes", "Yes", "Yes", "Yes")
summary(abcd$smokedpreg)

##   No  Yes NA's 
## 8333 1396  257

Drinking

abcd$drankpreg <- NA
for (i in 1:length(abcd$sex)) 
   {if ( is.na(abcd$devhx_8_alcohol[i]) && is.na(abcd$devhx_9_alcohol[i]) ) {
      abcd$drankpreg[i] = NA
   }else if (is.na(abcd$devhx_8_alcohol[i]) && abcd$devhx_9_alcohol[i]==0 ) {
      abcd$drankpreg[i] = 1
   }else if (abcd$devhx_8_alcohol[i]==0 && is.na(abcd$devhx_9_alcohol[i]) ) {
      abcd$drankpreg[i] = 2
   }else if (is.na(abcd$devhx_8_alcohol[i]) && abcd$devhx_9_alcohol[i]==1 ) {
      abcd$drankpreg[i] = 3
   }else if (abcd$devhx_8_alcohol[i]==1 && is.na(abcd$devhx_9_alcohol[i]) ) {
      abcd$drankpreg[i] = 4
   }else if (abcd$devhx_8_alcohol[i]==0 && abcd$devhx_9_alcohol[i]==1 ) {
      abcd$drankpreg[i] = 5
   }else if (abcd$devhx_8_alcohol[i]==1 && abcd$devhx_9_alcohol[i]==0 ) {
      abcd$drankpreg[i] = 6
   }else if ((abcd$devhx_8_alcohol[i]==1) && (abcd$devhx_9_alcohol[i]==1) ) {
      abcd$drankpreg[i] = 7
   }else if (abcd$devhx_8_alcohol[i]==0 && abcd$devhx_9_alcohol[i]==0 ) {
      abcd$drankpreg[i] = 0}}

abcd$drankpreg <- factor(abcd$drankpreg,
          levels = c(0,1,2,3,4,5,6,7),
          labels = c("no, never" ,                         # 0.easy no
                     "not after pregtest-before unknown",  # 1. NA
                     "not before pregtest-after unknown",  # 2. NA 
                     "yes after pregtest, before unknown", # 3. yes*
                     "yes beforepregtest, after unknown",  # 4. yes*
                     "no before pregtest, yes after",      # 5. yes
                     "yes before pregtest, not after",     # 6. yes
                     "yes, continuously"))                 # 7. yes

summary(abcd$drankpreg)

##                          no, never  not after pregtest-before unknown 
##                               6847                                370 
##  not before pregtest-after unknown yes after pregtest, before unknown 
##                                  8                                 11 
##  yes beforepregtest, after unknown      no before pregtest, yes after 
##                                 39                                 24 
##     yes before pregtest, not after                  yes, continuously 
##                               2232                                247 
##                               NA's 
##                                208

# a "yes" at any time (e.g. only before she knew she was pregnancy, etc) get's you a "yes" to the variable drank in pregnancy


# unknown (potential) drinking level-- mark as NA
levels(abcd$drankpreg)[levels(abcd$drankpreg)=="not after pregtest-before unknown"] <- NA
levels(abcd$drankpreg)[levels(abcd$drankpreg)=="not before pregtest-after unknown"] <- NA

levels(abcd$drankpreg) <- c("No", "Yes", "Yes", "Yes", "Yes", "Yes")

summary(abcd$drankpreg)

##   No  Yes NA's 
## 6847 2553  586

abcd<- move_columns(abcd, devhx_9_alcohol, .after = "devhx_8_alcohol")
abcd<- move_columns(abcd, drankpreg, .after = "devhx_9_alcohol")

Drugs

# illegal
abcd$preg_marijuana <- NA
abcd$preg_coc_crack <- NA
abcd$preg_morph_her <- NA
abcd$preg_oxy <- NA
abcd$preg_other<- NA

# simplify time-based data to "<drugX> any time in preg" 
for (i in 1:length(abcd$src_subject_id)) {
   # MARIJUANA
   if (is.na(abcd$devhx_8_marijuana[i]) && is.na(abcd$devhx_9_marijuana[i])) {
         abcd$preg_marijuana[i] <- NA
   } else {
            abcd$preg_marijuana[i] <- colSums(rbind(abcd$devhx_8_marijuana[i], abcd$devhx_9_marijuana[i]))}
   # COCAINE
   if (is.na(abcd$devhx_8_coc_crack[i]) && is.na(abcd$devhx_9_coc_crack[i])) {
         abcd$preg_coc_crack[i] <- NA
   } else {
            abcd$preg_coc_crack[i] <- colSums(rbind(abcd$devhx_8_coc_crack[i], abcd$devhx_9_coc_crack[i]))}
   # MORPHINE
   if (is.na(abcd$devhx_8_her_morph[i]) && is.na(abcd$devhx_9_her_morph[i])) {
         abcd$preg_morph_her[i] <- NA
   } else {
            abcd$preg_morph_her[i] <- colSums(rbind(abcd$devhx_8_her_morph[i], abcd$devhx_9_her_morph[i]))}
   # OXYCONTIN
   if (is.na(abcd$devhx_8_oxycont[i]) && is.na(abcd$devhx_9_oxycont[i])) {
         abcd$preg_oxy[i] <- NA
   } else {
            abcd$preg_oxy[i] <- colSums(rbind(abcd$devhx_8_oxycont[i], abcd$devhx_9_oxycont[i]))}
   # OTHER
   if (is.na(abcd$devhx_8_other_drugs[i]) && is.na(abcd$devhx_9_other_drugs[i])) {
         abcd$preg_other[i] <- NA
   } else {
            abcd$preg_other[i] <- colSums(rbind(abcd$devhx_8_other_drugs[i], abcd$devhx_9_other_drugs[i]))}
}

# make double instances of substance-use (before & during known preg) into just "yes"
abcd$preg_marijuana[abcd$preg_marijuana==2] <- 1
abcd$preg_coc_crack[abcd$preg_coc_crack==2] <- 1
abcd$preg_morph_her[abcd$preg_morph_her==2] <- 1
abcd$preg_oxy[abcd$preg_oxy==2] <- 1
abcd$preg_other[abcd$preg_other==2] <- 1

# make count variable of drugs used
abcd$preg_numdrugs <- NA
for (i in 1:length(abcd$src_subject_id)) {
   if (is.na(abcd$preg_marijuana[i]) & 
       is.na(abcd$preg_coc_crack[i]) & 
       is.na(abcd$preg_morph_her[i]) & 
       is.na(abcd$preg_oxy[i]) & 
       is.na(abcd$preg_other[i])) {
      abcd$preg_numdrugs[i] <- NA
   }
   else {   
   abcd$preg_numdrugs[i] <- colSums(
            rbind(
               abcd$preg_marijuana[i] ,
               abcd$preg_coc_crack[i] ,
               abcd$preg_morph_her[i] ,
               abcd$preg_oxy[i] ,
               abcd$preg_other[i]),na.rm = T)}
}

# Make a sum var for (any drug other than marijuana)
abcd$drug_notweed <- NA
for (i in 1:length(abcd$src_subject_id)) {
   if (is.na(abcd$preg_coc_crack[i]) & 
       is.na(abcd$preg_morph_her[i]) & 
       is.na(abcd$preg_oxy[i]) & 
       is.na(abcd$preg_other[i])) {
   abcd$drug_notweed[i] <- NA
   }
   else {   
   abcd$drug_notweed[i] <- colSums(
            rbind(
               abcd$preg_coc_crack[i] ,
               abcd$preg_morph_her[i] ,
               abcd$preg_oxy[i] ,
               abcd$preg_other[i]),na.rm = T)}
}
abcd$drug_notweed[abcd$drug_notweed>1] <- 1

abcd <- abcd %>%
    mutate(
     preg_marijuana=factor(preg_marijuana,
          levels = c(0,1),
          labels = c("No", "Yes")),
     preg_coc_crack=factor(preg_coc_crack,
          levels = c(0,1),
          labels = c("No", "Yes")),
     preg_morph_her=factor(preg_morph_her,
          levels = c(0,1),
          labels = c("No", "Yes")),
     preg_oxy=factor(preg_oxy,
          levels = c(0,1),
          labels = c("No", "Yes")),
     preg_other=factor(preg_other,
          levels = c(0,1),
          labels = c("No", "Yes")))
     
# make binary var
abcd$drugspreg<-NA
for (i in 1:length(abcd$devhx_8_marijuana)) {
   if (is.na(abcd$devhx_8_marijuana[i]) &
       is.na(abcd$devhx_9_marijuana[i]) &
       is.na(abcd$devhx_8_coc_crack[i]) &
       is.na(abcd$devhx_9_coc_crack[i]) &
       is.na(abcd$devhx_8_her_morph[i]) &
       is.na(abcd$devhx_9_her_morph[i]) &
       is.na(abcd$devhx_8_oxycont[i]) &
       is.na(abcd$devhx_9_oxycont[i]) &
       is.na(abcd$devhx_8_other_drugs[i]) &
       is.na(abcd$devhx_9_other_drugs[i])) {
      abcd$drugspreg[i] <- NA}
   else {abcd$drugspreg[i] <-colSums(rbind(
               abcd$devhx_8_marijuana[i] ,
               abcd$devhx_9_marijuana[i] ,
               abcd$devhx_8_coc_crack[i] ,
               abcd$devhx_9_coc_crack[i] ,
               abcd$devhx_8_her_morph[i] ,
               abcd$devhx_9_her_morph[i] ,
               abcd$devhx_8_oxycont[i] ,
               abcd$devhx_9_oxycont[i] ,
               abcd$devhx_8_other_drugs[i] ,
               abcd$devhx_9_other_drugs[i]),na.rm = T)}}

# simplify the var into "took illegal drugs *ever* during pregnancy Y/N"
for (i in 1:length(abcd$drugspreg)) {
if (!is.na(abcd$drugspreg[i]) & abcd$drugspreg[i] >= 1)  
   {abcd$drugspreg[i] <- 1}}
#the problem with doing 1,2+ categories here is that it could mean cocaine when they both did and didnt know of preg, or it could mean cocaine not knowing of preg, marijuana knowing of.

abcd<- move_columns(abcd, drugspreg, .before = "devhx_8_marijuana")

# make factor of it
abcd$drugspreg <- factor(abcd$drugspreg,
          levels = c(0, 1),
          labels = c("No", "Yes"))

# plot maternal smoking
g1 <- ggplot(data = abcd, aes(x = smokedpreg, fill = smokedpreg)) + 
            geom_bar() +
            scale_fill_brewer(palette = "Set3") +
            labs(y = "Count", x = "Gestational Smoking") +
            theme_classic() +
            theme(legend.position ="none")+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3)

# plot maternal drinking
g2<- ggplot(data = abcd, aes(x = drankpreg, fill = drankpreg)) + 
            geom_bar() +
            scale_fill_brewer(palette = "Set3") +
            labs(y = "Count", x = "Gestational Alcohol") +
            theme_classic() +
            theme(legend.position ="none")+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3)

# plot maternal drug use
g3<-ggplot(data = abcd, aes(x = drugspreg, fill = drugspreg)) + 
            geom_bar() +
            scale_fill_brewer(palette = "Set3") +
            labs(y = "Count", x = "Gestational Illegal Drugs") +
            theme_classic() +
            theme(axis.text.x = element_text(angle = 90), legend.position ="none")+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3)

ggarrange(g1, g2, g3, nrow=2, ncol=2)

options(qwraps2_markup = "markdown")

stats <-  
  list("Marijuana"=
         list("Yes"= ~ n_perc(preg_marijuana=="Yes" , na_rm=TRUE)),
       "Cocaine"=
         list("Yes"= ~ n_perc(preg_coc_crack=="Yes" , na_rm=TRUE)),
       "Oxycontin"=
         list("Yes"= ~ n_perc(preg_oxy=="Yes" , na_rm=TRUE)),
       "Morphine/Heroin"=
         list("Yes"= ~ n_perc(preg_morph_her=="Yes" , na_rm=TRUE)),
       "Other drug"=
         list("Yes"= ~ n_perc(preg_other=="Yes" , na_rm=TRUE))
      )

desc <- qwraps2::summary_table(abcd, stats)
print(desc)

## 
## 
## |                    |abcd (N = 9,986)  |
## |:-------------------|:-----------------|
## |**Marijuana**       |&nbsp;&nbsp;      |
## |&nbsp;&nbsp; Yes    |585/9,653 (6.06%) |
## |**Cocaine**         |&nbsp;&nbsp;      |
## |&nbsp;&nbsp; Yes    |68/9,712 (0.70%)  |
## |**Oxycontin**       |&nbsp;&nbsp;      |
## |&nbsp;&nbsp; Yes    |32/9,695 (0.33%)  |
## |**Morphine/Heroin** |&nbsp;&nbsp;      |
## |&nbsp;&nbsp; Yes    |20/9,709 (0.21%)  |
## |**Other drug**      |&nbsp;&nbsp;      |
## |&nbsp;&nbsp; Yes    |91/9,615 (0.95%)  |

# 'other' drugs: Amphetamines or methamphetamine (meth); Barbituates; Benzodiazepines; Caffeine; Cathinones (bath salts; Fake or synthetic marijuana (like spice or K2); GHB (liquid G or Georgia home boy); Hallucinogens (LSD or acid; Inhalants; Ketamine (special K); MDMA (ecstasy); Opioids; Other; Don't Know

Medication

# prescription meds (note, what this var represents is "took meds at *any* point in preg")
abcd$medspreg <- NA
for (i in 1:length(abcd$preg1_presc)) 
   {if ( is.na(abcd$preg1_presc[i]) &  is.na(abcd$preg2_presc[i]) ) {
      abcd$medspreg[i] = NA
   }else if (is.na(abcd$preg1_presc[i]) & abcd$preg2_presc[i]==0 ) {
      abcd$medspreg[i] = NA
   }else if (abcd$preg1_presc[i]==0 & is.na(abcd$preg2_presc[i])) {
      abcd$medspreg[i] = NA
   }else if (is.na(abcd$preg1_presc[i]) & abcd$preg2_presc[i]==1 ) {
      abcd$medspreg[i] = 1
   }else if (abcd$preg1_presc[i]==1 & is.na(abcd$preg2_presc[i])) {
      abcd$medspreg[i] = 1
   }else if (!is.na(abcd$preg1_presc[i]) & is.na(abcd$preg2_presc[i]) ) {
      abcd$medspreg[i] = abcd$preg1_presc[i]
   }else if ((abcd$preg1_presc[i]==1) | (abcd$preg2_presc[i]==1) ) {
      abcd$medspreg[i] = 1
   }else {abcd$medspreg[i]= 0}}

abcd<- move_columns(abcd, medspreg, .after = "preg2_presc")
abcd<- move_columns(abcd, preg1_presc_n, .after = "medspreg")

# make factor of it
abcd$medspreg <- factor(abcd$medspreg,
          levels = c(0, 1),
          labels = c("No", "Yes"))

ggplot(data = abcd, aes(x = medspreg, fill = medspreg)) + 
            geom_bar() +
            scale_fill_brewer(palette = "Set3") +
            labs(y = "Count", x = "Gestational Prescription Meds") +
            theme_classic() +
            theme(axis.text.x = element_text(angle = 90), legend.position ="none")+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3)

Obstetric complications

in pregnancy

options(qwraps2_markup = "markdown")

stats <-  
  list("1. gest_nausea"=
         list("Yes"= ~ n_perc(gest_nausea==1 , na_rm=TRUE)),
       "2. gest_hibloodpres"=
         list("Yes"= ~ n_perc(gest_hibloodpres==1 , na_rm=TRUE)),
       "3. gest_uti"=
         list("Yes"= ~ n_perc(gest_uti==1 , na_rm=TRUE)),
       "4. gest_proteinuria"=
         list("Yes"= ~ n_perc(gest_proteinuria==1 , na_rm=TRUE)),
       "5. gest_bleed"=
         list("Yes"= ~ n_perc(gest_bleed==1 , na_rm=TRUE)),
       "6. gest_preeclampsia"=
         list("Yes"= ~ n_perc(gest_preeclampsia==1 , na_rm=TRUE)),
       "7. gest_anemia"=
         list("Yes"= ~ n_perc(gest_anemia==1 , na_rm=TRUE)),
       "8. gest_rubella"=
         list("Yes"= ~ n_perc(gest_rubella==1 , na_rm=TRUE)),
       "9. gest_gallbladder"=
         list("Yes"= ~ n_perc(gest_gallbladder==1 , na_rm=TRUE)),
       "10. gest_placenta"=
         list("Yes"= ~ n_perc(gest_placenta==1 , na_rm=TRUE)),
       "11. gest_diabetes"=
         list("Yes"= ~ n_perc(gest_diabetes==1 , na_rm=TRUE)),
       "12. gest_accident"=
         list("Yes"= ~ n_perc(gest_accident==1 , na_rm=TRUE)),
       "13. gest_other"=
         list("Yes"= ~ n_perc(gest_other==1 , na_rm=TRUE))
      )

                                    
desc <- qwraps2::summary_table(abcd, stats)
print(desc)

## 
## 
## |                         |abcd (N = 9,986)     |
## |:------------------------|:--------------------|
## |**1. gest_nausea**       |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |1,354/9,642 (14.04%) |
## |**2. gest_hibloodpres**  |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |916/9,631 (9.51%)    |
## |**3. gest_uti**          |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |728/9,575 (7.60%)    |
## |**4. gest_proteinuria**  |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |47/9,624 (0.49%)     |
## |**5. gest_bleed**        |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |391/9,698 (4.03%)    |
## |**6. gest_preeclampsia** |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |672/9,641 (6.97%)    |
## |**7. gest_anemia**       |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |396/9,647 (4.10%)    |
## |**8. gest_rubella**      |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |12/9,703 (0.12%)     |
## |**9. gest_gallbladder**  |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |114/9,686 (1.18%)    |
## |**10. gest_placenta**    |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |283/9,669 (2.93%)    |
## |**11. gest_diabetes**    |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |679/9,658 (7.03%)    |
## |**12. gest_accident**    |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |188/9,702 (1.94%)    |
## |**13. gest_other**       |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes         |759/9,679 (7.84%)    |

# as proteinurea rates are so low (<1%), merge pre-eclampsia & proteinuria
abcd$preeclamp_protein <- rowSums(cbind(abcd$gest_preeclampsia, abcd$gest_proteinuria), na.rm=T)
abcd$preeclamp_protein[is.na(abcd$gest_preeclampsia) & is.na(abcd$gest_proteinuria)] <- NA
abcd$preeclamp_protein[abcd$preeclamp_protein>1] <- 1

for (i in 1:length(abcd$sex)) {
   if (is.na(abcd$gest_nausea[i]) &&
       is.na(abcd$gest_hibloodpres[i]) &&
       is.na(abcd$gest_uti[i]) &&
       is.na(abcd$preeclamp_protein[i]) &&
       is.na(abcd$gest_bleed[i]) &&
       is.na(abcd$gest_diabetes[i]) &&
       is.na(abcd$gest_anemia[i]) &&
       is.na(abcd$gest_rubella[i]) &&
       is.na(abcd$gest_placenta[i]) &&
       is.na(abcd$gest_gallbladder[i]) &&
       is.na(abcd$gest_accident[i]) &&
       is.na(abcd$gest_other[i])) {
      abcd$gest_anyOG_n[i] <- NA
   }
   else {
      abcd$gest_anyOG_n[i] <-
         colSums(
            rbind(
               abcd$gest_nausea[i],
               abcd$gest_hibloodpres[i],
               abcd$gest_uti[i],
               abcd$preeclamp_protein[i],
               abcd$gest_bleed[i],
               abcd$gest_diabetes[i],
               abcd$gest_anemia[i],
               abcd$gest_rubella[i],
               abcd$gest_placenta[i],
               abcd$gest_gallbladder[i],
               abcd$gest_accident[i],
               abcd$gest_other[i]
            ),
            na.rm = T
         )
   }
}


abcd<- move_columns(abcd, gest_anyOG_n, .after = "gest_other")

summary(abcd$gest_anyOG_n)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.6675  1.0000 12.0000     233

table(abcd$gest_anyOG_n)

## 
##    0    1    2    3    4    5    6    7    8   10   11   12 
## 5658 2576  986  329  132   45   10    7    1    1    1    7

at the delivery/birth

# birthcomp_blue, birthcomp_slowheart, birthcomp_nobreathe, birthcomp_convuls, birthcomp_jaundice, birthcomp_oxygen, birthcomp_blood, birthcomp_rhincomp,

options(qwraps2_markup = "markdown")

stats <-  
  list("1. birthcomp_jaundice"=
         list("Yes"= ~ n_perc(birthcomp_jaundice==1 , na_rm=TRUE)),
       "2. birthcomp_oxygen"=
         list("Yes"= ~ n_perc(birthcomp_oxygen==1 , na_rm=TRUE)),
       "3. birthcomp_blue"=
         list("Yes"= ~ n_perc(birthcomp_blue==1 , na_rm=TRUE)),
       "4. birthcomp_slowheart"=
         list("Yes"= ~ n_perc(birthcomp_slowheart==1 , na_rm=TRUE)),
       "5. birthcomp_nobreathe"=
         list("Yes"= ~ n_perc(birthcomp_nobreathe==1 , na_rm=TRUE)),
       "6. birthcomp_rhincomp"=
         list("Yes"= ~ n_perc(birthcomp_rhincomp==1 , na_rm=TRUE)),
       "7. birthcomp_blood"=
         list("Yes"= ~ n_perc(birthcomp_blood==1 , na_rm=TRUE)),
       "8. birthcomp_convuls"=
         list("Yes"= ~ n_perc(birthcomp_convuls==1 , na_rm=TRUE))
      )
                                    
desc <- qwraps2::summary_table(abcd, stats)
print(desc)

## 
## 
## |                           |abcd (N = 9,986)     |
## |:--------------------------|:--------------------|
## |**1. birthcomp_jaundice**  |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes           |1,528/9,739 (15.69%) |
## |**2. birthcomp_oxygen**    |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes           |800/9,734 (8.22%)    |
## |**3. birthcomp_blue**      |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes           |302/9,714 (3.11%)    |
## |**4. birthcomp_slowheart** |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes           |273/9,697 (2.82%)    |
## |**5. birthcomp_nobreathe** |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes           |443/9,730 (4.55%)    |
## |**6. birthcomp_rhincomp**  |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes           |243/9,639 (2.52%)    |
## |**7. birthcomp_blood**     |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes           |37/9,805 (0.38%)     |
## |**8. birthcomp_convuls**   |&nbsp;&nbsp;         |
## |&nbsp;&nbsp; Yes           |13/9,795 (0.13%)     |

#birthcomp_blue, birthcomp_slowheart, birthcomp_nobreathe, birthcomp_convuls, birthcomp_jaundice, birthcomp_oxygen, birthcomp_blood, birthcomp_rhincomp,

for (i in 1:length(abcd$sex)) {
   if (is.na(abcd$birthcomp_blue[i]) &&
       is.na(abcd$birthcomp_slowheart[i]) &&
       is.na(abcd$birthcomp_nobreathe[i]) &&
       is.na(abcd$birthcomp_convuls[i]) &&
       is.na(abcd$birthcomp_jaundice[i]) &&
       is.na(abcd$birthcomp_oxygen[i]) &&
       is.na(abcd$birthcomp_blood[i]) &&
       is.na(abcd$birthcomp_rhincomp[i])) 
    {abcd$birthcomp_any_n[i] <- NA}
   else {
      abcd$birthcomp_any_n[i] <-
         colSums(
            rbind(
               abcd$birthcomp_blue[i],
               abcd$birthcomp_slowheart[i],
               abcd$birthcomp_nobreathe[i],
               abcd$birthcomp_convuls[i],
               abcd$birthcomp_jaundice[i],
               abcd$birthcomp_oxygen[i],
               abcd$birthcomp_blood[i],
               abcd$birthcomp_rhincomp[i]),
            na.rm = T)
   }
}

abcd<- move_columns(abcd, birthcomp_any_n, .before = "birthcomp_blue")

summary(abcd$birthcomp_any_n)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.3697  1.0000  8.0000     144

table(abcd$birthcomp_any_n)

## 
##    0    1    2    3    4    5    6    8 
## 7263 1871  476  152   52   20    6    2

ind1 = which(names(abcd)=="birthcomp_blue")
ind2 = which(names(abcd)=="birthcomp_rhincomp")

# there are 13 types of gestational complication

tidypreg <- abcd[,ind1:ind2] %>%
  gather(key = "gest", value = "YesNo")

tidypreg$gest <- ordered(tidypreg$gest,
                         levels = c("birthcomp_jaundice","birthcomp_oxygen","birthcomp_nobreathe","birthcomp_blue", "birthcomp_slowheart" ,   "birthcomp_blood", "birthcomp_rhincomp",  "birthcomp_convuls"))

ggplot(tidypreg, aes(x=gest, fill=as.factor(YesNo)))+
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90))

Early life milestones

# firstyear_fever (# days in yr1), firstyear_ill (# days in yr1), breastfed_nmths (# of mths child was breastfed), age_roll (in mths), age_sit, age_walk, age_word, motordev, speechdev,

# replace any value for firstyear_fever > 365 as NA (there was one = 999)
abcd$firstyear_fever[abcd$firstyear_fever>365] <- NA 

# replace age_roll, age_sit, age_walk & age_word > 100 (thats ~8 years old) with NA (v high values here, could be data mis-entry or severe developmental delays)
abcd$age_roll[abcd$age_roll>100] <- NA
abcd$age_sit[abcd$age_sit>100] <- NA
abcd$age_walk[abcd$age_walk>100] <- NA
abcd$age_word[abcd$age_word>100] <- NA

# recode motor & speech dev string levels as about average=0, somewhat earlier/later ±1, much earlier/later ±2.

# recode motordev
levels(abcd$motordev)[levels(abcd$motordev)=="about average"] <-     0
levels(abcd$motordev)[levels(abcd$motordev)=="somewhat earlier"] <- -1
levels(abcd$motordev)[levels(abcd$motordev)=="much earlier"] <-     -2
levels(abcd$motordev)[levels(abcd$motordev)=="somewhat later"] <-    1
levels(abcd$motordev)[levels(abcd$motordev)=="much later"] <-        2

# recode speechdec
levels(abcd$speechdev)[levels(abcd$speechdev)=="about average"] <-     0
levels(abcd$speechdev)[levels(abcd$speechdev)=="somewhat earlier"] <- -1
levels(abcd$speechdev)[levels(abcd$speechdev)=="much earlier"] <-     -2
levels(abcd$speechdev)[levels(abcd$speechdev)=="somewhat later"] <-    1
levels(abcd$speechdev)[levels(abcd$speechdev)=="much later"] <-        2

roll <- ggplot(data = abcd, aes(x = age_roll)) + 
            geom_bar(fill = "darkseagreen") +
            labs(y = "Density", x = "age first rolled (months)") +
            theme_classic()+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)

sit <- ggplot(data = abcd, aes(x = age_sit)) + 
            geom_bar(fill = "cornsilk2") +
            labs(y = "Density", x = "age first sat (months)") +
            theme_classic()+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)

walk <- ggplot(data = abcd, aes(x = age_walk)) + 
            geom_bar(fill = "darkslategray3") +
            labs(y = "Density", x = "age first walked (months)") +
            theme_classic()+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)

talk <- ggplot(data = abcd, aes(x = age_word)) + 
            geom_bar(fill = "pink3") +
            labs(y = "Density", x = "age first word (months)") +
            theme_classic()+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)

ggarrange(roll, sit, walk, talk, nrow=2, ncol=2)

# despite these being interesting vars, there are many (n ~ 1000) missing values for them. Same for gest_ndocvisits unfortunately.

# let's go with subjective rating of whether motor and speech dev was average early or late
motor <- ggplot(data = abcd, aes(x = motordev)) + 
            geom_bar(fill = "darkslategray3") +
            labs(y = "Density", x = "Motor Development\n0=Average, negative=earlier (advanced),\n positive=later (delayed)") +
            theme_classic()+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)

speech <- ggplot(data = abcd, aes(x = speechdev)) + 
            geom_bar(fill = "pink3") +
            labs(y = "Density", x = "Speech Development\n0=Average, negative=earlier (advanced),\n positive=later (delayed)") +
            theme_classic()+
            geom_text(stat="count", aes(label=..count..), vjust=-0.3, size=2)

ggarrange(motor, speech, nrow=1, ncol=2)

# Item wording: 
# - Would you say his/her motor development (sitting, crawling, walking) was earlier, average, or later than most other children? 
# - Would you say his/her speech development was earlier, average,  or later than most other children?

CBCL scores

Parent’s attention problems

ggplot(data=abcd) +
   geom_histogram((aes(asr_scr_attention_r)), binwidth=1, colour="black")

## Warning: Removed 7 rows containing non-finite values (stat_bin).

# compare parent and child attention problems (split by child's sex)
ggplot(data=abcd, aes(asr_scr_attention_r, syn_attention_r, colour=sex)) +
   geom_smooth(method="lm", formula= y~poly(x,2))

## Warning: Removed 14 rows containing non-finite values (stat_smooth).

Child’s attention problems

ggplot(data=abcd)+
   geom_histogram(aes(syn_attention_r), binwidth =1, 
                  fill="grey50", colour="black")+
   labs(x="",
        title="Cohort1")+
   scale_x_continuous(name="CBCL attention problems", 
                      breaks=c(0:20))+
   geom_vline(xintercept= mean(abcd$syn_attention_r, na.rm=T), 
              colour = "cyan3", linetype="dashed")+
   geom_vline(xintercept= quantile(abcd$syn_attention_r, .80, na.rm=T), 
              colour = "orange", linetype="dashed")+
   geom_vline(xintercept= quantile(abcd$syn_attention_r, .90, na.rm=T), 
              colour = "red", linetype="dashed")+
   geom_vline(xintercept= 9, 
              colour = "red")+
   ylim(c(0,4700))+
   theme_classic()

## Warning: Removed 11 rows containing non-finite values (stat_bin).

# distribution of child's attention problems, view cut-offs of the mean (cyan), 80th percentile (orange), 90th percentile (red dashed), and raw score of 9/20 (red solid)
#table(abcd$syn_attention_r)


# observe relationship between t-scores and raw scores for child attenion problems
# note T >= 65 A CUT-OFF FOR "DEVIANT" (ACHENBACH ET AL., 2003)
# note T > 62 WAS OPTIMAL CUT-OFF SCORE TO DISTINGUIGH ADHD FROM NO ADHD (GOMEZ ET AL., 2019)

ggplot(data=abcd)+
   geom_smooth(aes(x=syn_attention_r, y = syn_attention_t), formula=y~poly(x,2), method="lm")+
   scale_x_continuous(name="CBCL attention problems", 
                      breaks=c(0:20))+
   geom_hline(yintercept= 65,linetype="dashed")+
   geom_hline(yintercept= 62,linetype="dotted", colour="blue")+
   geom_vline(xintercept= 9.6,linetype="dashed")+
   geom_vline(xintercept= 9.2,linetype="dotted", colour="blue")

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

Save data to csv

abcd_labels <- as.data.frame(get_label(abcd)) # save var labels
names(abcd_labels)<- "var_desc"
rownames(abcd_labels) <- c()

write.csv(abcd,"/Users/niamhdooley/OneDrive - Royal College of Surgeons in Ireland/AAAPhD/Prediction and explanation of BW-ADHD/data/abcd_clean.csv", row.names = FALSE)

save(abcd, file="/Users/niamhdooley/OneDrive - Royal College of Surgeons in Ireland/AAAPhD/Prediction and explanation of BW-ADHD/data/abcd_clean.Rda")

write.csv(abcd_labels, "/Users/niamhdooley/OneDrive - Royal College of Surgeons in Ireland/AAAPhD/Prediction and explanation of BW-ADHD/data/abcd_labels.csv", row.names = F)

Clean ABCD data

Niamh Dooley

imports

General data cleaning

Specific variable cleaning

Family units

Birth weight & gestational age

Parent psychiatric history

Parent education level

Parent employment & income

Parental age

Maternal substance-use during pregnancy

Smoking

Drinking

Drugs

Medication

Obstetric complications

in pregnancy

at the delivery/birth

Early life milestones

CBCL scores

Parent’s attention problems

Child’s attention problems

Save data to csv