This is the first of 3 scripts/programs that I ran to create our analysis. This script corresponds the the flowchart portion of our paper / presentation. The results & code from the other portions of the analysis can be found in the links below

  1. Data wrangling step
  2. Exploratory data analysis & descriptive statistics
  3. Regression models

Workflow

My workflow before this point was to download the two source files, 2015_2017_FemRespData.dat and 2015_2017_FemPregData.dat from the NSFG website. Because these are .dat files (and very large), I exported our potential variables of interest from SAS to two CSV’s. These CSV’s are essentially the raw data from the NSFG website, maintaining the same varaible names and formats. These CSVs can be downloaded from GitHub.

In this file, I take the data from these two CSVs (preg.csv and resp.csv) and create the datasets used in our analysis. You can run the code if you like or just download the datasets it produces from my GitHub.

Simplify the CSVs to variables of interest

Read & handle the female response csv

# If you want to download this data from my GitHub:
# read_csv("https://github.com/HunterRatliff1/PSH6253-MPH-Intro-Biostats/raw/master/Project/data/resp.csv")
resp <- read_csv("~/SASUniversityEdition/myfolders/NSFG/csv/resp.csv") %>%
  select(CASEID, EDUCAT, HISPRACE, BMI, PREGNUM, USUALCAR, HIEDUC) %>%
  
  mutate(
    # BMI's 95 and above are coded as missing
    BMI = ifelse(BMI>94, NA, BMI),
    
    # Recode some of the default lables from SAS
    USUALCAR = recode(USUALCAR, "No"=F, "Yes"=T, "Don't know"=NA),
    HISPRACE = recode(HISPRACE, "Non-Hispanic Black"="Black",
                      "Non-Hispanic White"="White",
                      "Non-Hispanic Other"="Other")
  ) %>%
  
  # Make characters factors
  mutate_if(is.character, factor)

Read & handle the pregnancy csv

# If you want to download this data from my GitHub:
# read_csv("https://github.com/HunterRatliff1/PSH6253-MPH-Intro-Biostats/raw/master/Project/data/preg.csv")
preg <- read_csv("~/SASUniversityEdition/myfolders/NSFG/csv/preg.csv") %>% 
  select(CASEID, KNEWPREG, BGNPRENA, LBW=LBW1,
         OUTCOME, AGECON, PMARPREG,  POVERTY, GA=WKSGEST, 
         Wantedness=NEWWANTR, TRYSCALE, WANTSCAL, 
         wgt=WGT2015_2017, SECU, strata=SEST) %>%
  
  
  mutate(
    # Anything above 94 is missing data per codebook
    GA       = ifelse(GA>94, NA, GA),
    BGNPRENA = ifelse(BGNPRENA>94, NA, BGNPRENA),
    KNEWPREG = ifelse(KNEWPREG>94, NA, KNEWPREG),
    
    # Recode some of the default lables from SAS
    LBW = recode(LBW,
                  "NO, NOT LOW BIRTH WEIGHT"=F,
                  "YES, LOW BIRTH WEIGHT"=T),
    PMARPREG = recode(str_to_lower(PMARPREG),
                      "no"=F,
                      "yes"=T)
  ) %>%
  
  # A gestational age over 44 weeks isn't biologically reasonable
  filter(GA < 45) %>%
  
  # Make characters factors
  mutate_if(is.character, factor)

Join data

here’s how many observations were present in the steps leading up to the creation of data0:

# raw number of rows
df <- left_join(preg, resp)
nrow(df)
## [1] 9327

# number of obs with live birth
df <- df %>%
  filter(OUTCOME=="LIVE BIRTH")
nrow(df)
## [1] 6688

# Same as above, but with ages 20-40 only
df <- df %>%
  filter(AGECON>=20, AGECON<=40)
nrow(df)
## [1] 5089

Now we create data0

data0 <- 
  left_join(preg, resp) %>%
  # must be between 20 - 40 years old and have had a live birth
  filter(AGECON>=20, AGECON<=40, OUTCOME=="LIVE BIRTH") %>%
  
  # OUTCOMES
  mutate(
    # Know if pregnant by 6 weeks
    KnowPreg = factor(if_else(KNEWPREG<=6, "Yes", "No")), 
    
    # Got prenatal care in first trimester
    gotPNcare = factor(if_else(BGNPRENA<13, "Yes", "No")),
    
    # Premature delivery
    PreMe = factor(if_else(GA<37, "Premature", "Term"))
  ) %>% 
  
  
  # PREDICTORS
  mutate(
    Trying_S = cut(TRYSCALE, breaks = c(0,5,10), labels = c("No",  "Yes")),
    Wanted_S = cut(WANTSCAL, breaks = c(0,2,7,10), labels = c("No", "Maybe", "Yes")),
    Wanted   = if_else(Wantedness == "Right time", T, F),
    HIEDUC   = recode(HIEDUC, 
                      "9TH GRADE OR LESS"="<HS",
                      "10TH GRADE"="<HS", "11TH GRADE"="<HS",
                      "12TH GRADE, NO DIPLOMA (NOR GED)"="<HS",
                      "HIGH SCHOOL GRADUATE (DIPLOMA OR GED)"="HS or GED",
                      "SOME COLLEGE BUT NO DEGREE"="Some college",
                      "ASSOCIATE DEGREE IN COLLEGE/UNIVERSITY"="Associates",
                      "BACHELOR'S DEGREE"="Bachelors",
                      "MASTER'S DEGREE"="Grad/prof school",
                      "PROFESSIONAL DEGREE"="Grad/prof school",
                      "DOCTORATE DEGREE"="Grad/prof school"
                      )
  ) %>%
  
  # Make characters factors
  mutate_if(is.character, factor) %>%
  
  select(CASEID,
         # Outcome vars
         LBW, PreMe, gotPNcare, KnowPreg,
         
         age=AGECON, income=POVERTY, YrEdu=EDUCAT, race=HISPRACE, BMI, PregNum=PREGNUM,
         eduCat=HIEDUC,
         # PMARPREG, USUALCAR,
         GA, Wanted,
         wgt:strata) 
         # everything()

Patterns of missing data

library(naniar)

data0 %>%
  select(LBW:Wanted) %>%
  vis_miss()

gg_miss_upset(data0, nsets = n_var_miss(data0))

Make two data sets

Make two datasets, BIRTH with birth outcomes and PN with prenatal care outcomes

BIRTH <- data0 %>%
  select(CASEID, LBW, PreMe, 
         GA, BMI, age, income, race, YrEdu, eduCat, Wanted) %>%
  na.omit() # drop rows with any missing values
nrow(BIRTH)

PN <- data0 %>% 
  select(CASEID, KnowPreg, gotPNcare, LBW, PreMe, 
         GA, BMI, PregNum, age, income, race, YrEdu, eduCat, Wanted) %>%
  na.omit() # drop rows with any missing values
nrow(PN)

# Write out for use in other scripts
BIRTH %>% write_rds("data/BIRTH.RDS")
PN %>% write_rds("data/PN.RDS")
data0 %>% write_rds("data/data0.RDS")