This is the first of 3 scripts/programs that I ran to create our analysis. This script corresponds the the flowchart portion of our paper / presentation. The results & code from the other portions of the analysis can be found in the links below
My workflow before this point was to download the two source files, 2015_2017_FemRespData.dat and 2015_2017_FemPregData.dat from the NSFG website. Because these are .dat files (and very large), I exported our potential variables of interest from SAS to two CSV’s. These CSV’s are essentially the raw data from the NSFG website, maintaining the same varaible names and formats. These CSVs can be downloaded from GitHub.
In this file, I take the data from these two CSVs (preg.csv and resp.csv) and create the datasets used in our analysis. You can run the code if you like or just download the datasets it produces from my GitHub.
Read & handle the female response csv
# If you want to download this data from my GitHub:
# read_csv("https://github.com/HunterRatliff1/PSH6253-MPH-Intro-Biostats/raw/master/Project/data/resp.csv")
resp <- read_csv("~/SASUniversityEdition/myfolders/NSFG/csv/resp.csv") %>%
select(CASEID, EDUCAT, HISPRACE, BMI, PREGNUM, USUALCAR, HIEDUC) %>%
mutate(
# BMI's 95 and above are coded as missing
BMI = ifelse(BMI>94, NA, BMI),
# Recode some of the default lables from SAS
USUALCAR = recode(USUALCAR, "No"=F, "Yes"=T, "Don't know"=NA),
HISPRACE = recode(HISPRACE, "Non-Hispanic Black"="Black",
"Non-Hispanic White"="White",
"Non-Hispanic Other"="Other")
) %>%
# Make characters factors
mutate_if(is.character, factor)
Read & handle the pregnancy csv
# If you want to download this data from my GitHub:
# read_csv("https://github.com/HunterRatliff1/PSH6253-MPH-Intro-Biostats/raw/master/Project/data/preg.csv")
preg <- read_csv("~/SASUniversityEdition/myfolders/NSFG/csv/preg.csv") %>%
select(CASEID, KNEWPREG, BGNPRENA, LBW=LBW1,
OUTCOME, AGECON, PMARPREG, POVERTY, GA=WKSGEST,
Wantedness=NEWWANTR, TRYSCALE, WANTSCAL,
wgt=WGT2015_2017, SECU, strata=SEST) %>%
mutate(
# Anything above 94 is missing data per codebook
GA = ifelse(GA>94, NA, GA),
BGNPRENA = ifelse(BGNPRENA>94, NA, BGNPRENA),
KNEWPREG = ifelse(KNEWPREG>94, NA, KNEWPREG),
# Recode some of the default lables from SAS
LBW = recode(LBW,
"NO, NOT LOW BIRTH WEIGHT"=F,
"YES, LOW BIRTH WEIGHT"=T),
PMARPREG = recode(str_to_lower(PMARPREG),
"no"=F,
"yes"=T)
) %>%
# A gestational age over 44 weeks isn't biologically reasonable
filter(GA < 45) %>%
# Make characters factors
mutate_if(is.character, factor)
here’s how many observations were present in the steps leading up to the creation of data0:
# raw number of rows
df <- left_join(preg, resp)
nrow(df)
## [1] 9327
# number of obs with live birth
df <- df %>%
filter(OUTCOME=="LIVE BIRTH")
nrow(df)
## [1] 6688
# Same as above, but with ages 20-40 only
df <- df %>%
filter(AGECON>=20, AGECON<=40)
nrow(df)
## [1] 5089
Now we create data0
data0 <-
left_join(preg, resp) %>%
# must be between 20 - 40 years old and have had a live birth
filter(AGECON>=20, AGECON<=40, OUTCOME=="LIVE BIRTH") %>%
# OUTCOMES
mutate(
# Know if pregnant by 6 weeks
KnowPreg = factor(if_else(KNEWPREG<=6, "Yes", "No")),
# Got prenatal care in first trimester
gotPNcare = factor(if_else(BGNPRENA<13, "Yes", "No")),
# Premature delivery
PreMe = factor(if_else(GA<37, "Premature", "Term"))
) %>%
# PREDICTORS
mutate(
Trying_S = cut(TRYSCALE, breaks = c(0,5,10), labels = c("No", "Yes")),
Wanted_S = cut(WANTSCAL, breaks = c(0,2,7,10), labels = c("No", "Maybe", "Yes")),
Wanted = if_else(Wantedness == "Right time", T, F),
HIEDUC = recode(HIEDUC,
"9TH GRADE OR LESS"="<HS",
"10TH GRADE"="<HS", "11TH GRADE"="<HS",
"12TH GRADE, NO DIPLOMA (NOR GED)"="<HS",
"HIGH SCHOOL GRADUATE (DIPLOMA OR GED)"="HS or GED",
"SOME COLLEGE BUT NO DEGREE"="Some college",
"ASSOCIATE DEGREE IN COLLEGE/UNIVERSITY"="Associates",
"BACHELOR'S DEGREE"="Bachelors",
"MASTER'S DEGREE"="Grad/prof school",
"PROFESSIONAL DEGREE"="Grad/prof school",
"DOCTORATE DEGREE"="Grad/prof school"
)
) %>%
# Make characters factors
mutate_if(is.character, factor) %>%
select(CASEID,
# Outcome vars
LBW, PreMe, gotPNcare, KnowPreg,
age=AGECON, income=POVERTY, YrEdu=EDUCAT, race=HISPRACE, BMI, PregNum=PREGNUM,
eduCat=HIEDUC,
# PMARPREG, USUALCAR,
GA, Wanted,
wgt:strata)
# everything()
library(naniar)
data0 %>%
select(LBW:Wanted) %>%
vis_miss()
gg_miss_upset(data0, nsets = n_var_miss(data0))
Make two datasets, BIRTH with birth outcomes and PN with prenatal care outcomes
BIRTH <- data0 %>%
select(CASEID, LBW, PreMe,
GA, BMI, age, income, race, YrEdu, eduCat, Wanted) %>%
na.omit() # drop rows with any missing values
nrow(BIRTH)
PN <- data0 %>%
select(CASEID, KnowPreg, gotPNcare, LBW, PreMe,
GA, BMI, PregNum, age, income, race, YrEdu, eduCat, Wanted) %>%
na.omit() # drop rows with any missing values
nrow(PN)
# Write out for use in other scripts
BIRTH %>% write_rds("data/BIRTH.RDS")
PN %>% write_rds("data/PN.RDS")
data0 %>% write_rds("data/data0.RDS")