# setup a working directory
setwd("C:/Users/Sferg/Desktop/JABSOM Grad School/Secondary Data Analysis/Week 3/Assignment 3")
# Load Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Import Raw Data Files
Import the files to be analyzed. These were pulled from the publicly available NHANES data
P_DEMO is demographic data
P_SMQ is data from a a survey with questions pertaining to smoking
P_HUQ is “general health status
P_PBCD is”smoking status”
Select Relevant Variables
These files have a lot of extra variables: many we’re not interested in.
Always keep your raw data files as is (good practice, I think).
But, Create new data frames that select only the variables (columns) we are interested in
More information about these specific variables is located in the “Assignment 3 Codebook” file
P_DEMO_new <- P_DEMO %>%
select(seqn, riagendr, ridreth3, ridageyr, dmdborn4, dmdmartz, wtmecprp, sdmvpsu, sdmvstra)
P_HUQ_new <- P_HUQ %>%
select(seqn, huq010)
P_SMQ_new <- P_SMQ %>%
select(seqn, smq040, smq020)
P_PBCD_new <- P_PBCD %>%
select(seqn, lbxbse)
where wtintprp - full sample interview weight
wtmecprp - full sample MEC exam weight
sdmvpsu - Masked variance pseudo-PSU
sdmvstra - Masked variance pseudo-stratum
Join Variables of interest into a data frame
Now, Join the tables together. Call it “my data”
mydata <- P_DEMO_new %>%
full_join(P_HUQ_new, by="seqn") %>%
full_join(P_PBCD_new, by="seqn") %>%
full_join(P_SMQ_new, by="seqn")
Recode the Variables
For each of the variables, recode ’Refused (7 or 77) or “Don’t Know (9 or 99) as NA.
Applicable variables, Recode all of these cases as “NA”
When dmdborn4 77 = Refused, 99 = Don’t know -> NA
when dmdmartz 77 = Refused, 99 = Don’t know -> NA
huq010 7 = refused or 9 = don’t know
smq020 7 = refused or 9 = don’t know smq040 7 = refused or 9 = don’t
know
However, don’t make 7, 9, or 77 for all variables!
Some variables have legitimate values for these!
for examples, ridageyr. 77 is 77 years, not Refused!
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
mydata$dmdborn4 <- recode(mydata$dmdborn4, recodes="77=NA; 99=NA")
mydata$dmdmartz <- recode(mydata$dmdmartz, recodes="77=NA; 99=NA")
mydata$huq010 <- recode(mydata$huq010, recodes="7=NA; 9=NA")
mydata$smq020 <- recode(mydata$smq020, recodes="7=NA; 9=NA")
mydata$smq040 <- recode(mydata$smq040, recodes="7=NA; 9=NA")
Create new variable “Smoking_Status”. Assign values based on these conditions:
Smoking_Status = 1 (current smoker) if SMQ020 = 1 and (SMQ040 = 1 or
2)
Smoking_Status = 2 (former smoker) if SMQ020 = 1 and SMQ040 = 3
Smoking_Status = 3 (never smoking) if SMQ020 = 2
Smoking_Status = NA (missing) else
mydata <- mydata %>%
mutate(Smoking_Status = case_when(
smq020 == 1 & (smq040 == 1 | smq040 == 2) ~ 1,
smq020 == 1 & smq040 == 3 ~ 2,
smq020 == 2 ~ 3,
TRUE ~ NA_real_
))
Create new variable “Age”. Assign values based on these conditions:
Age = 1 if RIDAGEYR >=20 and RIDAGEYR <65
= 2 if RIDAGEYR >=65
= NA else
This stratifies Age into 2 group
20-64
65+
(NA if age from 0-19 or missing)
mydata <- mydata %>%
mutate(Age = case_when(
ridageyr >= 20 & ridageyr < 65 ~ 1,
ridageyr >= 65 ~ 2,
TRUE ~ NA_real_
))
Save the Cleaned data frame
Save the rds (r dataset) file
saveRDS(mydata, file = "nhanes_assignment3_finished_v2.rds")