install.packages("table1")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
## also installing the dependency 'pillar'
install.packages("dagitty")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

Question 1: customizing read_csv

library(readr)
nhefs <- read_csv("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv", 
    col_types = cols(qsmk = col_factor(levels = c("0", 
         "1")), death = col_factor(levels = c("0", 
         "1")), sex = col_factor(levels = c("0", 
         "1")), race = col_factor(levels = c("0", 
         "1")), income = col_factor(levels = c("11", 
         "12", "13", "14", "15", "16", "17", 
         "18", "19", "20", "21", "22")), marital = col_factor(levels = c("1", 
         "2", "3", "4", "5", "6", "8")), education = col_factor(levels = c("1", 
         "2", "3", "4", "5")), asthma = col_factor(levels = c("0", 
         "1")), bronch = col_factor(levels = c("0", 
         "1")), tb = col_factor(levels = c("0", 
         "1")), hf = col_factor(levels = c("0", 
         "1")), hbp = col_factor(levels = c("0", 
         "1")), pepticulcer = col_factor(levels = c("0", 
         "1")), colitis = col_factor(levels = c("0", 
         "1")), hepatitis = col_factor(levels = c("0", 
         "1")), chroniccough = col_factor(levels = c("0", 
         "1")), hayfever = col_factor(levels = c("0", 
         "1")), diabetes = col_factor(levels = c("0", 
         "1")), polio = col_factor(levels = c("0", 
         "1")), tumor = col_factor(levels = c("0", 
         "1")), nervousbreak = col_factor(levels = c("0", 
         "1")), alcoholpy = col_factor(levels = c("0", 
         "1")), alcoholfreq = col_factor(levels = c("1", 
         "2", "3", "4", "5")), alcoholtype = col_factor(levels = c("1", 
         "2", "3", "4")), pica = col_factor(levels = c("0", 
         "1", "2")), headache = col_factor(levels = c("0", 
         "1")), otherpain = col_factor(levels = c("0", 
         "1")), weakheart = col_factor(levels = c("0", 
         "1")), allergies = col_factor(levels = c("0", 
         "1")), nerves = col_factor(levels = c("0", 
         "1")), lackpep = col_factor(levels = c("0", 
         "1")), hbpmed = col_factor(levels = c("0", 
         "1", "2")), boweltrouble = col_factor(levels = c("0", 
         "1", "2")), active = col_factor(levels = c("0", 
         "1","2")),exercise = col_factor(levels = c("0", 
         "1","2"))))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
nhefs2<-select(nhefs,c(active,age,education,exercise,race,sex,smokeintensity,smokeyrs,wt82_71,qsmk))

Question 2: recoding

library(dplyr)

nhefs2<-nhefs2 %>% mutate(active=case_match(active,"0"~"inactive","1"~"moderately active","2"~"very active",.ptype = factor(c("inactive","moderately active","very active"))))

nhefs2<-nhefs2 %>% mutate(exercise=case_match(exercise,"0"~"little or no exercise","1"~"moderate exercise","2"~"much exercise",.ptype = factor(c("little or no exercise","moderate exercise","much exercise"))))

nhefs2<-nhefs2 %>% mutate(sex=case_match(sex,"0"~"Male","1"~"Female",.ptype = factor(c("Male","Female"))))

nhefs2<-nhefs2 %>% mutate(education=case_match(education,"1"~"8TH GRADE OR LESS","2"~"HS DROPOUT","3"~"HS","4"~"COLLEGE DROPOUT","5"~"COLLEGE OR MORE",.ptype = factor(c("8TH GRADE OR LESS","HS DROPOUT","HS","COLLEGE DROPOUT","COLLEGE OR MORE"))))

nhefs2<-nhefs2 %>% mutate(race=case_match(race,"0"~"WHITE","1"~"BLACK OR OTHER",.ptype = factor(c("WHITE", "BLACK OR OTHER"))))

nhefs2<-nhefs2 %>% mutate(qsmk=case_match(qsmk,"0"~"NO",
           "1"~"YES",.ptype = factor(c("NO", "YES"))))
nhefs2
## # A tibble: 1,629 × 10
##    active     age education exercise race  sex   smokeintensity smokeyrs wt82_71
##    <fct>    <dbl> <fct>     <fct>    <fct> <fct>          <dbl>    <dbl>   <dbl>
##  1 inactive    42 8TH GRAD… much ex… BLAC… Male              30       29 -10.1  
##  2 inactive    36 HS DROPO… little … WHITE Male              20       24   2.60 
##  3 inactive    56 HS DROPO… much ex… BLAC… Fema…             20       26   9.41 
##  4 moderat…    68 8TH GRAD… much ex… BLAC… Male               3       53   4.99 
##  5 moderat…    40 HS DROPO… moderat… WHITE Male              20       19   4.99 
##  6 moderat…    43 HS DROPO… moderat… BLAC… Fema…             10       21   4.42 
##  7 inactive    56 HS        moderat… WHITE Fema…             20       39  -4.08 
##  8 inactive    29 HS        much ex… WHITE Fema…              2        9   0.227
##  9 very ac…    51 HS DROPO… much ex… WHITE Male              25       37  -2.72 
## 10 moderat…    43 HS DROPO… much ex… WHITE Male              20       25   9.86 
## # ℹ 1,619 more rows
## # ℹ 1 more variable: qsmk <fct>
nhefs2<-nhefs2 %>% mutate(active=relevel(active,ref="moderately active"))

nhefs2<-nhefs2 %>% mutate(education=relevel(education,ref="HS"))

nhefs2<-nhefs2 %>% mutate(exercise=relevel(exercise,ref="moderate exercise"))

nhefs2<-nhefs2 %>% mutate(race=relevel(race,ref="WHITE"))

Question 3: Table 1

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
nhefs2_table<-
  table1::table1(
    ~active+age+education+exercise+race+smokeintensity+smokeyrs+wt82_71|sex,
    data= nhefs2,
    overall="Total",
    topclass="Rtable1-times",
    caption="Table1:Epi Table-characterisitics of data")

nhefs2_table
Table1:Epi Table-characterisitics of data
Female
(N=830)
Male
(N=799)
Total
(N=1629)
active
moderately active 406 (48.9%) 332 (41.6%) 738 (45.3%)
inactive 334 (40.2%) 395 (49.4%) 729 (44.8%)
very active 90 (10.8%) 72 (9.0%) 162 (9.9%)
age
Mean (SD) 43.3 (11.9) 44.6 (12.5) 43.9 (12.2)
Median [Min, Max] 43.0 [25.0, 74.0] 45.0 [25.0, 74.0] 44.0 [25.0, 74.0]
education
HS 367 (44.2%) 292 (36.5%) 659 (40.5%)
8TH GRADE OR LESS 120 (14.5%) 191 (23.9%) 311 (19.1%)
COLLEGE DROPOUT 66 (8.0%) 60 (7.5%) 126 (7.7%)
COLLEGE OR MORE 72 (8.7%) 110 (13.8%) 182 (11.2%)
HS DROPOUT 205 (24.7%) 146 (18.3%) 351 (21.5%)
exercise
moderate exercise 349 (42.0%) 328 (41.1%) 677 (41.6%)
little or no exercise 115 (13.9%) 202 (25.3%) 317 (19.5%)
much exercise 366 (44.1%) 269 (33.7%) 635 (39.0%)
race
WHITE 709 (85.4%) 705 (88.2%) 1414 (86.8%)
BLACK OR OTHER 121 (14.6%) 94 (11.8%) 215 (13.2%)
smokeintensity
Mean (SD) 18.0 (10.6) 23.2 (12.4) 20.6 (11.8)
Median [Min, Max] 20.0 [1.00, 60.0] 20.0 [1.00, 80.0] 20.0 [1.00, 80.0]
smokeyrs
Mean (SD) 22.5 (11.1) 27.3 (12.8) 24.9 (12.2)
Median [Min, Max] 22.0 [1.00, 64.0] 27.0 [1.00, 60.0] 24.0 [1.00, 64.0]
wt82_71
Mean (SD) 2.47 (8.24) 2.82 (7.49) 2.64 (7.88)
Median [Min, Max] 2.61 [-30.5, 37.7] 2.50 [-41.3, 48.5] 2.60 [-41.3, 48.5]
Missing 26 (3.1%) 37 (4.6%) 63 (3.9%)

Question 4: Draw a DAG

library(dagitty)
dag<-dagitty('dag{
bb="0,0,1,1"
age [pos="0.319,0.169"]
education [pos="0.425,0.207"]
exercise [pos="0.486,0.263"]
qsmk [exposure,pos="0.162,0.371"]
race [pos="0.207,0.212"]
sex [pos="0.167,0.272"]
wt82_71 [outcome,pos="0.528,0.370"]
age -> qsmk
age -> wt82_71
education -> qsmk
education -> wt82_71
exercise -> qsmk
exercise -> wt82_71
qsmk -> wt82_71
race -> qsmk
race -> wt82_71
sex -> qsmk
sex -> wt82_71
}')
plot(dag)

research hypothesis: People who quit smoking are more likely to gain weight when compared to those who don’t quit smoking.

Ho= Weight gain among people who quit smoking is the same as those who don’t quit smoking/ there is no difference of weight gain among people who quit smoking and those who don’t.

Ha= There is a difference of weight gain among people who quit smoking and people who don’t.

Using a two-sided null hypothesis even though the research hypothesis is one sided will help us to avoid type 2 error.

Question 5: fit a regression model

model1<-lm(formula=wt82_71~qsmk+sex+race+age+I(age^2)+exercise,data=nhefs2)

model1
## 
## Call:
## lm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) + exercise, 
##     data = nhefs2)
## 
## Coefficients:
##                   (Intercept)                        qsmkYES  
##                     -5.125786                       3.102922  
##                       sexMale             raceBLACK OR OTHER  
##                      0.405427                      -0.117481  
##                           age                       I(age^2)  
##                      0.501830                      -0.007371  
## exerciselittle or no exercise          exercisemuch exercise  
##                      0.101084                      -0.095006

Question 6: Standardization

Calculate the mean predicted outcomes under treatment (quitting smoking) and no treatment (not quitting smoking).

model1qsmkquit<-nhefs2 
model1qsmkquit$qsmk<-"YES"

model1qsmknoquit<-nhefs2
model1qsmknoquit$qsmk<-"NO"
model1qsmkquitmean<-mean(predict(model1,newdata = model1qsmkquit))
model1qsmkquitmean
## [1] 4.875314
model1qsmknoquitmean<-mean(predict(model1,newdata = model1qsmknoquit))
model1qsmknoquitmean
## [1] 1.772392

Calculate the difference between these to estimate the causal effect of quitting smoking on change in weight.

model1qsmkquitmean-model1qsmknoquitmean
## [1] 3.102922

Is the conditional average treatment effect that you estimated using standardization the same or different than the coefficient for qsmk in the table of regression coefficients?

Question 7: DAG and regression with interactions

library(dagitty)
dag<-dagitty('
dag {
bb="0,0,1,1"
"qsmk * smoke intensity " [pos="0.387,0.436"]
age [pos="0.371,0.164"]
education [pos="0.536,0.231"]
exercise [pos="0.601,0.309"]
qsmk [pos="0.171,0.445"]
race [pos="0.219,0.228"]
sex [pos="0.166,0.306"]
wt82_71 [pos="0.599,0.445"]
"qsmk * smoke intensity " -> wt82_71
age -> qsmk
age -> wt82_71
education -> qsmk
education -> wt82_71
exercise -> qsmk
exercise -> wt82_71
qsmk -> "qsmk * smoke intensity "
race -> qsmk
race -> wt82_71
sex -> qsmk
sex -> wt82_71
}')
plot(dag)

model2<-lm(formula=wt82_71~qsmk+sex+race+age+I(age^2)+exercise+smokeintensity+I(smokeintensity^2)+qsmk*smokeintensity,data=nhefs2)

model2
## 
## Call:
## lm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) + exercise + 
##     smokeintensity + I(smokeintensity^2) + qsmk * smokeintensity, 
##     data = nhefs2)
## 
## Coefficients:
##                   (Intercept)                        qsmkYES  
##                     -5.442653                       2.262604  
##                       sexMale             raceBLACK OR OTHER  
##                      0.296738                      -0.001572  
##                           age                       I(age^2)  
##                      0.493352                      -0.007266  
## exerciselittle or no exercise          exercisemuch exercise  
##                      0.148055                      -0.071881  
##                smokeintensity            I(smokeintensity^2)  
##                      0.052308                      -0.001066  
##        qsmkYES:smokeintensity  
##                      0.048501

Question 8: Standardization with interactions

model2qsmkquit<-nhefs2 
model2qsmkquit$qsmk<-"YES"

model2qsmknoquit<-nhefs2
model2qsmknoquit$qsmk<-"NO"
model2qsmkquitmean<-mean(predict(model2,newdata = model2qsmkquit))
model2qsmkquitmean
## [1] 5.015699
model2qsmknoquitmean<-mean(predict(model2,newdata = model2qsmknoquit))
model2qsmknoquitmean
## [1] 1.756331
model2qsmkquitmean-model2qsmknoquitmean
## [1] 3.259369

Question 10: what did those quadratic terms do?

library(ggplot2)
ggplot(data=nhefs2, aes(x=age, y=wt82_71))+
  geom_point()+
  stat_smooth(method="lm",col="green")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 63 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 63 rows containing missing values (`geom_point()`).

ggplot(data=nhefs2, aes(x=age + I(age^2), y=wt82_71))+
  geom_point()+
  stat_smooth(method="lm",col="red")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 63 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 63 rows containing missing values (`geom_point()`).