install.packages("table1")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
## also installing the dependency 'pillar'
install.packages("dagitty")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
Question 1: customizing read_csv
library(readr)
nhefs <- read_csv("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv",
col_types = cols(qsmk = col_factor(levels = c("0",
"1")), death = col_factor(levels = c("0",
"1")), sex = col_factor(levels = c("0",
"1")), race = col_factor(levels = c("0",
"1")), income = col_factor(levels = c("11",
"12", "13", "14", "15", "16", "17",
"18", "19", "20", "21", "22")), marital = col_factor(levels = c("1",
"2", "3", "4", "5", "6", "8")), education = col_factor(levels = c("1",
"2", "3", "4", "5")), asthma = col_factor(levels = c("0",
"1")), bronch = col_factor(levels = c("0",
"1")), tb = col_factor(levels = c("0",
"1")), hf = col_factor(levels = c("0",
"1")), hbp = col_factor(levels = c("0",
"1")), pepticulcer = col_factor(levels = c("0",
"1")), colitis = col_factor(levels = c("0",
"1")), hepatitis = col_factor(levels = c("0",
"1")), chroniccough = col_factor(levels = c("0",
"1")), hayfever = col_factor(levels = c("0",
"1")), diabetes = col_factor(levels = c("0",
"1")), polio = col_factor(levels = c("0",
"1")), tumor = col_factor(levels = c("0",
"1")), nervousbreak = col_factor(levels = c("0",
"1")), alcoholpy = col_factor(levels = c("0",
"1")), alcoholfreq = col_factor(levels = c("1",
"2", "3", "4", "5")), alcoholtype = col_factor(levels = c("1",
"2", "3", "4")), pica = col_factor(levels = c("0",
"1", "2")), headache = col_factor(levels = c("0",
"1")), otherpain = col_factor(levels = c("0",
"1")), weakheart = col_factor(levels = c("0",
"1")), allergies = col_factor(levels = c("0",
"1")), nerves = col_factor(levels = c("0",
"1")), lackpep = col_factor(levels = c("0",
"1")), hbpmed = col_factor(levels = c("0",
"1", "2")), boweltrouble = col_factor(levels = c("0",
"1", "2")), active = col_factor(levels = c("0",
"1","2")),exercise = col_factor(levels = c("0",
"1","2"))))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
nhefs2<-select(nhefs,c(active,age,education,exercise,race,sex,smokeintensity,smokeyrs,wt82_71,qsmk))
Question 2: recoding
library(dplyr)
nhefs2<-nhefs2 %>% mutate(active=case_match(active,"0"~"inactive","1"~"moderately active","2"~"very active",.ptype = factor(c("inactive","moderately active","very active"))))
nhefs2<-nhefs2 %>% mutate(exercise=case_match(exercise,"0"~"little or no exercise","1"~"moderate exercise","2"~"much exercise",.ptype = factor(c("little or no exercise","moderate exercise","much exercise"))))
nhefs2<-nhefs2 %>% mutate(sex=case_match(sex,"0"~"Male","1"~"Female",.ptype = factor(c("Male","Female"))))
nhefs2<-nhefs2 %>% mutate(education=case_match(education,"1"~"8TH GRADE OR LESS","2"~"HS DROPOUT","3"~"HS","4"~"COLLEGE DROPOUT","5"~"COLLEGE OR MORE",.ptype = factor(c("8TH GRADE OR LESS","HS DROPOUT","HS","COLLEGE DROPOUT","COLLEGE OR MORE"))))
nhefs2<-nhefs2 %>% mutate(race=case_match(race,"0"~"WHITE","1"~"BLACK OR OTHER",.ptype = factor(c("WHITE", "BLACK OR OTHER"))))
nhefs2<-nhefs2 %>% mutate(qsmk=case_match(qsmk,"0"~"NO",
"1"~"YES",.ptype = factor(c("NO", "YES"))))
nhefs2
## # A tibble: 1,629 × 10
## active age education exercise race sex smokeintensity smokeyrs wt82_71
## <fct> <dbl> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl>
## 1 inactive 42 8TH GRAD… much ex… BLAC… Male 30 29 -10.1
## 2 inactive 36 HS DROPO… little … WHITE Male 20 24 2.60
## 3 inactive 56 HS DROPO… much ex… BLAC… Fema… 20 26 9.41
## 4 moderat… 68 8TH GRAD… much ex… BLAC… Male 3 53 4.99
## 5 moderat… 40 HS DROPO… moderat… WHITE Male 20 19 4.99
## 6 moderat… 43 HS DROPO… moderat… BLAC… Fema… 10 21 4.42
## 7 inactive 56 HS moderat… WHITE Fema… 20 39 -4.08
## 8 inactive 29 HS much ex… WHITE Fema… 2 9 0.227
## 9 very ac… 51 HS DROPO… much ex… WHITE Male 25 37 -2.72
## 10 moderat… 43 HS DROPO… much ex… WHITE Male 20 25 9.86
## # ℹ 1,619 more rows
## # ℹ 1 more variable: qsmk <fct>
nhefs2<-nhefs2 %>% mutate(active=relevel(active,ref="moderately active"))
nhefs2<-nhefs2 %>% mutate(education=relevel(education,ref="HS"))
nhefs2<-nhefs2 %>% mutate(exercise=relevel(exercise,ref="moderate exercise"))
nhefs2<-nhefs2 %>% mutate(race=relevel(race,ref="WHITE"))
Question 3: Table 1
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
nhefs2_table<-
table1::table1(
~active+age+education+exercise+race+smokeintensity+smokeyrs+wt82_71|sex,
data= nhefs2,
overall="Total",
topclass="Rtable1-times",
caption="Table1:Epi Table-characterisitics of data")
nhefs2_table
| Female (N=830) |
Male (N=799) |
Total (N=1629) |
|
|---|---|---|---|
| active | |||
| moderately active | 406 (48.9%) | 332 (41.6%) | 738 (45.3%) |
| inactive | 334 (40.2%) | 395 (49.4%) | 729 (44.8%) |
| very active | 90 (10.8%) | 72 (9.0%) | 162 (9.9%) |
| age | |||
| Mean (SD) | 43.3 (11.9) | 44.6 (12.5) | 43.9 (12.2) |
| Median [Min, Max] | 43.0 [25.0, 74.0] | 45.0 [25.0, 74.0] | 44.0 [25.0, 74.0] |
| education | |||
| HS | 367 (44.2%) | 292 (36.5%) | 659 (40.5%) |
| 8TH GRADE OR LESS | 120 (14.5%) | 191 (23.9%) | 311 (19.1%) |
| COLLEGE DROPOUT | 66 (8.0%) | 60 (7.5%) | 126 (7.7%) |
| COLLEGE OR MORE | 72 (8.7%) | 110 (13.8%) | 182 (11.2%) |
| HS DROPOUT | 205 (24.7%) | 146 (18.3%) | 351 (21.5%) |
| exercise | |||
| moderate exercise | 349 (42.0%) | 328 (41.1%) | 677 (41.6%) |
| little or no exercise | 115 (13.9%) | 202 (25.3%) | 317 (19.5%) |
| much exercise | 366 (44.1%) | 269 (33.7%) | 635 (39.0%) |
| race | |||
| WHITE | 709 (85.4%) | 705 (88.2%) | 1414 (86.8%) |
| BLACK OR OTHER | 121 (14.6%) | 94 (11.8%) | 215 (13.2%) |
| smokeintensity | |||
| Mean (SD) | 18.0 (10.6) | 23.2 (12.4) | 20.6 (11.8) |
| Median [Min, Max] | 20.0 [1.00, 60.0] | 20.0 [1.00, 80.0] | 20.0 [1.00, 80.0] |
| smokeyrs | |||
| Mean (SD) | 22.5 (11.1) | 27.3 (12.8) | 24.9 (12.2) |
| Median [Min, Max] | 22.0 [1.00, 64.0] | 27.0 [1.00, 60.0] | 24.0 [1.00, 64.0] |
| wt82_71 | |||
| Mean (SD) | 2.47 (8.24) | 2.82 (7.49) | 2.64 (7.88) |
| Median [Min, Max] | 2.61 [-30.5, 37.7] | 2.50 [-41.3, 48.5] | 2.60 [-41.3, 48.5] |
| Missing | 26 (3.1%) | 37 (4.6%) | 63 (3.9%) |
Question 4: Draw a DAG
library(dagitty)
dag<-dagitty('dag{
bb="0,0,1,1"
age [pos="0.319,0.169"]
education [pos="0.425,0.207"]
exercise [pos="0.486,0.263"]
qsmk [exposure,pos="0.162,0.371"]
race [pos="0.207,0.212"]
sex [pos="0.167,0.272"]
wt82_71 [outcome,pos="0.528,0.370"]
age -> qsmk
age -> wt82_71
education -> qsmk
education -> wt82_71
exercise -> qsmk
exercise -> wt82_71
qsmk -> wt82_71
race -> qsmk
race -> wt82_71
sex -> qsmk
sex -> wt82_71
}')
plot(dag)
research hypothesis: People who quit smoking are more likely to gain weight when compared to those who don’t quit smoking.
Ho= Weight gain among people who quit smoking is the same as those who don’t quit smoking/ there is no difference of weight gain among people who quit smoking and those who don’t.
Ha= There is a difference of weight gain among people who quit smoking and people who don’t.
Using a two-sided null hypothesis even though the research hypothesis is one sided will help us to avoid type 2 error.
Question 5: fit a regression model
model1<-lm(formula=wt82_71~qsmk+sex+race+age+I(age^2)+exercise,data=nhefs2)
model1
##
## Call:
## lm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) + exercise,
## data = nhefs2)
##
## Coefficients:
## (Intercept) qsmkYES
## -5.125786 3.102922
## sexMale raceBLACK OR OTHER
## 0.405427 -0.117481
## age I(age^2)
## 0.501830 -0.007371
## exerciselittle or no exercise exercisemuch exercise
## 0.101084 -0.095006
Question 6: Standardization
Calculate the mean predicted outcomes under treatment (quitting smoking) and no treatment (not quitting smoking).
model1qsmkquit<-nhefs2
model1qsmkquit$qsmk<-"YES"
model1qsmknoquit<-nhefs2
model1qsmknoquit$qsmk<-"NO"
model1qsmkquitmean<-mean(predict(model1,newdata = model1qsmkquit))
model1qsmkquitmean
## [1] 4.875314
model1qsmknoquitmean<-mean(predict(model1,newdata = model1qsmknoquit))
model1qsmknoquitmean
## [1] 1.772392
Calculate the difference between these to estimate the causal effect of quitting smoking on change in weight.
model1qsmkquitmean-model1qsmknoquitmean
## [1] 3.102922
Is the conditional average treatment effect that you estimated using standardization the same or different than the coefficient for qsmk in the table of regression coefficients?
Question 7: DAG and regression with interactions
library(dagitty)
dag<-dagitty('
dag {
bb="0,0,1,1"
"qsmk * smoke intensity " [pos="0.387,0.436"]
age [pos="0.371,0.164"]
education [pos="0.536,0.231"]
exercise [pos="0.601,0.309"]
qsmk [pos="0.171,0.445"]
race [pos="0.219,0.228"]
sex [pos="0.166,0.306"]
wt82_71 [pos="0.599,0.445"]
"qsmk * smoke intensity " -> wt82_71
age -> qsmk
age -> wt82_71
education -> qsmk
education -> wt82_71
exercise -> qsmk
exercise -> wt82_71
qsmk -> "qsmk * smoke intensity "
race -> qsmk
race -> wt82_71
sex -> qsmk
sex -> wt82_71
}')
plot(dag)
model2<-lm(formula=wt82_71~qsmk+sex+race+age+I(age^2)+exercise+smokeintensity+I(smokeintensity^2)+qsmk*smokeintensity,data=nhefs2)
model2
##
## Call:
## lm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) + exercise +
## smokeintensity + I(smokeintensity^2) + qsmk * smokeintensity,
## data = nhefs2)
##
## Coefficients:
## (Intercept) qsmkYES
## -5.442653 2.262604
## sexMale raceBLACK OR OTHER
## 0.296738 -0.001572
## age I(age^2)
## 0.493352 -0.007266
## exerciselittle or no exercise exercisemuch exercise
## 0.148055 -0.071881
## smokeintensity I(smokeintensity^2)
## 0.052308 -0.001066
## qsmkYES:smokeintensity
## 0.048501
Question 8: Standardization with interactions
model2qsmkquit<-nhefs2
model2qsmkquit$qsmk<-"YES"
model2qsmknoquit<-nhefs2
model2qsmknoquit$qsmk<-"NO"
model2qsmkquitmean<-mean(predict(model2,newdata = model2qsmkquit))
model2qsmkquitmean
## [1] 5.015699
model2qsmknoquitmean<-mean(predict(model2,newdata = model2qsmknoquit))
model2qsmknoquitmean
## [1] 1.756331
model2qsmkquitmean-model2qsmknoquitmean
## [1] 3.259369
Question 10: what did those quadratic terms do?
library(ggplot2)
ggplot(data=nhefs2, aes(x=age, y=wt82_71))+
geom_point()+
stat_smooth(method="lm",col="green")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 63 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 63 rows containing missing values (`geom_point()`).
ggplot(data=nhefs2, aes(x=age + I(age^2), y=wt82_71))+
geom_point()+
stat_smooth(method="lm",col="red")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 63 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 63 rows containing missing values (`geom_point()`).