Homework1

install.packages("table1")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

install.packages("dplyr")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

## also installing the dependency 'pillar'

install.packages("dagitty")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

install.packages("ggplot2")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

Question 1: customizing read_csv

library(readr)
nhefs <- read_csv("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv", 
    col_types = cols(qsmk = col_factor(levels = c("0", 
         "1")), death = col_factor(levels = c("0", 
         "1")), sex = col_factor(levels = c("0", 
         "1")), race = col_factor(levels = c("0", 
         "1")), income = col_factor(levels = c("11", 
         "12", "13", "14", "15", "16", "17", 
         "18", "19", "20", "21", "22")), marital = col_factor(levels = c("1", 
         "2", "3", "4", "5", "6", "8")), education = col_factor(levels = c("1", 
         "2", "3", "4", "5")), asthma = col_factor(levels = c("0", 
         "1")), bronch = col_factor(levels = c("0", 
         "1")), tb = col_factor(levels = c("0", 
         "1")), hf = col_factor(levels = c("0", 
         "1")), hbp = col_factor(levels = c("0", 
         "1")), pepticulcer = col_factor(levels = c("0", 
         "1")), colitis = col_factor(levels = c("0", 
         "1")), hepatitis = col_factor(levels = c("0", 
         "1")), chroniccough = col_factor(levels = c("0", 
         "1")), hayfever = col_factor(levels = c("0", 
         "1")), diabetes = col_factor(levels = c("0", 
         "1")), polio = col_factor(levels = c("0", 
         "1")), tumor = col_factor(levels = c("0", 
         "1")), nervousbreak = col_factor(levels = c("0", 
         "1")), alcoholpy = col_factor(levels = c("0", 
         "1")), alcoholfreq = col_factor(levels = c("1", 
         "2", "3", "4", "5")), alcoholtype = col_factor(levels = c("1", 
         "2", "3", "4")), pica = col_factor(levels = c("0", 
         "1", "2")), headache = col_factor(levels = c("0", 
         "1")), otherpain = col_factor(levels = c("0", 
         "1")), weakheart = col_factor(levels = c("0", 
         "1")), allergies = col_factor(levels = c("0", 
         "1")), nerves = col_factor(levels = c("0", 
         "1")), lackpep = col_factor(levels = c("0", 
         "1")), hbpmed = col_factor(levels = c("0", 
         "1", "2")), boweltrouble = col_factor(levels = c("0", 
         "1", "2")), active = col_factor(levels = c("0", 
         "1","2")),exercise = col_factor(levels = c("0", 
         "1","2"))))

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

nhefs2<-select(nhefs,c(active,age,education,exercise,race,sex,smokeintensity,smokeyrs,wt82_71,qsmk))

Question 2: recoding

library(dplyr)

nhefs2<-nhefs2 %>% mutate(active=case_match(active,"0"~"inactive","1"~"moderately active","2"~"very active",.ptype = factor(c("inactive","moderately active","very active"))))

nhefs2<-nhefs2 %>% mutate(exercise=case_match(exercise,"0"~"little or no exercise","1"~"moderate exercise","2"~"much exercise",.ptype = factor(c("little or no exercise","moderate exercise","much exercise"))))

nhefs2<-nhefs2 %>% mutate(sex=case_match(sex,"0"~"Male","1"~"Female",.ptype = factor(c("Male","Female"))))

nhefs2<-nhefs2 %>% mutate(education=case_match(education,"1"~"8TH GRADE OR LESS","2"~"HS DROPOUT","3"~"HS","4"~"COLLEGE DROPOUT","5"~"COLLEGE OR MORE",.ptype = factor(c("8TH GRADE OR LESS","HS DROPOUT","HS","COLLEGE DROPOUT","COLLEGE OR MORE"))))

nhefs2<-nhefs2 %>% mutate(race=case_match(race,"0"~"WHITE","1"~"BLACK OR OTHER",.ptype = factor(c("WHITE", "BLACK OR OTHER"))))

nhefs2<-nhefs2 %>% mutate(qsmk=case_match(qsmk,"0"~"NO",
           "1"~"YES",.ptype = factor(c("NO", "YES"))))

nhefs2

## # A tibble: 1,629 × 10
##    active     age education exercise race  sex   smokeintensity smokeyrs wt82_71
##    <fct>    <dbl> <fct>     <fct>    <fct> <fct>          <dbl>    <dbl>   <dbl>
##  1 inactive    42 8TH GRAD… much ex… BLAC… Male              30       29 -10.1  
##  2 inactive    36 HS DROPO… little … WHITE Male              20       24   2.60 
##  3 inactive    56 HS DROPO… much ex… BLAC… Fema…             20       26   9.41 
##  4 moderat…    68 8TH GRAD… much ex… BLAC… Male               3       53   4.99 
##  5 moderat…    40 HS DROPO… moderat… WHITE Male              20       19   4.99 
##  6 moderat…    43 HS DROPO… moderat… BLAC… Fema…             10       21   4.42 
##  7 inactive    56 HS        moderat… WHITE Fema…             20       39  -4.08 
##  8 inactive    29 HS        much ex… WHITE Fema…              2        9   0.227
##  9 very ac…    51 HS DROPO… much ex… WHITE Male              25       37  -2.72 
## 10 moderat…    43 HS DROPO… much ex… WHITE Male              20       25   9.86 
## # ℹ 1,619 more rows
## # ℹ 1 more variable: qsmk <fct>

nhefs2<-nhefs2 %>% mutate(active=relevel(active,ref="moderately active"))

nhefs2<-nhefs2 %>% mutate(education=relevel(education,ref="HS"))

nhefs2<-nhefs2 %>% mutate(exercise=relevel(exercise,ref="moderate exercise"))

nhefs2<-nhefs2 %>% mutate(race=relevel(race,ref="WHITE"))

Question 3: Table 1

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

nhefs2_table<-
  table1::table1(
    ~active+age+education+exercise+race+smokeintensity+smokeyrs+wt82_71|sex,
    data= nhefs2,
    overall="Total",
    topclass="Rtable1-times",
    caption="Table1:Epi Table-characterisitics of data")

nhefs2_table

Table1:Epi Table-characterisitics of data
	Female (N=830)	Male (N=799)	Total (N=1629)
active
moderately active	406 (48.9%)	332 (41.6%)	738 (45.3%)
inactive	334 (40.2%)	395 (49.4%)	729 (44.8%)
very active	90 (10.8%)	72 (9.0%)	162 (9.9%)
age
Mean (SD)	43.3 (11.9)	44.6 (12.5)	43.9 (12.2)
Median [Min, Max]	43.0 [25.0, 74.0]	45.0 [25.0, 74.0]	44.0 [25.0, 74.0]
education
HS	367 (44.2%)	292 (36.5%)	659 (40.5%)
8TH GRADE OR LESS	120 (14.5%)	191 (23.9%)	311 (19.1%)
COLLEGE DROPOUT	66 (8.0%)	60 (7.5%)	126 (7.7%)
COLLEGE OR MORE	72 (8.7%)	110 (13.8%)	182 (11.2%)
HS DROPOUT	205 (24.7%)	146 (18.3%)	351 (21.5%)
exercise
moderate exercise	349 (42.0%)	328 (41.1%)	677 (41.6%)
little or no exercise	115 (13.9%)	202 (25.3%)	317 (19.5%)
much exercise	366 (44.1%)	269 (33.7%)	635 (39.0%)
race
WHITE	709 (85.4%)	705 (88.2%)	1414 (86.8%)
BLACK OR OTHER	121 (14.6%)	94 (11.8%)	215 (13.2%)
smokeintensity
Mean (SD)	18.0 (10.6)	23.2 (12.4)	20.6 (11.8)
Median [Min, Max]	20.0 [1.00, 60.0]	20.0 [1.00, 80.0]	20.0 [1.00, 80.0]
smokeyrs
Mean (SD)	22.5 (11.1)	27.3 (12.8)	24.9 (12.2)
Median [Min, Max]	22.0 [1.00, 64.0]	27.0 [1.00, 60.0]	24.0 [1.00, 64.0]
wt82_71
Mean (SD)	2.47 (8.24)	2.82 (7.49)	2.64 (7.88)
Median [Min, Max]	2.61 [-30.5, 37.7]	2.50 [-41.3, 48.5]	2.60 [-41.3, 48.5]
Missing	26 (3.1%)	37 (4.6%)	63 (3.9%)

Question 4: Draw a DAG

library(dagitty)
dag<-dagitty('dag{
bb="0,0,1,1"
age [pos="0.319,0.169"]
education [pos="0.425,0.207"]
exercise [pos="0.486,0.263"]
qsmk [exposure,pos="0.162,0.371"]
race [pos="0.207,0.212"]
sex [pos="0.167,0.272"]
wt82_71 [outcome,pos="0.528,0.370"]
age -> qsmk
age -> wt82_71
education -> qsmk
education -> wt82_71
exercise -> qsmk
exercise -> wt82_71
qsmk -> wt82_71
race -> qsmk
race -> wt82_71
sex -> qsmk
sex -> wt82_71
}')
plot(dag)

research hypothesis: People who quit smoking are more likely to gain weight when compared to those who don’t quit smoking.

Ho= Weight gain among people who quit smoking is the same as those who don’t quit smoking/ there is no difference of weight gain among people who quit smoking and those who don’t.

Ha= There is a difference of weight gain among people who quit smoking and people who don’t.

Using a two-sided null hypothesis even though the research hypothesis is one sided will help us to avoid type 2 error.

Question 5: fit a regression model

model1<-lm(formula=wt82_71~qsmk+sex+race+age+I(age^2)+exercise,data=nhefs2)

model1

## 
## Call:
## lm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) + exercise, 
##     data = nhefs2)
## 
## Coefficients:
##                   (Intercept)                        qsmkYES  
##                     -5.125786                       3.102922  
##                       sexMale             raceBLACK OR OTHER  
##                      0.405427                      -0.117481  
##                           age                       I(age^2)  
##                      0.501830                      -0.007371  
## exerciselittle or no exercise          exercisemuch exercise  
##                      0.101084                      -0.095006

Question 6: Standardization

Calculate the mean predicted outcomes under treatment (quitting smoking) and no treatment (not quitting smoking).

model1qsmkquit<-nhefs2 
model1qsmkquit$qsmk<-"YES"

model1qsmknoquit<-nhefs2
model1qsmknoquit$qsmk<-"NO"

model1qsmkquitmean<-mean(predict(model1,newdata = model1qsmkquit))
model1qsmkquitmean

## [1] 4.875314

model1qsmknoquitmean<-mean(predict(model1,newdata = model1qsmknoquit))
model1qsmknoquitmean

## [1] 1.772392

Calculate the difference between these to estimate the causal effect of quitting smoking on change in weight.

model1qsmkquitmean-model1qsmknoquitmean

## [1] 3.102922

Is the conditional average treatment effect that you estimated using standardization the same or different than the coefficient for qsmk in the table of regression coefficients?

It’s the same because there are no interactions.

Question 7: DAG and regression with interactions

library(dagitty)
dag<-dagitty('
dag {
bb="0,0,1,1"
"qsmk * smoke intensity " [pos="0.387,0.436"]
age [pos="0.371,0.164"]
education [pos="0.536,0.231"]
exercise [pos="0.601,0.309"]
qsmk [pos="0.171,0.445"]
race [pos="0.219,0.228"]
sex [pos="0.166,0.306"]
wt82_71 [pos="0.599,0.445"]
"qsmk * smoke intensity " -> wt82_71
age -> qsmk
age -> wt82_71
education -> qsmk
education -> wt82_71
exercise -> qsmk
exercise -> wt82_71
qsmk -> "qsmk * smoke intensity "
race -> qsmk
race -> wt82_71
sex -> qsmk
sex -> wt82_71
}')
plot(dag)

model2<-lm(formula=wt82_71~qsmk+sex+race+age+I(age^2)+exercise+smokeintensity+I(smokeintensity^2)+qsmk*smokeintensity,data=nhefs2)

model2

## 
## Call:
## lm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) + exercise + 
##     smokeintensity + I(smokeintensity^2) + qsmk * smokeintensity, 
##     data = nhefs2)
## 
## Coefficients:
##                   (Intercept)                        qsmkYES  
##                     -5.442653                       2.262604  
##                       sexMale             raceBLACK OR OTHER  
##                      0.296738                      -0.001572  
##                           age                       I(age^2)  
##                      0.493352                      -0.007266  
## exerciselittle or no exercise          exercisemuch exercise  
##                      0.148055                      -0.071881  
##                smokeintensity            I(smokeintensity^2)  
##                      0.052308                      -0.001066  
##        qsmkYES:smokeintensity  
##                      0.048501

Question 8: Standardization with interactions

model2qsmkquit<-nhefs2 
model2qsmkquit$qsmk<-"YES"

model2qsmknoquit<-nhefs2
model2qsmknoquit$qsmk<-"NO"

model2qsmkquitmean<-mean(predict(model2,newdata = model2qsmkquit))
model2qsmkquitmean

## [1] 5.015699

model2qsmknoquitmean<-mean(predict(model2,newdata = model2qsmknoquit))
model2qsmknoquitmean

## [1] 1.756331

model2qsmkquitmean-model2qsmknoquitmean

## [1] 3.259369

this estimate is different because there is an interaction.

Question 10: what did those quadratic terms do?

library(ggplot2)

ggplot(data=nhefs2, aes(x=age, y=wt82_71))+
  geom_point()+
  stat_smooth(method="lm",col="green")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 63 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 63 rows containing missing values (`geom_point()`).

ggplot(data=nhefs2, aes(x=age + I(age^2), y=wt82_71))+
  geom_point()+
  stat_smooth(method="lm",col="red")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 63 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 63 rows containing missing values (`geom_point()`).

The first graph shows the relationship between age and weight change. The second graph shows the relationship between age + ,the quadratic term, I(age)^2. Both graphs show a negative, linear trend, some outliers, moderate strength and heteroscedasticity. What is different about the second graph is that the data points are visibily more tightly clustered around the regression line for the younger ages. This might imply that the data is better fitted.

Homework1

Anika Lewis

2023-03-10