Homework 1

Load Necessary Libraries

library(tidyverse)
library(dplyr)
library(readr)
library(table1)
library(dagitty)
library(ggdag)
library(sjPlot)
library(boot)

Question 1

Importing and customizing the dataset using read_csv()

# Read data and change first 15 variables to factors
nhefs = read_csv("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv",
                 col_types = cols(
                   qsmk = col_factor(),
                   sex = col_factor(),
                   race = col_factor(),
                   education = col_factor(),
                   active = col_factor(),
                   exercise = col_factor()
                 ))

Question 2

Recode factor variables of interest into informative factors with levels

# Subset data into necessary variables and recode factors
nhefs = nhefs %>% 
  select(wt82_71, wt71, qsmk, age, smokeintensity, sex, smokeyrs, race, education, active, exercise) %>% 
  mutate(qsmk = case_match(qsmk, "0" ~ "Did Not Quit Smoking", "1" ~ "Quit Smoking",
                           .ptype = factor(levels = c("Did Not Quit Smoking", "Quit Smoking"))),
         sex = case_match(sex, "0" ~ "Male", "1" ~ "Female",
                          .ptype = factor(levels = c("Female", "Male"))),
         race = case_match(race, "0" ~ "White", "1" ~ "Black or Other",
                           .ptype = factor(levels = c("White", "Black or Other"))),
         education = case_match(education, "1" ~ "8th Grade or less", "2" ~ "HS Dropout",
                                "3" ~ "High School", "4" ~ "College Dropout", "5" ~ "College or More",
                                .ptype = factor(levels = c("8th Grade or less", "HS Dropout", 
                                                           "High School", "College Dropout", "College or More"))),
         active = case_match(active, "2" ~ "Inactive", "1" ~ "Moderately Active", "0" ~ "Very Active",
                             .ptype = factor(levels = c("Inactive", "Moderately Active", "Very Active"))),
         exercise = case_match(exercise, "2" ~ "Little or No Exercise", "1" ~ "Moderate Exercise", 
                               "0" ~ "Much Exercise",
                               .ptype = factor(levels = c("Little or No Exercise", "Moderate Exercise",
                                                          "Much Exercise")))
         )

Question 3

Use the table1 package (or an alternative of your choice) to make a “Table 1” summarizing the above variables

# Set labels for each variable
nhefs$sex = setLabel(nhefs$sex, "Sex")
nhefs$age = setLabel(nhefs$age, "Age")
nhefs$race = setLabel(nhefs$race, "Race")
nhefs$education = setLabel(nhefs$education, "Education")
nhefs$active = setLabel(nhefs$active, "Physical Activity")
nhefs$exercise = setLabel(nhefs$exercise, "Exercise")
nhefs$smokeintensity = setLabel(nhefs$smokeintensity, "Number of Cigarettes per Day")
nhefs$smokeyrs = setLabel(nhefs$smokeyrs, "Years Smoking")
nhefs$wt71 = setLabel(nhefs$wt71, "Weight")
nhefs$wt82_71 = setLabel(nhefs$wt82_71, "Weight Change")
nhefs$qsmk = setLabel(nhefs$qsmk, "Smoking Cessation")

# Units for continuous variables
units(nhefs$age) = "years"
units(nhefs$wt71) = "Kilograms"
units(nhefs$wt82_71) = "Kilograms"

# Create table 1 to summarize variables
table1(~ sex + age + race + education + active + exercise + smokeintensity + 
         smokeyrs + wt71 + wt82_71 | qsmk, 
       data = nhefs, 
       caption = "Table 1: Demographics of individuals who quit and did not quit smoking", 
       footnote = "* Individuals who quit smoking did between the first questionnaire and 1982", 
       overall = "Total",
       topclass="Rtable1-times Rtable1-shade")

Table 1: Demographics of individuals who quit and did not quit smoking
	Did Not Quit Smoking (N=1201)	Quit Smoking (N=428)	Total (N=1629)
* Individuals who quit smoking did between the first questionnaire and 1982
Sex
Female	639 (53.2%)	191 (44.6%)	830 (51.0%)
Male	562 (46.8%)	237 (55.4%)	799 (49.0%)
Age (years)
Mean (SD)	42.9 (11.9)	46.7 (12.5)	43.9 (12.2)
Median [Min, Max]	43.0 [25.0, 72.0]	47.0 [25.0, 74.0]	44.0 [25.0, 74.0]
Race
White	1024 (85.3%)	390 (91.1%)	1414 (86.8%)
Black or Other	177 (14.7%)	38 (8.9%)	215 (13.2%)
Education
8th Grade or less	218 (18.2%)	93 (21.7%)	311 (19.1%)
HS Dropout	273 (22.7%)	78 (18.2%)	351 (21.5%)
High School	495 (41.2%)	164 (38.3%)	659 (40.5%)
College Dropout	96 (8.0%)	30 (7.0%)	126 (7.7%)
College or More	119 (9.9%)	63 (14.7%)	182 (11.2%)
Physical Activity
Inactive	114 (9.5%)	48 (11.2%)	162 (9.9%)
Moderately Active	540 (45.0%)	198 (46.3%)	738 (45.3%)
Very Active	547 (45.5%)	182 (42.5%)	729 (44.8%)
Exercise
Little or No Exercise	458 (38.1%)	177 (41.4%)	635 (39.0%)
Moderate Exercise	496 (41.3%)	181 (42.3%)	677 (41.6%)
Much Exercise	247 (20.6%)	70 (16.4%)	317 (19.5%)
Number of Cigarettes per Day
Mean (SD)	21.2 (11.6)	18.8 (12.3)	20.6 (11.8)
Median [Min, Max]	20.0 [1.00, 60.0]	20.0 [1.00, 80.0]	20.0 [1.00, 80.0]
Years Smoking
Mean (SD)	24.3 (11.8)	26.6 (13.0)	24.9 (12.2)
Median [Min, Max]	23.0 [1.00, 64.0]	26.0 [1.00, 60.0]	24.0 [1.00, 64.0]
Weight (Kilograms)
Mean (SD)	70.5 (15.6)	72.6 (16.1)	71.1 (15.7)
Median [Min, Max]	68.5 [40.8, 169]	71.3 [36.2, 137]	69.4 [36.2, 169]
Weight Change (Kilograms)
Mean (SD)	1.98 (7.45)	4.53 (8.75)	2.64 (7.88)
Median [Min, Max]	2.15 [-41.3, 48.5]	3.97 [-22.2, 47.5]	2.60 [-41.3, 48.5]
Missing	38 (3.2%)	25 (5.8%)	63 (3.9%)

Question 4

Draw a DAG that can represent the causal model in this study. The research question is whether quitting smoking has an effect on weight gain or loss. State the null and alternative hypothesis. Why would we use a two sided test rather than one-sided?

# Proposed DAG of research question
quit_smoking_dag = dagify(
  qsmk ~ sex + race + education + exercise + age,
  wt82_71 ~ qsmk + sex + race + education + exercise + age,
  exposure = "qsmk",
  outcome = "wt82_71",
  coords = list(x = c(qsmk = 4, wt82_71 = 8, sex = 5, race = 6, education = 7,
                      exercise = 5, age = 6),
                y = c(qsmk = 2, wt82_71 = 2, sex = 3, race = 3, education = 3,
                      exercise = 1, age = 1)),
  labels = c(qsmk = "Smoking Cessation", wt82_71 = "Weight Change", sex = "Sex",
             race = "Race", education = "Education", exercise = "Exercise", 
             age = "Age")
)

ggdag_status(quit_smoking_dag, use_labels = "label", text = FALSE, text_size = 3) +
  guides(fill = FALSE, color = FALSE) +
  theme_dag() +
  geom_dag_edges()

The research question in this study is where quitting smoking causes a change in weight. Particularly, does quitting smoking causes weight gain? We can write the hypothesis as such: \[ H_0: There\:is\:no\:association\:between\:quitting\:smoking\:and\:weight\:change\:(\beta_1 = 0) \] \[ H_A: There\:is\:no\:association\:between\:quitting\:smoking\:and\:weight\:change\:(\beta_1 \not= 0) \] Whether this association is causal will depend on whether we meet the assumptions for causality. In this case, we want to see whether there exists an association between these variables. Based on our DAG, we believe this relationship is causal, yet we need to adjust for the confounders in our model in order to get an unbiased estimate of the causal effect. The reason we want to use a two sided (or two tailed) hypothesis test is because it will test both directions. In other words, it will test whether quitting smoking is associated with weight gain or weight lost. Assuming that our DAG is the true causal model and there are no other unmeasured covariates, we can adjust for the minimum sufficient set of confounders to close all the unblocked backdoor paths between Smoking Cessation and Weight Change giving us an unbiased estimate of the true causal effect with a direction. Therefore, having a two-sided test will give us more flexibility to test either effect. In reality, a two-sided test will be used in most studies.

Question 5

Fit the following multiple linear regression: \[ Weight\:Change = Smoking\:Cessation + Sex + Race + Age + Age^2 + Education + Exercise \]

#Linear Model
wt_chg_mod = lm(wt82_71 ~ qsmk + sex + race + age + I(age^2) + education + exercise, data = nhefs)

# Summary of Model
tab_model(wt_chg_mod, show.se = TRUE, show.stat = TRUE, digits = 4)

	Weight Change
Predictors	Estimates	std. Error	CI	Statistic	p
(Intercept)	-5.7213	2.5963	-10.8139 – -0.6288	-2.2037	0.028
Smoking Cessation: Quit Smoking	3.1719	0.4402	2.3085 – 4.0354	7.2058	<0.001
Sex: Male	0.5155	0.3897	-0.2488 – 1.2799	1.3230	0.186
Race: Black or Other	-0.1029	0.5745	-1.2299 – 1.0240	-0.1792	0.858
Age	0.4991	0.1154	0.2727 – 0.7255	4.3244	<0.001
I(age^2)	-0.0073	0.0013	-0.0098 – -0.0048	-5.7692	<0.001
Education: HS Dropout	0.8025	0.6160	-0.4057 – 2.0107	1.3028	0.193
Education: High School	0.4890	0.5644	-0.6181 – 1.5961	0.8664	0.386
Education: College Dropout	1.2396	0.8440	-0.4159 – 2.8950	1.4687	0.142
Education: College or More	-0.5731	0.7466	-2.0375 – 0.8914	-0.7675	0.443
Exercise: Moderate Exercise	0.0878	0.4312	-0.7580 – 0.9336	0.2037	0.839
Exercise: Much Exercise	0.2174	0.5499	-0.8613 – 1.2960	0.3953	0.693
Observations	1566
R² / R² adjusted	0.105 / 0.098

Question 6

Use Standardization to estimate the effect of smoking cessation on weight change

# Treatment is quitting smoking (treatment = 1)
# Create a data frame where everyone is treated (quits smoking)
nhefs_trt = nhefs[]
nhefs_trt$qsmk = as.factor("Quit Smoking")

# Predict outcome using the linear regression model
(treatment_effect = mean(predict(wt_chg_mod, nhefs_trt)))

## [1] 4.926672

# Control is not quitting smoking (Control = 0)
# Create new data frame with all observations as controls
nhefs_control = nhefs[]
nhefs_control$qsmk = as.factor("Did Not Quit Smoking")

# Predict outcome
(control_effect = mean(predict(wt_chg_mod, nhefs_control)))

## [1] 1.754752

# Average Treatment effect
treatment_effect - control_effect

## [1] 3.17192

Based on our results from the standardization, we can see that the average causal effect of quitting smoking on weight change is 3.1719. This is the same result we got in our regression as the coefficient for \(\beta_1\). This shows that among those who quit smoking, they gained, on average, 3.17 lbs compared to those who did not quit smoking. In order words, assuming our causal DAG is correct and we have no other unmeasured confounders, the average causal effect of smoking cessation is 3.17 lbs (Quitting smoking causes, on average, a gain of 3.17 lbs).

Question 7

Now suppose we hypothesize that smoking intensity also impacts weight loss, and modifies the effect of quitting smoking on weight loss

smk_intensity_dag = dagify(
  qsmk ~ sex + race + education + exercise + age,
  wt82_71 ~ qsmk + sex + race + education + exercise + age + smokeintensity + interaction,
  interaction ~ qsmk + smokeintensity,
  exposure = "qsmk",
  outcome = "wt82_71",
  coords = list(x = c(qsmk = 4, wt82_71 = 9, sex = 5, race = 6, education = 7,
                      exercise = 5, age = 8, smokeintensity = 11, interaction = 9),
                y = c(qsmk = 10, wt82_71 = 10, sex = 20, race = 20, education = 20,
                      exercise = 1, age = 1, smokeintensity = 20, interaction = 18)),
  labels = c(qsmk = "Smoking \nCessation", wt82_71 = "Weight\nChange", sex = "Sex",
             race = "Race", education = "Education", exercise = "Exercise", 
             age = "Age", smokeintensity = "Smoke\nIntensity",
             interaction = "Cessation*\nIntensity")
)

ggdag_status(smk_intensity_dag, use_labels = "label", 
             text = FALSE, text_size = 2.5,
             node_size = 12) +
  guides(fill = FALSE, color = FALSE) +
  theme_dag() +
  geom_dag_edges()

Create a linear model with a quadratic term of smoke intensity and an interaction term between smoke intensity and smoking cessation

# Linear model with smoke intensity
smk_intensity_lm = lm(wt82_71 ~ qsmk*smokeintensity + sex + race + age + I(age^2) + 
                        education + exercise + I(smokeintensity^2), data = nhefs)

# Summary of Model
tab_model(smk_intensity_lm, show.se = TRUE, show.stat = TRUE, digits = 4)

	Weight Change
Predictors	Estimates	std. Error	CI	Statistic	p
(Intercept)	-5.9940	2.6587	-11.2091 – -0.7789	-2.2545	0.024
Smoking Cessation: Quit Smoking	2.3339	0.8238	0.7180 – 3.9499	2.8330	0.005
Number of Cigarettes per Day	0.0508	0.0525	-0.0522 – 0.1538	0.9679	0.333
Sex: Male	0.4211	0.4032	-0.3699 – 1.2120	1.0442	0.297
Race: Black or Other	-0.0103	0.5905	-1.1686 – 1.1480	-0.0174	0.986
Age	0.4934	0.1158	0.2662 – 0.7206	4.2602	<0.001
I(age^2)	-0.0072	0.0013	-0.0097 – -0.0047	-5.6899	<0.001
Education: HS Dropout	0.7494	0.6179	-0.4627 – 1.9614	1.2127	0.225
Education: High School	0.4593	0.5648	-0.6484 – 1.5671	0.8133	0.416
Education: College Dropout	1.2643	0.8479	-0.3988 – 2.9275	1.4912	0.136
Education: College or More	-0.5732	0.7469	-2.0382 – 0.8919	-0.7674	0.443
Exercise: Moderate Exercise	0.0615	0.4318	-0.7856 – 0.9085	0.1424	0.887
Exercise: Much Exercise	0.2333	0.5508	-0.8471 – 1.3138	0.4236	0.672
I(smokeintensity^2)	-0.0011	0.0010	-0.0030 – 0.0008	-1.1532	0.249
qsmkQuit Smoking:smokeintensity	0.0478	0.0358	-0.0225 – 0.1180	1.3339	0.182
Observations	1566
R² / R² adjusted	0.107 / 0.098

Question 8

use standardization to calculate the conditional average treatment effect of quitting smoking in the above sample. Is this estimate different from the model?

# Create dataset with all treated (Quit smoking)
nhefs_int_trt = nhefs[]
nhefs_int_trt$qsmk = as.factor("Quit Smoking")

# Predict outcome using the second linear model
(int_trt_effect = mean(predict(smk_intensity_lm, nhefs_int_trt)))

## [1] 5.056981

# Create dataset with all in the control group (Did not quit smoking)
nhefs_int_control = nhefs[]
nhefs_int_control$qsmk = as.factor("Did Not Quit Smoking")

# Predict outcome using the second model
(int_control_effect = mean(predict(smk_intensity_lm, nhefs_int_control)))

## [1] 1.741581

# Calculate estimate
int_trt_effect - int_control_effect

## [1] 3.3154

This estimate is different than the coefficient in our regression model from Question 7. This is because we have interaction terms. The regression outputs a coefficient of 2.3339. However, using standardization, we get an average effect of 3.3154

Question 9

Write a function that takes in the dataset and a vector of row IDs that will be used to choose a subset of the dataframe

Using Boot library

# Create a function to calculate the mean of a sample
average_effect = function(x, d){
  
  # Subset the dataset by d and assign treatment to every observation
  treatment = x[d, ]
  treatment$qsmk = as.factor("Quit Smoking")
  
  # Predict outcome using the second linear model
  int_trt_effect = mean(predict(smk_intensity_lm, treatment))
  
  # Subset the dataset by d and assign control to every observation
  control = x[d, ]
  control$qsmk = as.factor("Did Not Quit Smoking")
  
  # Predict outcome using the second model
  int_control_effect = mean(predict(smk_intensity_lm, control))
  
  # return value of interest
  return(int_trt_effect - int_control_effect)
}

# Use bootstrap to replicate the function 1000 times to estimate the point estimate and the 95% CI
bootstrap = boot(nhefs, average_effect, R = 1000)
trt_effect = mean(bootstrap$t)

# Estimate the 95% CI
conf_int = quantile(bootstrap$t, c(0.025, 0.975))

The observed value of the function when applied to the data is the same as question 8a: 3.3154. The above function calculates conditional average treatment effect through standardization. We use the boot() library to repeat this process 1,000 times. After doing this process, we get a distribution where our point estimate is the mean of the distribution. In this case, the point estimate is 3.316 and the 95% confidence interval is [3.2889, 3.345]. The 95% confidence interval shows that the effect of quitting smoking on weight change is significant since it does not contain zero. The standard error is the standard deviation of our bootstrap. In this case, our standard error is 0.014

We can also find the confidence interval using boot.ci() and use Bias Corrected Accelerated to estimate the interval and correct its limits based on the bias in our statistic. However, this will not work if the R in our bootstrap is too small. We need to increase our bootstrap in order to get a 95% confidence interval using this method.

# New bootstrap using 2000 samples
bca_bootstrap = boot(nhefs, average_effect, R = 2000)

# Estimating the 95% confidence interval
bca_conf_int = boot.ci(bca_bootstrap, type = "bca")
bca_conf_int

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = bca_bootstrap, type = "bca")
## 
## Intervals : 
## Level       BCa          
## 95%   ( 3.289,  3.343 )  
## Calculations and Intervals on Original Scale

As we can see, our 95% confidence interval differ by a small amount and our results are still statistically significant.

Question 10

Use the coefficients of age and I(age)^2 to plot the conditional relationship between age and weight change for the participants in this study

# Scatterplot of weight change and age
ggplot(nhefs, aes(x = age, y = wt82_71)) +
  geom_point() +
  geom_smooth() +
  theme_bw() +
  labs(x = "Age",
       y = "Weight Change")

In both regression models we created, \(age\) and \(age^2\) are both significant predictors of weight change. Their coefficients change slightly when adding smokeintensity. The plot shows that the relationship isn’t linear, but it rather falls off at the end showing a possible quadratic relationship. The quadratic variable, \(age^2\) has a negative coefficient and it’s a significant predictor of weight change. This shows that the relationship between age and weight change is concave (upside down parabola). This means that as an individual becomes older, the effect of smoking cessation becomes less dominant on their change in weight. The squared term fits the data better and shows a different relationship than just fitting a line.