Exercise 1

Question: Create a table1 and use GGally::ggpairs to explore your data.

# Create Tittle of Table
tittle  <- "Table 1: Demographic stratified by lifetime marijuana use"

# Change Labels Names
label(nsduh$demog_sex)       <- "Sex"
label(nsduh$alc_agefirst)    <- "Alcohol (age first used)"
label(nsduh$demog_age_cat6)  <- "Age"
label(nsduh$demog_income)    <- "Income"

table1(~ demog_sex + alc_agefirst + demog_age_cat6 + demog_income | mj_lifetime, overall = FALSE, caption=tittle, data= nsduh)
Table 1: Demographic stratified by lifetime marijuana use
No
(N=491)
Yes
(N=509)
Sex
Female 285 (58.0%) 249 (48.9%)
Male 206 (42.0%) 260 (51.1%)
Alcohol (age first used)
Mean (SD) 19.7 (5.09) 16.0 (3.08)
Median [Min, Max] 19.0 [3.00, 45.0] 16.0 [3.00, 29.0]
Missing 145 (29.5%) 12 (2.4%)
Age
18-25 53 (10.8%) 80 (15.7%)
26-34 52 (10.6%) 87 (17.1%)
35-49 129 (26.3%) 136 (26.7%)
50-64 109 (22.2%) 132 (25.9%)
65+ 148 (30.1%) 74 (14.5%)
Income
Less than $20,000 76 (15.5%) 85 (16.7%)
$20,000 - $49,999 166 (33.8%) 127 (25.0%)
$50,000 - $74,999 64 (13.0%) 81 (15.9%)
$75,000 or more 185 (37.7%) 216 (42.4%)
#GGally::ggpairs to explore the data
tittle_plot  <- "Lifetime marijuana use"
tittle = tittle_plot

ggpairs(nsduh,
        columns = 3:6,
        title = "Lifetime Marijuana use",
        aes(color = mj_lifetime, alpha = 0.5),
        upper = list(discrete = "count"),
        lower = list(discrete = "facetbar")
        )

ggpairs(
  nsduh, c(3,4,5,6),
  title = tittle_plot, 
  mapping = ggplot2::aes(color = mj_lifetime),
  upper = list(continuous = wrap("density", alpha = 0.9),
          combo = "facethist", discrete = wrap("colbar")),
  lower = list(continuous = wrap("cor"), combo = NULL),
  diag = list(continuous = wrap("densityDiag", colour = NA, alpha = 0.6), discrete = wrap("densityDiag")),
  legend = c(1,4),
  showStrips = FALSE,
   cardinality_threshold = 5
  ) + 
  labs(colour = "Smoking") + 
  theme_minimal() + 
  theme(legend.position = "bottom")

Exercise 2

Question: Use the contingency table below to calculate (a) probability of lifetime marijuana among sample subjects (independent of sex), among sample females and among sample males, (b) Odds of lifetime marijuana use among sample subjects, among sample females and among sample males. (c) What is the risk difference, the risk ratio and the odds ratio when comparing lifetime marijuana use among males and females in our sample?

# Showing absolute numbers
# the rows are the independent variables, and the dependent/outcome is the columns
nsduh |> 
  tabyl(demog_sex, mj_lifetime) |> 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_title()
##            mj_lifetime          
##  demog_sex          No Yes Total
##     Female         285 249   534
##       Male         206 260   466
##      Total         491 509  1000
# Showing percentages
nsduh |> 
  tabyl(demog_sex, mj_lifetime) |> 
  adorn_percentages() |> 
  adorn_pct_formatting() |> 
  adorn_title()
##            mj_lifetime      
##  demog_sex          No   Yes
##     Female       53.4% 46.6%
##       Male       44.2% 55.8%
# Replace the 0's below with appropriate values
tribble(
  ~` `,        ~General, ~Female, ~Male, 
  "Probability", 509 / 1000, 249/534, 260/466, 
  "Odds",        509 / 491, 249/285, 260/206
) |> gt() %>% 
  fmt_number(decimals = 3)
General Female Male
Probability 0.509 0.466 0.558
Odds 1.037 0.874 1.262
# Replace the 0's below with appropriate values
tribble(
  ~` `, ~`Males vs. Females`, 
  "Risk difference",   (260/466)-(249/534),      # males - females 
  "Risk Ratio",     (260/466) / (249/534),       #males / female
  "Odds ratio",      (260/206) / (249/285)
) |> gt() %>% 
  fmt_number(decimals = 3)
Males vs. Females
Risk difference 0.092
Risk Ratio 1.197
Odds ratio 1.445

Exercise 3

Question: You will now estimate the same measures from the previous question, but this time using two types of regressions: a logistic regression and a regular linear regression. To those measures add the 95% confidence intervals associated. Show your code and present your results in a table, just as you’ve done above. Explain your findings: what are the differences between the three approaches (using a contingency table as in the question above, using logistic regression and using linear regression)? What explains these differences (or lack thereof)? What are the advantages/drawbacks of using one approach over the other?

mdl.glm.null <- glm(
  mj_lifetime ~ 1, 
  data = nsduh, 
  family = "binomial"
  )
summary(mdl.glm.null)
## 
## Call:
## glm(formula = mj_lifetime ~ 1, family = "binomial", data = nsduh)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.03600    0.06326   0.569    0.569
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1386  on 999  degrees of freedom
## Residual deviance: 1386  on 999  degrees of freedom
## AIC: 1388
## 
## Number of Fisher Scoring iterations: 3
# 0.03600 odd logs of smoking mj in the population

# probability of lifetime marijuana (everyone)
plogis(0.03600)
## [1] 0.508999
# odds of smoking for everyone 
exp(0.03600)
## [1] 1.036656
#linear model
mdl.lm.null <- lm(mj_lifetime.numeric ~ 1, data = nsduh )
summary(mdl.lm.null)
## 
## Call:
## lm(formula = mj_lifetime.numeric ~ 1, data = nsduh)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.509 -0.509  0.491  0.491  0.491 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.50900    0.01582   32.18   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5002 on 999 degrees of freedom
# probability of lifetime marijuana (everyone)
0.509
## [1] 0.509
mdl.glm <- glm(mj_lifetime ~ demog_sex, data = nsduh, family = "binomial")
summary(mdl.glm)
## 
## Call:
## glm(formula = mj_lifetime ~ demog_sex, family = "binomial", data = nsduh)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   -0.13504    0.08675  -1.557  0.11954   
## demog_sexMale  0.36784    0.12738   2.888  0.00388 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1386.0  on 999  degrees of freedom
## Residual deviance: 1377.6  on 998  degrees of freedom
## AIC: 1381.6
## 
## Number of Fisher Scoring iterations: 3
mdl.glm %>% 
  tidy(conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 2 × 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)      0.874    0.0867     -1.56 0.120      0.737      1.04
## 2 demog_sexMale    1.44     0.127       2.89 0.00388    1.13       1.86
# probability lifetime marijuana (females)
plogis(-0.13504 )
## [1] 0.4662912
# probability lifetime marijuana (males)
plogis(-0.13504 + 0.36784 )
## [1] 0.5579386
# Odds ratio male over female
exp(0.36784)
## [1] 1.444611
mdl.lm <- lm(mj_lifetime.numeric ~ demog_sex, data = nsduh )
summary(mdl.lm)
## 
## Call:
## lm(formula = mj_lifetime.numeric ~ demog_sex, data = nsduh)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5579 -0.4663  0.4421  0.4421  0.5337 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.46629    0.02156  21.623   <2e-16 ***
## demog_sexMale  0.09165    0.03159   2.901   0.0038 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4983 on 998 degrees of freedom
## Multiple R-squared:  0.008363,   Adjusted R-squared:  0.00737 
## F-statistic: 8.417 on 1 and 998 DF,  p-value: 0.003799
# probability among female
 0.46629
## [1] 0.46629
# probability among males
 0.46629 + 0.09165  
## [1] 0.55794
 #0.55794

Exercise 4

Question: Write up your results using the template shown below.

Conclusion: Males have significantly higher odds of lifetime marijuana use than females (OR = 1.44 ; 95% CI = 1.13 , 1.86 p = 0.004). Males have 44% higher odds of lifetime marijuana use than females.

Exercise 5

Question:Using the 2019 National Survey of Drug Use and Health (NSDUH) teaching dataset, explore the association between lifetime marijuana use mj_lifetime and age at first use of alcohol alc_agefirst? Fit a logistic regression and interpret the results, using the template shown below.

# model for association between lifetime marijuana use mj_lifetime and age at first use of alcohol alc_agefirst
mdl.glm.alc <- glm(mj_lifetime ~ alc_agefirst, data = nsduh, family = "binomial")
summary(mdl.glm.alc)
## 
## Call:
## glm(formula = mj_lifetime ~ alc_agefirst, family = "binomial", 
##     data = nsduh)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    5.3407     0.4747   11.25   <2e-16 ***
## alc_agefirst  -0.2835     0.0267  -10.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1141.45  on 842  degrees of freedom
## Residual deviance:  968.44  on 841  degrees of freedom
##   (157 observations deleted due to missingness)
## AIC: 972.44
## 
## Number of Fisher Scoring iterations: 5
mdl.glm.alc %>% 
tidy(exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 2 × 7
##   term         estimate std.error statistic  p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)   209.       0.475       11.3 2.29e-29   84.7     545.   
## 2 alc_agefirst    0.753    0.0267     -10.6 2.46e-26    0.713     0.792
#baseline 
plogis(5.3407)
## [1] 0.9952303
#calculation of reduction in odds associated with starting drinking alcohol 3 years later the odds is he complement of the result of 0.753^3
0.753^3
## [1] 0.4269578

Answer: Age at first alcohol use is significantly negatively associated with lifetime marijuana use (OR = 0.753 ; 95% CI = 0.713, 0.792 ; p < 0.01). Individuals who first used alcohol at age 19 years have 25% lower odds of having ever used marijuana than those who first used alcohol at age 18 years. In contrast, the reduction in odds associated with starting drinking alcohol 3 years later is 57% lower odds.

Exercise 6

Question: What is the association between lifetime marijuana use (mj_lifetime) and age at first use of alcohol (alc_agefirst), adjusted for age (demog_age_cat6), sex (demog_sex), and income (demog_income)? Fit a model with and without an interaction term (alc_agefirst:demog_sex) and compute the AOR, its 95% CI, and the p-value that tests the significance of age at first use of alcohol. Report also their AORs of the other predictors, their 95% CIs and p-values. Since there are some categorical predictors with more than two levels, use car::Anova() function to compute p-values for multiple degrees of freedom. In your answer, fill in the following template.

# adjusted model without interaction
mdl.glm.alc.adj <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income, data = nsduh, family = "binomial")

# exponentiation of adjusted model without interaction
mdl.glm.alc.adj %>% 
tidy(exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 10 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           520.       0.591     10.6   3.85e-26  169.     1722.   
##  2 alc_agefirst            0.759    0.0276    -9.99  1.65e-23    0.718     0.800
##  3 demog_age_cat626-34     0.744    0.329     -0.901 3.67e- 1    0.387     1.41 
##  4 demog_age_cat635-49     0.447    0.297     -2.71  6.69e- 3    0.247     0.791
##  5 demog_age_cat650-64     0.502    0.299     -2.31  2.08e- 2    0.275     0.891
##  6 demog_age_cat665+       0.279    0.304     -4.19  2.80e- 5    0.152     0.502
##  7 demog_sexMale           0.941    0.162     -0.376 7.07e- 1    0.684     1.29 
##  8 demog_income$20,000…    0.588    0.266     -1.99  4.63e- 2    0.347     0.987
##  9 demog_income$50,000…    0.924    0.305     -0.260 7.95e- 1    0.507     1.68 
## 10 demog_income$75,000…    0.697    0.253     -1.43  1.54e- 1    0.421     1.14
# adjusted model with interaction
mdl.glm.alc.adj.int <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income + alc_agefirst:demog_sex, data = nsduh, family = "binomial")

# exponentiation of adjusted model with interaction
mdl.glm.alc.adj.int %>% 
tidy(exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 11 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)          1593.       0.829      8.89  5.99e-19 337.      8717.   
##  2 alc_agefirst            0.713    0.0423    -7.99  1.38e-15   0.654      0.772
##  3 demog_age_cat626-34     0.744    0.331     -0.890 3.73e- 1   0.385      1.42 
##  4 demog_age_cat635-49     0.442    0.299     -2.73  6.30e- 3   0.243      0.785
##  5 demog_age_cat650-64     0.503    0.301     -2.28  2.24e- 2   0.275      0.897
##  6 demog_age_cat665+       0.284    0.306     -4.11  3.91e- 5   0.153      0.511
##  7 demog_sexMale           0.118    0.985     -2.17  2.99e- 2   0.0167     0.800
##  8 demog_income$20,000…    0.588    0.268     -1.98  4.80e- 2   0.346      0.991
##  9 demog_income$50,000…    0.911    0.307     -0.303 7.62e- 1   0.498      1.66 
## 10 demog_income$75,000…    0.696    0.255     -1.42  1.54e- 1   0.419      1.14 
## 11 alc_agefirst:demog_…    1.13     0.0555     2.14  3.22e- 2   1.01       1.26
# comparison of the models 
modelsummary(list(mdl.glm.alc.adj, mdl.glm.alc.adj.int), estimate = "{estimate}{stars} [{conf.low}, {conf.high}]", statistic = "p={p.value}")
 (1)   (2)
(Intercept) 6.254*** [5.131, 7.451] 7.374*** [5.819, 9.073]
p=<0.001 p=<0.001
alc_agefirst −0.275*** [−0.331, −0.223] −0.338*** [−0.425, −0.259]
p=<0.001 p=<0.001
demog_age_cat626-34 −0.296 [−0.949, 0.343] −0.295 [−0.954, 0.350]
p=0.367 p=0.373
demog_age_cat635-49 −0.804** [−1.400, −0.234] −0.816** [−1.417, −0.242]
p=0.007 p=0.006
demog_age_cat650-64 −0.690* [−1.289, −0.116] −0.687* [−1.291, −0.108]
p=0.021 p=0.022
demog_age_cat665+ −1.275*** [−1.885, −0.689] −1.260*** [−1.875, −0.671]
p=<0.001 p=<0.001
demog_sexMale −0.061 [−0.379, 0.256] −2.138* [−4.093, −0.224]
p=0.707 p=0.030
demog_income$20,000 - $49,999 −0.531* [−1.059, −0.013] −0.530* [−1.062, −0.009]
p=0.046 p=0.048
demog_income$50,000 - $74,999 −0.079 [−0.679, 0.519] −0.093 [−0.697, 0.510]
p=0.795 p=0.762
demog_income$75,000 or more −0.361 [−0.864, 0.130] −0.363 [−0.869, 0.131]
p=0.154 p=0.154
alc_agefirst × demog_sexMale 0.119* [0.011, 0.229]
p=0.032
Num.Obs. 843 843
AIC 955.6 952.9
BIC 1002.9 1005.0
Log.Lik. −467.776 −465.448
F 14.804 13.139
RMSE 0.43 0.43

Answer: Interpreting the output:

The AOR for our primary predictor alc_agefirst is 0.713. This represents the OR for lifetime marijuana use comparing those with a one-year difference in age at first use of alcohol, adjusted for age, sex, and income.

The remaining AORs compare levels of categorical predictors to their reference level, adjusted for the other predictors in the model. For example, comparing individuals with the same age of first alcohol use, sex, and income, 35-49 year-olds have 55.8 % lower odds of lifetime marijuana use than 18-25 year-olds (OR = 0.442 ; 95% CI = 0.243 , 0.785 ; p = p=0.006 ). An overall, 4 df p-value for age, can be read from the Type III Test table (p = ___ ).

Conclusion: After adjusting for age, sex, and income, age at first alcohol use is significantly ___ (negatively/positively) associated with lifetime marijuana use (AOR = ___ ; 95% CI = ___ , ___ ; p < ___ ). Individuals who first used alcohol at a given age have ___ % (lower/higher) odds of having ever used marijuana than those who first used alcohol one year earlier. The association between age of first alcohol use and lifetime marijuana use differs significantly between males and females (p = ___ ). A likelihood ratio test shows that the adjusted model with the interaction is ___ (superior/not superior) to the one without the interaction (p = ___)

Exercise 7

Question: Replicate the model summary and the forest plot shown below (the forest plot using a centralized variable alc_agefirst, scaled such that its standard deviation is 0.5).

# Unadjusted model 
mdl.glm.UNADJUSTED <- glm(mj_lifetime ~ alc_agefirst, data = nsduh, family = "binomial")

# Changing the name of the variables
modelsummary(
  list("Unadjusted"= mdl.glm.UNADJUSTED,
       "Adjusted"= mdl.glm.alc.adj,
       "Interaction"= mdl.glm.alc.adj.int), 
  estimate = "{estimate} {stars}", 
  statitsics = "p={p.value}",
  coef_rename = c(
    "alc_agefirst" = "Alcohol (age first used)",
    "demog_age_cat626-34" = "Age [26-34]",
    "demog_age_cat635-49" = "Age [35-49]",
    "demog_age_cat650-64" = "Age [50-64]",
    "demog_age_cat665+" = "Age [65+]",
    "demog_sexMale" = "Sex [Male]",
    "demog_income$20,000 - $49,999" = "Income [$20,000 - $49,999]", 
    "demog_income$50,000 - $74,999" = "Income [$50,000 - $74,999]", 
    "demog_income$75,000 or more" = "Income [$75,000 or more]", 
    "alc_agefirst × demog_sexMale" = "Alcohol (age first used)* Sex [Male]"
))
Unadjusted Adjusted  Interaction
(Intercept) 5.341 *** 6.254 *** 7.374 ***
(0.475) (0.591) (0.829)
Alcohol (age first used) −0.284 *** −0.275 *** −0.338 ***
(0.027) (0.028) (0.042)
Age [26-34] −0.296 −0.295
(0.329) (0.331)
Age [35-49] −0.804 ** −0.816 **
(0.297) (0.299)
Age [50-64] −0.690 * −0.687 *
(0.299) (0.301)
Age [65+] −1.275 *** −1.260 ***
(0.304) (0.306)
Sex [Male] −0.061 −2.138 *
(0.162) (0.985)
Income [$20,000 - $49,999] −0.531 * −0.530 *
(0.266) (0.268)
Income [$50,000 - $74,999] −0.079 −0.093
(0.305) (0.307)
Income [$75,000 or more] −0.361 −0.363
(0.253) (0.255)
Alcohol (age first used):Sex [Male] 0.119 *
(0.056)
Num.Obs. 843 843 843
AIC 972.4 955.6 952.9
BIC 981.9 1002.9 1005.0
Log.Lik. −484.219 −467.776 −465.448
F 112.743 14.804 13.139
RMSE 0.44 0.43 0.43
# create a new centralized data frame
nsduh.ctr <- nsduh %>% 
  mutate_if(is.numeric, scale) %>% 
  mutate_if(is.numeric, ~ as.numeric(.x/2))

# create a new model for centralized
mdl.glm.alc.adj.int.ctr <-  glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income + alc_agefirst:demog_sex, data = nsduh.ctr, family = "binomial")

# forest plot
mdl.glm.alc.adj.int.ctr %>% 
  model_parameters() %>% 
  plot() +
  xlab("Log-Odds(standarized)") +
  theme(
    axis.line = element_blank(),
    panel.grid.major = element_line(color = "lightgray", linetype = "solid", 
                                    size = 0.1))