knitr::opts_chunk$set(echo = TRUE, warning=FALSE)

# load libraries
# tidyverse, broom, table1, GGally, car, easystats, gt, janitor, lmtest, datawizard, marginaleffects and modelsummary
library(tidyverse)
library(broom)
library(table1)
library(GGally)
library(car)
library(easystats)
library(gt)
install.packages("janitor")
library(janitor)
library(lmtest)
library(datawizard)
library(marginaleffects)
library(modelsummary)

# load dataset
nsduh <- read_rds("data/nsduh.rds")
save(data, file="nsduh.RData")

Exploratory data analysis

Exercise 1

Create a table1 and use GGally::ggpairs to explore your data.

# table1

label(nsduh$demog_sex) <- "Sex"
label(nsduh$alc_agefirst) <- "Alcohol (age first use)"
label(nsduh$demog_age_cat6)  <- "Age"
label(nsduh$demog_income) <- "Income"
label(nsduh$mj_lifetime) <- "Lifetime marijuana use"
caption_table1 <- "Table 1: Demographic stratified by lifetime marijuana use"

table1(~ demog_sex + alc_agefirst + demog_age_cat6 + demog_income | mj_lifetime, data=nsduh, overall = FALSE, caption=caption_table1)

Table 1: Demographic stratified by lifetime marijuana use
	No (N=491)	Yes (N=509)
Sex
Female	285 (58.0%)	249 (48.9%)
Male	206 (42.0%)	260 (51.1%)
Alcohol (age first use)
Mean (SD)	19.7 (5.09)	16.0 (3.08)
Median [Min, Max]	19.0 [3.00, 45.0]	16.0 [3.00, 29.0]
Missing	145 (29.5%)	12 (2.4%)
Age
18-25	53 (10.8%)	80 (15.7%)
26-34	52 (10.6%)	87 (17.1%)
35-49	129 (26.3%)	136 (26.7%)
50-64	109 (22.2%)	132 (25.9%)
65+	148 (30.1%)	74 (14.5%)
Income
Less than $20,000	76 (15.5%)	85 (16.7%)
$20,000 - $49,999	166 (33.8%)	127 (25.0%)
$50,000 - $74,999	64 (13.0%)	81 (15.9%)
$75,000 or more	185 (37.7%)	216 (42.4%)

# explore associations between variables with ggpairs

plot_ggpairs <- ggpairs(nsduh, columns = 3:6, 
                        columnLabels = c("Alcohol (age first use)", "Age", "Sex", "Income"),
                        legend = 1,
                upper = list(continuous = "density", combo = "box", discrete = "colbar"),  
                lower = list(continuous = "cor", combo = "facethist", discrete = "barDiag"),  
                diag = list(continuous = wrap("densityDiag", alpha=0.5), discrete = "densityDiag"), 
                mapping = aes(colour=nsduh$mj_lifetime))  +
  labs(fill = "Lifetime marijuana use")

plot_ggpairs + theme_minimal() + ggtitle('Demographic stratified by lifetime marijuana use') + theme(legend.position = "bottom")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculating odds ratio

Exercise 2

Use the contingency table below to calculate (a) probability of lifetime marijuana among sample subjects (independent of sex), among sample females and among sample males, (b) Odds of lifetime marijuana use among sample subjects, among sample females and among sample males. (c) What is the risk difference, the risk ratio and the odds ratio when comparing lifetime marijuana use among males and females in our sample?

# Showing absolute numbers
nsduh |> 
  tabyl(demog_sex, mj_lifetime) |> 
  adorn_title()

##            mj_lifetime    
##  demog_sex          No Yes
##     Female         285 249
##       Male         206 260

# probability in total sample
(249+260)/(285+206+249+260) #0.51

## [1] 0.509

# odds in total sample
(249+260)/(285+206) #1.04

## [1] 1.03666

# log odds in total sample
log((249+260)/(285+206)) #0.04

## [1] 0.03600389

# probability in males and females
nsduh |> 
  tabyl(demog_sex, mj_lifetime) |> 
  adorn_percentages() |> 
  adorn_pct_formatting() |> 
  adorn_title()

##            mj_lifetime      
##  demog_sex          No   Yes
##     Female       53.4% 46.6%
##       Male       44.2% 55.8%

# odds of marijuana use in males
260/206 #1.26

## [1] 1.262136

# log odds of marijuana use in males
log(260/206) #0.23

## [1] 0.2328055

# odds of marijuana use in females
249/285 #0.87

## [1] 0.8736842

# log odds of marijuana use in females
log(249/285)  #-0.14

## [1] -0.1350363

### combined in one table
tribble(
  ~Prediction, ~General, ~Female, ~Male, 
  "Probability", 0.51, 0.47, 0.56, 
  "Odds",        1.04, 0.87, 1.26, 
  "Log Odds",    0.04, -0.14, 0.23
) |> gt()

Prediction	General	Female	Male
Probability	0.51	0.47	0.56
Odds	1.04	0.87	1.26
Log Odds	0.04	-0.14	0.23

# risk difference males vs females
55.8-46.6 #9.2%

## [1] 9.2

# risk ratio males vs females
55.8/46.6 #1.20

## [1] 1.197425

# odds ratio males vs females
1.26/0.87 #1.45

## [1] 1.448276

# log odds ratio males vs females
log(1.26/0.87) #0.37

## [1] 0.3703738

### combined in one table
tribble(
  ~Contrast, ~`Males vs. Females`, 
  "Risk difference", 9.20, 
  "Risk Ratio",      1.20,
  "Odds ratio",      1.45, 
  "Log odds ratio",  0.37
) |> gt()

Contrast	Males vs. Females
Risk difference	9.20
Risk Ratio	1.20
Odds ratio	1.45
Log odds ratio	0.37

Answer: See above.

Exercise 3

You will now estimate the same measures from the previous question, but this time using two types of regressions: a logistic regression and a regular least squares linear regression. Show your code and present your results in a table adding the confidence intervals (see below). Explain your findings: what are the differences between the three approaches (using a contingency table as in the question above, using logistic regression and using linear regression)? What explains these differences (or lack thereof)? What are the advantages/drawbacks of using one approach over the other?

# add variable with values for male and female changed
nsduh$demog_sex_inv <- ifelse(nsduh$demog_sex=="Male", 0, 1)

### LOGISTIC REGRESSION ###
# specify the logistic regression models
M0_log <- glm(mj_lifetime ~ 1, data=nsduh, family = "binomial")
M1_log <- glm(mj_lifetime ~ demog_sex, data=nsduh, family = "binomial")
M1_log_inv <- glm(mj_lifetime ~ demog_sex_inv, data=nsduh, family = "binomial")

## log odds of marijuana use in total sample
tidy(M0_log, conf.int = TRUE) ## log odds [95% CI] = 0.036 [-0.088, 0.160]

## # A tibble: 1 × 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)   0.0360    0.0633     0.569   0.569  -0.0880     0.160

## odds of marijuana use in total sample
tidy(M0_log, exponentiate = TRUE, conf.int = TRUE) ## odds [95% CI] = 1.037 [0.916, 1.174]

## # A tibble: 1 × 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)     1.04    0.0633     0.569   0.569    0.916      1.17

## probability of marijuana use in total sample
tidy(M0_log, conf.int = TRUE) |> 
  select(estimate, conf.low, conf.high) |> 
  mutate_all(plogis) ## probability [95% CI] = 0.509 [0.478, 0.540]

## # A tibble: 1 × 3
##   estimate conf.low conf.high
##      <dbl>    <dbl>     <dbl>
## 1    0.509    0.478     0.540

## log odds of marijuana use in female
tidy(M1_log, conf.int = TRUE) ## log odds female [95% CI] = -0.135 [-0.305  0.035]

## # A tibble: 2 × 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)     -0.135    0.0867     -1.56 0.120     -0.305    0.0348
## 2 demog_sexMale    0.368    0.127       2.89 0.00388    0.119    0.618

## log odds of marijuana use in males
tidy(M1_log_inv, conf.int = TRUE) ## log odds male [95% CI] = 0.233 [0.051, 0.416]

## # A tibble: 2 × 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)      0.233    0.0933      2.50 0.0126    0.0505     0.416
## 2 demog_sex_inv   -0.368    0.127      -2.89 0.00388  -0.618     -0.119

## odds of marijuana use in females
tidy(M1_log, exponentiate = TRUE, conf.int = TRUE) ## odds [95% CI] = 0.874 [0.737,1.035]

## # A tibble: 2 × 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)      0.874    0.0867     -1.56 0.120      0.737      1.04
## 2 demog_sexMale    1.44     0.127       2.89 0.00388    1.13       1.86

## odds of marijuana use in males
tidy(M1_log_inv, exponentiate = TRUE, conf.int = TRUE) ## odds [95% CI] = 1.262 [1.052, 1.516]

## # A tibble: 2 × 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)      1.26     0.0933      2.50 0.0126     1.05      1.52 
## 2 demog_sex_inv    0.692    0.127      -2.89 0.00388    0.539     0.888

## probability of marijuana use in females
tidy(M1_log, conf.int = TRUE) |> 
  select(estimate, conf.low, conf.high) |> 
  mutate_all(plogis) ## probability [95% CI] = 0.466 [0.424, 0.509]

## # A tibble: 2 × 3
##   estimate conf.low conf.high
##      <dbl>    <dbl>     <dbl>
## 1    0.466    0.424     0.509
## 2    0.591    0.530     0.650

## probability of marijuana use in males
tidy(M1_log_inv, conf.int = TRUE) |> 
  select(estimate, conf.low, conf.high) |> 
  mutate_all(plogis) ## probability [95% CI] = 0.558 [0.513, 0.603]

## # A tibble: 2 × 3
##   estimate conf.low conf.high
##      <dbl>    <dbl>     <dbl>
## 1    0.558    0.513     0.603
## 2    0.409    0.350     0.470

## odds ratio males vs females
tidy(M1_log, exponentiate = TRUE, conf.int = TRUE) ## OR [95%CI] = 1.445 [1.126, 1.855]

## # A tibble: 2 × 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)      0.874    0.0867     -1.56 0.120      0.737      1.04
## 2 demog_sexMale    1.44     0.127       2.89 0.00388    1.13       1.86

## log odds ratio males vs females
log(c(1.445, 1.126, 1.855)) ## log OR [95%CI] 0.368 [0.119,0.618]

## [1] 0.3681093 0.1186715 0.6178847

## risk difference males vs females
55.8-46.6 # 9.2

## [1] 9.2

avg_comparisons(
  model = M1_log, 
  variables = "demog_sex",
  comparison = "difference") ## risk difference [95%CI] 9.2 [3.0, 15.3]

## 
##       Term      Contrast Estimate Std. Error   z Pr(>|z|)   S  2.5 % 97.5 %
##  demog_sex Male - Female   0.0916     0.0315 2.9  0.00367 8.1 0.0298  0.153
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
## Type:  response

# risk ratio males vs females
0.558/0.466 # 1.197

## [1] 1.197425

avg_comparisons(
  model = M1_log, 
  variables = "demog_sex",
  comparison = "ratio") ## risk ratio [95%CI] 1.20 [1.05, 1.34]

## 
##       Term      Contrast Estimate Std. Error    z Pr(>|z|)     S 2.5 % 97.5 %
##  demog_sex Male / Female      1.2     0.0742 16.1   <0.001 192.0  1.05   1.34
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
## Type:  response

### LINEAR REGRESSION ###
# specify the linear regression models
M0_lin <- lm(mj_lifetime.numeric ~ 1, data=nsduh)
M1_lin <- lm(mj_lifetime.numeric ~ demog_sex, data=nsduh)
M1_lin_inv <- lm(mj_lifetime.numeric ~ demog_sex_inv, data=nsduh)

## probability of marijuana use in total sample
tidy(M0_lin, conf.int = TRUE) ## probability [95% CI] = 0.509 [0.478, 0.540]

## # A tibble: 1 × 7
##   term        estimate std.error statistic   p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)    0.509    0.0158      32.2 1.75e-156    0.478     0.540

## odds of marijuana use in total sample
0.509/(1-0.509) ## odds = 1.036

## [1] 1.03666

0.478/(1-0.478) ## 95%CI LB of odds = 0.916

## [1] 0.9157088

0.540/(1-0.540) ## 95%CI UB of odds = 1.174

## [1] 1.173913

## -> odds = 1.036 [0.916, 1.174]

## log odds of marijuana use in total sample
log(0.509/(1-0.509))  ## log odds = 0.036

## [1] 0.03600389

log(0.478/(1-0.478)) ## 95%CI LB of log odds = -0.088

## [1] -0.08805686

log(0.540/(1-0.540)) ## 95%CI UB of log odds = 0.160

## [1] 0.1603427

## -> log odds = 0.036 [-0.088, 0.160]

## probability of marijuana use in females
tidy(M1_lin, conf.int = TRUE) ## probability [95% CI] = 0.466 [0.424, 0.509]

## # A tibble: 2 × 7
##   term          estimate std.error statistic  p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)     0.466     0.0216     21.6  2.40e-85   0.424      0.509
## 2 demog_sexMale   0.0916    0.0316      2.90 3.80e- 3   0.0297     0.154

## probability of marijuana use in males
tidy(M1_lin_inv, conf.int = TRUE) ## probability [95% CI] = 0.558 [0.513, 0.603]

## # A tibble: 2 × 7
##   term          estimate std.error statistic   p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)     0.558     0.0231     24.2  5.71e-102    0.513    0.603 
## 2 demog_sex_inv  -0.0916    0.0316     -2.90 3.80e-  3   -0.154   -0.0297

## odds of marijuana use in females
0.466/(1-0.466) ## odds = 0.873

## [1] 0.8726592

0.424/(1-0.424) ## 95%CI LB of odds = 0.736

## [1] 0.7361111

0.509/(1-0.509) ## 95%CI UB of odds = 1.037

## [1] 1.03666

## -> odds = 0.873 [0.736, 1.037]

## odds of marijuana use in males
0.558/(1-0.558) ## odds = 1.262

## [1] 1.262443

0.513/(1-0.513) ## 95%CI LB of odds = 1.053

## [1] 1.053388

0.603/(1-0.603) ## 95%CI UB of odds = 1.519

## [1] 1.518892

## -> ## odds = 1.262 [1.053, 1.519]

## log odds of marijuana use in females
log(0.466/(1-0.466)) ## log odds = -0.136

## [1] -0.1362102

log(0.424/(1-0.424)) ## 95%CI LB of log odds = -0.306

## [1] -0.3063742

log(0.509/(1-0.509)) ## 95%CI UB of log odds = 0.036

## [1] 0.03600389

## -> ## log odds = -0.136 [-0.306, 0.036]

## log odds of marijuana use in males
log(0.558/(1-0.558)) ## log odds = 0.233

## [1] 0.2330491

log(0.513/(1-0.513)) ## 95%CI LB of log odds = 0.052

## [1] 0.05201172

log(0.603/(1-0.603)) ## 95%CI UB of log odds = 0.418

## [1] 0.4179809

## -> ## log odds = 0.233 [0.052, 0.418]

## log odds ratio males vs females
log(1.262/0.873) ## 0.369

## [1] 0.3685175

avg_comparisons(
  model = M1_lin, 
  variables = "demog_sex",
  comparison = "lnor") ## log odds ratio [95%CI] = 0.37 [0.12, 0.62]

## 
##       Term                      Contrast Estimate Std. Error    z Pr(>|z|)   S
##  demog_sex ln(odds(Male) / odds(Female))    0.368      0.128 2.88  0.00393 8.0
##  2.5 % 97.5 %
##  0.118  0.618
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
## Type:  response

## odds ratio males vs females
1.262/0.873 ## 1.446

## [1] 1.44559

exp(c(0.368, 0.118, 0.618)) ## odds ratio [95%CI] = 1.44 [1.13, 1.86]

## [1] 1.444842 1.125244 1.855214

avg_comparisons(
  model = M1_lin, 
  variables = "demog_sex",
  comparison = "lnor",
  transform = 'exp')

## 
##       Term                      Contrast Estimate Pr(>|z|)   S 2.5 % 97.5 %
##  demog_sex ln(odds(Male) / odds(Female))     1.44  0.00393 8.0  1.13   1.85
## 
## Columns: term, contrast, estimate, p.value, s.value, conf.low, conf.high 
## Type:  response

## risk difference males vs females
55.8-46.6 # 9.2

## [1] 9.2

avg_comparisons(
  model = M1_lin, 
  variables = "demog_sex",
  comparison = "difference") ## risk difference [95%CI] = 9.2 [3.0, 15.4]

## 
##       Term      Contrast Estimate Std. Error   z Pr(>|z|)   S  2.5 % 97.5 %
##  demog_sex Male - Female   0.0916     0.0316 2.9  0.00372 8.1 0.0297  0.154
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
## Type:  response

# risk ratio males vs females
 0.558/0.466 # 1.197

## [1] 1.197425

avg_comparisons(
  model = M1_lin, 
  variables = "demog_sex",
  comparison = "ratio") ## risk ratio [95%CI] = 0.20 [1.05, 1.34]

## 
##       Term      Contrast Estimate Std. Error    z Pr(>|z|)     S 2.5 % 97.5 %
##  demog_sex Male / Female      1.2     0.0742 16.1   <0.001 191.7  1.05   1.34
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
## Type:  response

### Probability, odds and log odds based on logistic regression:
tribble(
  ~Prediction, ~General, ~Female, ~Male, 
  "Probability [95%CI]", "0.51 [0.48, 0.54]", "0.47 [0.42, 0.51]", "0.56 [0.51, 0.60]", 
  "Odds [95%CI]",        "1.04 [0.92, 1.17]", "0.87 [0.74, 1.04]", "1.26 [1.05, 1.52]", 
  "Log Odds [95%CI]",    "0.04 [-0.09, 0.16]", "-0.14 [-0.31,   0.04]", "0.23 [0.05, 0.42]"
) |> gt()

Prediction	General	Female	Male
Probability [95%CI]	0.51 [0.48, 0.54]	0.47 [0.42, 0.51]	0.56 [0.51, 0.60]
Odds [95%CI]	1.04 [0.92, 1.17]	0.87 [0.74, 1.04]	1.26 [1.05, 1.52]
Log Odds [95%CI]	0.04 [-0.09, 0.16]	-0.14 [-0.31, 0.04]	0.23 [0.05, 0.42]

### Risk difference, risk ratio, odds ratio and log odds ratio based on logistic regression
tribble(
  ~Contrast, ~`Males vs. Females`, 
  "Risk difference [95%CI]", "9.2 [3.0, 15.3]", 
  "Risk Ratio [95%CI]",      "1.20 [1.05, 1.34]",
  "Odds ratio [95%CI]",      "1.44 [1.13, 1.86]", 
  "Log odds ratio [95%CI]",  "0.37 [0.12, 0.62]"
) |> gt()

Contrast	Males vs. Females
Risk difference [95%CI]	9.2 [3.0, 15.3]
Risk Ratio [95%CI]	1.20 [1.05, 1.34]
Odds ratio [95%CI]	1.44 [1.13, 1.86]
Log odds ratio [95%CI]	0.37 [0.12, 0.62]

### Probability, odds and log odds based on linear regression:
tribble(
  ~Prediction, ~General, ~Female, ~Male, 
  "Probability", "0.51 [0.48, 0.54]", "0.47 [0.42, 0.51]", "0.56 [0.51, 0.60]", 
  "Odds",        "1.04 [0.92, 1.17]", "0.87 [0.74, 1.04]", "1.26 [1.05, 1.52]", 
  "Log Odds",    "0.04 [-0.09, 0.16]", "-0.14 [-0.31, 0.04]", "0.23 [0.05, 0.42]"
) |> gt()

Prediction	General	Female	Male
Probability	0.51 [0.48, 0.54]	0.47 [0.42, 0.51]	0.56 [0.51, 0.60]
Odds	1.04 [0.92, 1.17]	0.87 [0.74, 1.04]	1.26 [1.05, 1.52]
Log Odds	0.04 [-0.09, 0.16]	-0.14 [-0.31, 0.04]	0.23 [0.05, 0.42]

### Risk difference, risk ratio, odds ratio and log odds ratio based on linear regression
tribble(
  ~Contrast, ~`Males vs. Females`, 
  "Risk difference [95%CI]", "9.2 [3.0, 15.4]", 
  "Risk Ratio [95%CI]",      "0.20 [1.05, 1.34]",
  "Odds ratio [95%CI]",      "1.44 [1.13, 1.86]", 
  "Log odds ratio [95%CI]",  "0.37 [0.12, 0.62]"
) |> gt()

Contrast	Males vs. Females
Risk difference [95%CI]	9.2 [3.0, 15.4]
Risk Ratio [95%CI]	0.20 [1.05, 1.34]
Odds ratio [95%CI]	1.44 [1.13, 1.86]
Log odds ratio [95%CI]	0.37 [0.12, 0.62]

Answer: The linear and logistic regression models produce similar estimates for the probability, odds and logs odds. This is probably the case because there is a linear relationship the predictor and the outcome. The advantage of the logistic regression is that there are less assumptions that need to be met compared to linear regression.

Exercise 4

Write up your results using the template shown below.

## odds ratio males vs females
tidy(M1_log, exponentiate = TRUE, conf.int = TRUE) ## OR [95%CI] = 1.44 [1.13, 1.86]

## # A tibble: 2 × 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)      0.874    0.0867     -1.56 0.120      0.737      1.04
## 2 demog_sexMale    1.44     0.127       2.89 0.00388    1.13       1.86

Answer: Conclusion: Males have significantly higher odds of lifetime marijuana use than females (OR = 1.44; 95% CI = 1.13, 1.86; p = 0.004). Males have 44% higher odds of lifetime marijuana use than females.

For this simple case with a binary outcome and a binary predictor, the odds ratio you calculated should be the same as the odds ratio obtained via the cross-product from a 2 x 2 contingency table. But what if your predictor variable is not binary, but continuous?

Answer: For continuous variables, to be able to manually calculate odds ratios et cetera, you need to devide the continuous variable in groups or bins, and calculate the odds for all the bins. If you make these bins endlessly small, you would probably get the same results as the regression analysis. As this is not feasible of course, your results will differ sligtly from those that you find with regression analysis.

Exercise 5

Whereas in the previous questions we explored the relationship between lifetime marijuana use and sex, we will now explore the association between lifetime marijuana use (mj_lifetime) and age at first use of alcohol (alc_agefirst)? Fit a logistic regression and interpret the results, using the template shown below.

M2_log <- glm(mj_lifetime ~ alc_agefirst, data=nsduh, family = "binomial")

tidy(M2_log, exponentiate = TRUE, conf.int = TRUE) ## OR [95%CI] = 0.75 [0.71, 0.79]

## # A tibble: 2 × 7
##   term         estimate std.error statistic  p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)   209.       0.475       11.3 2.29e-29   84.7     545.   
## 2 alc_agefirst    0.753    0.0267     -10.6 2.46e-26    0.713     0.792

(0.7531-1)*100 ## 25% lower odds

## [1] -24.69

tidy(M2_log, exponentiate = FALSE, conf.int = TRUE) ## est = -0.2835

## # A tibble: 2 × 7
##   term         estimate std.error statistic  p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)     5.34     0.475       11.3 2.29e-29    4.44      6.30 
## 2 alc_agefirst   -0.284    0.0267     -10.6 2.46e-26   -0.338    -0.233

exp(-0.2835*3) ## OR (per 3 years) = 0.43

## [1] 0.4272013

(exp(-0.2835*3)-1)*100 ## 57% lower odds

## [1] -57.27987

Answer: Age at first alcohol use is significantly negatively associated with lifetime marijuana use (OR = 0.75; 95% CI = 0.71, 0.79 ; p < 0.001). Individuals who first used alcohol at age 19 years have 25% lower odds of having ever used marijuana than those who first used alcohol at age 18 years. In contrast, the reduction in odds associated with starting drinking alcohol 3 years later is 57% lower odds.

Adjusted logistic regression

The main reason for combining multiple predictors is to address the problem of confounding bias. Confounding bias occurs when a predictor affects both outcome and exposure. Adding a confounder to as a predictor to the model removes this bias.

With multiple predictors in a logistic regression model, the resulting odds ratios are called Adjusted Odds Ratio (AOR).

Exercise 6

What is the association between lifetime marijuana use (mj_lifetime) and age at first use of alcohol (alc_agefirst), adjusted for age (demog_age_cat6), sex (demog_sex), and income (demog_income)? Fit a model with and without an interaction term (alc_agefirst:demog_sex) and compute the AOR, its 95% CI, and the p-value that tests the significance of age at first use of alcohol. Report also their AORs of the other predictors, their 95% CIs and p-values. Since there are some categorical predictors with more than two levels, use car::Anova(model, type = 3) function to compute p-values for multiple degrees of freedom. In your answer, fill in the template below.

# Define models
M3_log <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income, data=nsduh, family = "binomial")
M3_log_int <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income + 
                    alc_agefirst:demog_sex, data=nsduh, family = "binomial")

# Compare models
modelsummary(
  list(
    M3_log = M3_log, 
    M3_log_int = M3_log_int),
  fmt = 2,
  estimate  = "{estimate} ({std.error}){stars}",
  statistic = NULL,
  stars = TRUE
  )

	M3_log	M3_log_int
(Intercept)	6.25 (0.59)***	7.37 (0.83)***
alc_agefirst	−0.28 (0.03)***	−0.34 (0.04)***
demog_age_cat626-34	−0.30 (0.33)	−0.30 (0.33)
demog_age_cat635-49	−0.80 (0.30)**	−0.82 (0.30)**
demog_age_cat650-64	−0.69 (0.30)*	−0.69 (0.30)*
demog_age_cat665+	−1.27 (0.30)***	−1.26 (0.31)***
demog_sexMale	−0.06 (0.16)	−2.14 (0.98)*
demog_income$20,000 - $49,999	−0.53 (0.27)*	−0.53 (0.27)*
demog_income$50,000 - $74,999	−0.08 (0.30)	−0.09 (0.31)
demog_income$75,000 or more	−0.36 (0.25)	−0.36 (0.25)
alc_agefirst × demog_sexMale		0.12 (0.06)*
Num.Obs.	843	843
AIC	955.6	952.9
BIC	1002.9	1005.0
Log.Lik.	−467.776	−465.448
F	14.804	13.139
RMSE	0.43	0.43

## --> there seems to be significant interaction between alc_agefirst and demog_sex (p-value of interaction term < 0.05); also, the AIC of the model with the interaction term is lower, so the overall performance of the model seems better

tidy(anova(M3_log, M3_log_int, test='LR'))

## # A tibble: 2 × 6
##   term                      df.residual residual.deviance    df deviance p.value
##   <chr>                           <dbl>             <dbl> <dbl>    <dbl>   <dbl>
## 1 mj_lifetime ~ alc_agefir…         833              936.    NA    NA    NA     
## 2 mj_lifetime ~ alc_agefir…         832              931.     1     4.66  0.0310

## --> the model significantly improves when the interaction term is added to the model

## Interpretation of results based on model without interaction term (thus modeling the association between age at first alcohol and lifetime marijuana risk for males and females combined)
tidy(M3_log, exponentiate = TRUE, conf.int = TRUE)

## # A tibble: 10 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           520.       0.591     10.6   3.85e-26  169.     1722.   
##  2 alc_agefirst            0.759    0.0276    -9.99  1.65e-23    0.718     0.800
##  3 demog_age_cat626-34     0.744    0.329     -0.901 3.67e- 1    0.387     1.41 
##  4 demog_age_cat635-49     0.447    0.297     -2.71  6.69e- 3    0.247     0.791
##  5 demog_age_cat650-64     0.502    0.299     -2.31  2.08e- 2    0.275     0.891
##  6 demog_age_cat665+       0.279    0.304     -4.19  2.80e- 5    0.152     0.502
##  7 demog_sexMale           0.941    0.162     -0.376 7.07e- 1    0.684     1.29 
##  8 demog_income$20,000…    0.588    0.266     -1.99  4.63e- 2    0.347     0.987
##  9 demog_income$50,000…    0.924    0.305     -0.260 7.95e- 1    0.507     1.68 
## 10 demog_income$75,000…    0.697    0.253     -1.43  1.54e- 1    0.421     1.14

## --> alc_first: AOR [95%CI] = 0.76 [0.72, 0.80]
(1-0.7592)*100 ## 24% lower odds in the total sample

## [1] 24.08

## --> 35-49 year-olds: AOR [95% CI] = 0.45 [0.25, 0.79]
(1-0.4474)*100 ## 55% lower odds in the total sample

## [1] 55.26

## Interpretation of results based on model with interaction term (which seems to be the better model)

# M3 with interaction term, but now with male = 0 and female = 1
M3_log_int_inv <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex_inv + demog_income + 
                    alc_agefirst:demog_sex_inv, data=nsduh, family = "binomial")

# in females
tidy(M3_log_int, exponentiate = TRUE, conf.int = TRUE)

## # A tibble: 11 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)          1593.       0.829      8.89  5.99e-19 337.      8717.   
##  2 alc_agefirst            0.713    0.0423    -7.99  1.38e-15   0.654      0.772
##  3 demog_age_cat626-34     0.744    0.331     -0.890 3.73e- 1   0.385      1.42 
##  4 demog_age_cat635-49     0.442    0.299     -2.73  6.30e- 3   0.243      0.785
##  5 demog_age_cat650-64     0.503    0.301     -2.28  2.24e- 2   0.275      0.897
##  6 demog_age_cat665+       0.284    0.306     -4.11  3.91e- 5   0.153      0.511
##  7 demog_sexMale           0.118    0.985     -2.17  2.99e- 2   0.0167     0.800
##  8 demog_income$20,000…    0.588    0.268     -1.98  4.80e- 2   0.346      0.991
##  9 demog_income$50,000…    0.911    0.307     -0.303 7.62e- 1   0.498      1.66 
## 10 demog_income$75,000…    0.696    0.255     -1.42  1.54e- 1   0.419      1.14 
## 11 alc_agefirst:demog_…    1.13     0.0555     2.14  3.22e- 2   1.01       1.26

## --> alc_first: AOR [95%CI] = 0.71 [0.65, 0.77], p<0.001
(1-0.7133)*100 ## 29% lower odds in females (females=reference population)

## [1] 28.67

## --> 35-49 year-olds: AOR [95% CI] = 0.44 [0.24, 0.78], p=0.006
(1-0.4422)*100 ## 56% lower odds in females (females=reference population)

## [1] 55.78

# in males
tidy(M3_log_int_inv, exponentiate = TRUE, conf.int = TRUE)

## # A tibble: 11 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           188.       0.702      7.46  8.75e-14   49.8     783.   
##  2 alc_agefirst            0.803    0.0363    -6.04  1.58e- 9    0.746     0.860
##  3 demog_age_cat626-34     0.744    0.331     -0.890 3.73e- 1    0.385     1.42 
##  4 demog_age_cat635-49     0.442    0.299     -2.73  6.30e- 3    0.243     0.785
##  5 demog_age_cat650-64     0.503    0.301     -2.28  2.24e- 2    0.275     0.897
##  6 demog_age_cat665+       0.284    0.306     -4.11  3.91e- 5    0.153     0.511
##  7 demog_sex_inv           8.48     0.985      2.17  2.99e- 2    1.25     59.9  
##  8 demog_income$20,000…    0.588    0.268     -1.98  4.80e- 2    0.346     0.991
##  9 demog_income$50,000…    0.911    0.307     -0.303 7.62e- 1    0.498     1.66 
## 10 demog_income$75,000…    0.696    0.255     -1.42  1.54e- 1    0.419     1.14 
## 11 alc_agefirst:demog_…    0.888    0.0555    -2.14  3.22e- 2    0.795     0.989

## --> alc_first: AOR [95%CI] = 0.80 [0.75, 0.86], p<0.001
(1-0.8034)*100 ## 20% lower odds in males

## [1] 19.66

## --> 35-49 year-olds: AOR [95% CI] = 0.44 [0.24, 0.78], p=0.006
(1-0.4422)*100 ## 56% lower odds in females (= same as in females, because no interaction term sex * age is not in the model)

## [1] 55.78

car::Anova(M3_log_int, type=3)

## Analysis of Deviance Table (Type III tests)
## 
## Response: mj_lifetime
##                        LR Chisq Df Pr(>Chisq)    
## alc_agefirst            108.028  1  < 2.2e-16 ***
## demog_age_cat6           23.248  4   0.000113 ***
## demog_sex                 4.797  1   0.028506 *  
## demog_income              5.296  3   0.151361    
## alc_agefirst:demog_sex    4.656  1   0.030953 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Answer: The AOR [95%CI] for our primary predictor alc_agefirst is 0.71 [0.65, 0.77] for females, and 0.80 [0.75, 0.86]. In the total sample, the AOR for our primary predictor alc_agefirst is 0.76 [0.72, 0.80]. This represents the AOR for lifetime marijuana use comparing those with a one-year difference in age at first use of alcohol, adjusted for age, sex, and income. The remaining AORs compare levels of categorical predictors to their reference level, adjusted for the other predictors in the model. For example, comparing individuals with the same age of first alcohol use, sex, and income, 35-49 year-olds have 56% lower odds of lifetime marijuana use than 18-25 year-olds (OR = 0.44; 95% CI = 0.24, 0.78; p = 0.006). An overall 4 df p-value for age can be read from the Type III Test table. It is significant at p < 0.001. An overall, 3 df p-value for income is not significant at p = 0.15. Conclusion: After adjusting for age, sex, and income, age at first alcohol use is significantly negatively associated with lifetime marijuana use (AOR for females = 0.71, 95% CI = 0.65, 0.77, p < 0.001; AOR for males = 0.80, 95% CI = 0.75, 0.86, p < 0.001). All individuals combined who first used alcohol at a given age have 24% lower odds of having ever used marijuana than those who first used alcohol one year earlier. The association between age of first alcohol use and lifetime marijuana use differs significantly between males and females (p = 0.032). Females who first used alcohol at a given age have 29% lower odds of having ever used marijuana than those who first used alcohol one year earlier. Males who first used alcohol at a given age have 20% lower odds of having ever used marijuana than those who first used alcohol one year earlier. A likelihood ratio test shows that the adjusted model with the interaction is superior to the one without the interaction (p = 0.031).

Exercise 7

Replicate the model summary shown below, and discuss its main features. Use the likelihood ratio test to determine which model is best, and discuss the association between age of first alcohol and lifetime marijuana use, when comparing males to females.

## all models were previously defined in exercise 5 and 6

# Compare models
modelsummary(
  list("Unadjusted" = M2_log,
       "Adjusted" = M3_log,
    "Interaction" = M3_log_int),
  fmt = 3,
  estimate  = "{estimate} ({std.error}){stars}",
  statistic = NULL,
  stars = TRUE,
  coef_rename = TRUE
  )

	Unadjusted	Adjusted	Interaction
(Intercept)	5.341 (0.475)***	6.254 (0.591)***	7.374 (0.829)***
Alcohol (age first use)	−0.284 (0.027)***	−0.275 (0.028)***	−0.338 (0.042)***
Age [26-34]		−0.296 (0.329)	−0.295 (0.331)
Age [35-49]		−0.804 (0.297)**	−0.816 (0.299)**
Age [50-64]		−0.690 (0.299)*	−0.687 (0.301)*
Age [65+]		−1.275 (0.304)***	−1.260 (0.306)***
Sex [Male]		−0.061 (0.162)	−2.138 (0.985)*
Income [$20,000 - $49,999]		−0.531 (0.266)*	−0.530 (0.268)*
Income [$50,000 - $74,999]		−0.079 (0.305)	−0.093 (0.307)
Income [$75,000 or more]		−0.361 (0.253)	−0.363 (0.255)
Alcohol (age first use) × Sex [Male]			0.119 (0.056)*
Num.Obs.	843	843	843
AIC	972.4	955.6	952.9
BIC	981.9	1002.9	1005.0
Log.Lik.	−484.219	−467.776	−465.448
F	112.743	14.804	13.139
RMSE	0.44	0.43	0.43

tidy(anova(M2_log, M3_log, M3_log_int, test='LR'))

## # A tibble: 3 × 6
##   term                     df.residual residual.deviance    df deviance  p.value
##   <chr>                          <dbl>             <dbl> <dbl>    <dbl>    <dbl>
## 1 mj_lifetime ~ alc_agefi…         841              968.    NA    NA    NA      
## 2 mj_lifetime ~ alc_agefi…         833              936.     8    32.9   6.46e-5
## 3 mj_lifetime ~ alc_agefi…         832              931.     1     4.66  3.10e-2

## OR of alcohol use in females
tidy(M3_log_int, exponentiate = TRUE, conf.int = TRUE) ## OR for females [95%CI] = 0.71 [0.65, 0.77]

## # A tibble: 11 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)          1593.       0.829      8.89  5.99e-19 337.      8717.   
##  2 alc_agefirst            0.713    0.0423    -7.99  1.38e-15   0.654      0.772
##  3 demog_age_cat626-34     0.744    0.331     -0.890 3.73e- 1   0.385      1.42 
##  4 demog_age_cat635-49     0.442    0.299     -2.73  6.30e- 3   0.243      0.785
##  5 demog_age_cat650-64     0.503    0.301     -2.28  2.24e- 2   0.275      0.897
##  6 demog_age_cat665+       0.284    0.306     -4.11  3.91e- 5   0.153      0.511
##  7 demog_sexMale           0.118    0.985     -2.17  2.99e- 2   0.0167     0.800
##  8 demog_income$20,000…    0.588    0.268     -1.98  4.80e- 2   0.346      0.991
##  9 demog_income$50,000…    0.911    0.307     -0.303 7.62e- 1   0.498      1.66 
## 10 demog_income$75,000…    0.696    0.255     -1.42  1.54e- 1   0.419      1.14 
## 11 alc_agefirst:demog_…    1.13     0.0555     2.14  3.22e- 2   1.01       1.26

## OR of alcohol use in males
tidy(M3_log_int_inv, exponentiate = TRUE, conf.int = TRUE) ## OR for males [95%CI] = 0.80 [0.75, 0.86]

## # A tibble: 11 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           188.       0.702      7.46  8.75e-14   49.8     783.   
##  2 alc_agefirst            0.803    0.0363    -6.04  1.58e- 9    0.746     0.860
##  3 demog_age_cat626-34     0.744    0.331     -0.890 3.73e- 1    0.385     1.42 
##  4 demog_age_cat635-49     0.442    0.299     -2.73  6.30e- 3    0.243     0.785
##  5 demog_age_cat650-64     0.503    0.301     -2.28  2.24e- 2    0.275     0.897
##  6 demog_age_cat665+       0.284    0.306     -4.11  3.91e- 5    0.153     0.511
##  7 demog_sex_inv           8.48     0.985      2.17  2.99e- 2    1.25     59.9  
##  8 demog_income$20,000…    0.588    0.268     -1.98  4.80e- 2    0.346     0.991
##  9 demog_income$50,000…    0.911    0.307     -0.303 7.62e- 1    0.498     1.66 
## 10 demog_income$75,000…    0.696    0.255     -1.42  1.54e- 1    0.419     1.14 
## 11 alc_agefirst:demog_…    0.888    0.0555    -2.14  3.22e- 2    0.795     0.989

# show interaction graphically
## data withing cases with missing for age at first alcohol use
nsduh_nomissing <- na.omit(nsduh)

## add prediction
nsduh_nomissing$pred <- predict(M3_log_int)

## plot
ggplot(nsduh_nomissing, aes(x=alc_agefirst, y=pred, colour=demog_sex)) +
  geom_point()

Answer: The model improves by adding adjusting for confounders (adjusted model) and by adding an interaction term between alc_agefirst and demog_sex: AIC decreases. When performing a LRT, the adjusted model performs better than the unadjusted model (p<0.001) and the interaction model performs better than the adjusted model (p=0.031). Thus, we can conclude that the interaction model is the best model. Based on the interaction model, age of first alcohol is associated significantly with lifetime marijuana use. The association differs between males and females (p-value interactionterm = 0.032). In females, 1 year increase in age of first alcohol is significantly associated with 29% lower odds (AOR 0.71; 95% CI 0.65, 0.77) for lifetime marijuana use. In males, 1 year increase in age of first alcohol is significantly associated with 20% lower odds for lifetime marijuana use (AOR 0.80; 95% CI 0.75, 0.86). In other words, starting drinking at later age seems less preventive for the risk of lifetime marijuana use in males than in females.

Exercise 8

Replicate the forest plot shown below, using centralized variables scaled such that standard deviations are 0.5 (you can use the function datawizard::standardize(two_sd = TRUE). What is the advantage of this plot compared to the model summary tables shown above?

# Scaling
nsduh.ctr <- nsduh |> 
  standardize(two_sd = TRUE)

# Fitting the model
M3_log_int.ctr <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income + 
                    alc_agefirst:demog_sex, data=nsduh.ctr, family = "binomial")

# Creating a forest plot
forest_plot <- plot(model_parameters(M3_log_int.ctr))
forest_plot + theme_bw() + theme(panel.grid.minor = element_blank(), panel.border = element_blank(),
                                 legend.position = "bottom") +
  labs(x="Log-Odds (standardized)")

Answer: The advantage of this plot is that is visually shows you what factors increase and what factors decrease the risk of lifetime marijuana use, and how these factors relate to each other in a sense of the effect size.

Exercise 9

We want to continue exploring the association between age of first alcohol use and lifetime marijuana use, and how this association varies between males and females. Try to replicate the following graphs, and interpret what you see. Note also that something interesting happens at the age of 18. What is special about that age, according to the two graphs? Is the difference between the predicted outcomes of males and females significant at any age bracket? If so, what is the age bracket?

# model with interaction term and correction for confounders (previously specified)
M3_log_int <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income + 
                    alc_agefirst:demog_sex, data=nsduh, family = "binomial")

# model with interaction term without correction for confounders
M4_log_int <- glm(mj_lifetime ~ alc_agefirst + demog_sex + alc_agefirst:demog_sex, data=nsduh, family = "binomial")

# plot of predicted probability
plot <- plot_predictions(M3_log_int, 
                 condition = c("alc_agefirst", "demog_sex"))
plot <- plot + labs(x = "Alcohol (age first use)", y = "Probability of lifetime marijuana use") + theme_bw() + 
  scale_x_continuous(breaks=c(5, 10, 15, 18, 20, 25, 30, 35, 40)) +
  theme(panel.grid.minor = element_blank(),
        panel.border = element_blank(), 
        legend.position = "bottom",
        legend.title=element_blank())+
  ggtitle('Predicted probability') 
  
# plot showing the difference between the predicted probabilities for males and females along age of alcohol’s first use

plot2 <- plot_comparisons(M3_log_int, 
  variables = list(
    demog_sex = c("Male", "Female")
    ), 
  condition = "alc_agefirst")

plot2 <- plot2 + labs(x = "Alcohol (age first use)", y = "Pr(female) - Pr(male)") + theme_bw() +
  scale_x_continuous(breaks=c(5, 10, 15, 18, 20, 25, 30, 35, 40)) +
  theme(panel.grid.minor = element_blank(), 
        panel.border = element_blank())+
        ggtitle('Probability difference') +
  geom_hline(yintercept = 0, linetype=2, linewidth=0.4) +
  coord_cartesian(xlim=c(5,40))

# put plots together
require(gridExtra)
grid.arrange(plot, plot2, ncol=2)

# additional plots showing the log odds ratio, odds ratio and risk ratio for males and females 
## plot 3
plot3 <- plot_comparisons(M3_log_int, 
                          variables = list(
                            demog_sex = c("Male", "Female")
                          ), 
                          condition = "alc_agefirst",
                          comparison = 'lnor')

plot3 <- plot3 + labs(x = "Alcohol (age first use)", 
                      y = "log(odds(female)/odds(male))") + 
  theme_bw() +
  scale_x_continuous(breaks=c(5, 10, 15, 18, 20, 25, 30, 35, 40)) +
  theme(panel.grid.minor = element_blank(), 
        panel.border = element_blank()) +
  geom_hline(yintercept = 0, 
             linetype=2, 
             linewidth=0.4) +
  coord_cartesian(xlim=c(5,40)) +
  ggtitle('Log odds ratio')

plot3

## plot 4
plot4 <- plot_comparisons(M3_log_int, 
                          variables = list(
                            demog_sex = c("Male", "Female")
                          ), 
                          condition = "alc_agefirst",
                          comparison = 'lnor',
                          transform = 'exp')


plot4 <- plot4 + labs(x = "Alcohol (age first use)", 
                      y = "odds(female)/odds(male)") + theme_bw() +
  scale_x_continuous(breaks=c(5, 10, 15, 18, 20, 25, 30, 35, 40)) +
  theme(panel.grid.minor = element_blank(), 
        panel.border = element_blank()) +
  geom_hline(yintercept = 0, 
             linetype=2, 
             linewidth=0.4) +
  coord_cartesian(xlim=c(5,40)) + ggtitle('Odds ratio')

plot4

## plot 5
plot5 <- plot_comparisons(M3_log_int, 
                          variables = list(
                            demog_sex = c("Male", "Female")
                          ), 
                          condition = "alc_agefirst",
                          comparison = 'ratio')

plot5 <- plot5 + labs(x = "Risk ratio: alcohol (age first use)", 
                      y = "Pr(female) / PR(male)") + 
  theme_bw() +
  scale_x_continuous(breaks=c(5, 10, 15, 18, 20, 25, 30, 35, 40)) +
  theme(panel.grid.minor = element_blank(),
        panel.border = element_blank()) +
  geom_hline(yintercept = 1,
             linetype=2, 
             linewidth=0.4) +
  coord_cartesian(xlim=c(5,40)) +
  ggtitle('Risk ratio')

plot5

Answer: Probability of lifetime marijuana use decreases more with age at first alcohol use in males than in females up to the age of 18. After 18, the decrease in the probability of lifetime marijuana use with increasing age at first alcohol use seems to be more in females than in males. In other words, below 18, women are at higher probability of lifetime marijuana use than males with the same age of first alcohol use. After 18, men are at higher probability of lifetime marijana use than females with the same age of first alcohol use. The 95%CI of the difference between Pr(female) and Pr(male) seems to include 0 in almost all age at first alcohol brackets. The only exception might be 10-13 years, when the null seems not to be in 95%CI.

BMLR Lab 2 - Group 15

Team 15: Renee Burger, Alireza Ani, Johanna Bult, Javier Olmos Becerra, Rui Martins, Milla Gois

2023-11-27