## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'table1'
##
##
## The following objects are masked from 'package:base':
##
## units, units<-
##
##
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## # Attaching packages: easystats 0.6.0 (red = needs update)
## ✔ bayestestR 0.13.1 ✔ correlation 0.8.4
## ✔ datawizard 0.9.0 ✔ effectsize 0.8.6
## ✔ insight 0.19.6 ✔ modelbased 0.8.6
## ✖ performance 0.10.5 ✖ parameters 0.21.2
## ✔ report 0.5.7 ✖ see 0.8.0
##
## Restart the R-Session and update packages in red with `easystats::easystats_update()`.
##
##
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
##
##
## Attaching package: 'janitor'
##
##
## The following object is masked from 'package:insight':
##
## clean_names
##
##
## The following objects are masked from 'package:datawizard':
##
## remove_empty, remove_empty_rows
##
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
##
##
## Loading required package: zoo
##
##
## Attaching package: 'zoo'
##
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
##
## Attaching package: 'modelsummary'
##
##
## The following object is masked from 'package:parameters':
##
## supported_models
##
##
## The following object is masked from 'package:insight':
##
## supported_models
Question 1: Create a table1 and use GGally::ggpairs to explore your data.
## Rows: 1,000
## Columns: 6
## $ mj_lifetime <fct> Yes, No, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes…
## $ mj_lifetime.numeric <dbl> 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,…
## $ alc_agefirst <dbl> 18, 21, 12, 17, 10, 18, 17, 21, 15, 16, 15, 21, 19…
## $ demog_age_cat6 <fct> 26-34, 65+, 26-34, 35-49, 26-34, 18-25, 65+, 35-49…
## $ demog_sex <fct> Female, Female, Male, Male, Female, Female, Female…
## $ demog_income <fct> "Less than $20,000", "$20,000 - $49,999", "$50,000…
| No (N=491) |
Yes (N=509) |
|
|---|---|---|
| Sex | ||
| Female | 285 (58.0%) | 249 (48.9%) |
| Male | 206 (42.0%) | 260 (51.1%) |
| Alcohol (age first used) | ||
| Mean (SD) | 19.7 (5.09) | 16.0 (3.08) |
| Median [Min, Max] | 19.0 [3.00, 45.0] | 16.0 [3.00, 29.0] |
| Missing | 145 (29.5%) | 12 (2.4%) |
| Age | ||
| 18-25 | 53 (10.8%) | 80 (15.7%) |
| 26-34 | 52 (10.6%) | 87 (17.1%) |
| 35-49 | 129 (26.3%) | 136 (26.7%) |
| 50-64 | 109 (22.2%) | 132 (25.9%) |
| 65+ | 148 (30.1%) | 74 (14.5%) |
| Income | ||
| Less than $20,000 | 76 (15.5%) | 85 (16.7%) |
| $20,000 - $49,999 | 166 (33.8%) | 127 (25.0%) |
| $50,000 - $74,999 | 64 (13.0%) | 81 (15.9%) |
| $75,000 or more | 185 (37.7%) | 216 (42.4%) |
## function (data, mapping = NULL, columns = 1:ncol(data), title = NULL,
## upper = list(continuous = "cor", combo = "box_no_facet",
## discrete = "count", na = "na"), lower = list(continuous = "points",
## combo = "facethist", discrete = "facetbar", na = "na"),
## diag = list(continuous = "densityDiag", discrete = "barDiag",
## na = "naDiag"), params = NULL, ..., xlab = NULL, ylab = NULL,
## axisLabels = c("show", "internal", "none"), columnLabels = colnames(data[columns]),
## labeller = "label_value", switch = NULL, showStrips = NULL,
## legend = NULL, cardinality_threshold = 15, progress = NULL,
## proportions = NULL, legends = stop("deprecated"))
## {
## warn_deprecated(!missing(legends), "legends")
## warn_if_args_exist(list(...))
## stop_if_params_exist(params)
## isSharedData <- inherits(data, "SharedData")
## data_ <- fix_data(data)
## data <- fix_data_slim(data_, isSharedData)
## if (!missing(mapping) & !is.list(mapping) & missing(columns)) {
## columns <- mapping
## mapping <- NULL
## }
## stop_if_bad_mapping(mapping)
## columns <- fix_column_values(data, columns, columnLabels,
## "columns", "columnLabels")
## stop_if_high_cardinality(data, columns, cardinality_threshold)
## upper <- check_and_set_ggpairs_defaults("upper", upper, continuous = "cor",
## combo = "box_no_facet", discrete = "count", na = "na")
## lower <- check_and_set_ggpairs_defaults("lower", lower, continuous = "points",
## combo = "facethist", discrete = "facetbar", na = "na")
## diag <- check_and_set_ggpairs_defaults("diag", diag, continuous = "densityDiag",
## discrete = "barDiag", na = "naDiag", isDiag = TRUE)
## axisLabels <- fix_axis_label_choice(axisLabels, c("show",
## "internal", "none"))
## proportions <- ggmatrix_proportions(proportions, data, columns)
## dataTypes <- plot_types(data, columns, columns, allowDiag = TRUE)
## if (identical(axisLabels, "internal")) {
## dataTypes$plotType[dataTypes$posX == dataTypes$posY] <- "label"
## }
## ggpairsPlots <- lapply(seq_len(nrow(dataTypes)), function(i) {
## plotType <- dataTypes[i, "plotType"]
## posX <- dataTypes[i, "posX"]
## posY <- dataTypes[i, "posY"]
## xColName <- dataTypes[i, "xVar"]
## yColName <- dataTypes[i, "yVar"]
## if (posX > posY) {
## types <- upper
## }
## else if (posX < posY) {
## types <- lower
## }
## else {
## types <- diag
## }
## sectionAes <- add_and_overwrite_aes(add_and_overwrite_aes(aes_(x = as.name(xColName),
## y = as.name(yColName)), mapping), types$mapping)
## args <- list(types = types, sectionAes = sectionAes)
## if (plotType == "label") {
## args$label <- columnLabels[posX]
## }
## plot_fn <- ggmatrix_plot_list(plotType)
## p <- do.call(plot_fn, args)
## return(p)
## })
## plotMatrix <- ggmatrix(plots = ggpairsPlots, byrow = TRUE,
## nrow = length(columns), ncol = length(columns), xAxisLabels = (if (axisLabels ==
## "internal")
## NULL
## else columnLabels), yAxisLabels = (if (axisLabels ==
## "internal")
## NULL
## else columnLabels), labeller = labeller, switch = switch,
## showStrips = showStrips, showXAxisPlotLabels = identical(axisLabels,
## "show"), showYAxisPlotLabels = identical(axisLabels,
## "show"), title = title, xlab = xlab, ylab = ylab,
## data = data_, gg = NULL, progress = progress, legend = legend,
## xProportions = proportions, yProportions = proportions)
## plotMatrix
## }
## <bytecode: 0x55b799ff0c60>
## <environment: namespace:GGally>
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Question 2: Use the contingency table below to calculate (a) probability of lifetime marijuana among sample subjects (independent of sex), among sample females and among sample males, (b) Odds of lifetime marijuana use among sample subjects, among sample females and among sample males. (c) What is the risk difference, the risk ratio and the odds ratio when comparing lifetime marijuana use among males and females in our sample?
## mj_lifetime
## demog_sex No Yes Total
## Female 285 249 534
## Male 206 260 466
## Total 491 509 1000
## mj_lifetime
## demog_sex No Yes
## Female 53.4% 46.6%
## Male 44.2% 55.8%
| General | Female | Male | |
|---|---|---|---|
| Probability | 0.509 | 0.466 | 0.558 |
| Odds | 1.037 | 0.874 | 1.262 |
| Males vs. Females | |
|---|---|
| Risk difference | 0.092 |
| Risk Ratio | 1.197 |
| Odds ratio | 1.445 |
Question 3: You will now estimate the same measures from the previous question, but this time using two types of regressions: a logistic regression and a regular linear regression. To those measures add the 95% confidence intervals associated. Show your code and present your results in a table, just as you’ve done above. Explain your findings: what are the differences between the three approaches (using a contingency table as in the question above, using logistic regression and using linear regression)? What explains these differences (or lack thereof)? What are the advantages/drawbacks of using one approach over the other?
##
## Call:
## glm(formula = mj_lifetime ~ 1, family = "binomial", data = nsduh)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.03600 0.06326 0.569 0.569
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1386 on 999 degrees of freedom
## Residual deviance: 1386 on 999 degrees of freedom
## AIC: 1388
##
## Number of Fisher Scoring iterations: 3
## [1] 1.036656
## [1] 0.508999
##
## Call:
## lm(formula = mj_lifetime.numeric ~ 1, data = nsduh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.509 -0.509 0.491 0.491 0.491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.50900 0.01582 32.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5002 on 999 degrees of freedom
## [1] 0.509
##
## Call:
## glm(formula = mj_lifetime ~ demog_sex, family = "binomial", data = nsduh)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.13504 0.08675 -1.557 0.11954
## demog_sexMale 0.36784 0.12738 2.888 0.00388 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1386.0 on 999 degrees of freedom
## Residual deviance: 1377.6 on 998 degrees of freedom
## AIC: 1381.6
##
## Number of Fisher Scoring iterations: 3
## [1] 0.4662912
## [1] 0.5579386
## [1] 1.444611
##
## Call:
## lm(formula = mj_lifetime.numeric ~ demog_sex, data = nsduh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5579 -0.4663 0.4421 0.4421 0.5337
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.46629 0.02156 21.623 <2e-16 ***
## demog_sexMale 0.09165 0.03159 2.901 0.0038 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4983 on 998 degrees of freedom
## Multiple R-squared: 0.008363, Adjusted R-squared: 0.00737
## F-statistic: 8.417 on 1 and 998 DF, p-value: 0.003799
## [1] 0.46629
## [1] 0.55794
Answer: The differences between using these three methods is that with the contingency tables we have to ask R to calculate the specific values. So we have to “forecfully” design what the table will say, which might be tedious. But on the other hand the results are very straightforward and easy to interpret. With the regression models we are transforming the data into a regression line, whose values (intercept & slope) we can then use to eventually arrive at the same results as given by the table. In order to get these results however we need to know how to interpret the values that we were given (so the intercept and slope). The difference between the logistic and linear approach is that for the logistic, we need to conduct a transformation (take the log) and to obtain the same results then as a consequence take the transformation back (take the exponent). Although it is more logical to use a logistic model for a dichotomous outcome variable, the results of the linear regression interestingly yield the same results. An advantage of using regression models is that they are more generalization than a table in the sense that you can make out-of data prediction too if you put in different values for the Xs.
Question 4: Write up your results using the template shown below.
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.874 0.0867 -1.56 0.120 0.737 1.04
## 2 demog_sexMale 1.44 0.127 2.89 0.00388 1.13 1.86
Answer: Conclusion: Males have significantly higher odds of lifetime marijuana use than females (OR = exp(0.36784) = 1.444611 ; 95% CI = 1.13 , 1.86, p = 0.00388). Males have 1.44 = 44% higher odds of lifetime marijuana use than females.
Question 5: Using the 2019 National Survey of Drug Use and Health (NSDUH) teaching dataset, explore the association between lifetime marijuana use mj_lifetime and age at first use of alcohol alc_agefirst? Fit a logistic regression and interpret the results, using the template shown below.
##
## Call:
## glm(formula = mj_lifetime ~ alc_agefirst, family = "binomial",
## data = nsduh)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.3407 0.4747 11.25 <2e-16 ***
## alc_agefirst -0.2835 0.0267 -10.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1141.45 on 842 degrees of freedom
## Residual deviance: 968.44 on 841 degrees of freedom
## (157 observations deleted due to missingness)
## AIC: 972.44
##
## Number of Fisher Scoring iterations: 5
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 209. 0.475 11.3 2.29e-29 84.7 545.
## 2 alc_agefirst 0.753 0.0267 -10.6 2.46e-26 0.713 0.792
Answer: Age at first alcohol use is significantly negatively associated with lifetime marijuana use (OR = 0.753 ; 95% CI = 0.713 , 0.792 ; p < 0.01 ). Individuals who first used alcohol at age 19 years have 25 % lower odds of having ever used marijuana than those who first used alcohol at age 18 years. In contrast, the reduction in odds associated with starting drinking alcohol 3 years later is 0.753^3 = 0.4269578 -> 57.3% lower odds.
Question: What is the association between lifetime marijuana use (mj_lifetime) and age at first use of alcohol (alc_agefirst), adjusted for age (demog_age_cat6), sex (demog_sex), and income (demog_income)? Fit a model with and without an interaction term (alc_agefirst:demog_sex) and compute the AOR, its 95% CI, and the p-value that tests the significance of age at first use of alcohol. Report also their AORs of the other predictors, their 95% CIs and p-values. Since there are some categorical predictors with more than two levels, use car::Anova() function to compute p-values for multiple degrees of freedom. In your answer, fill in the following template.
| (1) | (2) | |
|---|---|---|
| (Intercept) | 6.254 *** [5.131, 7.451] | 7.374 *** [5.819, 9.073] |
| (0.591) | (0.829) | |
| alc_agefirst | −0.275 *** [−0.331, −0.223] | −0.338 *** [−0.425, −0.259] |
| (0.028) | (0.042) | |
| demog_age_cat626-34 | −0.296 [−0.949, 0.343] | −0.295 [−0.954, 0.350] |
| (0.329) | (0.331) | |
| demog_age_cat635-49 | −0.804 ** [−1.400, −0.234] | −0.816 ** [−1.417, −0.242] |
| (0.297) | (0.299) | |
| demog_age_cat650-64 | −0.690 * [−1.289, −0.116] | −0.687 * [−1.291, −0.108] |
| (0.299) | (0.301) | |
| demog_age_cat665+ | −1.275 *** [−1.885, −0.689] | −1.260 *** [−1.875, −0.671] |
| (0.304) | (0.306) | |
| demog_sexMale | −0.061 [−0.379, 0.256] | −2.138 * [−4.093, −0.224] |
| (0.162) | (0.985) | |
| demog_income$20,000 - $49,999 | −0.531 * [−1.059, −0.013] | −0.530 * [−1.062, −0.009] |
| (0.266) | (0.268) | |
| demog_income$50,000 - $74,999 | −0.079 [−0.679, 0.519] | −0.093 [−0.697, 0.510] |
| (0.305) | (0.307) | |
| demog_income$75,000 or more | −0.361 [−0.864, 0.130] | −0.363 [−0.869, 0.131] |
| (0.253) | (0.255) | |
| alc_agefirst × demog_sexMale | 0.119 * [0.011, 0.229] | |
| (0.056) | ||
| Num.Obs. | 843 | 843 |
| AIC | 955.6 | 952.9 |
| BIC | 1002.9 | 1005.0 |
| Log.Lik. | −467.776 | −465.448 |
| F | 14.804 | 13.139 |
| RMSE | 0.43 | 0.43 |
## Analysis of Deviance Table (Type III tests)
##
## Response: mj_lifetime
## LR Chisq Df Pr(>Chisq)
## alc_agefirst 149.610 1 < 2.2e-16 ***
## demog_age_cat6 23.964 4 8.12e-05 ***
## demog_sex 0.142 1 0.7066
## demog_income 5.510 3 0.1380
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table (Type III tests)
##
## Response: mj_lifetime
## LR Chisq Df Pr(>Chisq)
## alc_agefirst 108.028 1 < 2.2e-16 ***
## demog_age_cat6 23.248 4 0.000113 ***
## demog_sex 4.797 1 0.028506 *
## demog_income 5.296 3 0.151361
## alc_agefirst:demog_sex 4.656 1 0.030953 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Likelihood ratio test
##
## Model 1: mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income
## Model 2: mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex + demog_income +
## alc_agefirst:demog_sex
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 10 -467.78
## 2 11 -465.45 1 4.6556 0.03095 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Answer: The AOR for our primary predictor alc_agefirst is exp(−0.275) = 0.76. This represents the OR for lifetime marijuana use comparing those with a one-year difference in age at first use of alcohol, adjusted for age, sex, and income. The remaining AORs compare levels of categorical predictors to their reference level, adjusted for the other predictors in the model. For example, comparing individuals with the same age of first alcohol use, sex, and income, 35-49 year-olds have exp(−0.804) = 0.45, 55% lower odds of lifetime marijuana use than 18-25 year-olds (OR = 0.45; 95% CI = 0.25; 0.79; p < 0.01). An overall, 4 df p-value for age, can be read from the Type III Test table (p < 0.001). Conclusion: After adjusting for age, sex, and income, age at first alcohol use is significantly negatively associated with lifetime marijuana use (AOR = exp(−0.275) = 0.76; 95% CI = 0.72, 0.8 ; p < 0.001). Individuals who first used alcohol at a given age have 24% lower odds of having ever used marijuana than those who first used alcohol one year earlier. The association between age of first alcohol use and lifetime marijuana use differs significantly between males and females (p = 0.030953). A likelihood ratio test shows that the adjusted model with the interaction is superior to the one without the interaction (p = 0.03095).
Question: Replicate the model summary and the forest plot shown below (the forest plot using a centralized variable alc_agefirst, scaled such that its standard deviation is 0.5).
| Unadjusted | Adjusted | Interaction | |
|---|---|---|---|
| (Intercept) | 5.341 *** | 6.254 *** | 7.374 *** |
| (0.475) | (0.591) | (0.829) | |
| Alcohol (age first used) | −0.284 *** | −0.275 *** | −0.338 *** |
| (0.027) | (0.028) | (0.042) | |
| Age [26-34] | −0.296 | −0.295 | |
| (0.329) | (0.331) | ||
| Age [35-49] | −0.804 ** | −0.816 ** | |
| (0.297) | (0.299) | ||
| Age [50-64] | −0.690 * | −0.687 * | |
| (0.299) | (0.301) | ||
| Age [65+] | −1.275 *** | −1.260 *** | |
| (0.304) | (0.306) | ||
| Sex [Male] | −0.061 | −2.138 * | |
| (0.162) | (0.985) | ||
| Income [$20,000 - $49,999] | −0.531 * | −0.530 * | |
| (0.266) | (0.268) | ||
| Income [$50,000 - $74,999] | −0.079 | −0.093 | |
| (0.305) | (0.307) | ||
| Income [$75,000 or more] | −0.361 | −0.363 | |
| (0.253) | (0.255) | ||
| Alcohol (age first used):Sex [Male] | 0.119 * | ||
| (0.056) | |||
| Num.Obs. | 843 | 843 | 843 |
| AIC | 972.4 | 955.6 | 952.9 |
| BIC | 981.9 | 1002.9 | 1005.0 |
| Log.Lik. | −484.219 | −467.776 | −465.448 |
| F | 112.743 | 14.804 | 13.139 |
| RMSE | 0.44 | 0.43 | 0.43 |