# Core data wrangling and string handling
library(dplyr) # data manipulation (filter, select, mutate, etc.)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr) # string operations
# Table creation and reporting
library(gtsummary) # publication-style summary tables
library(forcats) # factor handling (relevel, recode)
# Statistical modeling
library(lmerTest) # linear mixed-effects models with p-values
## Loading required package: lme4
## Loading required package: Matrix
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(MatchIt) # propensity score matching
# Visualization
library(ggplot2) # core plotting system
library(patchwork) # arrange multiple ggplots
library(gridExtra) # arrange multiple plots (grid.arrange)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Misc utilities
library(rlang) # tidy evaluation
library(rmarkdown) # knitting & document generation
library(tidyverse) # collection of core packages (incl. dplyr, ggplot2, etc.)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::%@%() masks rlang::%@%()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks rlang::flatten()
## ✖ purrr::flatten_chr() masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
## ✖ purrr::flatten_int() masks rlang::flatten_int()
## ✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw() masks rlang::flatten_raw()
## ✖ purrr::invoke() masks rlang::invoke()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ purrr::splice() masks rlang::splice()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr) # fast CSV/TSV reading
library(flextable) # export gtsummary tables to Word
##
## Attaching package: 'flextable'
##
## The following object is masked from 'package:purrr':
##
## compose
##
## The following object is masked from 'package:gtsummary':
##
## continuous_summary
library(broom) # tidy model output (glm, lm, etc.)
library(broom.mixed) # tidy model output (lmer, glmer, etc.)
getwd()
## [1] "/Users/carolinhohne/Documents/charite/Fortbildungen:Kongresse:Poster/diclofenac-AD/R-diclo/NSAID-Dementia"
og_data <- read.csv("Investigator_nacc60.csv")
data <- og_data # work on a copy of the original data
# ── Dataset Overview────────────────────────────────────────────────────────
# Count how many unique participants (NACCID) are in the dataset
n_unique_ids <- length(unique(data$NACCID))
n_unique_ids # print the result
## [1] 47165
# ── Adding NSAID Columns ────────────────────────────────────────────────────────
# Identify all drug-related columns (start with "DRUG")
drug_columns <- grep("^DRUG", names(data), value = TRUE)
# Standardize drug names: lowercase for consistent matching
data[drug_columns] <- lapply(data[drug_columns], tolower)
# Flag diclofenac use: 1 if "diclofenac" appears in any DRUG column
data$DICLOFENAC <- ifelse(rowSums(sapply(data[drug_columns], function(x) grepl("diclofenac", x))) > 0, 1, 0)
# Flag naproxen use
data$NAPROXEN <- ifelse(rowSums(sapply(data[drug_columns], function(x) grepl("naproxen", x))) > 0, 1, 0)
# Flag etodolac use
data$ETODOLAC <- ifelse(rowSums(sapply(data[drug_columns], function(x) grepl("etodolac", x))) > 0, 1, 0)
# Define “NONSAID”: 1 if none of the NSAIDs were used
data$NONSAID <- ifelse(data$DICLOFENAC == 0 & data$NAPROXEN == 0 & data$ETODOLAC == 0, 1, 0)
# ── Diagnostic conditions ─────────────────────────────────────────────────────
diagnosis_cols <- c("NACCTBI", "HXHYPER", "HXSTROKE", "DEP", "BIPOLDX", "SCHIZOP", "ANXIET", "PTSDDX", "OTHPSY", "ALCABUSE", "CANCER","DIABETES", "MYOINF", "CONGHRT", "AFIBRILL", "HYPERT", "HYPCHOL", "VB12DEF", "THYDIS", "ARTH", "SLEEPAP", "OTHCOND")
# Convert diagnosis codes (1 or 2) → binary indicator (1 = has condition, else 0)
data <- data %>%
mutate(across(all_of(diagnosis_cols), ~ ifelse(. %in% c(1, 2), 1, 0)))
# Collapse per patient (NACCID): take max across visits, count #conditions
diagnosis_summary <- data %>%
group_by(NACCID) %>%
summarise(across(all_of(diagnosis_cols), ~ max(.x, na.rm = TRUE))) %>%
mutate(DIAGNOSIS = rowSums(across(all_of(diagnosis_cols))))
# Join summary back to full data (so each row has DIAGNOSIS count)
data <- data %>%
left_join(diagnosis_summary %>% dplyr::select(NACCID, DIAGNOSIS), by = "NACCID")
# ── Collapse NSAID use per patient ────────────────────────────────────────────
data <- data %>%
group_by(NACCID) %>%
mutate(
DICLOFENAC = ifelse(any(DICLOFENAC == 1, na.rm = TRUE), 1, 0),
NAPROXEN = ifelse(any(NAPROXEN == 1, na.rm = TRUE), 1, 0),
ETODOLAC = ifelse(any(ETODOLAC == 1, na.rm = TRUE), 1, 0),
NONSAID = ifelse(DICLOFENAC == 0 & NAPROXEN == 0 & ETODOLAC == 0, 1, 0)
) %>% ungroup()
# Define mutually exclusive NSAID categories
data <- data %>%
mutate(NSAID_TYPE = case_when(
DICLOFENAC == 1 & NAPROXEN == 0 & ETODOLAC == 0 ~ "DICLOFENAC",
NAPROXEN == 1 & DICLOFENAC == 0 & ETODOLAC == 0 ~ "NAPROXEN",
ETODOLAC == 1 & DICLOFENAC == 0 & NAPROXEN == 0 ~ "ETODOLAC",
DICLOFENAC + NAPROXEN + ETODOLAC > 1 ~ "MULTIPLE",
DICLOFENAC == 0 & NAPROXEN == 0 & ETODOLAC == 0 ~ "NONSAID",
TRUE ~ NA_character_
))
# ── Clean numeric codes (set sentinel values to NA) ───────────────────────────
data <- data %>%
mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., -4))) %>%
mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 88))) %>%
mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 95))) %>%
mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 96))) %>%
mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 97))) %>%
mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 98))) %>%
mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 99)))
# Identify participants who have at least one valid (non-missing) MOCA score
naccid_with_moca <- unique(data$NACCID[!is.na(data$NACCMOCA)])
# Count how many participants have MOCA data available
length(naccid_with_moca)
## [1] 20176
# ── Most recent visit with the most recent valid MOCA per patient ───────────────────────────────────────
most_recent_data <- data %>%
group_by(NACCID) %>%
mutate(
# Per patient: pull MOCA from the visit with the highest NACCVNUM among valid (0–30)
latest_NACCMOCA = NACCMOCA[which.max(ifelse(NACCMOCA %in% 0:30, NACCVNUM, -Inf))]
) %>%
ungroup()
most_recent_data <- most_recent_data %>%
group_by(NACCID) %>%
filter(NACCVNUM == max(NACCVNUM)) %>% # keep only the latest visit per patient
slice_max(NACCVNUM, n = 1) %>% # (defensive) ensure a single row if ties
ungroup() %>%
# Overwrite visit MOCA with the most recent valid MOCA (NA if none)
mutate(NACCMOCA = latest_NACCMOCA)
# ── Build collapsed dataset for analysis ──────────────────────────────────────
# keep only variables needed for analysis (IDs, demographics, diagnoses, MOCA subscores, NSAID use)
collapsed_data <- most_recent_data %>%
dplyr::select(
NACCID, NACCVNUM, NACCAGE, SEX, NACCNIHR, EDUC, NACCALZD, NACCALZP,
NACCMOCA, CDRGLOB, NACCMMSE, NORMCOG, DEMENTED, NACCUDSD, CSFTAU,
NACCTBI, HXHYPER, HXSTROKE, DEP, BIPOLDX, SCHIZOP, ANXIET, PTSDDX, OTHPSY,
ALCABUSE, CANCER, DIABETES, MYOINF, CONGHRT, AFIBRILL, HYPERT, HYPCHOL,
VB12DEF, THYDIS, ARTH, SLEEPAP, OTHCOND,
DICLOFENAC, NAPROXEN, ETODOLAC, DIAGNOSIS, NONSAID,
MOCATRAI, MOCACUBE, MOCACLOC, MOCACLON, MOCACLOH, MOCANAMI,
MOCAFLUE, MOCAREPE, MOCAREGI, MOCARECN, MOCAABST, MOCADIGI,
MOCALETT, MOCASER7, MOCAORDT, MOCAORMO, MOCAORYR, MOCAORDY,
MOCAORPL, MOCAORCT, NSAID_TYPE)
# Quick summary check of selected variables
summary(collapsed_data[, c("NSAID_TYPE", "NACCAGE", "SEX", "NACCNIHR", "EDUC",
"NACCTBI", "DEP", "BIPOLDX", "SCHIZOP", "ANXIET",
"PTSDDX", "CANCER", "DIABETES", "CONGHRT", "HYPERT",
"HYPCHOL", "VB12DEF", "THYDIS", "ARTH", "SLEEPAP",
"DICLOFENAC", "NAPROXEN", "ETODOLAC", "DIAGNOSIS")])
## NSAID_TYPE NACCAGE SEX NACCNIHR
## Length:47165 Min. : 18.00 Min. :1.00 Min. :1.000
## Class :character 1st Qu.: 68.00 1st Qu.:1.00 1st Qu.:1.000
## Mode :character Median : 75.00 Median :2.00 Median :1.000
## Mean : 74.62 Mean :1.57 Mean :1.411
## 3rd Qu.: 82.00 3rd Qu.:2.00 3rd Qu.:1.000
## Max. :110.00 Max. :2.00 Max. :6.000
## NA's :824
## EDUC NACCTBI DEP BIPOLDX
## Min. : 0.00 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:12.00 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :16.00 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :15.17 Mean :0.07887 Mean :0.1763 Mean :0.00299
## 3rd Qu.:18.00 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :31.00 Max. :1.00000 Max. :1.0000 Max. :1.00000
## NA's :363
## SCHIZOP ANXIET PTSDDX CANCER
## Min. :0.0000000 Min. :0.00000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.0000000 Median :0.00000 Median :0.000000 Median :0.00000
## Mean :0.0008057 Mean :0.02837 Mean :0.002353 Mean :0.09348
## 3rd Qu.:0.0000000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :1.0000000 Max. :1.00000 Max. :1.000000 Max. :1.00000
##
## DIABETES CONGHRT HYPERT HYPCHOL
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.09492 Mean :0.01446 Mean :0.2561 Mean :0.2704
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## VB12DEF THYDIS ARTH SLEEPAP
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.03785 Mean :0.09844 Mean :0.2793 Mean :0.09354
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
##
## DICLOFENAC NAPROXEN ETODOLAC DIAGNOSIS
## Min. :0.0000 Min. :0.00000 Min. :0.000000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.: 1.000
## Median :0.0000 Median :0.00000 Median :0.000000 Median : 2.000
## Mean :0.0219 Mean :0.06098 Mean :0.003922 Mean : 2.692
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.: 4.000
## Max. :1.0000 Max. :1.00000 Max. :1.000000 Max. :15.000
##
# ── Convert variables ──────────────────────────────────
collapsed_data <- collapsed_data %>%
mutate(
across(c(
SEX, NACCNIHR, NACCTBI, DEP, BIPOLDX, SCHIZOP, ANXIET, PTSDDX,
CANCER, DIABETES, CONGHRT, HYPERT, HYPCHOL, VB12DEF, THYDIS, ARTH,
SLEEPAP, DICLOFENAC, NAPROXEN, ETODOLAC, NONSAID
), as.factor),
DIAGNOSIS = as.numeric(DIAGNOSIS),
CSFTAU = factor(case_when(
CSFTAU == 1 ~ "Positive",
CSFTAU == 0 ~ "Negative",
TRUE ~ NA_character_),
levels = c("Negative", "Positive")))
# ── Recode & clean variables for descriptives ─────────────────────────────────
descriptive_data <- collapsed_data %>%
mutate(
# ── Demographics ──
SEX = factor(SEX, levels = c(1, 2), labels = c("Male", "Female")),
NACCNIHR = fct_recode(as.factor(NACCNIHR), # recode race categories
"White" = "1",
"Black or African American" = "2",
"American Indian or Alaska Native" = "3",
"Asian" = "5",
"Native Hawaiian or Pacific Islander" = "4",
"Multiracial" = "6",
"Unknown" = "99") |>
fct_explicit_na("Unknown"),
EDUC = ifelse(EDUC %in% 0:36, EDUC, NA), # restrict education years to 0–36
# ── Clinical diagnosis outcomes ──
NACCALZD = factor(NACCALZD, levels = c(0, 1, 8),
labels = c("No cognitive impairment", "Yes", "No (assumed assessed and found not present")),
DEMENTED = factor(DEMENTED, levels = c(0, 1), labels = c("No", "Yes")),
# ── Cognitive test scores ──
NACCMOCA = ifelse(NACCMOCA %in% 0:30, NACCMOCA, NA) # restrict to valid MOCA range
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `NACCNIHR = fct_explicit_na(...)`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# ── Table 1: Baseline descriptives
summary_table1 <- descriptive_data %>%
dplyr::select(
`Age (years)` = NACCAGE,
`Sex` = SEX,
`Race` = NACCNIHR,
`Education (years)` = EDUC,
`Alzheimer’s disease diagnosis (presumptive)` = NACCALZD,
`Education-corrected MOCA score` = NACCMOCA,
`Dementia (all-cause)` = DEMENTED,
`CSF Tau positivity` = CSFTAU,
`NSAID Use` = NSAID_TYPE
) %>%
tbl_summary(
statistic = list(
all_continuous() ~ "{median} ({p25},{p75})", # medians + IQR
all_categorical() ~ "{n} ({p}%)" # counts + %
),
missing = "always" # always report missing
) %>%
modify_header(label = "**Variable**")
# Print in R and export to Word
summary_table1
| Variable | N = 47,1651 |
|---|---|
| Age (years) | 75 (68,82) |
| Unknown | 0 |
| Sex | |
| Male | 20,263 (43%) |
| Female | 26,902 (57%) |
| Unknown | 0 |
| Race | |
| White | 37,297 (79%) |
| Black or African American | 5,996 (13%) |
| American Indian or Alaska Native | 288 (0.6%) |
| Native Hawaiian or Pacific Islander | 36 (<0.1%) |
| Asian | 1,265 (2.7%) |
| Multiracial | 1,459 (3.1%) |
| Unknown | 824 (1.7%) |
| Unknown | 0 |
| Education (years) | 16 (12,18) |
| Unknown | 363 |
| Alzheimer’s disease diagnosis (presumptive) | |
| No cognitive impairment | 9,696 (21%) |
| Yes | 20,644 (44%) |
| No (assumed assessed and found not present | 16,825 (36%) |
| Unknown | 0 |
| Education-corrected MOCA score | 24 (19,27) |
| Unknown | 26,989 |
| Dementia (all-cause) | 20,053 (43%) |
| Unknown | 0 |
| CSF Tau positivity | |
| Negative | 767 (53%) |
| Positive | 667 (47%) |
| Unknown | 45,731 |
| NSAID Use | |
| DICLOFENAC | 879 (1.9%) |
| ETODOLAC | 151 (0.3%) |
| MULTIPLE | 174 (0.4%) |
| NAPROXEN | 2,715 (5.8%) |
| NONSAID | 43,246 (92%) |
| Unknown | 0 |
| 1 Median (Q1,Q3); n (%) | |
summary_table1 %>% as_flex_table() %>% flextable::save_as_docx(path = "Table1.docx")
# ── Table 2: Comparison across NSAID types ────────────────────────────────────
comparison_table <- descriptive_data %>%
dplyr::select(
NSAID_TYPE,
`Age (years)` = NACCAGE,
Sex = SEX,
Race = NACCNIHR,
`Education (years)` = EDUC,
`Alzheimer’s diagnosis` = NACCALZD,
`MOCA score` = NACCMOCA,
`Dementia (all-cause)` = DEMENTED,
`CSF Tau positivity` = CSFTAU
) %>%
tbl_summary(
by = NSAID_TYPE, # stratify by NSAID group
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})",
all_categorical() ~ "{n} ({p}%)"
),
missing = "always"
) %>%
add_p( # add group comparisons
test = list(
all_continuous() ~ "kruskal.test",
all_categorical() ~ "chisq.test"
)
) %>%
modify_header(label = "**Variable**", p.value = "**P-value**") %>%
bold_labels() %>%
bold_p(t = 0.05) # bold significant p-values
## The following warnings were returned during `bold_p()`:
## ! For variable `CSF Tau positivity` (`NSAID_TYPE`) and "statistic", "p.value",
## and "parameter" statistics: Chi-squared approximation may be incorrect
## ! For variable `Race` (`NSAID_TYPE`) and "statistic", "p.value", and
## "parameter" statistics: Chi-squared approximation may be incorrect
# Print in R and export to Word
comparison_table
| Variable | DICLOFENAC N = 8791 |
ETODOLAC N = 1511 |
MULTIPLE N = 1741 |
NAPROXEN N = 2,7151 |
NONSAID N = 43,2461 |
P-value2 |
|---|---|---|---|---|---|---|
| Age (years) | 76 (70, 83) | 76 (70, 83) | 78 (71, 85) | 76 (69, 82) | 75 (68, 82) | <0.001 |
| Unknown | 0 | 0 | 0 | 0 | 0 | |
| Sex | <0.001 | |||||
| Male | 277 (32%) | 69 (46%) | 57 (33%) | 1,051 (39%) | 18,809 (43%) | |
| Female | 602 (68%) | 82 (54%) | 117 (67%) | 1,664 (61%) | 24,437 (57%) | |
| Unknown | 0 | 0 | 0 | 0 | 0 | |
| Race | <0.001 | |||||
| White | 659 (75%) | 127 (84%) | 135 (78%) | 2,110 (78%) | 34,266 (79%) | |
| Black or African American | 141 (16%) | 17 (11%) | 30 (17%) | 393 (14%) | 5,415 (13%) | |
| American Indian or Alaska Native | 7 (0.8%) | 1 (0.7%) | 2 (1.1%) | 20 (0.7%) | 258 (0.6%) | |
| Native Hawaiian or Pacific Islander | 1 (0.1%) | 0 (0%) | 0 (0%) | 2 (<0.1%) | 33 (<0.1%) | |
| Asian | 14 (1.6%) | 0 (0%) | 2 (1.1%) | 39 (1.4%) | 1,210 (2.8%) | |
| Multiracial | 43 (4.9%) | 5 (3.3%) | 2 (1.1%) | 118 (4.3%) | 1,291 (3.0%) | |
| Unknown | 14 (1.6%) | 1 (0.7%) | 3 (1.7%) | 33 (1.2%) | 773 (1.8%) | |
| Unknown | 0 | 0 | 0 | 0 | 0 | |
| Education (years) | 16 (13, 18) | 16 (12, 18) | 16 (14, 18) | 16 (13, 18) | 16 (12, 18) | 0.4 |
| Unknown | 4 | 0 | 1 | 15 | 343 | |
| Alzheimer’s diagnosis | <0.001 | |||||
| No cognitive impairment | 164 (19%) | 25 (17%) | 26 (15%) | 467 (17%) | 9,014 (21%) | |
| Yes | 313 (36%) | 63 (42%) | 52 (30%) | 1,013 (37%) | 19,203 (44%) | |
| No (assumed assessed and found not present | 402 (46%) | 63 (42%) | 96 (55%) | 1,235 (45%) | 15,029 (35%) | |
| Unknown | 0 | 0 | 0 | 0 | 0 | |
| MOCA score | 24 (20, 27) | 25 (20, 27) | 24 (19, 28) | 25 (21, 28) | 24 (18, 27) | <0.001 |
| Unknown | 322 | 99 | 55 | 1,255 | 25,258 | |
| Dementia (all-cause) | 252 (29%) | 54 (36%) | 43 (25%) | 889 (33%) | 18,815 (44%) | <0.001 |
| Unknown | 0 | 0 | 0 | 0 | 0 | |
| CSF Tau positivity | 0.11 | |||||
| Negative | 16 (55%) | 0 (0%) | 7 (88%) | 53 (60%) | 691 (53%) | |
| Positive | 13 (45%) | 2 (100%) | 1 (13%) | 36 (40%) | 615 (47%) | |
| Unknown | 850 | 149 | 166 | 2,626 | 41,940 | |
| 1 Median (Q1, Q3); n (%) | ||||||
| 2 Kruskal-Wallis rank sum test; Pearson’s Chi-squared test | ||||||
comparison_table %>%
as_flex_table() %>%
flextable::save_as_docx(path = "Table2.docx")
Collapsed data collapsed data 1 collapsed data 2 Diclofenac collapsed data 3 Naproxen collapsed data 4 Etodolac
# ── Propensity Score Matching ────────────────────────────────
# We compared each NSAID (diclofenac, naproxen, etodolac) vs non-use (NONSAID).
# Steps: (1) define treatment, (2) restrict to complete covariates,
# (3) 1:1 nearest-neighbor PSM, (4) logistic regression for AD diagnosis.
# ── Data Cleaning and Outcome Recoding ────────────────────────────────
# Keep only rows where NACCALZD != 0
# Recode outcome:
# - 8 → 0
# - 1 → 1
collapsed_data <- collapsed_data %>%
filter(NACCALZD != 0) %>%
mutate(NACCALZD = case_when(
NACCALZD == 8 ~ 0,
NACCALZD == 1 ~ 1
))
set.seed(20250421)
# ── Treatment Group (Diclofenac) Definition ────────────────────────────────────────
# Create binary treatment variable:
# - DICLOFENAC → 1
# - NONSAID → 0
# - others → NA (removed later)
collapsed_data2 <- collapsed_data %>%
mutate(treatment_group = ifelse(NSAID_TYPE == "DICLOFENAC", 1,
ifelse(NSAID_TYPE == "NONSAID", 0, NA))) %>%
filter(!is.na(treatment_group))
# ── Missing Data Filtering ────────────────────────────────────────────
# Drop rows with missing values in key covariates
collapsed_data2 <- collapsed_data2 %>%
filter(!is.na(EDUC) & !is.na(NACCNIHR) & !is.na(NACCAGE) &
!is.na(SEX) & !is.na(DIABETES) & !is.na(HYPERT))
# ── Propensity Score Matching ─────────────────────────────────────────
# Nearest-neighbor matching, 1:1 ratio
psm_model_DICLO_NON <- matchit(
treatment_group ~ NACCAGE + SEX + EDUC + NACCNIHR + NACCTBI +
DEP + BIPOLDX + SCHIZOP + ANXIET + PTSDDX + CANCER +
DIABETES + CONGHRT + HYPERT + HYPCHOL + VB12DEF +
THYDIS + ARTH + SLEEPAP,
data = collapsed_data2,
method = "nearest",
ratio = 1)
# Extract matched dataset
matched_data_DICLO_NON <- match.data(psm_model_DICLO_NON)
# ── Diagnostic: Check Unique Values ───────────────────────────────────
# See how many unique non-missing values each variable has
# (Variables with only one level provide no information)
sapply(matched_data_DICLO_NON, function(x) length(unique(na.omit(x))))
## NACCID NACCVNUM NACCAGE SEX NACCNIHR
## 1404 17 60 2 6
## EDUC NACCALZD NACCALZP NACCMOCA CDRGLOB
## 24 2 3 30 5
## NACCMMSE NORMCOG DEMENTED NACCUDSD CSFTAU
## 29 2 2 4 2
## NACCTBI HXHYPER HXSTROKE DEP BIPOLDX
## 2 2 2 2 2
## SCHIZOP ANXIET PTSDDX OTHPSY ALCABUSE
## 1 2 2 2 1
## CANCER DIABETES MYOINF CONGHRT AFIBRILL
## 2 2 2 2 2
## HYPERT HYPCHOL VB12DEF THYDIS ARTH
## 2 2 2 2 2
## SLEEPAP OTHCOND DICLOFENAC NAPROXEN ETODOLAC
## 2 2 2 1 1
## DIAGNOSIS NONSAID MOCATRAI MOCACUBE MOCACLOC
## 14 2 2 2 2
## MOCACLON MOCACLOH MOCANAMI MOCAFLUE MOCAREPE
## 2 2 4 2 3
## MOCAREGI MOCARECN MOCAABST MOCADIGI MOCALETT
## 11 6 3 3 2
## MOCASER7 MOCAORDT MOCAORMO MOCAORYR MOCAORDY
## 4 2 2 2 2
## MOCAORPL MOCAORCT NSAID_TYPE treatment_group distance
## 2 2 2 2 1060
## weights subclass
## 1 702
# ── Factor Handling ───────────────────────────────────────────────────
# Replace missing NACCNIHR with "Unknown" and ensure factor type
matched_data_DICLO_NON$NACCNIHR <- fct_explicit_na(matched_data_DICLO_NON$NACCNIHR, na_level = "Unknown")
matched_data_DICLO_NON$NACCNIHR <- as.factor(matched_data_DICLO_NON$NACCNIHR)
# ── Relabel Factors ───────────────────────────────────────────────────
# Give descriptive labels for treatment_group and NACCALZD
matched_clean_DICLO_NON <- matched_data_DICLO_NON %>%
mutate(
treatment_group = factor(treatment_group, levels = c(0, 1), labels = c("NONSAID", "DICLOFENAC")),
NACCALZD = factor(NACCALZD, levels = c(0, 1), labels = c("No", "Yes")) )
# ── Chi-square Test ───────────────────────────────────────────────────
# Test association between treatment group and NACCALZD outcome
chisq.test(table(matched_clean_DICLO_NON$treatment_group, matched_clean_DICLO_NON$NACCALZD))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(matched_clean_DICLO_NON$treatment_group, matched_clean_DICLO_NON$NACCALZD)
## X-squared = 8.0264, df = 1, p-value = 0.00461
# ── Treatment Group (Naproxen) Definition ─────────────────────────────────────────
# Create treatment group:
# - NAPROXEN → 1
# - NONSAID → 0
# - others → NA (removed later)
collapsed_data3 <- collapsed_data %>%
mutate(treatment_group = ifelse(NSAID_TYPE == "NAPROXEN", 1,
ifelse(NSAID_TYPE == "NONSAID", 0, NA))) %>%
filter(!is.na(treatment_group))
# ── Missing Data Filtering ────────────────────────────────────────────
# Keep only complete cases for important covariates
collapsed_data3 <- collapsed_data3 %>%
filter(!is.na(EDUC) & !is.na(NACCNIHR) & !is.na(NACCAGE) &
!is.na(SEX) & !is.na(DIABETES) & !is.na(HYPERT))
# ── Propensity Score Matching ─────────────────────────────────────────
# Nearest-neighbor matching (1:1 ratio)
psm_model_NAP_NON <- matchit(
treatment_group ~ NACCAGE + SEX + EDUC + NACCNIHR + NACCTBI +
DEP + BIPOLDX + SCHIZOP + ANXIET + PTSDDX + CANCER +
DIABETES + CONGHRT + HYPERT + HYPCHOL + VB12DEF +
THYDIS + ARTH + SLEEPAP,
data = collapsed_data3,
method = "nearest",
ratio = 1)
# Extract matched dataset
matched_data_NAP_NON <- match.data(psm_model_NAP_NON)
# ── Diagnostic: Check Unique Values ───────────────────────────────────
# Count number of unique values per variable
sapply(matched_data_NAP_NON, function(x) length(unique(na.omit(x))))
## NACCID NACCVNUM NACCAGE SEX NACCNIHR
## 4420 17 75 2 6
## EDUC NACCALZD NACCALZP NACCMOCA CDRGLOB
## 29 2 4 31 5
## NACCMMSE NORMCOG DEMENTED NACCUDSD CSFTAU
## 31 2 2 4 2
## NACCTBI HXHYPER HXSTROKE DEP BIPOLDX
## 2 2 2 2 2
## SCHIZOP ANXIET PTSDDX OTHPSY ALCABUSE
## 2 2 2 2 2
## CANCER DIABETES MYOINF CONGHRT AFIBRILL
## 2 2 2 2 2
## HYPERT HYPCHOL VB12DEF THYDIS ARTH
## 2 2 2 2 2
## SLEEPAP OTHCOND DICLOFENAC NAPROXEN ETODOLAC
## 2 2 1 2 1
## DIAGNOSIS NONSAID MOCATRAI MOCACUBE MOCACLOC
## 15 2 2 2 2
## MOCACLON MOCACLOH MOCANAMI MOCAFLUE MOCAREPE
## 2 2 4 2 3
## MOCAREGI MOCARECN MOCAABST MOCADIGI MOCALETT
## 11 6 3 3 2
## MOCASER7 MOCAORDT MOCAORMO MOCAORYR MOCAORDY
## 4 2 2 2 2
## MOCAORPL MOCAORCT NSAID_TYPE treatment_group distance
## 2 2 2 2 3073
## weights subclass
## 1 2210
# ── Chi-square Test ───────────────────────────────────────────────────
# Association between treatment group and Alzheimer’s diagnosis
matched_clean_NAP_NON <- matched_data_NAP_NON %>%
mutate(
treatment_group = factor(treatment_group, levels = c(0, 1), labels = c("NONSAID", "NAPROXEN")),
NACCALZD = factor(NACCALZD, levels = c(0, 1), labels = c("No", "Yes")))
chisq.test(table(matched_clean_NAP_NON$treatment_group, matched_clean_NAP_NON$NACCALZD))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(matched_clean_NAP_NON$treatment_group, matched_clean_NAP_NON$NACCALZD)
## X-squared = 21.768, df = 1, p-value = 3.077e-06
# ── Treatment Group (Etodolac) Definition ─────────────────────────────────────────
# Create treatment group:
# - ETODOLAC → 1
# - NONSAID → 0
# - others → NA (removed later)
collapsed_data4 <- collapsed_data %>%
mutate(treatment_group = ifelse(NSAID_TYPE == "ETODOLAC", 1,
ifelse(NSAID_TYPE == "NONSAID", 0, NA))) %>%
filter(!is.na(treatment_group))
# ── Missing Data Filtering ────────────────────────────────────────────
# Keep only complete cases for important covariates
collapsed_data4 <- collapsed_data4 %>%
filter(!is.na(EDUC) & !is.na(NACCNIHR) & !is.na(NACCAGE) &
!is.na(SEX) & !is.na(DIABETES) & !is.na(HYPERT))
# ── Propensity Score Matching ─────────────────────────────────────────
# Nearest-neighbor matching (1:1 ratio)
psm_model_ETO_NON <- matchit(
treatment_group ~ NACCAGE + SEX + EDUC + NACCNIHR + NACCTBI +
DEP + BIPOLDX + SCHIZOP + ANXIET + PTSDDX + CANCER +
DIABETES + CONGHRT + HYPERT + HYPCHOL + VB12DEF +
THYDIS + ARTH + SLEEPAP,
data = collapsed_data4,
method = "nearest",
ratio = 1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Extract matched dataset
matched_data_ETO_NON <- match.data(psm_model_ETO_NON)
# ── Diagnostic: Check Unique Values ───────────────────────────────────
# Count number of unique values per variable
sapply(matched_data_ETO_NON, function(x) length(unique(na.omit(x))))
## NACCID NACCVNUM NACCAGE SEX NACCNIHR
## 250 15 47 2 4
## EDUC NACCALZD NACCALZP NACCMOCA CDRGLOB
## 20 2 3 24 5
## NACCMMSE NORMCOG DEMENTED NACCUDSD CSFTAU
## 24 2 2 3 2
## NACCTBI HXHYPER HXSTROKE DEP BIPOLDX
## 2 2 2 2 1
## SCHIZOP ANXIET PTSDDX OTHPSY ALCABUSE
## 1 2 1 2 1
## CANCER DIABETES MYOINF CONGHRT AFIBRILL
## 2 2 2 2 2
## HYPERT HYPCHOL VB12DEF THYDIS ARTH
## 2 2 2 2 2
## SLEEPAP OTHCOND DICLOFENAC NAPROXEN ETODOLAC
## 2 2 1 1 2
## DIAGNOSIS NONSAID MOCATRAI MOCACUBE MOCACLOC
## 11 2 2 2 2
## MOCACLON MOCACLOH MOCANAMI MOCAFLUE MOCAREPE
## 2 2 4 2 3
## MOCAREGI MOCARECN MOCAABST MOCADIGI MOCALETT
## 11 6 3 3 2
## MOCASER7 MOCAORDT MOCAORMO MOCAORYR MOCAORDY
## 4 2 2 2 2
## MOCAORPL MOCAORCT NSAID_TYPE treatment_group distance
## 2 2 2 2 177
## weights subclass
## 1 125
## Supplementary tables -------------------------------------------------------
# Recode + clean for table
clean_for_table <- function(df) {
df %>%
dplyr::mutate(
SEX = factor(SEX, levels = c(1, 2), labels = c("Male", "Female")),
NACCALZD = factor(NACCALZD, levels = c(0, 1, 8),
labels = c("No cognitive impairment", "Yes", "No")),
DEMENTED = factor(DEMENTED, levels = c(0, 1), labels = c("No", "Yes")),
CSFTAU = dplyr::case_when(
CSFTAU == 0 ~ "Negative",
CSFTAU == 1 ~ "Positive",
CSFTAU == "Unknown" ~ NA_character_,
TRUE ~ as.character(CSFTAU)
),
DICLOFENAC = factor(DICLOFENAC, levels = c(0, 1), labels = c("No", "Yes")),
NAPROXEN = factor(NAPROXEN, levels = c(0, 1), labels = c("No", "Yes")),
ETODOLAC = factor(ETODOLAC, levels = c(0, 1), labels = c("No", "Yes")),
NONSAID = factor(NONSAID, levels = c(0, 1), labels = c("No", "Yes")),
EDUC = ifelse(EDUC >= 0 & EDUC <= 36, EDUC, NA),
NACCMOCA = ifelse(NACCMOCA >= 0 & NACCMOCA <= 30, NACCMOCA, NA),
NACCNIHR = forcats::fct_recode(as.factor(NACCNIHR),
"White" = "1",
"Black or African American" = "2",
"American Indian or Alaska Native" = "3",
"Asian" = "5",
"Native Hawaiian or Pacific Islander" = "4",
"Multiracial" = "6",
"Unknown" = "99")
) %>%
dplyr::mutate(
NACCNIHR = forcats::fct_explicit_na(NACCNIHR, na_level = "Unknown"),
NACCNIHR = as.factor(NACCNIHR)
)
}
# Build a gtsummary table from a cleaned matched dataset
build_matched_table <- function(clean_df) {
clean_df %>%
dplyr::select(
NSAID_TYPE,
`Age (years)` = NACCAGE,
Sex = SEX,
Race = NACCNIHR,
`Education (years)` = EDUC,
`Alzheimer’s diagnosis` = NACCALZD,
`MOCA score` = NACCMOCA,
`Dementia (all-cause)` = DEMENTED,
`CSF Tau positivity` = CSFTAU
) %>%
gtsummary::tbl_summary(
by = NSAID_TYPE,
statistic = list(
gtsummary::all_continuous() ~ "{median} ({p25},{p75})",
gtsummary::all_categorical() ~ "{n} ({p}%)"
),
missing = "always"
) %>%
gtsummary::add_p(
test = list(
gtsummary::all_continuous() ~ "t.test",
gtsummary::all_categorical() ~ "chisq.test"
)
) %>%
gtsummary::modify_header(label = "**Variable**", p.value = "**P-value**") %>%
gtsummary::bold_p(t = 0.05)
}
# --- DICLOFENAC vs NONSAID ----------------------------------------------------
clean_data_DICLO_NON <- clean_for_table(matched_data_DICLO_NON)
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `NACCNIHR = forcats::fct_recode(...)`.
## Caused by warning:
## ! Unknown levels in `f`: 99
table_matched_DICLO_NON <- build_matched_table(clean_data_DICLO_NON)
## The following warnings were returned during `bold_p()`:
## ! For variable `Race` (`NSAID_TYPE`) and "statistic", "p.value", and
## "parameter" statistics: Chi-squared approximation may be incorrect
# --- NAPROXEN vs NONSAID ------------------------------------------------------
clean_data_NAP_NON <- clean_for_table(matched_data_NAP_NON)
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `NACCNIHR = forcats::fct_recode(...)`.
## Caused by warning:
## ! Unknown levels in `f`: 99
table_matched_NAP_NON <- build_matched_table(clean_data_NAP_NON)
## The following warnings were returned during `bold_p()`:
## ! For variable `Race` (`NSAID_TYPE`) and "statistic", "p.value", and
## "parameter" statistics: Chi-squared approximation may be incorrect
# --- ETODOLAC vs NONSAID ------------------------------------------------------
clean_data_ETO_NON <- clean_for_table(matched_data_ETO_NON)
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `NACCNIHR = forcats::fct_recode(...)`.
## Caused by warning:
## ! Unknown levels in `f`: 99
table_matched_ETO_NON <- build_matched_table(clean_data_ETO_NON)
## The following warnings were returned during `bold_p()`:
## ! For variable `CSF Tau positivity` (`NSAID_TYPE`) and "statistic", "p.value",
## and "parameter" statistics: Chi-squared approximation may be incorrect
## ! For variable `Race` (`NSAID_TYPE`) and "statistic", "p.value", and
## "parameter" statistics: Chi-squared approximation may be incorrect
# --- Export all three as ONE Word file ---------------------------------------
ft_diclo <- gtsummary::as_flex_table(table_matched_DICLO_NON)
ft_nap <- gtsummary::as_flex_table(table_matched_NAP_NON)
ft_eto <- gtsummary::as_flex_table(table_matched_ETO_NON)
flextable::save_as_docx(
"Supplementary Table S1. Diclofenac vs No NSAID (matched sample)" = ft_diclo,
"Supplementary Table S2. Naproxen vs No NSAID (matched sample)" = ft_nap,
"Supplementary Table S3. Etodolac vs No NSAID (matched sample)" = ft_eto,
path = "Supplementary_Tables.docx"
)
# ── Prevalence: Demented / NACCALZD by Treatment ───────────────────────
# This section builds % bar plots (with SE bars and chi-square p-values)
# for DEMENTED and NACCALZD across matched NSAID vs NONSAID groups.
# ── Logistic Models (Unadjusted ORs) ───────────────────────────────────
# Build simple logistic models per drug × outcome (OR/CI)
# DEMENTED models
logistic_model_DEM_DICLO_NON <- glm(
DEMENTED ~ treatment_group,
data = matched_data_DICLO_NON,
family = binomial(link = "logit"))
logistic_model_DEM_NAP_NON <- glm(
DEMENTED ~ treatment_group,
data = matched_data_NAP_NON,
family = binomial(link = "logit"))
logistic_model_DEM_ETO_NON <- glm(
DEMENTED ~ treatment_group,
data = matched_data_ETO_NON,
family = binomial(link = "logit"))
# NACCALZD models
logistic_model_NACCALZD_DICLO_NON <- glm(
NACCALZD ~ treatment_group,
data = matched_data_DICLO_NON,
family = binomial(link = "logit"))
logistic_model_NACCALZD_NAP_NON <- glm(
NACCALZD ~ treatment_group,
data = matched_data_NAP_NON,
family = binomial(link = "logit"))
logistic_model_NACCALZD_ETO_NON <- glm(
NACCALZD ~ treatment_group,
data = matched_data_ETO_NON,
family = binomial(link = "logit"))
# ── Helper: Extract OR/CI from glm for forest plotting ─────────────────
extract_or <- function(model, comparison, outcome_label) {
broom::tidy(model, conf.int = TRUE, exponentiate = TRUE) %>%
filter(term == "treatment_group") %>%
transmute(
group = comparison,
outcome = outcome_label,
OR = estimate,
CI_lower = conf.low,
CI_upper = conf.high,
p_value = p.value
)
}
# ── Collect ORs from all 6 models ─────────────────────────────────────
results_list <- list(
extract_or(logistic_model_DEM_DICLO_NON, "Diclofenac vs No NSAID", "Cognitive Status: demented"),
extract_or(logistic_model_DEM_NAP_NON, "Naproxen vs No NSAID", "Cognitive Status: demented"),
extract_or(logistic_model_DEM_ETO_NON, "Etodolac vs No NSAID", "Cognitive Status: demented"),
extract_or(logistic_model_NACCALZD_DICLO_NON, "Diclofenac vs No NSAID", "Alzheimer’s diagnosis"),
extract_or(logistic_model_NACCALZD_NAP_NON, "Naproxen vs No NSAID", "Alzheimer’s diagnosis"),
extract_or(logistic_model_NACCALZD_ETO_NON, "Etodolac vs No NSAID", "Alzheimer’s diagnosis")
)
logit_summary <- bind_rows(results_list)
# Add significance stars, factor order, and label text
logit_summary <- logit_summary %>%
mutate(
stars = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
p_value < 0.1 ~ ".",
TRUE ~ ""
),
outcome = factor(outcome, levels = c("Cognitive Status: demented", "Alzheimer’s diagnosis")),
group = factor(group, levels = c("Diclofenac vs No NSAID", "Naproxen vs No NSAID", "Etodolac vs No NSAID")),
label_text = paste0("OR = ", sprintf("%.2f", OR), "\n[", sprintf("%.2f", CI_lower), ", ", sprintf("%.2f", CI_upper), "] ", stars)
)
# ── Forest Plot helper ─────────────────────────────────────────────────
make_forest_plot <- function(df, title_text) {
ggplot(df, aes(x = OR, y = outcome)) +
geom_point(size = 2) +
geom_errorbarh(aes(xmin = CI_lower, xmax = CI_upper), height = 0.1) +
geom_vline(xintercept = 1, linetype = "dashed") +
scale_x_log10(limits = c(0.4, 1.5)) +
geom_text(aes(label = stars), hjust = -0.4, vjust = -0.8, size = 4) +
labs(
x = "Odds Ratio (log scale)",
y = NULL,
title = title_text
) +
theme_minimal(base_size = 20) +
theme(
plot.title = element_text(size = 11, face = "bold", hjust = 0.5),
axis.text.x = element_text(size = 11),
axis.text.y = element_text(size = 18),
axis.title.x = element_text(size = 11)
)
}
# ── % Plot Function (Final version, larger fonts) ──────────────────────
plot_NSAID_DEM_NACCALZD_percent <- function(data, var, group_labels, title, treat_value, control_value) {
temp_data <- data %>%
mutate(treatment_group = case_when(
NSAID_TYPE == treat_value ~ 1,
NSAID_TYPE == control_value ~ 0,
TRUE ~ NA_real_
)) %>%
filter(!is.na(treatment_group))
test <- chisq.test(table(temp_data$treatment_group, temp_data[[var]]))
pval <- round(test$p.value, 3)
p_text <- ifelse(pval < 0.001, "p < 0.001", paste0("p = ", pval))
summary_df <- temp_data %>%
group_by(treatment_group) %>%
summarise(
n = n(),
positive = sum(.data[[var]] == 1, na.rm = TRUE),
percent = 100 * positive / n,
se = sqrt((percent / 100) * (1 - percent / 100) / n) * 100,
.groups = "drop"
) %>%
mutate(treatment_group = factor(treatment_group, labels = group_labels))
y_max <- 70
ggplot(summary_df, aes(x = treatment_group, y = percent)) +
geom_col(fill = "steelblue", width = 0.5) +
geom_errorbar(aes(ymin = percent - se, ymax = percent + se), width = 0.2) +
labs(
title = title,
subtitle = p_text,
y = "Percent (%)",
x = NULL
) +
scale_y_continuous(limits = c(0, y_max), expand = expansion(mult = c(0, 0.05))) +
theme_minimal(base_size = 20) +
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 18, hjust = 0.5),
axis.text = element_text(size = 18),
axis.title = element_text(size = 18)
)
}
# ── Build 6 % Plots ───────────────────────────────────────────────────
p1 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_DICLO_NON, "DEMENTED", c("No NSAID", "Diclofenac"), "", "DICLOFENAC", "NONSAID")
p2 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_DICLO_NON, "NACCALZD", c("No NSAID", "Diclofenac"), "", "DICLOFENAC", "NONSAID")
p3 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_NAP_NON, "DEMENTED", c("No NSAID", "Naproxen"), "", "NAPROXEN", "NONSAID")
p4 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_NAP_NON, "NACCALZD", c("No NSAID", "Naproxen"), "", "NAPROXEN", "NONSAID")
p5 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_ETO_NON, "DEMENTED", c("No NSAID", "Etodolac"), "", "ETODOLAC", "NONSAID")
p6 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_ETO_NON, "NACCALZD", c("No NSAID", "Etodolac"), "", "ETODOLAC", "NONSAID")
# ── Forest Panels by Comparison ───────────────────────────────────────
p7 <- make_forest_plot(filter(logit_summary, group == "Diclofenac vs No NSAID"), "Diclofenac vs No NSAID")
p8 <- make_forest_plot(filter(logit_summary, group == "Naproxen vs No NSAID"), "Naproxen vs No NSAID")
p9 <- make_forest_plot(filter(logit_summary, group == "Etodolac vs No NSAID"), "Etodolac vs No NSAID")
# ── Final Composite Layout ────────────────────────────────────────────
final_plot <- (p1 | p2 | p7) /
(p3 | p4 | p8) /
(p5 | p6 | p9) +
plot_layout(guides = "collect") +
plot_annotation(title = "", theme = theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5)))
# Show in console
final_plot
# Save to file
ggsave("Figure1.pdf", final_plot, width = 14, height = 12)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## for 'Alzheimer’s diagnosis' in 'mbcsToSbcs': ' substituted for ’ (U+2019)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## for 'Alzheimer’s diagnosis' in 'mbcsToSbcs': ' substituted for ’ (U+2019)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## for 'Alzheimer’s diagnosis' in 'mbcsToSbcs': ' substituted for ’ (U+2019)
# ── MoCA Score per NSAID Use ──────────────────────────────────────────
# Boxplots of MOCA by treatment (overall, among DEMENTED==1, and among NACCALZD==1)
# with Wilcoxon rank-sum p-values in subtitles.
# ── Helper: Boxplot with Wilcoxon test ─────────────────────────────────
# Inputs:
# data : matched dataset for a specific comparison (has treatment_group 0/1)
# treat_label : c("Control label","Treatment label") used to relabel 0/1
# title_text : plot title
plot_moca_by_treatment <- function(data, treat_label, title_text) {
data <- data %>%
filter(!is.na(NACCMOCA)) %>% # drop missing MOCA
mutate(
treatment_group = factor(
treatment_group,
levels = c(0, 1),
labels = treat_label
)
)
# Wilcoxon rank-sum test comparing MOCA across the two groups
pval <- wilcox.test(NACCMOCA ~ treatment_group, data = data)$p.value
p_text <- ifelse(pval < 0.001, "p < 0.001", paste0("p = ", signif(pval, 3)))
# Boxplot (0–30 MOCA scale), minimal theme
ggplot(data, aes(x = treatment_group, y = NACCMOCA)) +
geom_boxplot(outlier.shape = NA, fill = "steelblue", color = "black") +
theme_minimal(base_size = 20) +
coord_cartesian(ylim = c(0, 30)) +
labs(
title = title_text,
subtitle = p_text,
x = "",
y = "MOCA Score"
) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 16)
)
}
# ── Overall Boxplots (All Participants) ─────────────────────────────────
p1 <- plot_moca_by_treatment(matched_data_DICLO_NON, c("No NSAID", "Diclofenac"), "")
p2 <- plot_moca_by_treatment(matched_data_NAP_NON, c("No NSAID", "Naproxen"), "")
p3 <- plot_moca_by_treatment(matched_data_ETO_NON, c("No NSAID", "Etodolac"), "")
# ── Boxplots Among DEMENTED == 1 ───────────────────────────────────────
p4 <- plot_moca_by_treatment(
matched_data_DICLO_NON %>% filter(DEMENTED == 1),
c("No NSAID", "Diclofenac"), ""
)
p5 <- plot_moca_by_treatment(
matched_data_NAP_NON %>% filter(DEMENTED == 1),
c("No NSAID", "Naproxen"), ""
)
p6 <- plot_moca_by_treatment(
matched_data_ETO_NON %>% filter(DEMENTED == 1),
c("No NSAID", "Etodolac"), ""
)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
# ── Boxplots Among NACCALZD == 1 ───────────────────────────────────────
p7 <- plot_moca_by_treatment(
matched_data_DICLO_NON %>% filter(NACCALZD == 1),
c("No NSAID", "Diclofenac"), ""
)
p8 <- plot_moca_by_treatment(
matched_data_NAP_NON %>% filter(NACCALZD == 1),
c("No NSAID", "Naproxen"), ""
)
p9 <- plot_moca_by_treatment(
matched_data_ETO_NON %>% filter(NACCALZD == 1),
c("No NSAID", "Etodolac"), ""
)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
# ── Save 3×3 Grid to PDF ──────────────────────────────────────────────
# Layout (left→right, top→bottom): DICLO overall / DEMENTED / AD; then NAP; then ETO.
pdf("Figure2.pdf", width = 12, height = 10)
gridExtra::grid.arrange(p1, p4, p7,
p2, p5, p8,
p3, p6, p9,
ncol = 3)
dev.off()
## quartz_off_screen
## 2
# ── MOCA Trajectory for NSAID ─────────────────────────────────────────
# This section fits a linear mixed-effects model of MOCA score over time
# by NSAID group, extracts slopes with CIs, and visualizes trajectories.
# ── NSAID Group Definition ────────────────────────────────────────────
data_moca <- data %>%
mutate(NSAID_TYPE = case_when(
DICLOFENAC == 1 & NAPROXEN == 0 & ETODOLAC == 0 ~ "DICLOFENAC",
NAPROXEN == 1 & DICLOFENAC == 0 & ETODOLAC == 0 ~ "NAPROXEN",
ETODOLAC == 1 & DICLOFENAC == 0 & NAPROXEN == 0 ~ "ETODOLAC",
DICLOFENAC + NAPROXEN + ETODOLAC > 1 ~ "MULTIPLE",
DICLOFENAC == 0 & NAPROXEN == 0 & ETODOLAC == 0 ~ "NONSAID",
TRUE ~ NA_character_
)) %>%
filter(NSAID_TYPE %in% c("NONSAID", "DICLOFENAC", "NAPROXEN", "ETODOLAC", "MULTIPLE")) %>%
group_by(NACCID) %>%
mutate(time_since_baseline = NACCFDYS - min(NACCFDYS)) %>%
ungroup() %>%
mutate(year = time_since_baseline / 365)
# ── Data Checks ───────────────────────────────────────────────────────
naccid_with_moca <- unique(data_moca$NACCID[!is.na(data_moca$NACCMOCA)])
length(naccid_with_moca)
## [1] 20176
# Set out-of-range MOCA values to missing
data_moca$NACCMOCA[ data_moca$NACCMOCA < 0 | data_moca$NACCMOCA > 30 ] <- NA
# Distribution of visits per participant
visits_per_subject <- data_moca %>%
count(NACCID, name = "num_visits")
visit_distribution_table <- visits_per_subject %>%
count(num_visits, name = "num_subjects") %>%
arrange(num_visits)
visit_distribution_table
## # A tibble: 18 × 2
## num_visits num_subjects
## <int> <int>
## 1 1 14784
## 2 2 7894
## 3 3 6051
## 4 4 4771
## 5 5 3332
## 6 6 2524
## 7 7 1916
## 8 8 1415
## 9 9 1114
## 10 10 905
## 11 11 757
## 12 12 537
## 13 13 459
## 14 14 306
## 15 15 219
## 16 16 131
## 17 17 47
## 18 18 3
# ── Factor Levels and Scaling ─────────────────────────────────────────
data_moca$NSAID_TYPE <- factor(data_moca$NSAID_TYPE,
levels = c("NONSAID", "DICLOFENAC", "NAPROXEN", "ETODOLAC", "MULTIPLE"))
data_moca <- data_moca %>%
mutate(NACCFDYS_scaled = scale(NACCFDYS, center = TRUE, scale = TRUE))
# Save scale info for back-transform
sd_days <- attr(data_moca$NACCFDYS_scaled, "scaled:scale")
mean_days <- attr(data_moca$NACCFDYS_scaled, "scaled:center")
# ── Mixed Model ───────────────────────────────────────────────────────
lmm <- lmer(NACCMOCA ~ NACCFDYS_scaled * NSAID_TYPE + (1 + NACCFDYS_scaled | NACCID),
data = data_moca)
# ── Extract Fixed Effects & Variance-Covariance ───────────────────────
coef <- fixef(lmm)
vcov_mat <- vcov(lmm)
# ── Helper: Compute Back-Transformed Slopes + CIs ─────────────────────
get_slope_ci <- function(base, int = 0, var_base, var_int = 0, covar = 0) {
est_scaled <- base + int
se_scaled <- sqrt(var_base + var_int + 2 * covar)
est_day <- est_scaled / sd_days
se_day <- se_scaled / sd_days
ci_day <- est_day + c(-1.96, 1.96) * se_day
est_year <- est_day * 365
se_year <- se_day * 365
ci_year <- est_year + c(-1.96, 1.96) * se_year
data.frame(
estimate_per_day = est_day,
ci_low_day = ci_day[1],
ci_high_day = ci_day[2],
estimate_per_year = est_year,
ci_low_year = ci_year[1],
ci_high_year = ci_year[2]
)
}
# ── Slopes for Each Group ─────────────────────────────────────────────
res_nonsaid <- get_slope_ci(
base = coef["NACCFDYS_scaled"],
var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"])
res_diclo <- get_slope_ci(
base = coef["NACCFDYS_scaled"],
int = coef["NACCFDYS_scaled:NSAID_TYPEDICLOFENAC"],
var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"],
var_int = vcov_mat["NACCFDYS_scaled:NSAID_TYPEDICLOFENAC", "NACCFDYS_scaled:NSAID_TYPEDICLOFENAC"],
covar = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled:NSAID_TYPEDICLOFENAC"])
res_naprox <- get_slope_ci(
base = coef["NACCFDYS_scaled"],
int = coef["NACCFDYS_scaled:NSAID_TYPENAPROXEN"],
var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"],
var_int = vcov_mat["NACCFDYS_scaled:NSAID_TYPENAPROXEN", "NACCFDYS_scaled:NSAID_TYPENAPROXEN"],
covar = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled:NSAID_TYPENAPROXEN"])
res_etodolac <- get_slope_ci(
base = coef["NACCFDYS_scaled"],
int = coef["NACCFDYS_scaled:NSAID_TYPEETODOLAC"],
var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"],
var_int = vcov_mat["NACCFDYS_scaled:NSAID_TYPEETODOLAC", "NACCFDYS_scaled:NSAID_TYPEETODOLAC"],
covar = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled:NSAID_TYPEETODOLAC"])
res_multiple <- get_slope_ci(
base = coef["NACCFDYS_scaled"],
int = coef["NACCFDYS_scaled:NSAID_TYPEMULTIPLE"],
var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"],
var_int = vcov_mat["NACCFDYS_scaled:NSAID_TYPEMULTIPLE", "NACCFDYS_scaled:NSAID_TYPEMULTIPLE"],
covar = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled:NSAID_TYPEMULTIPLE"])
# ── Combine Slopes ────────────────────────────────────────────────────
slope_results <- bind_rows(
NONSAID = res_nonsaid,
DICLOFENAC = res_diclo,
NAPROXEN = res_naprox,
ETODOLAC = res_etodolac,
MULTIPLE = res_multiple,
.id = "NSAID_TYPE")
# ── P-values & Stars ──────────────────────────────────────────────────
slope_pvals <- broom.mixed::tidy(lmm, effects = "fixed") %>%
filter(str_detect(term, "NACCFDYS_scaled")) %>%
mutate(
NSAID_TYPE = dplyr::recode(term,
"NACCFDYS_scaled" = "NONSAID",
"NACCFDYS_scaled:NSAID_TYPEDICLOFENAC" = "DICLOFENAC",
"NACCFDYS_scaled:NSAID_TYPENAPROXEN" = "NAPROXEN",
"NACCFDYS_scaled:NSAID_TYPEETODOLAC" = "ETODOLAC",
"NACCFDYS_scaled:NSAID_TYPEMULTIPLE" = "MULTIPLE"),
stars = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ "",
TRUE ~ ""
)
) %>%
dplyr::select(NSAID_TYPE, p.value, stars)
slope_annotate <- slope_results %>%
left_join(slope_pvals, by = "NSAID_TYPE") %>%
mutate(label = paste0(round(estimate_per_year, 2), " /y ", stars))
# ── Predictions for Plotting ──────────────────────────────────────────
time_years <- seq(0, 15, length.out = 20)
time_days <- time_years * 365
time_scaled <- (time_days - mean_days) / sd_days
newdata <- expand.grid(
NACCFDYS_scaled = time_scaled,
NSAID_TYPE = levels(data_moca$NSAID_TYPE))
Xmat <- model.matrix(~ NACCFDYS_scaled * NSAID_TYPE, newdata)
newdata$predicted <- as.vector(Xmat %*% coef)
se <- sqrt(diag(Xmat %*% vcov_mat %*% t(Xmat)))
newdata$year <- time_days / 365
newdata$ci_low <- newdata$predicted + (-1.96) * se
newdata$ci_high <- newdata$predicted + ( 1.96) * se
# ── Legend Labels ─────────────────────────────────────────────────────
display_names <- c("No NSAID", "Diclofenac", "Naproxen", "Etodolac", "Multiple NSAIDs")
legend_labels <- slope_annotate %>%
mutate(
formatted = paste0(
str_pad(display_names, width = 18, side = "right"),
str_pad(paste0(round(estimate_per_year, 2), " /y ", stars), width = 10, side = "left"))
) %>%
pull(formatted)
# ── Plot Trajectories ─────────────────────────────────────────────────
p <- ggplot(newdata, aes(x = year, y = predicted, color = NSAID_TYPE)) +
geom_line(size = 1.2) +
geom_ribbon(aes(ymin = ci_low, ymax = ci_high, fill = NSAID_TYPE),
alpha = 0.2, color = NA) +
labs(
title = "",
x = "Years Since First Visit",
y = "Predicted MOCA Score",
color = "NSAID Type",
fill = "NSAID Type"
) +
ylim(15, 30) +
scale_color_brewer(
palette = "Set1",
name = "Group (slope per year)",
labels = legend_labels) +
scale_fill_brewer(
palette = "Set1",
name = "Group (slope per year)",
labels = legend_labels) +
theme_classic(base_size = 14)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# ── Save Plot ─────────────────────────────────────────────────────────
ggsave("Fig3.pdf", plot = p, width = 10, height = 6)