Loading Libraries and Data into Environment

# Core data wrangling and string handling
library(dplyr)        # data manipulation (filter, select, mutate, etc.)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)      # string operations

# Table creation and reporting
library(gtsummary)    # publication-style summary tables
library(forcats)      # factor handling (relevel, recode)

# Statistical modeling
library(lmerTest)     # linear mixed-effects models with p-values
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(MatchIt)      # propensity score matching

# Visualization
library(ggplot2)      # core plotting system
library(patchwork)    # arrange multiple ggplots
library(gridExtra)    # arrange multiple plots (grid.arrange)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Misc utilities
library(rlang)        # tidy evaluation
library(rmarkdown)    # knitting & document generation
library(tidyverse)    # collection of core packages (incl. dplyr, ggplot2, etc.)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::%@%()         masks rlang::%@%()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ tidyr::expand()      masks Matrix::expand()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ purrr::flatten()     masks rlang::flatten()
## ✖ purrr::flatten_chr() masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
## ✖ purrr::flatten_int() masks rlang::flatten_int()
## ✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw() masks rlang::flatten_raw()
## ✖ purrr::invoke()      masks rlang::invoke()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tidyr::pack()        masks Matrix::pack()
## ✖ purrr::splice()      masks rlang::splice()
## ✖ tidyr::unpack()      masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)        # fast CSV/TSV reading
library(flextable)    # export gtsummary tables to Word
## 
## Attaching package: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## 
## The following object is masked from 'package:gtsummary':
## 
##     continuous_summary
library(broom)        # tidy model output (glm, lm, etc.)
library(broom.mixed)  # tidy model output (lmer, glmer, etc.)

getwd()
## [1] "/Users/carolinhohne/Documents/charite/Fortbildungen:Kongresse:Poster/diclofenac-AD/R-diclo/NSAID-Dementia"
og_data <- read.csv("Investigator_nacc60.csv")

── Dataset Preparation

data <- og_data   # work on a copy of the original data
# ── Dataset Overview────────────────────────────────────────────────────────

# Count how many unique participants (NACCID) are in the dataset
n_unique_ids <- length(unique(data$NACCID))
n_unique_ids   # print the result
## [1] 47165
# ── Adding NSAID Columns ────────────────────────────────────────────────────────
# Identify all drug-related columns (start with "DRUG")
drug_columns <- grep("^DRUG", names(data), value = TRUE)

# Standardize drug names: lowercase for consistent matching
data[drug_columns] <- lapply(data[drug_columns], tolower)

# Flag diclofenac use: 1 if "diclofenac" appears in any DRUG column
data$DICLOFENAC <- ifelse(rowSums(sapply(data[drug_columns], function(x) grepl("diclofenac", x))) > 0, 1, 0)

# Flag naproxen use
data$NAPROXEN <- ifelse(rowSums(sapply(data[drug_columns], function(x) grepl("naproxen", x))) > 0, 1, 0)

# Flag etodolac use
data$ETODOLAC <- ifelse(rowSums(sapply(data[drug_columns], function(x) grepl("etodolac", x))) > 0, 1, 0)

# Define “NONSAID”: 1 if none of the NSAIDs were used
data$NONSAID <- ifelse(data$DICLOFENAC == 0 & data$NAPROXEN == 0 & data$ETODOLAC == 0, 1, 0)


# ── Diagnostic conditions ─────────────────────────────────────────────────────
diagnosis_cols <- c("NACCTBI", "HXHYPER", "HXSTROKE", "DEP", "BIPOLDX", "SCHIZOP", "ANXIET", "PTSDDX", "OTHPSY", "ALCABUSE", "CANCER","DIABETES", "MYOINF", "CONGHRT", "AFIBRILL", "HYPERT", "HYPCHOL", "VB12DEF", "THYDIS", "ARTH", "SLEEPAP", "OTHCOND")

# Convert diagnosis codes (1 or 2) → binary indicator (1 = has condition, else 0)
data <- data %>%
  mutate(across(all_of(diagnosis_cols), ~ ifelse(. %in% c(1, 2), 1, 0)))

# Collapse per patient (NACCID): take max across visits, count #conditions
diagnosis_summary <- data %>%
  group_by(NACCID) %>%
  summarise(across(all_of(diagnosis_cols), ~ max(.x, na.rm = TRUE))) %>%
  mutate(DIAGNOSIS = rowSums(across(all_of(diagnosis_cols))))

# Join summary back to full data (so each row has DIAGNOSIS count)
data <- data %>%
  left_join(diagnosis_summary %>% dplyr::select(NACCID, DIAGNOSIS), by = "NACCID")


# ── Collapse NSAID use per patient ────────────────────────────────────────────
data <- data %>%
  group_by(NACCID) %>%
  mutate(
    DICLOFENAC = ifelse(any(DICLOFENAC == 1, na.rm = TRUE), 1, 0),
    NAPROXEN   = ifelse(any(NAPROXEN == 1, na.rm = TRUE), 1, 0),
    ETODOLAC   = ifelse(any(ETODOLAC == 1, na.rm = TRUE), 1, 0),
    NONSAID    = ifelse(DICLOFENAC == 0 & NAPROXEN == 0 & ETODOLAC == 0, 1, 0)
  ) %>% ungroup()

# Define mutually exclusive NSAID categories
data <- data %>%
  mutate(NSAID_TYPE = case_when(
    DICLOFENAC == 1 & NAPROXEN == 0 & ETODOLAC == 0 ~ "DICLOFENAC",
    NAPROXEN   == 1 & DICLOFENAC == 0 & ETODOLAC == 0 ~ "NAPROXEN",
    ETODOLAC   == 1 & DICLOFENAC == 0 & NAPROXEN == 0 ~ "ETODOLAC",
    DICLOFENAC + NAPROXEN + ETODOLAC > 1             ~ "MULTIPLE",
    DICLOFENAC == 0 & NAPROXEN == 0 & ETODOLAC == 0  ~ "NONSAID",
    TRUE ~ NA_character_
  ))


# ── Clean numeric codes (set sentinel values to NA) ───────────────────────────
data <- data %>%
  mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., -4))) %>%
  mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 88))) %>%
  mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 95))) %>%
  mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 96))) %>%
  mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 97))) %>%
  mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 98))) %>%
  mutate(across(where(is.numeric) & !all_of("NACCAGE"), ~ na_if(., 99)))

# Identify participants who have at least one valid (non-missing) MOCA score
naccid_with_moca <- unique(data$NACCID[!is.na(data$NACCMOCA)])

# Count how many participants have MOCA data available
length(naccid_with_moca)
## [1] 20176
# ── Most recent visit with the most recent valid MOCA per patient ───────────────────────────────────────

most_recent_data <- data %>%
  group_by(NACCID) %>%
  mutate(
    # Per patient: pull MOCA from the visit with the highest NACCVNUM among valid (0–30)
    latest_NACCMOCA = NACCMOCA[which.max(ifelse(NACCMOCA %in% 0:30, NACCVNUM, -Inf))]
  ) %>%
  ungroup()


most_recent_data <- most_recent_data %>%
  group_by(NACCID) %>%
  filter(NACCVNUM == max(NACCVNUM)) %>%   # keep only the latest visit per patient
  slice_max(NACCVNUM, n = 1) %>%          # (defensive) ensure a single row if ties
  ungroup() %>%
  # Overwrite visit MOCA with the most recent valid MOCA (NA if none)
  mutate(NACCMOCA = latest_NACCMOCA) 


# ── Build collapsed dataset for analysis ──────────────────────────────────────
# keep only variables needed for analysis (IDs, demographics, diagnoses, MOCA subscores, NSAID use)

collapsed_data <- most_recent_data %>%
  dplyr::select(
    NACCID, NACCVNUM, NACCAGE, SEX, NACCNIHR, EDUC, NACCALZD, NACCALZP,
    NACCMOCA, CDRGLOB, NACCMMSE, NORMCOG, DEMENTED, NACCUDSD, CSFTAU,
    NACCTBI, HXHYPER, HXSTROKE, DEP, BIPOLDX, SCHIZOP, ANXIET, PTSDDX, OTHPSY,
    ALCABUSE, CANCER, DIABETES, MYOINF, CONGHRT, AFIBRILL, HYPERT, HYPCHOL,
    VB12DEF, THYDIS, ARTH, SLEEPAP, OTHCOND,
    DICLOFENAC, NAPROXEN, ETODOLAC, DIAGNOSIS, NONSAID,
    MOCATRAI, MOCACUBE, MOCACLOC, MOCACLON, MOCACLOH, MOCANAMI,
    MOCAFLUE, MOCAREPE, MOCAREGI, MOCARECN, MOCAABST, MOCADIGI,
    MOCALETT, MOCASER7, MOCAORDT, MOCAORMO, MOCAORYR, MOCAORDY,
    MOCAORPL, MOCAORCT, NSAID_TYPE)


# Quick summary check of selected variables
summary(collapsed_data[, c("NSAID_TYPE", "NACCAGE", "SEX", "NACCNIHR", "EDUC",
                           "NACCTBI", "DEP", "BIPOLDX", "SCHIZOP", "ANXIET",
                           "PTSDDX", "CANCER", "DIABETES", "CONGHRT", "HYPERT",
                           "HYPCHOL", "VB12DEF", "THYDIS", "ARTH", "SLEEPAP",
                           "DICLOFENAC", "NAPROXEN", "ETODOLAC", "DIAGNOSIS")])
##   NSAID_TYPE           NACCAGE            SEX          NACCNIHR    
##  Length:47165       Min.   : 18.00   Min.   :1.00   Min.   :1.000  
##  Class :character   1st Qu.: 68.00   1st Qu.:1.00   1st Qu.:1.000  
##  Mode  :character   Median : 75.00   Median :2.00   Median :1.000  
##                     Mean   : 74.62   Mean   :1.57   Mean   :1.411  
##                     3rd Qu.: 82.00   3rd Qu.:2.00   3rd Qu.:1.000  
##                     Max.   :110.00   Max.   :2.00   Max.   :6.000  
##                                                     NA's   :824    
##       EDUC          NACCTBI             DEP            BIPOLDX       
##  Min.   : 0.00   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:12.00   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :16.00   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :15.17   Mean   :0.07887   Mean   :0.1763   Mean   :0.00299  
##  3rd Qu.:18.00   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :31.00   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##  NA's   :363                                                         
##     SCHIZOP              ANXIET            PTSDDX             CANCER       
##  Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
##  1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000  
##  Median :0.0000000   Median :0.00000   Median :0.000000   Median :0.00000  
##  Mean   :0.0008057   Mean   :0.02837   Mean   :0.002353   Mean   :0.09348  
##  3rd Qu.:0.0000000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000  
##  Max.   :1.0000000   Max.   :1.00000   Max.   :1.000000   Max.   :1.00000  
##                                                                            
##     DIABETES          CONGHRT            HYPERT          HYPCHOL      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.09492   Mean   :0.01446   Mean   :0.2561   Mean   :0.2704  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##                                                                       
##     VB12DEF            THYDIS             ARTH           SLEEPAP       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.03785   Mean   :0.09844   Mean   :0.2793   Mean   :0.09354  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##                                                                        
##    DICLOFENAC        NAPROXEN          ETODOLAC          DIAGNOSIS     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000000   Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.: 1.000  
##  Median :0.0000   Median :0.00000   Median :0.000000   Median : 2.000  
##  Mean   :0.0219   Mean   :0.06098   Mean   :0.003922   Mean   : 2.692  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.: 4.000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.000000   Max.   :15.000  
## 
# ── Convert variables  ──────────────────────────────────
collapsed_data <- collapsed_data %>%
  mutate(
    across(c(
      SEX, NACCNIHR, NACCTBI, DEP, BIPOLDX, SCHIZOP, ANXIET, PTSDDX,
      CANCER, DIABETES, CONGHRT, HYPERT, HYPCHOL, VB12DEF, THYDIS, ARTH,
      SLEEPAP, DICLOFENAC, NAPROXEN, ETODOLAC, NONSAID
    ), as.factor),
    DIAGNOSIS = as.numeric(DIAGNOSIS),
    CSFTAU = factor(case_when(
                      CSFTAU == 1 ~ "Positive",
                      CSFTAU == 0 ~ "Negative",
                      TRUE        ~ NA_character_),
                    levels = c("Negative", "Positive")))

────── Descriptive Analysis of Study Population

# ── Recode & clean variables for descriptives ─────────────────────────────────
descriptive_data <- collapsed_data %>%
  mutate(
    # ── Demographics ──
    SEX       = factor(SEX, levels = c(1, 2), labels = c("Male", "Female")),
    NACCNIHR  = fct_recode(as.factor(NACCNIHR),       # recode race categories
                           "White" = "1",
                           "Black or African American" = "2",
                           "American Indian or Alaska Native" = "3",
                           "Asian" = "5",
                           "Native Hawaiian or Pacific Islander" = "4",
                           "Multiracial" = "6",
                           "Unknown" = "99") |> 
                fct_explicit_na("Unknown"),          
    EDUC      = ifelse(EDUC %in% 0:36, EDUC, NA),    # restrict education years to 0–36

    # ── Clinical diagnosis outcomes ──
    NACCALZD  = factor(NACCALZD, levels = c(0, 1, 8),
                       labels = c("No cognitive impairment", "Yes", "No (assumed assessed and found not present")),
    DEMENTED  = factor(DEMENTED, levels = c(0, 1), labels = c("No", "Yes")),

    # ── Cognitive test scores ──
    NACCMOCA = ifelse(NACCMOCA %in% 0:30, NACCMOCA, NA)   # restrict to valid MOCA range
  )
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `NACCNIHR = fct_explicit_na(...)`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# ── Table 1: Baseline descriptives 
summary_table1 <- descriptive_data %>%
  dplyr::select(
    `Age (years)`                        = NACCAGE,
    `Sex`                                = SEX,
    `Race`                               = NACCNIHR,
    `Education (years)`                  = EDUC,
    `Alzheimer’s disease diagnosis (presumptive)` = NACCALZD,
    `Education-corrected MOCA score`     = NACCMOCA,
    `Dementia (all-cause)`               = DEMENTED,
    `CSF Tau positivity`                 = CSFTAU,
    `NSAID Use`                          = NSAID_TYPE
  ) %>%
  tbl_summary(
    statistic = list(
      all_continuous()  ~ "{median} ({p25},{p75})",   # medians + IQR
      all_categorical() ~ "{n} ({p}%)"                # counts + %
    ),
    missing = "always"                                # always report missing
  ) %>%
  modify_header(label = "**Variable**")

# Print in R and export to Word
summary_table1
Variable N = 47,1651
Age (years) 75 (68,82)
    Unknown 0
Sex
    Male 20,263 (43%)
    Female 26,902 (57%)
    Unknown 0
Race
    White 37,297 (79%)
    Black or African American 5,996 (13%)
    American Indian or Alaska Native 288 (0.6%)
    Native Hawaiian or Pacific Islander 36 (<0.1%)
    Asian 1,265 (2.7%)
    Multiracial 1,459 (3.1%)
    Unknown 824 (1.7%)
    Unknown 0
Education (years) 16 (12,18)
    Unknown 363
Alzheimer’s disease diagnosis (presumptive)
    No cognitive impairment 9,696 (21%)
    Yes 20,644 (44%)
    No (assumed assessed and found not present 16,825 (36%)
    Unknown 0
Education-corrected MOCA score 24 (19,27)
    Unknown 26,989
Dementia (all-cause) 20,053 (43%)
    Unknown 0
CSF Tau positivity
    Negative 767 (53%)
    Positive 667 (47%)
    Unknown 45,731
NSAID Use
    DICLOFENAC 879 (1.9%)
    ETODOLAC 151 (0.3%)
    MULTIPLE 174 (0.4%)
    NAPROXEN 2,715 (5.8%)
    NONSAID 43,246 (92%)
    Unknown 0
1 Median (Q1,Q3); n (%)
summary_table1 %>% as_flex_table() %>% flextable::save_as_docx(path = "Table1.docx")


# ── Table 2: Comparison across NSAID types ────────────────────────────────────
comparison_table <- descriptive_data %>%
 dplyr::select(
    NSAID_TYPE,
    `Age (years)`           = NACCAGE,
    Sex                     = SEX,
    Race                    = NACCNIHR,
    `Education (years)`     = EDUC,
    `Alzheimer’s diagnosis` = NACCALZD,
    `MOCA score`            = NACCMOCA,
    `Dementia (all-cause)`  = DEMENTED,
    `CSF Tau positivity`    = CSFTAU
  ) %>%
  tbl_summary(
    by = NSAID_TYPE,                                # stratify by NSAID group
    statistic = list(
      all_continuous()  ~ "{median} ({p25}, {p75})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "always"
  ) %>%
  add_p(                                             # add group comparisons
    test = list(
      all_continuous()  ~ "kruskal.test",
      all_categorical() ~ "chisq.test"
    )
  ) %>%
  modify_header(label = "**Variable**", p.value = "**P-value**") %>%
  bold_labels() %>%
  bold_p(t = 0.05)                                  # bold significant p-values
## The following warnings were returned during `bold_p()`:
## ! For variable `CSF Tau positivity` (`NSAID_TYPE`) and "statistic", "p.value",
##   and "parameter" statistics: Chi-squared approximation may be incorrect
## ! For variable `Race` (`NSAID_TYPE`) and "statistic", "p.value", and
##   "parameter" statistics: Chi-squared approximation may be incorrect
# Print in R and export to Word
comparison_table
Variable DICLOFENAC
N = 879
1
ETODOLAC
N = 151
1
MULTIPLE
N = 174
1
NAPROXEN
N = 2,715
1
NONSAID
N = 43,246
1
P-value2
Age (years) 76 (70, 83) 76 (70, 83) 78 (71, 85) 76 (69, 82) 75 (68, 82) <0.001
    Unknown 0 0 0 0 0
Sex




<0.001
    Male 277 (32%) 69 (46%) 57 (33%) 1,051 (39%) 18,809 (43%)
    Female 602 (68%) 82 (54%) 117 (67%) 1,664 (61%) 24,437 (57%)
    Unknown 0 0 0 0 0
Race




<0.001
    White 659 (75%) 127 (84%) 135 (78%) 2,110 (78%) 34,266 (79%)
    Black or African American 141 (16%) 17 (11%) 30 (17%) 393 (14%) 5,415 (13%)
    American Indian or Alaska Native 7 (0.8%) 1 (0.7%) 2 (1.1%) 20 (0.7%) 258 (0.6%)
    Native Hawaiian or Pacific Islander 1 (0.1%) 0 (0%) 0 (0%) 2 (<0.1%) 33 (<0.1%)
    Asian 14 (1.6%) 0 (0%) 2 (1.1%) 39 (1.4%) 1,210 (2.8%)
    Multiracial 43 (4.9%) 5 (3.3%) 2 (1.1%) 118 (4.3%) 1,291 (3.0%)
    Unknown 14 (1.6%) 1 (0.7%) 3 (1.7%) 33 (1.2%) 773 (1.8%)
    Unknown 0 0 0 0 0
Education (years) 16 (13, 18) 16 (12, 18) 16 (14, 18) 16 (13, 18) 16 (12, 18) 0.4
    Unknown 4 0 1 15 343
Alzheimer’s diagnosis




<0.001
    No cognitive impairment 164 (19%) 25 (17%) 26 (15%) 467 (17%) 9,014 (21%)
    Yes 313 (36%) 63 (42%) 52 (30%) 1,013 (37%) 19,203 (44%)
    No (assumed assessed and found not present 402 (46%) 63 (42%) 96 (55%) 1,235 (45%) 15,029 (35%)
    Unknown 0 0 0 0 0
MOCA score 24 (20, 27) 25 (20, 27) 24 (19, 28) 25 (21, 28) 24 (18, 27) <0.001
    Unknown 322 99 55 1,255 25,258
Dementia (all-cause) 252 (29%) 54 (36%) 43 (25%) 889 (33%) 18,815 (44%) <0.001
    Unknown 0 0 0 0 0
CSF Tau positivity




0.11
    Negative 16 (55%) 0 (0%) 7 (88%) 53 (60%) 691 (53%)
    Positive 13 (45%) 2 (100%) 1 (13%) 36 (40%) 615 (47%)
    Unknown 850 149 166 2,626 41,940
1 Median (Q1, Q3); n (%)
2 Kruskal-Wallis rank sum test; Pearson’s Chi-squared test
comparison_table %>%
  as_flex_table() %>%
  flextable::save_as_docx(path = "Table2.docx")

── Propensity Score Matching

Collapsed data collapsed data 1 collapsed data 2 Diclofenac collapsed data 3 Naproxen collapsed data 4 Etodolac

# ── Propensity Score Matching ────────────────────────────────
# We compared each NSAID (diclofenac, naproxen, etodolac) vs non-use (NONSAID).
# Steps: (1) define treatment, (2) restrict to complete covariates, 
# (3) 1:1 nearest-neighbor PSM, (4) logistic regression for AD diagnosis.

# ── Data Cleaning and Outcome Recoding ────────────────────────────────
# Keep only rows where NACCALZD != 0
# Recode outcome:
#   - 8 → 0
#   - 1 → 1
collapsed_data <- collapsed_data %>% 
  filter(NACCALZD != 0) %>%
  mutate(NACCALZD = case_when(
    NACCALZD == 8 ~ 0,
    NACCALZD == 1 ~ 1
  ))

set.seed(20250421)

# ── Treatment Group (Diclofenac) Definition ────────────────────────────────────────
# Create binary treatment variable:
#   - DICLOFENAC → 1
#   - NONSAID → 0
#   - others → NA (removed later)
collapsed_data2 <- collapsed_data %>%
  mutate(treatment_group = ifelse(NSAID_TYPE == "DICLOFENAC", 1,
                           ifelse(NSAID_TYPE == "NONSAID", 0, NA))) %>% 
  filter(!is.na(treatment_group))

# ── Missing Data Filtering ────────────────────────────────────────────
# Drop rows with missing values in key covariates
collapsed_data2 <- collapsed_data2 %>%
  filter(!is.na(EDUC) & !is.na(NACCNIHR) & !is.na(NACCAGE) & 
         !is.na(SEX) & !is.na(DIABETES) & !is.na(HYPERT))

# ── Propensity Score Matching ─────────────────────────────────────────
# Nearest-neighbor matching, 1:1 ratio
psm_model_DICLO_NON <- matchit(
  treatment_group ~ NACCAGE + SEX + EDUC + NACCNIHR + NACCTBI + 
                   DEP + BIPOLDX + SCHIZOP + ANXIET + PTSDDX + CANCER + 
                   DIABETES + CONGHRT + HYPERT + HYPCHOL + VB12DEF + 
                   THYDIS + ARTH + SLEEPAP,
  data = collapsed_data2,
  method = "nearest",   
  ratio = 1)

# Extract matched dataset
matched_data_DICLO_NON <- match.data(psm_model_DICLO_NON)

# ── Diagnostic: Check Unique Values ───────────────────────────────────
# See how many unique non-missing values each variable has
# (Variables with only one level provide no information)
sapply(matched_data_DICLO_NON, function(x) length(unique(na.omit(x)))) 
##          NACCID        NACCVNUM         NACCAGE             SEX        NACCNIHR 
##            1404              17              60               2               6 
##            EDUC        NACCALZD        NACCALZP        NACCMOCA         CDRGLOB 
##              24               2               3              30               5 
##        NACCMMSE         NORMCOG        DEMENTED        NACCUDSD          CSFTAU 
##              29               2               2               4               2 
##         NACCTBI         HXHYPER        HXSTROKE             DEP         BIPOLDX 
##               2               2               2               2               2 
##         SCHIZOP          ANXIET          PTSDDX          OTHPSY        ALCABUSE 
##               1               2               2               2               1 
##          CANCER        DIABETES          MYOINF         CONGHRT        AFIBRILL 
##               2               2               2               2               2 
##          HYPERT         HYPCHOL         VB12DEF          THYDIS            ARTH 
##               2               2               2               2               2 
##         SLEEPAP         OTHCOND      DICLOFENAC        NAPROXEN        ETODOLAC 
##               2               2               2               1               1 
##       DIAGNOSIS         NONSAID        MOCATRAI        MOCACUBE        MOCACLOC 
##              14               2               2               2               2 
##        MOCACLON        MOCACLOH        MOCANAMI        MOCAFLUE        MOCAREPE 
##               2               2               4               2               3 
##        MOCAREGI        MOCARECN        MOCAABST        MOCADIGI        MOCALETT 
##              11               6               3               3               2 
##        MOCASER7        MOCAORDT        MOCAORMO        MOCAORYR        MOCAORDY 
##               4               2               2               2               2 
##        MOCAORPL        MOCAORCT      NSAID_TYPE treatment_group        distance 
##               2               2               2               2            1060 
##         weights        subclass 
##               1             702
# ── Factor Handling ───────────────────────────────────────────────────
# Replace missing NACCNIHR with "Unknown" and ensure factor type
matched_data_DICLO_NON$NACCNIHR <- fct_explicit_na(matched_data_DICLO_NON$NACCNIHR, na_level = "Unknown")
matched_data_DICLO_NON$NACCNIHR <- as.factor(matched_data_DICLO_NON$NACCNIHR)

# ── Relabel Factors ───────────────────────────────────────────────────
# Give descriptive labels for treatment_group and NACCALZD
matched_clean_DICLO_NON <- matched_data_DICLO_NON %>%
  mutate(
    treatment_group = factor(treatment_group, levels = c(0, 1), labels = c("NONSAID", "DICLOFENAC")),
    NACCALZD = factor(NACCALZD, levels = c(0, 1), labels = c("No", "Yes")) )

# ── Chi-square Test ───────────────────────────────────────────────────
# Test association between treatment group and NACCALZD outcome
chisq.test(table(matched_clean_DICLO_NON$treatment_group, matched_clean_DICLO_NON$NACCALZD))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(matched_clean_DICLO_NON$treatment_group, matched_clean_DICLO_NON$NACCALZD)
## X-squared = 8.0264, df = 1, p-value = 0.00461
# ── Treatment Group (Naproxen) Definition ─────────────────────────────────────────
# Create treatment group:
#   - NAPROXEN → 1
#   - NONSAID  → 0
#   - others   → NA (removed later)
collapsed_data3 <- collapsed_data %>%
  mutate(treatment_group = ifelse(NSAID_TYPE == "NAPROXEN", 1,
                           ifelse(NSAID_TYPE == "NONSAID", 0, NA))) %>%  
  filter(!is.na(treatment_group))  

# ── Missing Data Filtering ────────────────────────────────────────────
# Keep only complete cases for important covariates
collapsed_data3 <- collapsed_data3 %>%
  filter(!is.na(EDUC) & !is.na(NACCNIHR) & !is.na(NACCAGE) & 
         !is.na(SEX) & !is.na(DIABETES) & !is.na(HYPERT))

# ── Propensity Score Matching ─────────────────────────────────────────
# Nearest-neighbor matching (1:1 ratio)
psm_model_NAP_NON <- matchit(
  treatment_group ~ NACCAGE + SEX + EDUC + NACCNIHR + NACCTBI +
    DEP + BIPOLDX + SCHIZOP + ANXIET + PTSDDX + CANCER +
    DIABETES + CONGHRT + HYPERT + HYPCHOL + VB12DEF +
    THYDIS + ARTH + SLEEPAP,
  data = collapsed_data3,
  method = "nearest",
  ratio = 1)

# Extract matched dataset
matched_data_NAP_NON <- match.data(psm_model_NAP_NON)

# ── Diagnostic: Check Unique Values ───────────────────────────────────
# Count number of unique values per variable
sapply(matched_data_NAP_NON, function(x) length(unique(na.omit(x))))
##          NACCID        NACCVNUM         NACCAGE             SEX        NACCNIHR 
##            4420              17              75               2               6 
##            EDUC        NACCALZD        NACCALZP        NACCMOCA         CDRGLOB 
##              29               2               4              31               5 
##        NACCMMSE         NORMCOG        DEMENTED        NACCUDSD          CSFTAU 
##              31               2               2               4               2 
##         NACCTBI         HXHYPER        HXSTROKE             DEP         BIPOLDX 
##               2               2               2               2               2 
##         SCHIZOP          ANXIET          PTSDDX          OTHPSY        ALCABUSE 
##               2               2               2               2               2 
##          CANCER        DIABETES          MYOINF         CONGHRT        AFIBRILL 
##               2               2               2               2               2 
##          HYPERT         HYPCHOL         VB12DEF          THYDIS            ARTH 
##               2               2               2               2               2 
##         SLEEPAP         OTHCOND      DICLOFENAC        NAPROXEN        ETODOLAC 
##               2               2               1               2               1 
##       DIAGNOSIS         NONSAID        MOCATRAI        MOCACUBE        MOCACLOC 
##              15               2               2               2               2 
##        MOCACLON        MOCACLOH        MOCANAMI        MOCAFLUE        MOCAREPE 
##               2               2               4               2               3 
##        MOCAREGI        MOCARECN        MOCAABST        MOCADIGI        MOCALETT 
##              11               6               3               3               2 
##        MOCASER7        MOCAORDT        MOCAORMO        MOCAORYR        MOCAORDY 
##               4               2               2               2               2 
##        MOCAORPL        MOCAORCT      NSAID_TYPE treatment_group        distance 
##               2               2               2               2            3073 
##         weights        subclass 
##               1            2210
# ── Chi-square Test ───────────────────────────────────────────────────
# Association between treatment group and Alzheimer’s diagnosis
matched_clean_NAP_NON <- matched_data_NAP_NON %>%
  mutate(
    treatment_group = factor(treatment_group, levels = c(0, 1), labels = c("NONSAID", "NAPROXEN")),
    NACCALZD = factor(NACCALZD, levels = c(0, 1), labels = c("No", "Yes")))

chisq.test(table(matched_clean_NAP_NON$treatment_group, matched_clean_NAP_NON$NACCALZD))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(matched_clean_NAP_NON$treatment_group, matched_clean_NAP_NON$NACCALZD)
## X-squared = 21.768, df = 1, p-value = 3.077e-06
# ── Treatment Group (Etodolac) Definition ─────────────────────────────────────────
# Create treatment group:
#   - ETODOLAC → 1
#   - NONSAID  → 0
#   - others   → NA (removed later)
collapsed_data4 <- collapsed_data %>%
  mutate(treatment_group = ifelse(NSAID_TYPE == "ETODOLAC", 1,
                           ifelse(NSAID_TYPE == "NONSAID", 0, NA))) %>%  
  filter(!is.na(treatment_group))  

# ── Missing Data Filtering ────────────────────────────────────────────
# Keep only complete cases for important covariates
collapsed_data4 <- collapsed_data4 %>%
  filter(!is.na(EDUC) & !is.na(NACCNIHR) & !is.na(NACCAGE) & 
         !is.na(SEX) & !is.na(DIABETES) & !is.na(HYPERT))

# ── Propensity Score Matching ─────────────────────────────────────────
# Nearest-neighbor matching (1:1 ratio)
psm_model_ETO_NON <- matchit(
  treatment_group ~ NACCAGE + SEX + EDUC + NACCNIHR + NACCTBI +
    DEP + BIPOLDX + SCHIZOP + ANXIET + PTSDDX + CANCER +
    DIABETES + CONGHRT + HYPERT + HYPCHOL + VB12DEF +
    THYDIS + ARTH + SLEEPAP,
  data = collapsed_data4,
  method = "nearest",
  ratio = 1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Extract matched dataset
matched_data_ETO_NON <- match.data(psm_model_ETO_NON)

# ── Diagnostic: Check Unique Values ───────────────────────────────────
# Count number of unique values per variable
sapply(matched_data_ETO_NON, function(x) length(unique(na.omit(x))))
##          NACCID        NACCVNUM         NACCAGE             SEX        NACCNIHR 
##             250              15              47               2               4 
##            EDUC        NACCALZD        NACCALZP        NACCMOCA         CDRGLOB 
##              20               2               3              24               5 
##        NACCMMSE         NORMCOG        DEMENTED        NACCUDSD          CSFTAU 
##              24               2               2               3               2 
##         NACCTBI         HXHYPER        HXSTROKE             DEP         BIPOLDX 
##               2               2               2               2               1 
##         SCHIZOP          ANXIET          PTSDDX          OTHPSY        ALCABUSE 
##               1               2               1               2               1 
##          CANCER        DIABETES          MYOINF         CONGHRT        AFIBRILL 
##               2               2               2               2               2 
##          HYPERT         HYPCHOL         VB12DEF          THYDIS            ARTH 
##               2               2               2               2               2 
##         SLEEPAP         OTHCOND      DICLOFENAC        NAPROXEN        ETODOLAC 
##               2               2               1               1               2 
##       DIAGNOSIS         NONSAID        MOCATRAI        MOCACUBE        MOCACLOC 
##              11               2               2               2               2 
##        MOCACLON        MOCACLOH        MOCANAMI        MOCAFLUE        MOCAREPE 
##               2               2               4               2               3 
##        MOCAREGI        MOCARECN        MOCAABST        MOCADIGI        MOCALETT 
##              11               6               3               3               2 
##        MOCASER7        MOCAORDT        MOCAORMO        MOCAORYR        MOCAORDY 
##               4               2               2               2               2 
##        MOCAORPL        MOCAORCT      NSAID_TYPE treatment_group        distance 
##               2               2               2               2             177 
##         weights        subclass 
##               1             125
## Supplementary tables  -------------------------------------------------------


# Recode + clean for table 
clean_for_table <- function(df) {
  df %>%
    dplyr::mutate(
      SEX       = factor(SEX, levels = c(1, 2), labels = c("Male", "Female")),
      NACCALZD  = factor(NACCALZD, levels = c(0, 1, 8),
                         labels = c("No cognitive impairment", "Yes", "No")),
      DEMENTED  = factor(DEMENTED, levels = c(0, 1), labels = c("No", "Yes")),
      CSFTAU    = dplyr::case_when(
                     CSFTAU == 0 ~ "Negative",
                     CSFTAU == 1 ~ "Positive",
                     CSFTAU == "Unknown" ~ NA_character_,
                     TRUE ~ as.character(CSFTAU)
                   ),
      DICLOFENAC = factor(DICLOFENAC, levels = c(0, 1), labels = c("No", "Yes")),
      NAPROXEN   = factor(NAPROXEN,   levels = c(0, 1), labels = c("No", "Yes")),
      ETODOLAC   = factor(ETODOLAC,   levels = c(0, 1), labels = c("No", "Yes")),
      NONSAID    = factor(NONSAID,    levels = c(0, 1), labels = c("No", "Yes")),
      EDUC       = ifelse(EDUC >= 0 & EDUC <= 36, EDUC, NA),
      NACCMOCA   = ifelse(NACCMOCA >= 0 & NACCMOCA <= 30, NACCMOCA, NA),
      NACCNIHR   = forcats::fct_recode(as.factor(NACCNIHR),
                        "White" = "1",
                        "Black or African American" = "2",
                        "American Indian or Alaska Native" = "3",
                        "Asian" = "5",
                        "Native Hawaiian or Pacific Islander" = "4",
                        "Multiracial" = "6",
                        "Unknown" = "99")
    ) %>%
    dplyr::mutate(
      NACCNIHR = forcats::fct_explicit_na(NACCNIHR, na_level = "Unknown"),
      NACCNIHR = as.factor(NACCNIHR)
    )
}

# Build a gtsummary table from a cleaned matched dataset
build_matched_table <- function(clean_df) {
  clean_df %>%
    dplyr::select(
      NSAID_TYPE,
      `Age (years)`            = NACCAGE,
      Sex                      = SEX,
      Race                     = NACCNIHR,
      `Education (years)`      = EDUC,
      `Alzheimer’s diagnosis`  = NACCALZD,
      `MOCA score`             = NACCMOCA,
      `Dementia (all-cause)`   = DEMENTED,
      `CSF Tau positivity`     = CSFTAU
    ) %>%
    gtsummary::tbl_summary(
      by = NSAID_TYPE,
      statistic = list(
        gtsummary::all_continuous()  ~ "{median} ({p25},{p75})",
        gtsummary::all_categorical() ~ "{n} ({p}%)"
      ),
      missing = "always"
    ) %>%
    gtsummary::add_p(
      test = list(
        gtsummary::all_continuous()  ~ "t.test",
        gtsummary::all_categorical() ~ "chisq.test"
      )
    ) %>%
    gtsummary::modify_header(label = "**Variable**", p.value = "**P-value**") %>%
    gtsummary::bold_p(t = 0.05)
}

# --- DICLOFENAC vs NONSAID ----------------------------------------------------

clean_data_DICLO_NON <- clean_for_table(matched_data_DICLO_NON)
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `NACCNIHR = forcats::fct_recode(...)`.
## Caused by warning:
## ! Unknown levels in `f`: 99
table_matched_DICLO_NON <- build_matched_table(clean_data_DICLO_NON)
## The following warnings were returned during `bold_p()`:
## ! For variable `Race` (`NSAID_TYPE`) and "statistic", "p.value", and
##   "parameter" statistics: Chi-squared approximation may be incorrect
# --- NAPROXEN vs NONSAID ------------------------------------------------------

clean_data_NAP_NON <- clean_for_table(matched_data_NAP_NON)
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `NACCNIHR = forcats::fct_recode(...)`.
## Caused by warning:
## ! Unknown levels in `f`: 99
table_matched_NAP_NON <- build_matched_table(clean_data_NAP_NON)
## The following warnings were returned during `bold_p()`:
## ! For variable `Race` (`NSAID_TYPE`) and "statistic", "p.value", and
##   "parameter" statistics: Chi-squared approximation may be incorrect
# --- ETODOLAC vs NONSAID ------------------------------------------------------

clean_data_ETO_NON <- clean_for_table(matched_data_ETO_NON)
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `NACCNIHR = forcats::fct_recode(...)`.
## Caused by warning:
## ! Unknown levels in `f`: 99
table_matched_ETO_NON <- build_matched_table(clean_data_ETO_NON)
## The following warnings were returned during `bold_p()`:
## ! For variable `CSF Tau positivity` (`NSAID_TYPE`) and "statistic", "p.value",
##   and "parameter" statistics: Chi-squared approximation may be incorrect
## ! For variable `Race` (`NSAID_TYPE`) and "statistic", "p.value", and
##   "parameter" statistics: Chi-squared approximation may be incorrect
# --- Export all three as ONE Word file ---------------------------------------

ft_diclo <- gtsummary::as_flex_table(table_matched_DICLO_NON)
ft_nap   <- gtsummary::as_flex_table(table_matched_NAP_NON)
ft_eto   <- gtsummary::as_flex_table(table_matched_ETO_NON)

flextable::save_as_docx(
  "Supplementary Table S1. Diclofenac vs No NSAID (matched sample)" = ft_diclo,
  "Supplementary Table S2. Naproxen vs No NSAID (matched sample)"   = ft_nap,
  "Supplementary Table S3. Etodolac vs No NSAID (matched sample)"   = ft_eto,
  path = "Supplementary_Tables.docx"
)

── Prevalence and Odds: Demented / NACCALZD by Treatment

# ── Prevalence: Demented / NACCALZD by Treatment ───────────────────────
# This section builds % bar plots (with SE bars and chi-square p-values)
# for DEMENTED and NACCALZD across matched NSAID vs NONSAID groups.

# ── Logistic Models (Unadjusted ORs) ───────────────────────────────────
# Build simple logistic models per drug × outcome (OR/CI)

# DEMENTED models
logistic_model_DEM_DICLO_NON <- glm(
  DEMENTED ~ treatment_group,
  data = matched_data_DICLO_NON,
  family = binomial(link = "logit"))

logistic_model_DEM_NAP_NON <- glm(
  DEMENTED ~ treatment_group,
  data = matched_data_NAP_NON,
  family = binomial(link = "logit"))

logistic_model_DEM_ETO_NON <- glm(
  DEMENTED ~ treatment_group,
  data = matched_data_ETO_NON,
  family = binomial(link = "logit"))

# NACCALZD models
logistic_model_NACCALZD_DICLO_NON <- glm(
  NACCALZD ~ treatment_group,
  data = matched_data_DICLO_NON,
  family = binomial(link = "logit"))

logistic_model_NACCALZD_NAP_NON <- glm(
  NACCALZD ~ treatment_group,
  data = matched_data_NAP_NON,
  family = binomial(link = "logit"))

logistic_model_NACCALZD_ETO_NON <- glm(
  NACCALZD ~ treatment_group,
  data = matched_data_ETO_NON,
  family = binomial(link = "logit"))

# ── Helper: Extract OR/CI from glm for forest plotting ─────────────────
extract_or <- function(model, comparison, outcome_label) {
  broom::tidy(model, conf.int = TRUE, exponentiate = TRUE) %>%
    filter(term == "treatment_group") %>%
    transmute(
      group = comparison,
      outcome = outcome_label,
      OR = estimate,
      CI_lower = conf.low,
      CI_upper = conf.high,
      p_value = p.value
    )
}

# ── Collect ORs from all 6 models ─────────────────────────────────────
results_list <- list(
  extract_or(logistic_model_DEM_DICLO_NON,      "Diclofenac vs No NSAID", "Cognitive Status: demented"),
  extract_or(logistic_model_DEM_NAP_NON,        "Naproxen vs No NSAID",   "Cognitive Status: demented"),
  extract_or(logistic_model_DEM_ETO_NON,        "Etodolac vs No NSAID",   "Cognitive Status: demented"),
  extract_or(logistic_model_NACCALZD_DICLO_NON, "Diclofenac vs No NSAID", "Alzheimer’s diagnosis"),
  extract_or(logistic_model_NACCALZD_NAP_NON,   "Naproxen vs No NSAID",   "Alzheimer’s diagnosis"),
  extract_or(logistic_model_NACCALZD_ETO_NON,   "Etodolac vs No NSAID",   "Alzheimer’s diagnosis")
)

logit_summary <- bind_rows(results_list)

# Add significance stars, factor order, and label text
logit_summary <- logit_summary %>%
  mutate(
    stars = case_when(
      p_value < 0.001 ~ "***",
      p_value < 0.01  ~ "**",
      p_value < 0.05  ~ "*",
      p_value < 0.1   ~ ".",
      TRUE ~ ""
    ),
    outcome = factor(outcome, levels = c("Cognitive Status: demented", "Alzheimer’s diagnosis")),
    group   = factor(group,   levels = c("Diclofenac vs No NSAID", "Naproxen vs No NSAID", "Etodolac vs No NSAID")),
    label_text = paste0("OR = ", sprintf("%.2f", OR), "\n[", sprintf("%.2f", CI_lower), ", ", sprintf("%.2f", CI_upper), "] ", stars)
  )

# ── Forest Plot helper ─────────────────────────────────────────────────
make_forest_plot <- function(df, title_text) {
  ggplot(df, aes(x = OR, y = outcome)) +
    geom_point(size = 2) +
    geom_errorbarh(aes(xmin = CI_lower, xmax = CI_upper), height = 0.1) +
    geom_vline(xintercept = 1, linetype = "dashed") +
    scale_x_log10(limits = c(0.4, 1.5)) +
    geom_text(aes(label = stars), hjust = -0.4, vjust = -0.8, size = 4) +
    labs(
      x = "Odds Ratio (log scale)",
      y = NULL,
      title = title_text
    ) +
    theme_minimal(base_size = 20) +
    theme(
      plot.title  = element_text(size = 11, face = "bold", hjust = 0.5),
      axis.text.x = element_text(size = 11),
      axis.text.y = element_text(size = 18),
      axis.title.x = element_text(size = 11)
    )
}

# ── % Plot Function (Final version, larger fonts) ──────────────────────
plot_NSAID_DEM_NACCALZD_percent <- function(data, var, group_labels, title, treat_value, control_value) {
  temp_data <- data %>%
    mutate(treatment_group = case_when(
      NSAID_TYPE == treat_value ~ 1,
      NSAID_TYPE == control_value ~ 0,
      TRUE ~ NA_real_
    )) %>%
    filter(!is.na(treatment_group))

  test <- chisq.test(table(temp_data$treatment_group, temp_data[[var]]))
  pval <- round(test$p.value, 3)
  p_text <- ifelse(pval < 0.001, "p < 0.001", paste0("p = ", pval))

  summary_df <- temp_data %>%
    group_by(treatment_group) %>%
    summarise(
      n = n(),
      positive = sum(.data[[var]] == 1, na.rm = TRUE),
      percent = 100 * positive / n,
      se = sqrt((percent / 100) * (1 - percent / 100) / n) * 100,
      .groups = "drop"
    ) %>%
    mutate(treatment_group = factor(treatment_group, labels = group_labels))

  y_max <- 70

  ggplot(summary_df, aes(x = treatment_group, y = percent)) +
    geom_col(fill = "steelblue", width = 0.5) +
    geom_errorbar(aes(ymin = percent - se, ymax = percent + se), width = 0.2) +
    labs(
      title = title,
      subtitle = p_text,
      y = "Percent (%)",
      x = NULL
    ) +
    scale_y_continuous(limits = c(0, y_max), expand = expansion(mult = c(0, 0.05))) +
    theme_minimal(base_size = 20) +
    theme(
      plot.title    = element_text(size = 18, face = "bold", hjust = 0.5),
      plot.subtitle = element_text(size = 18, hjust = 0.5),
      axis.text     = element_text(size = 18),
      axis.title    = element_text(size = 18)
    )
}

# ── Build 6 % Plots ───────────────────────────────────────────────────
p1 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_DICLO_NON, "DEMENTED", c("No NSAID", "Diclofenac"), "", "DICLOFENAC", "NONSAID")
p2 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_DICLO_NON, "NACCALZD", c("No NSAID", "Diclofenac"), "", "DICLOFENAC", "NONSAID")

p3 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_NAP_NON, "DEMENTED", c("No NSAID", "Naproxen"), "", "NAPROXEN", "NONSAID")
p4 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_NAP_NON, "NACCALZD", c("No NSAID", "Naproxen"), "", "NAPROXEN", "NONSAID")

p5 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_ETO_NON, "DEMENTED", c("No NSAID", "Etodolac"), "", "ETODOLAC", "NONSAID")
p6 <- plot_NSAID_DEM_NACCALZD_percent(matched_data_ETO_NON, "NACCALZD", c("No NSAID", "Etodolac"), "", "ETODOLAC", "NONSAID")

# ── Forest Panels by Comparison ───────────────────────────────────────
p7 <- make_forest_plot(filter(logit_summary, group == "Diclofenac vs No NSAID"), "Diclofenac vs No NSAID")
p8 <- make_forest_plot(filter(logit_summary, group == "Naproxen vs No NSAID"),   "Naproxen vs No NSAID")
p9 <- make_forest_plot(filter(logit_summary, group == "Etodolac vs No NSAID"),   "Etodolac vs No NSAID")

# ── Final Composite Layout ────────────────────────────────────────────
final_plot <- (p1 | p2 | p7) /
              (p3 | p4 | p8) /
              (p5 | p6 | p9) +
  plot_layout(guides = "collect") +
  plot_annotation(title = "", theme = theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5)))

# Show in console
final_plot

# Save to file
ggsave("Figure1.pdf", final_plot, width = 14, height = 12)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## for 'Alzheimer’s diagnosis' in 'mbcsToSbcs': ' substituted for ’ (U+2019)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## for 'Alzheimer’s diagnosis' in 'mbcsToSbcs': ' substituted for ’ (U+2019)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## for 'Alzheimer’s diagnosis' in 'mbcsToSbcs': ' substituted for ’ (U+2019)

── MoCA Score per NSAID Use

# ── MoCA Score per NSAID Use ──────────────────────────────────────────
# Boxplots of MOCA by treatment (overall, among DEMENTED==1, and among NACCALZD==1)
# with Wilcoxon rank-sum p-values in subtitles.


# ── Helper: Boxplot with Wilcoxon test ─────────────────────────────────
# Inputs:
#   data        : matched dataset for a specific comparison (has treatment_group 0/1)
#   treat_label : c("Control label","Treatment label") used to relabel 0/1
#   title_text  : plot title
plot_moca_by_treatment <- function(data, treat_label, title_text) {
  data <- data %>%
    filter(!is.na(NACCMOCA)) %>%  # drop missing MOCA
    mutate(
      treatment_group = factor(
        treatment_group,
        levels = c(0, 1),
        labels = treat_label
      )
    )

  # Wilcoxon rank-sum test comparing MOCA across the two groups
  pval <- wilcox.test(NACCMOCA ~ treatment_group, data = data)$p.value
  p_text <- ifelse(pval < 0.001, "p < 0.001", paste0("p = ", signif(pval, 3)))

  # Boxplot (0–30 MOCA scale), minimal theme
  ggplot(data, aes(x = treatment_group, y = NACCMOCA)) +
    geom_boxplot(outlier.shape = NA, fill = "steelblue", color = "black") +
    theme_minimal(base_size = 20) +
    coord_cartesian(ylim = c(0, 30)) +
    labs(
      title = title_text,
      subtitle = p_text,
      x = "",
      y = "MOCA Score"
    ) +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold"),
      plot.subtitle = element_text(hjust = 0.5, size = 16)
    )
}

# ── Overall Boxplots (All Participants) ─────────────────────────────────
p1  <- plot_moca_by_treatment(matched_data_DICLO_NON, c("No NSAID", "Diclofenac"), "")
p2  <- plot_moca_by_treatment(matched_data_NAP_NON,   c("No NSAID", "Naproxen"),   "")
p3 <- plot_moca_by_treatment(matched_data_ETO_NON,   c("No NSAID", "Etodolac"),   "")

# ── Boxplots Among DEMENTED == 1 ───────────────────────────────────────
p4  <- plot_moca_by_treatment(
  matched_data_DICLO_NON %>% filter(DEMENTED == 1),
  c("No NSAID", "Diclofenac"), ""
)
p5  <- plot_moca_by_treatment(
  matched_data_NAP_NON %>% filter(DEMENTED == 1),
  c("No NSAID", "Naproxen"), ""
)
p6 <- plot_moca_by_treatment(
  matched_data_ETO_NON %>% filter(DEMENTED == 1),
  c("No NSAID", "Etodolac"), ""
)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
# ── Boxplots Among NACCALZD == 1 ───────────────────────────────────────
p7  <- plot_moca_by_treatment(
  matched_data_DICLO_NON %>% filter(NACCALZD == 1),
  c("No NSAID", "Diclofenac"), ""
)
p8  <- plot_moca_by_treatment(
  matched_data_NAP_NON %>% filter(NACCALZD == 1),
  c("No NSAID", "Naproxen"), ""
)
p9 <- plot_moca_by_treatment(
  matched_data_ETO_NON %>% filter(NACCALZD == 1),
  c("No NSAID", "Etodolac"), ""
)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
# ── Save 3×3 Grid to PDF ──────────────────────────────────────────────
# Layout (left→right, top→bottom): DICLO overall / DEMENTED / AD; then NAP; then ETO.
pdf("Figure2.pdf", width = 12, height = 10)
gridExtra::grid.arrange(p1, p4, p7,
                        p2, p5, p8,
                        p3, p6, p9,
                        ncol = 3)
dev.off()
## quartz_off_screen 
##                 2

── MOCA Trajectory for NSAID

# ── MOCA Trajectory for NSAID ─────────────────────────────────────────
# This section fits a linear mixed-effects model of MOCA score over time
# by NSAID group, extracts slopes with CIs, and visualizes trajectories.

# ── NSAID Group Definition ────────────────────────────────────────────
data_moca <- data %>%
  mutate(NSAID_TYPE = case_when(
    DICLOFENAC == 1 & NAPROXEN == 0 & ETODOLAC == 0 ~ "DICLOFENAC",
    NAPROXEN == 1   & DICLOFENAC == 0 & ETODOLAC == 0 ~ "NAPROXEN",
    ETODOLAC == 1   & DICLOFENAC == 0 & NAPROXEN == 0 ~ "ETODOLAC",
    DICLOFENAC + NAPROXEN + ETODOLAC > 1 ~ "MULTIPLE",
    DICLOFENAC == 0 & NAPROXEN == 0 & ETODOLAC == 0 ~ "NONSAID",
    TRUE ~ NA_character_
  )) %>%
  filter(NSAID_TYPE %in% c("NONSAID", "DICLOFENAC", "NAPROXEN", "ETODOLAC", "MULTIPLE")) %>%
  group_by(NACCID) %>%
  mutate(time_since_baseline = NACCFDYS - min(NACCFDYS)) %>%
  ungroup() %>%
  mutate(year = time_since_baseline / 365)

# ── Data Checks ───────────────────────────────────────────────────────
naccid_with_moca <- unique(data_moca$NACCID[!is.na(data_moca$NACCMOCA)])   
length(naccid_with_moca)                                                   
## [1] 20176
# Set out-of-range MOCA values to missing
data_moca$NACCMOCA[ data_moca$NACCMOCA < 0 | data_moca$NACCMOCA > 30 ] <- NA

# Distribution of visits per participant
visits_per_subject <- data_moca %>%                                       
  count(NACCID, name = "num_visits")                                      
visit_distribution_table <- visits_per_subject %>%                        
  count(num_visits, name = "num_subjects") %>%                           
  arrange(num_visits)                                                    

visit_distribution_table                                                   
## # A tibble: 18 × 2
##    num_visits num_subjects
##         <int>        <int>
##  1          1        14784
##  2          2         7894
##  3          3         6051
##  4          4         4771
##  5          5         3332
##  6          6         2524
##  7          7         1916
##  8          8         1415
##  9          9         1114
## 10         10          905
## 11         11          757
## 12         12          537
## 13         13          459
## 14         14          306
## 15         15          219
## 16         16          131
## 17         17           47
## 18         18            3
# ── Factor Levels and Scaling ─────────────────────────────────────────
data_moca$NSAID_TYPE <- factor(data_moca$NSAID_TYPE,
  levels = c("NONSAID", "DICLOFENAC", "NAPROXEN", "ETODOLAC", "MULTIPLE"))

data_moca <- data_moca %>%
  mutate(NACCFDYS_scaled = scale(NACCFDYS, center = TRUE, scale = TRUE))

# Save scale info for back-transform
sd_days  <- attr(data_moca$NACCFDYS_scaled, "scaled:scale")
mean_days <- attr(data_moca$NACCFDYS_scaled, "scaled:center")

# ── Mixed Model ───────────────────────────────────────────────────────
lmm <- lmer(NACCMOCA ~ NACCFDYS_scaled * NSAID_TYPE + (1 + NACCFDYS_scaled | NACCID),
            data = data_moca)

# ── Extract Fixed Effects & Variance-Covariance ───────────────────────
coef <- fixef(lmm)
vcov_mat <- vcov(lmm)

# ── Helper: Compute Back-Transformed Slopes + CIs ─────────────────────
get_slope_ci <- function(base, int = 0, var_base, var_int = 0, covar = 0) {
  est_scaled <- base + int
  se_scaled <- sqrt(var_base + var_int + 2 * covar)
  est_day <- est_scaled / sd_days
  se_day <- se_scaled / sd_days
  ci_day <- est_day + c(-1.96, 1.96) * se_day
  est_year <- est_day * 365
  se_year <- se_day * 365
  ci_year <- est_year + c(-1.96, 1.96) * se_year
  data.frame(
    estimate_per_day = est_day,
    ci_low_day = ci_day[1],
    ci_high_day = ci_day[2],
    estimate_per_year = est_year,
    ci_low_year = ci_year[1],
    ci_high_year = ci_year[2]
  )
}

# ── Slopes for Each Group ─────────────────────────────────────────────
res_nonsaid <- get_slope_ci(
  base = coef["NACCFDYS_scaled"],
  var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"])

res_diclo <- get_slope_ci(
  base = coef["NACCFDYS_scaled"],
  int = coef["NACCFDYS_scaled:NSAID_TYPEDICLOFENAC"],
  var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"],
  var_int = vcov_mat["NACCFDYS_scaled:NSAID_TYPEDICLOFENAC", "NACCFDYS_scaled:NSAID_TYPEDICLOFENAC"],
  covar = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled:NSAID_TYPEDICLOFENAC"])

res_naprox <- get_slope_ci(
  base = coef["NACCFDYS_scaled"],
  int = coef["NACCFDYS_scaled:NSAID_TYPENAPROXEN"],
  var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"],
  var_int = vcov_mat["NACCFDYS_scaled:NSAID_TYPENAPROXEN", "NACCFDYS_scaled:NSAID_TYPENAPROXEN"],
  covar = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled:NSAID_TYPENAPROXEN"])

res_etodolac <- get_slope_ci(
  base = coef["NACCFDYS_scaled"],
  int = coef["NACCFDYS_scaled:NSAID_TYPEETODOLAC"],
  var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"],
  var_int = vcov_mat["NACCFDYS_scaled:NSAID_TYPEETODOLAC", "NACCFDYS_scaled:NSAID_TYPEETODOLAC"],
  covar = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled:NSAID_TYPEETODOLAC"])

res_multiple <- get_slope_ci(
  base = coef["NACCFDYS_scaled"],
  int = coef["NACCFDYS_scaled:NSAID_TYPEMULTIPLE"],
  var_base = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled"],
  var_int = vcov_mat["NACCFDYS_scaled:NSAID_TYPEMULTIPLE", "NACCFDYS_scaled:NSAID_TYPEMULTIPLE"],
  covar = vcov_mat["NACCFDYS_scaled", "NACCFDYS_scaled:NSAID_TYPEMULTIPLE"])

# ── Combine Slopes ────────────────────────────────────────────────────
slope_results <- bind_rows(
  NONSAID = res_nonsaid,
  DICLOFENAC = res_diclo,
  NAPROXEN = res_naprox,
  ETODOLAC = res_etodolac,
  MULTIPLE = res_multiple,
  .id = "NSAID_TYPE")


# ── P-values & Stars ──────────────────────────────────────────────────
slope_pvals <- broom.mixed::tidy(lmm, effects = "fixed") %>%
  filter(str_detect(term, "NACCFDYS_scaled")) %>%
  mutate(
    NSAID_TYPE = dplyr::recode(term,
      "NACCFDYS_scaled" = "NONSAID",
      "NACCFDYS_scaled:NSAID_TYPEDICLOFENAC" = "DICLOFENAC",
      "NACCFDYS_scaled:NSAID_TYPENAPROXEN" = "NAPROXEN",
      "NACCFDYS_scaled:NSAID_TYPEETODOLAC" = "ETODOLAC",
      "NACCFDYS_scaled:NSAID_TYPEMULTIPLE" = "MULTIPLE"),
    stars = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ "",
      TRUE ~ ""
    )
  ) %>%
  dplyr::select(NSAID_TYPE, p.value, stars)  

slope_annotate <- slope_results %>%
  left_join(slope_pvals, by = "NSAID_TYPE") %>%
  mutate(label = paste0(round(estimate_per_year, 2), " /y ", stars))       

# ── Predictions for Plotting ──────────────────────────────────────────
time_years <- seq(0, 15, length.out = 20)
time_days  <- time_years * 365
time_scaled <- (time_days - mean_days) / sd_days

newdata <- expand.grid(
  NACCFDYS_scaled = time_scaled,
  NSAID_TYPE = levels(data_moca$NSAID_TYPE))

Xmat <- model.matrix(~ NACCFDYS_scaled * NSAID_TYPE, newdata)
newdata$predicted <- as.vector(Xmat %*% coef)
se <- sqrt(diag(Xmat %*% vcov_mat %*% t(Xmat)))

newdata$year   <- time_days / 365
newdata$ci_low <- newdata$predicted + (-1.96) * se
newdata$ci_high <- newdata$predicted + ( 1.96) * se

# ── Legend Labels ─────────────────────────────────────────────────────
display_names <- c("No NSAID", "Diclofenac", "Naproxen", "Etodolac", "Multiple NSAIDs")
legend_labels <- slope_annotate %>%
  mutate(
    formatted = paste0(
      str_pad(display_names, width = 18, side = "right"),
      str_pad(paste0(round(estimate_per_year, 2), " /y ", stars), width = 10, side = "left"))
  ) %>%
  pull(formatted)

# ── Plot Trajectories ─────────────────────────────────────────────────
p <- ggplot(newdata, aes(x = year, y = predicted, color = NSAID_TYPE)) +
  geom_line(size = 1.2) +
  geom_ribbon(aes(ymin = ci_low, ymax = ci_high, fill = NSAID_TYPE),
              alpha = 0.2, color = NA) +
  labs(
    title = "",
    x = "Years Since First Visit",
    y = "Predicted MOCA Score",
    color = "NSAID Type",
    fill  = "NSAID Type"
  ) + 
  ylim(15, 30) +
  scale_color_brewer(
    palette = "Set1",
    name = "Group (slope per year)",
    labels = legend_labels) +
  scale_fill_brewer(
    palette = "Set1",
    name = "Group (slope per year)",
    labels = legend_labels) +  
  theme_classic(base_size = 14)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# ── Save Plot ─────────────────────────────────────────────────────────
ggsave("Fig3.pdf", plot = p, width = 10, height = 6)