# 設定全局 Chunk 選項:隱藏警告與訊息,保持報告乾淨
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 10, fig.height = 6)

# === 載入全域必要套件 ===
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 4.5.2
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(survival)
library(survminer)
## Warning: package 'survminer' was built under R version 4.5.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.5.2
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(flextable)
## Warning: package 'flextable' was built under R version 4.5.2
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:gtsummary':
## 
##     continuous_summary
## The following objects are masked from 'package:ggpubr':
## 
##     border, font, rotate
library(officer)
## Warning: package 'officer' was built under R version 4.5.2
library(ggplot2)
library(broom)
## Warning: package 'broom' was built under R version 4.5.2
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)         # 💡 新增這行!
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.2
library(forcats)
## Warning: package 'forcats' was built under R version 4.5.2
# === 全域參數設定 ===
stages <- c("1A","1B","2A","2B","3A","3B","3C")
censor_years <- 5

一、資料前處理與世代建立 (Data Curation)

此區塊進行日期轉換、病歷 ID 去重複(依據側性、診斷時間等規則)、篩選 HER2 陽性族群,並動態定義最終期別(有做 NAC 看臨床分期,未就做看病理分期)。

二、存活資料設定 (Survival Setup)

精準處理 RFS 與 OS 事件,並將存活時間強制截尾於 5 年(60個月),以利後續 KM 與 Cox 分析。保留 1332 人母體,未曾 Disease-free 者(Recur_Type=70)於 RFS 設為 NA。

## 📄 原始資料筆數: 10319
## ✅ ID 數量分類:
##   一筆: 9226 
##   二筆: 539 
##   三筆以上: 5
## ✅ 整合完成!最終資料筆數: 10304
## 👉 Step 1 (HER2+ 總人數): 1757
## 👉 Step 2 (有開刀): 1652
## 👉 Step 3 (未做 NAC): 1333
## 👉 Step 6 (pT1N0M0 極早期小腫瘤總數): 366
## 👉 Step 7 (全體 HER2+ 且有明確期別的分析人數): 1332
## 🎉 資料清理完成!已產出 'final_her2_data.csv'。

三、患者基線特徵 (Table 1)

依照 4 種治療組合 (Group A~D) 呈現人口學與臨床病理特徵。

# ==========================================
# 治療設定與製表 (Table 1) - ✅ 1332人完美版
# ==========================================

library(dplyr)
library(gtsummary)
library(flextable)
library(officer)

# === 1. 讀取 1,332 人的完整資料 ===
# 請確保 final_her2_data.csv 已經是 1332 筆的版本
df_all <- read.csv("final_her2_data.csv")

# === 2. 變數轉換與整理 ===
df_analysis <- df_all %>%
  mutate(
    # --- 治療分組 ---
    group = case_when(
      CT == 0 & TT == 0 ~ "Group A",
      CT == 0 & TT == 1 ~ "Group B",
      CT == 1 & TT == 0 ~ "Group C",
      CT == 1 & TT == 1 ~ "Group D",
      TRUE ~ NA_character_
    ),
    group = factor(group, levels = c("Group A", "Group B", "Group C", "Group D")),
    
    # --- 臨床特徵轉換 ---
    age_group = ifelse(Age < 50, "< 50 y/o", "≥ 50 y/o"),
    hr_status = case_when(
      grepl("HR\\+", ihc) ~ "HR positive",
      grepl("HR\\-", ihc) ~ "HR negative",
      TRUE ~ NA_character_
    ),
    hr_status = factor(hr_status, levels = c("HR negative", "HR positive")),
    grade_f = factor(Grade, levels = c(1, 2, 3), labels = c("I", "II", "III")),
    histology = case_when(
      Hist == 8500 ~ "IDC",
      Hist == 8520 ~ "ILC",
      TRUE ~ "Others"
    ),
    histology = factor(histology, levels = c("IDC", "ILC", "Others")),
    endocrine = factor(ifelse(HT == 1, "Yes", "No"), levels = c("No", "Yes")),
    
    # --- 放射線治療 (修復 100% bug,使用正確欄位) ---
    rt_done = ifelse(!is.na(X4.2.1.1.RT.Target.Summary) & as.numeric(X4.2.1.1.RT.Target.Summary) > 0, "Done", "None"),
    rt_done = factor(rt_done, levels = c("None", "Done")),
    
    # --- 追蹤月數 (使用原始未截尾的 surv_year_os,呈現真實追蹤時間) ---
    follow_up_months = surv_year_os * 12 
  )

# === 3. 使用 gtsummary 製作 Table 1 ===
tbl1 <- df_analysis %>%
  select(
    # 這裡放進去的變數,都會出現在 Table 1 裡面!
    # final_stage 就是你要的 patient counts by stage
    group, Age, age_group, hr_status, grade_f, histology,
    endocrine, rt_done, follow_up_months, final_stage
  ) %>%
  tbl_summary(
    by = group,
    type = list(
      Age ~ "continuous2",
      follow_up_months ~ "continuous2"
    ),
    statistic = list(
      all_continuous2() ~ "{mean} ± {sd}; {median} [{min}, {max}]",
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no", 
    label = list(
      Age ~ "Age",
      age_group ~ "Age group",
      hr_status ~ "Hormone receptor status",
      grade_f ~ "Tumor grade",
      histology ~ "Histology",
      endocrine ~ "Endocrine therapy",
      rt_done ~ "Radiotherapy",
      follow_up_months ~ "Follow-up (months)",
      final_stage ~ "Final stage" # <--- 這裡會顯示各期別與各治療組的交叉人數
    )
  ) %>%
  add_overall(last = TRUE) %>%                   
  add_p(test = everything() ~ "chisq.test") %>%  
  modify_header(
    label = "**Characteristics**",
    # gtsummary 會自動抓取 1332 人並計算各組 N 數
    all_stat_cols() ~ "**{level}**\n(n={n})" 
  ) %>%
  bold_labels()

# === 4. 輸出結果 ===
# 在 R Markdown 中顯示表格
tbl1
Characteristics Group A (n=240)1 Group B (n=30)1 Group C (n=70)1 Group D (n=992)1 Overall (n=1332)1 p-value2
Age




<0.001
    Mean ± SD; Median [Min, Max] 58 ± 12; 58 [32, 89] 64 ± 16; 68 [32, 86] 55 ± 12; 57 [30, 79] 54 ± 11; 54 [20, 87] 55 ± 12; 55 [20, 89]
Age group




0.008
    < 50 y/o 57 (24%) 7 (23%) 22 (31%) 344 (35%) 430 (32%)
    ≥ 50 y/o 183 (76%) 23 (77%) 48 (69%) 648 (65%) 902 (68%)
Hormone receptor status




<0.001
    HR negative 148 (62%) 9 (30%) 23 (33%) 428 (43%) 608 (46%)
    HR positive 92 (38%) 21 (70%) 47 (67%) 564 (57%) 724 (54%)
Tumor grade




<0.001
    I 7 (5.5%) 0 (0%) 0 (0%) 4 (0.7%) 11 (1.5%)
    II 76 (60%) 7 (44%) 27 (53%) 270 (49%) 380 (51%)
    III 44 (35%) 9 (56%) 24 (47%) 276 (50%) 353 (47%)
Histology




0.070
    IDC 224 (93%) 27 (90%) 67 (96%) 946 (95%) 1,264 (95%)
    ILC 2 (0.8%) 2 (6.7%) 0 (0%) 12 (1.2%) 16 (1.2%)
    Others 14 (5.8%) 1 (3.3%) 3 (4.3%) 34 (3.4%) 52 (3.9%)
Endocrine therapy 89 (37%) 21 (70%) 43 (61%) 536 (54%) 689 (52%) <0.001
Radiotherapy




<0.001
    None 168 (70%) 22 (73%) 49 (70%) 489 (49%) 728 (55%)
    Done 72 (30%) 8 (27%) 21 (30%) 503 (51%) 604 (45%)
Follow-up (months)




0.10
    Mean ± SD; Median [Min, Max] 66 ± 41; 60 [0, 155] 65 ± 39; 58 [9, 147] 77 ± 42; 78 [5, 156] 75 ± 39; 71 [0, 163] 73 ± 40; 67 [0, 163]
Final stage




<0.001
    1A 191 (80%) 12 (40%) 30 (43%) 251 (25%) 484 (36%)
    1B 0 (0%) 0 (0%) 0 (0%) 7 (0.7%) 7 (0.5%)
    2A 35 (15%) 5 (17%) 28 (40%) 287 (29%) 355 (27%)
    2B 6 (2.5%) 6 (20%) 3 (4.3%) 175 (18%) 190 (14%)
    3A 2 (0.8%) 2 (6.7%) 4 (5.7%) 130 (13%) 138 (10%)
    3B 4 (1.7%) 0 (0%) 0 (0%) 17 (1.7%) 21 (1.6%)
    3C 2 (0.8%) 5 (17%) 5 (7.1%) 125 (13%) 137 (10%)
1 n (%)
2 Pearson’s Chi-squared test
# 匯出成 Word 檔
tbl1 %>%
  as_flex_table() %>%
  autofit() %>%
  save_as_docx(path = "Table1_Clinicopathologic_Characteristics.docx")

cat("✅ Table 1 已成功產出並存為 Table1_Clinicopathologic_Characteristics.docx")
## ✅ Table 1 已成功產出並存為 Table1_Clinicopathologic_Characteristics.docx

四、5年存活率分析 (KM Plots)

# ==========================================
# 存活分析:KM Plot (1A~3C全期別展示) + Risk Table + 檢定結果
# ==========================================

library(survival)
library(survminer)
library(dplyr)
library(gridExtra)
library(ggpubr)       # 💡 新增:用來畫底下的統計檢定表格
library(broom)

# 確保資料格式與 Reference 組別設定正確
df_analysis <- df_analysis %>%
  filter(!is.na(group)) %>%  
  mutate(
    group = relevel(factor(group), ref = "Group A"),
    final_stage = relevel(factor(final_stage), ref = "1A")
  )

# 自訂經典四色 (Group A:紅, B:藍, C:綠, D:紫)
km_colors <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3")

for (stg in stages) {
  
  df_stage <- df_analysis %>% filter(final_stage == stg)
  
  if (nrow(df_stage) > 0 && n_distinct(df_stage$group) >= 2) {
    
    # ==========================================
    # [RFS] 無復發存活率
    # ==========================================
    fit_rfs <- survfit(Surv(surv_year_rfs_5y, relapse_5y) ~ group, data = df_stage)
    p_rfs <- ggsurvplot(
      fit_rfs, data = df_stage,
      pval = TRUE,                      
      risk.table = TRUE,                
      xlim = c(0, 5),                   
      break.time.by = 1,
      palette = km_colors,
      title = paste0("5-Year RFS - Stage ", stg),
      xlab = "Time (Years)", ylab = "RFS Probability",
      legend.title = "Treatment",
      ggtheme = theme_minimal(base_size = 14),
      risk.table.height = 0.25,          
      risk.table.y.text = TRUE,         # ✅ 恢復文字標註 (Group A, B, C, D)
      risk.table.y.text.col = TRUE,     # ✅ 讓文字保持對應顏色,清楚又好看
      fontsize = 4.5,                   
      tables.theme = theme_cleantable() 
    )

    # RFS 底下的檢定表格 (Cox HR vs Group A)
    cox_rfs <- tryCatch(coxph(Surv(surv_year_rfs_5y, relapse_5y) ~ group, data = df_stage), error = function(e) NULL)
    if (!is.null(cox_rfs)) {
      cox_tbl_rfs <- tidy(cox_rfs, exponentiate = TRUE, conf.int = TRUE) %>%
        mutate(
          Treatment = sub("group", "", term),
          `HR (95% CI)` = sprintf("%.2f (%.2f - %.2f)", estimate, conf.low, conf.high),
          `P-value` = ifelse(p.value < 0.001, "<0.001", sprintf("%.3f", p.value))
        ) %>% select(Treatment, `HR (95% CI)`, `P-value`)
      p_rfs_table <- ggtexttable(cox_tbl_rfs, rows = NULL, theme = ttheme("minimal", base_size = 12))
    } else {
      p_rfs_table <- ggparagraph(text = "Not estimable", size = 12)
    }

    # ==========================================
    # [OS] 整體存活率
    # ==========================================
    fit_os <- survfit(Surv(surv_year_os_5y, Vita_m_5y) ~ group, data = df_stage)
    p_os <- ggsurvplot(
      fit_os, data = df_stage,
      pval = TRUE,                      
      risk.table = TRUE,
      xlim = c(0, 5),
      break.time.by = 1,
      palette = km_colors,
      title = paste0("5-Year OS - Stage ", stg),
      xlab = "Time (Years)", ylab = "OS Probability",
      legend.title = "Treatment",
      ggtheme = theme_minimal(base_size = 14),
      risk.table.height = 0.25,
      risk.table.y.text = TRUE,         # ✅ 恢復文字標註
      risk.table.y.text.col = TRUE,     
      fontsize = 4.5,
      tables.theme = theme_cleantable()
    )

    # OS 底下的檢定表格 (Cox HR vs Group A)
    cox_os <- tryCatch(coxph(Surv(surv_year_os_5y, Vita_m_5y) ~ group, data = df_stage), error = function(e) NULL)
    if (!is.null(cox_os)) {
      cox_tbl_os <- tidy(cox_os, exponentiate = TRUE, conf.int = TRUE) %>%
        mutate(
          Treatment = sub("group", "", term),
          `HR (95% CI)` = sprintf("%.2f (%.2f - %.2f)", estimate, conf.low, conf.high),
          `P-value` = ifelse(p.value < 0.001, "<0.001", sprintf("%.3f", p.value))
        ) %>% select(Treatment, `HR (95% CI)`, `P-value`)
      p_os_table <- ggtexttable(cox_tbl_os, rows = NULL, theme = ttheme("minimal", base_size = 12))
    } else {
      p_os_table <- ggparagraph(text = "Not estimable", size = 12)
    }

    # ==========================================
    # 組合輸出 (KM 圖 + Risk Table + Cox 檢定表)
    # ==========================================
    grid.newpage() 
    grid.arrange(
      p_rfs$plot, p_os$plot, 
      p_rfs$table, p_os$table, 
      p_rfs_table, p_os_table,
      ncol = 2, 
      heights = c(4, 1.2, 1) # 高度比例:KM 圖最大,Risk Table 其次,檢定表在最下面
    )
  }
}

五、Cox 比例風險模型 (Cox Regression)

評估治療組別、年齡、HR狀態、腫瘤分化與期別對 RFS 與 OS 的多變項影響。

library(survival)
library(dplyr)
library(broom)
library(flextable)
library(officer)

# === 定義生存物件 (改用我們整理好的、最乾淨完整的 df_analysis) ===
# OS (使用完整的追蹤年份與事件)
surv_os <- Surv(time = df_analysis$surv_year_os, event = df_analysis$Vita_m == 1)

# RFS (使用完整的追蹤年份與事件)
surv_rfs <- Surv(time = df_analysis$surv_year_rfs, event = df_analysis$rfs_event_raw == 1)

# === 要跑的變數 ===
vars <- c("group", "Age", "hr_status", "grade_f", "final_stage")

# --- function: 單因子 + 多因子 Cox ---
run_cox <- function(surv_obj, data, vars) {
  
  # 單因子 Cox
  uni_results <- lapply(vars, function(v) {
    f <- as.formula(paste("surv_obj ~", v))
    coxph(f, data = data)
  })
  
  uni_table <- lapply(uni_results, tidy, exponentiate = TRUE, conf.int = TRUE) %>%
    bind_rows(.id = "Variable") %>%
    mutate(Variable = vars[as.numeric(Variable)]) %>%
    select(Variable, term, estimate, conf.low, conf.high, p.value)
  
  # 多因子 Cox
  multi_model <- coxph(surv_obj ~ group + Age + hr_status + grade_f + final_stage,
                       data = data)
  
  multi_table <- tidy(multi_model, exponentiate = TRUE, conf.int = TRUE) %>%
    select(term, estimate, conf.low, conf.high, p.value)
  
  # 合併單因子與多因子
  cox_table <- uni_table %>%
    rename(Uni_HR = estimate, Uni_LCL = conf.low, Uni_UCL = conf.high, Uni_p = p.value) %>%
    left_join(
      multi_table %>%
        rename(Multi_HR = estimate, Multi_LCL = conf.low, Multi_UCL = conf.high, Multi_p = p.value),
      by = c("term")
    )
  
  # ✨ 智慧攔截:處理 Inf 或因為 0 事件導致的荒謬超大數值 (>100) → 轉為 NA
  cox_table <- cox_table %>%
    mutate(across(c(Uni_HR, Uni_LCL, Uni_UCL, Multi_HR, Multi_LCL, Multi_UCL),
                  ~ ifelse(. > 100 | is.infinite(.), NA, .)))
  
  return(cox_table)
}

# === 格式化成表格(乾淨 Level + 顯示 ref group) ===
format_cox <- function(cox_table, title) {
  
  cox_table_fmt <- cox_table %>%
    mutate(
      # 乾淨的 Level 名稱
      Level = case_when(
        grepl("group", term) ~ sub("group", "", term),
        grepl("hr_status", term) ~ sub("hr_status", "", term),
        grepl("grade_f", term) ~ sub("grade_f", "", term),
        grepl("final_stage", term) ~ sub("final_stage", "", term),
        term == "Age" ~ "Age (per year increase)",
        TRUE ~ term
      ),
      # 數值格式化 (遇到 NA 就顯示 Not estimable)
      Uni = ifelse(is.na(Uni_HR), "Not estimable",
                   sprintf("%.2f (%.2f–%.2f)", Uni_HR, Uni_LCL, Uni_UCL)),
      Multi = ifelse(is.na(Multi_HR), "Not estimable",
                     sprintf("%.2f (%.2f–%.2f)", Multi_HR, Multi_LCL, Multi_UCL)),
      Uni_p = ifelse(is.na(Uni_p), "", sprintf("%.3f", Uni_p)),
      Multi_p = ifelse(is.na(Multi_p), "", sprintf("%.3f", Multi_p))
    ) %>%
    select(Variable, Level, Uni, Uni_p, Multi, Multi_p)
  
  # 各變數的 reference group
  ref_rows <- tribble(
    ~Variable,      ~Level,                ~Uni, ~Uni_p, ~Multi, ~Multi_p,
    "group",        "Group A (ref)",       "",   "",     "",     "",
    "hr_status",    "HR negative (ref)",   "",   "",     "",     "",
    "grade_f",      "Grade I (ref)",       "",   "",     "",     "",
    "final_stage",  "Stage 1A (ref)",      "",   "",     "",     ""
  )
  
  # 插入 ref 並讓它排在各變數最前面
  cox_table_fmt <- bind_rows(cox_table_fmt, ref_rows) %>%
    mutate(Level = trimws(Level)) %>%
    arrange(factor(Variable, levels = c("group","Age","hr_status","grade_f","final_stage")),
            case_when(
              grepl("ref", Level) ~ 0,  # ref 永遠排最上面
              TRUE ~ 1
            ),
            Level)
  
  flextable(cox_table_fmt) %>%
    set_header_labels(
      Variable = "Variable",
      Level = "Level",
      Uni = "Univariate HR (95% CI)",
      Uni_p = "Univariate p",
      Multi = "Multivariate HR (95% CI)",
      Multi_p = "Multivariate p"
    ) %>%
    add_header_row(values = title, colwidths = 6) %>%
    autofit() %>%
    bold(i = ~ as.numeric(Uni_p) < 0.05 | as.numeric(Multi_p) < 0.05, bold = TRUE)
}

# === 跑 OS & RFS ===
cox_os <- run_cox(surv_os, df_analysis, vars)
cox_rfs <- run_cox(surv_rfs, df_analysis, vars)

# === 轉換成表格 (直接在 RMarkdown 顯示) ===
tbl_os <- format_cox(cox_os, "Table 3a. Cox Regression for Overall Survival (OS)")
tbl_rfs <- format_cox(cox_rfs, "Table 3b. Cox Regression for Recurrence-Free Survival (RFS)")

tbl_os

Table 3a. Cox Regression for Overall Survival (OS)

Variable

Level

Univariate HR (95% CI)

Univariate p

Multivariate HR (95% CI)

Multivariate p

group

Group A (ref)

group

Group B

2.33 (0.94–5.76)

0.068

0.96 (0.35–2.58)

0.928

group

Group C

1.29 (0.59–2.81)

0.524

0.87 (0.34–2.18)

0.760

group

Group D

0.59 (0.36–0.97)

0.039

0.38 (0.20–0.73)

0.003

Age

Age (per year increase)

1.07 (1.05–1.09)

0.000

1.05 (1.03–1.07)

0.000

hr_status

HR negative (ref)

hr_status

HR positive

0.81 (0.54–1.22)

0.320

0.73 (0.47–1.16)

0.181

grade_f

Grade I (ref)

grade_f

II

0.49 (0.12–2.02)

0.322

0.18 (0.04–0.85)

0.030

grade_f

III

0.53 (0.13–2.19)

0.382

0.12 (0.02–0.58)

0.009

final_stage

Stage 1A (ref)

final_stage

1B

0.00 (0.00–NA)

0.994

0.00 (0.00–NA)

0.995

final_stage

2A

2.53 (1.26–5.09)

0.009

3.49 (1.55–7.83)

0.003

final_stage

2B

3.75 (1.81–7.78)

0.000

7.41 (3.03–18.12)

0.000

final_stage

3A

3.72 (1.70–8.16)

0.001

7.71 (3.06–19.44)

0.000

final_stage

3B

9.27 (2.99–28.75)

0.000

7.66 (2.24–26.13)

0.001

final_stage

3C

7.23 (3.62–14.46)

0.000

9.59 (3.97–23.13)

0.000

tbl_rfs

Table 3b. Cox Regression for Recurrence-Free Survival (RFS)

Variable

Level

Univariate HR (95% CI)

Univariate p

Multivariate HR (95% CI)

Multivariate p

group

Group A (ref)

group

Group B

2.09 (0.85–5.14)

0.107

0.47 (0.12–1.74)

0.255

group

Group C

1.10 (0.51–2.37)

0.817

0.48 (0.18–1.24)

0.128

group

Group D

0.66 (0.41–1.06)

0.084

0.24 (0.12–0.48)

0.000

Age

Age (per year increase)

1.03 (1.01–1.04)

0.001

1.01 (0.99–1.03)

0.464

hr_status

HR negative (ref)

hr_status

HR positive

0.72 (0.50–1.04)

0.080

0.73 (0.46–1.16)

0.180

grade_f

Grade I (ref)

grade_f

II

Not estimable

0.996

Not estimable

0.995

grade_f

III

Not estimable

0.996

Not estimable

0.995

final_stage

Stage 1A (ref)

final_stage

1B

0.00 (0.00–NA)

0.994

0.00 (0.00–NA)

0.997

final_stage

2A

2.40 (1.33–4.33)

0.004

5.18 (2.31–11.63)

0.000

final_stage

2B

1.91 (0.93–3.93)

0.079

5.22 (1.93–14.16)

0.001

final_stage

3A

2.06 (0.94–4.50)

0.070

6.11 (2.24–16.69)

0.000

final_stage

3B

7.44 (2.75–20.18)

0.000

10.98 (3.23–37.25)

0.000

final_stage

3C

7.50 (4.21–13.36)

0.000

18.57 (7.71–44.74)

0.000

六、新輔助治療 (NAC) 時代演變趨勢

觀察 2007-2022 年間,不同期別與 HR 狀態下採用 NAC 的比例變化。

summary_nac <- df_analysis %>%
  mutate(year = year(DIAG_DT), NAC = as.integer(NAC_binary == 1)) %>%
  filter(!is.na(year), !is.na(hr_status)) %>%
  group_by(year, final_stage, hr_status) %>%
  summarise(total = n(), nac_rate = sum(NAC) / total, .groups = "drop") %>%
  filter(final_stage %in% c("1A", "2A", "2B", "3A", "3B", "3C"))

ggplot(summary_nac, aes(x = year, y = nac_rate, color = hr_status)) +
  geom_line(size = 1) + geom_point(size = 2) +
  facet_wrap(~ final_stage, ncol = 3) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(title = "NAC Adoption Rate Over Time by Stage and HR Status", x = "Year of Diagnosis", y = "NAC Rate", color = "HR Status") +
  theme_minimal(base_size = 14) + theme(legend.position = "bottom")