# 設定全局 Chunk 選項：隱藏警告與訊息，保持報告乾淨
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 10, fig.height = 6)

# === 載入全域必要套件 ===
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

## Warning: package 'stringr' was built under R version 4.5.2

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(survival)
library(survminer)

## Warning: package 'survminer' was built under R version 4.5.2

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Loading required package: ggpubr

## Warning: package 'ggpubr' was built under R version 4.5.2

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

library(gtsummary)

## Warning: package 'gtsummary' was built under R version 4.5.2

library(flextable)

## Warning: package 'flextable' was built under R version 4.5.2

## 
## Attaching package: 'flextable'

## The following object is masked from 'package:gtsummary':
## 
##     continuous_summary

## The following objects are masked from 'package:ggpubr':
## 
##     border, font, rotate

library(officer)

## Warning: package 'officer' was built under R version 4.5.2

library(ggplot2)
library(broom)

## Warning: package 'broom' was built under R version 4.5.2

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(grid)         # 💡 新增這行！
library(patchwork)

## Warning: package 'patchwork' was built under R version 4.5.2

library(forcats)

## Warning: package 'forcats' was built under R version 4.5.2

# === 全域參數設定 ===
stages <- c("1A","1B","2A","2B","3A","3B","3C")
censor_years <- 5

一、資料前處理與世代建立 (Data Curation)

此區塊進行日期轉換、病歷 ID 去重複（依據側性、診斷時間等規則）、篩選 HER2 陽性族群，並動態定義最終期別（有做 NAC 看臨床分期，未就做看病理分期）。

二、存活資料設定 (Survival Setup)

精準處理 RFS 與 OS 事件，並將存活時間強制截尾於 5 年（60個月），以利後續 KM 與 Cox 分析。保留 1332 人母體，未曾 Disease-free 者（Recur_Type=70）於 RFS 設為 NA。

## 📄 原始資料筆數： 10319

## ✅ ID 數量分類：
##   一筆： 9226 
##   二筆： 539 
##   三筆以上： 5

## ✅ 整合完成！最終資料筆數： 10304

## 👉 Step 1 (HER2+ 總人數): 1757

## 👉 Step 2 (有開刀): 1652

## 👉 Step 3 (未做 NAC): 1333

## 👉 Step 6 (pT1N0M0 極早期小腫瘤總數): 366

## 👉 Step 7 (全體 HER2+ 且有明確期別的分析人數): 1332

## 🎉 資料清理完成！已產出 'final_her2_data.csv'。

三、患者基線特徵 (Table 1)

依照 4 種治療組合 (Group A~D) 呈現人口學與臨床病理特徵。

# ==========================================
# 治療設定與製表 (Table 1) - ✅ 1332人完美版
# ==========================================

library(dplyr)
library(gtsummary)
library(flextable)
library(officer)

# === 1. 讀取 1,332 人的完整資料 ===
# 請確保 final_her2_data.csv 已經是 1332 筆的版本
df_all <- read.csv("final_her2_data.csv")

# === 2. 變數轉換與整理 ===
df_analysis <- df_all %>%
  mutate(
    # --- 治療分組 ---
    group = case_when(
      CT == 0 & TT == 0 ~ "Group A",
      CT == 0 & TT == 1 ~ "Group B",
      CT == 1 & TT == 0 ~ "Group C",
      CT == 1 & TT == 1 ~ "Group D",
      TRUE ~ NA_character_
    ),
    group = factor(group, levels = c("Group A", "Group B", "Group C", "Group D")),
    
    # --- 臨床特徵轉換 ---
    age_group = ifelse(Age < 50, "< 50 y/o", "≥ 50 y/o"),
    hr_status = case_when(
      grepl("HR\\+", ihc) ~ "HR positive",
      grepl("HR\\-", ihc) ~ "HR negative",
      TRUE ~ NA_character_
    ),
    hr_status = factor(hr_status, levels = c("HR negative", "HR positive")),
    grade_f = factor(Grade, levels = c(1, 2, 3), labels = c("I", "II", "III")),
    histology = case_when(
      Hist == 8500 ~ "IDC",
      Hist == 8520 ~ "ILC",
      TRUE ~ "Others"
    ),
    histology = factor(histology, levels = c("IDC", "ILC", "Others")),
    endocrine = factor(ifelse(HT == 1, "Yes", "No"), levels = c("No", "Yes")),
    
    # --- 放射線治療 (修復 100% bug，使用正確欄位) ---
    rt_done = ifelse(!is.na(X4.2.1.1.RT.Target.Summary) & as.numeric(X4.2.1.1.RT.Target.Summary) > 0, "Done", "None"),
    rt_done = factor(rt_done, levels = c("None", "Done")),
    
    # --- 追蹤月數 (使用原始未截尾的 surv_year_os，呈現真實追蹤時間) ---
    follow_up_months = surv_year_os * 12 
  )

# === 3. 使用 gtsummary 製作 Table 1 ===
tbl1 <- df_analysis %>%
  select(
    # 這裡放進去的變數，都會出現在 Table 1 裡面！
    # final_stage 就是你要的 patient counts by stage
    group, Age, age_group, hr_status, grade_f, histology,
    endocrine, rt_done, follow_up_months, final_stage
  ) %>%
  tbl_summary(
    by = group,
    type = list(
      Age ~ "continuous2",
      follow_up_months ~ "continuous2"
    ),
    statistic = list(
      all_continuous2() ~ "{mean} ± {sd}; {median} [{min}, {max}]",
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no", 
    label = list(
      Age ~ "Age",
      age_group ~ "Age group",
      hr_status ~ "Hormone receptor status",
      grade_f ~ "Tumor grade",
      histology ~ "Histology",
      endocrine ~ "Endocrine therapy",
      rt_done ~ "Radiotherapy",
      follow_up_months ~ "Follow-up (months)",
      final_stage ~ "Final stage" # <--- 這裡會顯示各期別與各治療組的交叉人數
    )
  ) %>%
  add_overall(last = TRUE) %>%                   
  add_p(test = everything() ~ "chisq.test") %>%  
  modify_header(
    label = "**Characteristics**",
    # gtsummary 會自動抓取 1332 人並計算各組 N 數
    all_stat_cols() ~ "**{level}**\n(n={n})" 
  ) %>%
  bold_labels()

# === 4. 輸出結果 ===
# 在 R Markdown 中顯示表格
tbl1

Characteristics	Group A (n=240)¹	Group B (n=30)¹	Group C (n=70)¹	Group D (n=992)¹	Overall (n=1332)¹	p-value²
Age						<0.001
Mean ± SD; Median [Min, Max]	58 ± 12; 58 [32, 89]	64 ± 16; 68 [32, 86]	55 ± 12; 57 [30, 79]	54 ± 11; 54 [20, 87]	55 ± 12; 55 [20, 89]
Age group						0.008
< 50 y/o	57 (24%)	7 (23%)	22 (31%)	344 (35%)	430 (32%)
≥ 50 y/o	183 (76%)	23 (77%)	48 (69%)	648 (65%)	902 (68%)
Hormone receptor status						<0.001
HR negative	148 (62%)	9 (30%)	23 (33%)	428 (43%)	608 (46%)
HR positive	92 (38%)	21 (70%)	47 (67%)	564 (57%)	724 (54%)
Tumor grade						<0.001
I	7 (5.5%)	0 (0%)	0 (0%)	4 (0.7%)	11 (1.5%)
II	76 (60%)	7 (44%)	27 (53%)	270 (49%)	380 (51%)
III	44 (35%)	9 (56%)	24 (47%)	276 (50%)	353 (47%)
Histology						0.070
IDC	224 (93%)	27 (90%)	67 (96%)	946 (95%)	1,264 (95%)
ILC	2 (0.8%)	2 (6.7%)	0 (0%)	12 (1.2%)	16 (1.2%)
Others	14 (5.8%)	1 (3.3%)	3 (4.3%)	34 (3.4%)	52 (3.9%)
Endocrine therapy	89 (37%)	21 (70%)	43 (61%)	536 (54%)	689 (52%)	<0.001
Radiotherapy						<0.001
None	168 (70%)	22 (73%)	49 (70%)	489 (49%)	728 (55%)
Done	72 (30%)	8 (27%)	21 (30%)	503 (51%)	604 (45%)
Follow-up (months)						0.10
Mean ± SD; Median [Min, Max]	66 ± 41; 60 [0, 155]	65 ± 39; 58 [9, 147]	77 ± 42; 78 [5, 156]	75 ± 39; 71 [0, 163]	73 ± 40; 67 [0, 163]
Final stage						<0.001
1A	191 (80%)	12 (40%)	30 (43%)	251 (25%)	484 (36%)
1B	0 (0%)	0 (0%)	0 (0%)	7 (0.7%)	7 (0.5%)
2A	35 (15%)	5 (17%)	28 (40%)	287 (29%)	355 (27%)
2B	6 (2.5%)	6 (20%)	3 (4.3%)	175 (18%)	190 (14%)
3A	2 (0.8%)	2 (6.7%)	4 (5.7%)	130 (13%)	138 (10%)
3B	4 (1.7%)	0 (0%)	0 (0%)	17 (1.7%)	21 (1.6%)
3C	2 (0.8%)	5 (17%)	5 (7.1%)	125 (13%)	137 (10%)
¹ n (%)
² Pearson’s Chi-squared test

# 匯出成 Word 檔
tbl1 %>%
  as_flex_table() %>%
  autofit() %>%
  save_as_docx(path = "Table1_Clinicopathologic_Characteristics.docx")

cat("✅ Table 1 已成功產出並存為 Table1_Clinicopathologic_Characteristics.docx")

## ✅ Table 1 已成功產出並存為 Table1_Clinicopathologic_Characteristics.docx

四、5年存活率分析 (KM Plots)

# ==========================================
# 存活分析：KM Plot (1A~3C全期別展示) + Risk Table + 檢定結果
# ==========================================

library(survival)
library(survminer)
library(dplyr)
library(gridExtra)
library(ggpubr)       # 💡 新增：用來畫底下的統計檢定表格
library(broom)

# 確保資料格式與 Reference 組別設定正確
df_analysis <- df_analysis %>%
  filter(!is.na(group)) %>%  
  mutate(
    group = relevel(factor(group), ref = "Group A"),
    final_stage = relevel(factor(final_stage), ref = "1A")
  )

# 自訂經典四色 (Group A:紅, B:藍, C:綠, D:紫)
km_colors <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3")

for (stg in stages) {
  
  df_stage <- df_analysis %>% filter(final_stage == stg)
  
  if (nrow(df_stage) > 0 && n_distinct(df_stage$group) >= 2) {
    
    # ==========================================
    # [RFS] 無復發存活率
    # ==========================================
    fit_rfs <- survfit(Surv(surv_year_rfs_5y, relapse_5y) ~ group, data = df_stage)
    p_rfs <- ggsurvplot(
      fit_rfs, data = df_stage,
      pval = TRUE,                      
      risk.table = TRUE,                
      xlim = c(0, 5),                   
      break.time.by = 1,
      palette = km_colors,
      title = paste0("5-Year RFS - Stage ", stg),
      xlab = "Time (Years)", ylab = "RFS Probability",
      legend.title = "Treatment",
      ggtheme = theme_minimal(base_size = 14),
      risk.table.height = 0.25,          
      risk.table.y.text = TRUE,         # ✅ 恢復文字標註 (Group A, B, C, D)
      risk.table.y.text.col = TRUE,     # ✅ 讓文字保持對應顏色，清楚又好看
      fontsize = 4.5,                   
      tables.theme = theme_cleantable() 
    )

    # RFS 底下的檢定表格 (Cox HR vs Group A)
    cox_rfs <- tryCatch(coxph(Surv(surv_year_rfs_5y, relapse_5y) ~ group, data = df_stage), error = function(e) NULL)
    if (!is.null(cox_rfs)) {
      cox_tbl_rfs <- tidy(cox_rfs, exponentiate = TRUE, conf.int = TRUE) %>%
        mutate(
          Treatment = sub("group", "", term),
          `HR (95% CI)` = sprintf("%.2f (%.2f - %.2f)", estimate, conf.low, conf.high),
          `P-value` = ifelse(p.value < 0.001, "<0.001", sprintf("%.3f", p.value))
        ) %>% select(Treatment, `HR (95% CI)`, `P-value`)
      p_rfs_table <- ggtexttable(cox_tbl_rfs, rows = NULL, theme = ttheme("minimal", base_size = 12))
    } else {
      p_rfs_table <- ggparagraph(text = "Not estimable", size = 12)
    }

    # ==========================================
    # [OS] 整體存活率
    # ==========================================
    fit_os <- survfit(Surv(surv_year_os_5y, Vita_m_5y) ~ group, data = df_stage)
    p_os <- ggsurvplot(
      fit_os, data = df_stage,
      pval = TRUE,                      
      risk.table = TRUE,
      xlim = c(0, 5),
      break.time.by = 1,
      palette = km_colors,
      title = paste0("5-Year OS - Stage ", stg),
      xlab = "Time (Years)", ylab = "OS Probability",
      legend.title = "Treatment",
      ggtheme = theme_minimal(base_size = 14),
      risk.table.height = 0.25,
      risk.table.y.text = TRUE,         # ✅ 恢復文字標註
      risk.table.y.text.col = TRUE,     
      fontsize = 4.5,
      tables.theme = theme_cleantable()
    )

    # OS 底下的檢定表格 (Cox HR vs Group A)
    cox_os <- tryCatch(coxph(Surv(surv_year_os_5y, Vita_m_5y) ~ group, data = df_stage), error = function(e) NULL)
    if (!is.null(cox_os)) {
      cox_tbl_os <- tidy(cox_os, exponentiate = TRUE, conf.int = TRUE) %>%
        mutate(
          Treatment = sub("group", "", term),
          `HR (95% CI)` = sprintf("%.2f (%.2f - %.2f)", estimate, conf.low, conf.high),
          `P-value` = ifelse(p.value < 0.001, "<0.001", sprintf("%.3f", p.value))
        ) %>% select(Treatment, `HR (95% CI)`, `P-value`)
      p_os_table <- ggtexttable(cox_tbl_os, rows = NULL, theme = ttheme("minimal", base_size = 12))
    } else {
      p_os_table <- ggparagraph(text = "Not estimable", size = 12)
    }

    # ==========================================
    # 組合輸出 (KM 圖 + Risk Table + Cox 檢定表)
    # ==========================================
    grid.newpage() 
    grid.arrange(
      p_rfs$plot, p_os$plot, 
      p_rfs$table, p_os$table, 
      p_rfs_table, p_os_table,
      ncol = 2, 
      heights = c(4, 1.2, 1) # 高度比例：KM 圖最大，Risk Table 其次，檢定表在最下面
    )
  }
}

五、Cox 比例風險模型 (Cox Regression)

評估治療組別、年齡、HR狀態、腫瘤分化與期別對 RFS 與 OS 的多變項影響。

library(survival)
library(dplyr)
library(broom)
library(flextable)
library(officer)

# === 定義生存物件 (改用我們整理好的、最乾淨完整的 df_analysis) ===
# OS (使用完整的追蹤年份與事件)
surv_os <- Surv(time = df_analysis$surv_year_os, event = df_analysis$Vita_m == 1)

# RFS (使用完整的追蹤年份與事件)
surv_rfs <- Surv(time = df_analysis$surv_year_rfs, event = df_analysis$rfs_event_raw == 1)

# === 要跑的變數 ===
vars <- c("group", "Age", "hr_status", "grade_f", "final_stage")

# --- function: 單因子 + 多因子 Cox ---
run_cox <- function(surv_obj, data, vars) {
  
  # 單因子 Cox
  uni_results <- lapply(vars, function(v) {
    f <- as.formula(paste("surv_obj ~", v))
    coxph(f, data = data)
  })
  
  uni_table <- lapply(uni_results, tidy, exponentiate = TRUE, conf.int = TRUE) %>%
    bind_rows(.id = "Variable") %>%
    mutate(Variable = vars[as.numeric(Variable)]) %>%
    select(Variable, term, estimate, conf.low, conf.high, p.value)
  
  # 多因子 Cox
  multi_model <- coxph(surv_obj ~ group + Age + hr_status + grade_f + final_stage,
                       data = data)
  
  multi_table <- tidy(multi_model, exponentiate = TRUE, conf.int = TRUE) %>%
    select(term, estimate, conf.low, conf.high, p.value)
  
  # 合併單因子與多因子
  cox_table <- uni_table %>%
    rename(Uni_HR = estimate, Uni_LCL = conf.low, Uni_UCL = conf.high, Uni_p = p.value) %>%
    left_join(
      multi_table %>%
        rename(Multi_HR = estimate, Multi_LCL = conf.low, Multi_UCL = conf.high, Multi_p = p.value),
      by = c("term")
    )
  
  # ✨ 智慧攔截：處理 Inf 或因為 0 事件導致的荒謬超大數值 (>100) → 轉為 NA
  cox_table <- cox_table %>%
    mutate(across(c(Uni_HR, Uni_LCL, Uni_UCL, Multi_HR, Multi_LCL, Multi_UCL),
                  ~ ifelse(. > 100 | is.infinite(.), NA, .)))
  
  return(cox_table)
}

# === 格式化成表格（乾淨 Level + 顯示 ref group） ===
format_cox <- function(cox_table, title) {
  
  cox_table_fmt <- cox_table %>%
    mutate(
      # 乾淨的 Level 名稱
      Level = case_when(
        grepl("group", term) ~ sub("group", "", term),
        grepl("hr_status", term) ~ sub("hr_status", "", term),
        grepl("grade_f", term) ~ sub("grade_f", "", term),
        grepl("final_stage", term) ~ sub("final_stage", "", term),
        term == "Age" ~ "Age (per year increase)",
        TRUE ~ term
      ),
      # 數值格式化 (遇到 NA 就顯示 Not estimable)
      Uni = ifelse(is.na(Uni_HR), "Not estimable",
                   sprintf("%.2f (%.2f–%.2f)", Uni_HR, Uni_LCL, Uni_UCL)),
      Multi = ifelse(is.na(Multi_HR), "Not estimable",
                     sprintf("%.2f (%.2f–%.2f)", Multi_HR, Multi_LCL, Multi_UCL)),
      Uni_p = ifelse(is.na(Uni_p), "", sprintf("%.3f", Uni_p)),
      Multi_p = ifelse(is.na(Multi_p), "", sprintf("%.3f", Multi_p))
    ) %>%
    select(Variable, Level, Uni, Uni_p, Multi, Multi_p)
  
  # 各變數的 reference group
  ref_rows <- tribble(
    ~Variable,      ~Level,                ~Uni, ~Uni_p, ~Multi, ~Multi_p,
    "group",        "Group A (ref)",       "",   "",     "",     "",
    "hr_status",    "HR negative (ref)",   "",   "",     "",     "",
    "grade_f",      "Grade I (ref)",       "",   "",     "",     "",
    "final_stage",  "Stage 1A (ref)",      "",   "",     "",     ""
  )
  
  # 插入 ref 並讓它排在各變數最前面
  cox_table_fmt <- bind_rows(cox_table_fmt, ref_rows) %>%
    mutate(Level = trimws(Level)) %>%
    arrange(factor(Variable, levels = c("group","Age","hr_status","grade_f","final_stage")),
            case_when(
              grepl("ref", Level) ~ 0,  # ref 永遠排最上面
              TRUE ~ 1
            ),
            Level)
  
  flextable(cox_table_fmt) %>%
    set_header_labels(
      Variable = "Variable",
      Level = "Level",
      Uni = "Univariate HR (95% CI)",
      Uni_p = "Univariate p",
      Multi = "Multivariate HR (95% CI)",
      Multi_p = "Multivariate p"
    ) %>%
    add_header_row(values = title, colwidths = 6) %>%
    autofit() %>%
    bold(i = ~ as.numeric(Uni_p) < 0.05 | as.numeric(Multi_p) < 0.05, bold = TRUE)
}

# === 跑 OS & RFS ===
cox_os <- run_cox(surv_os, df_analysis, vars)
cox_rfs <- run_cox(surv_rfs, df_analysis, vars)

# === 轉換成表格 (直接在 RMarkdown 顯示) ===
tbl_os <- format_cox(cox_os, "Table 3a. Cox Regression for Overall Survival (OS)")
tbl_rfs <- format_cox(cox_rfs, "Table 3b. Cox Regression for Recurrence-Free Survival (RFS)")

tbl_os

Table 3a. Cox Regression for Overall Survival (OS)
Variable	Level	Univariate HR (95% CI)	Univariate p	Multivariate HR (95% CI)	Multivariate p
group	Group A (ref)
group	Group B	2.33 (0.94–5.76)	0.068	0.96 (0.35–2.58)	0.928
group	Group C	1.29 (0.59–2.81)	0.524	0.87 (0.34–2.18)	0.760
group	Group D	0.59 (0.36–0.97)	0.039	0.38 (0.20–0.73)	0.003
Age	Age (per year increase)	1.07 (1.05–1.09)	0.000	1.05 (1.03–1.07)	0.000
hr_status	HR negative (ref)
hr_status	HR positive	0.81 (0.54–1.22)	0.320	0.73 (0.47–1.16)	0.181
grade_f	Grade I (ref)
grade_f	II	0.49 (0.12–2.02)	0.322	0.18 (0.04–0.85)	0.030
grade_f	III	0.53 (0.13–2.19)	0.382	0.12 (0.02–0.58)	0.009
final_stage	Stage 1A (ref)
final_stage	1B	0.00 (0.00–NA)	0.994	0.00 (0.00–NA)	0.995
final_stage	2A	2.53 (1.26–5.09)	0.009	3.49 (1.55–7.83)	0.003
final_stage	2B	3.75 (1.81–7.78)	0.000	7.41 (3.03–18.12)	0.000
final_stage	3A	3.72 (1.70–8.16)	0.001	7.71 (3.06–19.44)	0.000
final_stage	3B	9.27 (2.99–28.75)	0.000	7.66 (2.24–26.13)	0.001
final_stage	3C	7.23 (3.62–14.46)	0.000	9.59 (3.97–23.13)	0.000

tbl_rfs

Table 3b. Cox Regression for Recurrence-Free Survival (RFS)
Variable	Level	Univariate HR (95% CI)	Univariate p	Multivariate HR (95% CI)	Multivariate p
group	Group A (ref)
group	Group B	2.09 (0.85–5.14)	0.107	0.47 (0.12–1.74)	0.255
group	Group C	1.10 (0.51–2.37)	0.817	0.48 (0.18–1.24)	0.128
group	Group D	0.66 (0.41–1.06)	0.084	0.24 (0.12–0.48)	0.000
Age	Age (per year increase)	1.03 (1.01–1.04)	0.001	1.01 (0.99–1.03)	0.464
hr_status	HR negative (ref)
hr_status	HR positive	0.72 (0.50–1.04)	0.080	0.73 (0.46–1.16)	0.180
grade_f	Grade I (ref)
grade_f	II	Not estimable	0.996	Not estimable	0.995
grade_f	III	Not estimable	0.996	Not estimable	0.995
final_stage	Stage 1A (ref)
final_stage	1B	0.00 (0.00–NA)	0.994	0.00 (0.00–NA)	0.997
final_stage	2A	2.40 (1.33–4.33)	0.004	5.18 (2.31–11.63)	0.000
final_stage	2B	1.91 (0.93–3.93)	0.079	5.22 (1.93–14.16)	0.001
final_stage	3A	2.06 (0.94–4.50)	0.070	6.11 (2.24–16.69)	0.000
final_stage	3B	7.44 (2.75–20.18)	0.000	10.98 (3.23–37.25)	0.000
final_stage	3C	7.50 (4.21–13.36)	0.000	18.57 (7.71–44.74)	0.000

六、新輔助治療 (NAC) 時代演變趨勢

觀察 2007-2022 年間，不同期別與 HR 狀態下採用 NAC 的比例變化。

summary_nac <- df_analysis %>%
  mutate(year = year(DIAG_DT), NAC = as.integer(NAC_binary == 1)) %>%
  filter(!is.na(year), !is.na(hr_status)) %>%
  group_by(year, final_stage, hr_status) %>%
  summarise(total = n(), nac_rate = sum(NAC) / total, .groups = "drop") %>%
  filter(final_stage %in% c("1A", "2A", "2B", "3A", "3B", "3C"))

ggplot(summary_nac, aes(x = year, y = nac_rate, color = hr_status)) +
  geom_line(size = 1) + geom_point(size = 2) +
  facet_wrap(~ final_stage, ncol = 3) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(title = "NAC Adoption Rate Over Time by Stage and HR Status", x = "Year of Diagnosis", y = "NAC Rate", color = "HR Status") +
  theme_minimal(base_size = 14) + theme(legend.position = "bottom")

Breast cancer for graduation presentation

b10406023 Nature Huang

2026-04-30