toronto_tables_and

ds<- read.csv("combined.csv") %>% clean_names()
ds$mapped_type1<- ifelse(ds$mapped_type1=="Direct mapping (no transformation needed)", ds$cancer_type1,ds$mapped_type1)
ds$mapped_type2<- ifelse(ds$mapped_type2=="Direct mapping (no transformation needed)", ds$cancer_type1,ds$mapped_type2)

Figure 1: Flowchart of Automated Pediatric Cancer Staging Pipeline

For a flowchart like this, we’ll use the DiagrammeR package which allows for creation of flowcharts:

library(DiagrammeR)

grViz("
digraph pipeline {
  rankdir = LR;
  node [shape = rectangle, style = filled, fillcolor = lightblue, fontname = Helvetica, fontsize = 12, margin = 0.2];
  edge [arrowhead = vee, color = grey40];

  subgraph cluster_input {
    label = 'Input Documents';
    bgcolor = lightgrey;
    node [shape = note, fillcolor = white];
    rad [label = 'Radiology Reports'];
    path [label = 'Pathology Reports'];
    clin [label = 'Clinical Notes'];
  }

  subgraph cluster_agents {
    label = 'LangGraph Multi-Agent Pipeline';
    bgcolor = lightgrey;
    agent1 [label = 'Diagnosis Identification Agent'];
    agent2 [label = 'Feature Extraction Agent'];
    agent3 [label = 'Staging Logic Agent'];
    valid [label = 'Internal Validation', fillcolor = '#FDE9D9'];
  }

  subgraph cluster_output {
    label = 'Output';
    bgcolor = lightgrey;
    report [label = 'Final Stage Report', fillcolor = '#E8F4D9'];
  }

  // Edges
  {rad path clin} -> agent1;
  agent1 -> agent2;
  agent2 -> agent3;
  
  // Rules connection
  rules [label = 'JSON-based Staging Rules', shape = cylinder, fillcolor = '#FFE6CC'];
  rules -> agent3 [style = dashed];
  
  agent3 -> valid;
  valid -> report;
  
  // Feedback loops
  valid -> agent2 [label = 'Request\nmore data', color = red, constraint = false];
}
")

Flowchart of Automated Pediatric Cancer Staging Pipeline

Figure 2: Bar Chart of Accuracy by Cancer Type

# Create a combined dataset for mapped type analysis
mapped_type_data <- rbind(
  ds %>% 
    select(mapped_type = mapped_type1, correct = ai_corret_1) %>%
    mutate(evaluation = "Evaluation 1"),
  ds %>% 
    select(mapped_type = mapped_type2, correct = ai_correct_2) %>%
    mutate(evaluation = "Evaluation 2")
) %>% 
  filter(mapped_type %nin% c("Not Toronto", "Not applicable"))


# Calculate combined accuracy by mapped type
mapped_type_accuracy <- mapped_type_data %>%
  group_by(mapped_type) %>%
  summarise(
    count = n()/2, 
    total_eval = n(),
    correct_eval = sum(correct, na.rm = TRUE),
    accuracy = correct_eval / total_eval
  ) %>%
  arrange(accuracy) %>%
  filter(count >= 3)

# Create a more presentable table
mapped_type_table <- mapped_type_accuracy %>%
  mutate(
    accuracy_percent = paste0(round(accuracy * 100, 1), "%"),
    error_rate = paste0(round((1-accuracy) * 100, 1), "%")
  ) %>%
  select(
    `Mapped Type` = mapped_type,
    `Sample Count` = count,
    `Total Evaluations` = total_eval,
    `Correct Evaluations` = correct_eval,
    `Accuracy %` = accuracy_percent,
    `Error Rate` = error_rate
  ) %>%
  arrange(desc(`Accuracy %`))  # Sort from highest to lowest accuracy

# Display the table
knitr::kable(mapped_type_table, 
             caption = "Accuracy by Mapped Type (Combined across both evaluations)")

Accuracy by Mapped Type (Combined across both evaluations)
Mapped Type	Sample Count	Total Evaluations	Correct Evaluations	Accuracy %	Error Rate
Bone Tumors	41.0	82	81	98.8%	1.2%
Astrocytoma	44.0	88	85	96.6%	3.4%
Acute Lymphoblastic Leukemia	114.0	228	220	96.5%	3.5%
Medulloblastoma	19.0	38	36	94.7%	5.3%
Ependymoma	9.0	18	17	94.4%	5.6%
Hodgkin Lymphoma	43.5	87	78	89.7%	10.3%
Rhabdomyosarcoma	17.0	34	30	88.2%	11.8%
Non-Hodgkin Lymphoma	39.0	78	67	85.9%	14.1%
Neuroblastoma	28.0	56	48	85.7%	14.3%
Retinoblastoma	32.5	65	54	83.1%	16.9%
Ovarian Germ Cell Tumor	7.0	14	11	78.6%	21.4%
Renal Tumors	26.0	52	38	73.1%	26.9%
Non-Rhabdomyosarcoma Soft Tissue Sarcoma	6.0	12	12	100%	0%

# Create improved visualization with better text alignment and margins
ggplot(mapped_type_accuracy, aes(x = reorder(mapped_type, accuracy), y = accuracy, fill = mapped_type)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(accuracy * 100, 1), "% (n=", count, ")")), 
            hjust = -0.1, size = 3) +
  labs(title = "Accuracy by Mapped Type",
       subtitle = "Combined across both evaluations",
       x = "Mapped Type",
       y = "Accuracy") +
  theme_minimal() +
  coord_flip() +
  scale_y_continuous(labels = scales::percent, 
                     limits = c(0, 1), 
                     expand = expansion(mult = c(0, 0.3))) + # Expand right margin by 30%
  theme(legend.position = "none",
        plot.margin = margin(10, 30, 10, 10, "pt")) # Add extra margin on the right side

Accuracy by Cancer Type

Figure 3: Error Analysis Diagram

library(ggplot2)

# Create sample data for error categories
error_categories <- data.frame(
  category = c("Missing Information", "Ambiguous Imaging", 
               "Local vs Metastatic Confusion", "Staging Criteria Misapplication", 
               "Diagnosis Mapping Errors"),
  percentage = c(32, 25, 18, 15, 10)
)

# Calculate position for labels
error_categories <- error_categories %>%
  arrange(desc(category)) %>%
  mutate(ypos = cumsum(percentage) - 0.5*percentage)

# Create pie chart
ggplot(error_categories, aes(x = "", y = percentage, fill = category)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(y = ypos, label = paste0(percentage, "%")), 
            color = "white", size = 4, fontface = "bold") +
  scale_fill_brewer(palette = "Set2", name = "Error Category") +
  labs(title = "Distribution of Error Categories",
       subtitle = "Percentage of cases with staging errors") +
  theme_minimal() +
  theme(axis.title = element_blank(),
        axis.text = element_blank(),
        panel.grid = element_blank(),
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position = "right")

Error Analysis Diagram

Table 1: Cohort Demographics and Clinical Characteristics

library(kableExtra)
library(dplyr)

# Assume your grouped stage data looks like this:
stage_counts <- ds %>%
  group_by(mapped_type1, true_stage) %>%
  tally(name = "n") %>%
  arrange(mapped_type1, desc(n))

# Convert to table
stage_table <- stage_counts %>%
  rename(`Cancer Type` = mapped_type1, Stage = true_stage, N = n)


library(dplyr)
library(stringr)

cleaned_stage_table <- stage_table %>%
  mutate(Stage = str_replace_all(Stage, "\\*\\*|\\(.*?\\)", ""),      # remove ** and parenthetical notes
         Stage = str_trim(Stage),
         Stage = str_replace_all(Stage, "Iunable to stage|Unable to determine -.*|Unknown", "Unable to Stage"),
         Stage = str_replace_all(Stage, "Ms", "MS"),
         Stage = str_replace_all(Stage, "III-A|III A", "Stage III-A"),
         Stage = str_replace_all(Stage, "III-B|III B", "Stage III-B"),
         Stage = str_replace_all(Stage, "II-A|II A", "Stage II-A"),
         Stage = str_replace_all(Stage, "II-B|II B", "Stage II-B"),
         Stage = str_replace_all(Stage, "I-A|I A", "Stage I-A"),
         Stage = str_replace_all(Stage, "IV-B|IV B", "Stage IV-B"),
         Stage = str_replace_all(Stage, "IV-A|IV A", "Stage IV-A"),
         Stage = str_replace_all(Stage, "y-I", "Stage y-I"),
         Stage = str_replace_all(Stage, "y-II", "Stage y-II"),
         Stage = str_replace_all(Stage, "y-III|yIII", "Stage y-III"),
         Stage = str_replace_all(Stage, "y-IV", "Stage y-IV"),
         Stage = str_replace_all(Stage, "Stage ", "")) %>%
  mutate(Stage = paste("Stage", Stage)) %>%                           # ensure all have "Stage" prefix
  group_by(`Cancer Type`, Stage) %>%
  summarise(N = sum(N), .groups = "drop") %>%
  arrange(`Cancer Type`, desc(N))


# Documentation metrics table
doc_metrics <- data.frame(
  Metric = c(
    "Documents per patient (median, IQR)",
    "Words per patient (median, IQR)",
    "With explicit stage documentation",
    "Without explicit stage documentation"
  ),
  Value = c(
    "8 (5–12)",
    "3874 (2988–5066)",
    "38.3%",
    "61.7%"
  )
)

# Render the first table
cleaned_stage_table %>%
  kbl(caption = "Cancer Types with Stage Breakdown") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Cancer Types with Stage Breakdown
Cancer Type	Stage	N
Acute Lymphoblastic Leukemia	Stage CNS1	102
Acute Lymphoblastic Leukemia	Stage CNS2	5
Acute Lymphoblastic Leukemia	Stage CNS3	4
Acute Lymphoblastic Leukemia	Stage Unable to Stage	1
Acute Lymphoblastic Leukemia	Stage Unable to stage	1
Acute Myeloid Leukemia	Stage Unable to Stage	1
Astrocytoma	Stage Localized	35
Astrocytoma	Stage Metastatic	8
Astrocytoma	Stage Unable to stage	1
Bone Tumors	Stage Localized	22
Bone Tumors	Stage Metastatic	19
Ependymoma	Stage M0	7
Ependymoma	Stage M2	1
Ependymoma	Stage M3	1
Hepatoblastoma	Stage Localized	1
Hepatoblastoma	Stage Metastatic	1
Hodgkin Lymphoma	Stage II-A	19
Hodgkin Lymphoma	Stage IV-B	10
Hodgkin Lymphoma	Stage II-B	8
Hodgkin Lymphoma	Stage III-A	4
Hodgkin Lymphoma	Stage III-B	4
Hodgkin Lymphoma	Stage I-A	1
Hodgkin Lymphoma	Stage IV-A	1
Medulloblastoma	Stage M0	10
Medulloblastoma	Stage M2	5
Medulloblastoma	Stage M3	4
Neuroblastoma	Stage M	18
Neuroblastoma	Stage L2	4
Neuroblastoma	Stage L1	3
Neuroblastoma	Stage MS	3
Non-Hodgkin Lymphoma	Stage III	18
Non-Hodgkin Lymphoma	Stage II	9
Non-Hodgkin Lymphoma	Stage IV	9
Non-Hodgkin Lymphoma	Stage I	2
Non-Hodgkin Lymphoma	Stage Pending further evaluation	1
Non-Rhabdomyosarcoma Soft Tissue Sarcoma	Stage II	2
Non-Rhabdomyosarcoma Soft Tissue Sarcoma	Stage III	2
Non-Rhabdomyosarcoma Soft Tissue Sarcoma	Stage IV	2
Not Toronto	Stage Unable to Stage	49
Not applicable	Stage Unable to Stage	15
Not applicable (as AML is not covered by the Toronto Pediatric Cancer Staging System)	Stage Unable to Stage	1
Not covered by Toronto Pediatric Cancer Staging System	Stage Unable to Stage	1
Ovarian Germ Cell Tumor	Stage IV	3
Ovarian Germ Cell Tumor	Stage I	2
Ovarian Germ Cell Tumor	Stage III	2
Renal Tumors	Stage y-I	7
Renal Tumors	Stage V	4
Renal Tumors	Stage y-II	4
Renal Tumors	Stage y-III	3
Renal Tumors	Stage I	2
Renal Tumors	Stage II	2
Renal Tumors	Stage y-IV	2
Renal Tumors	Stage IV	1
Renal Tumors	Stage Undetermined	1
Retinoblastoma	Stage 0	13
Retinoblastoma	Stage I	12
Retinoblastoma	Stage II	6
Retinoblastoma	Stage III	2
Rhabdomyosarcoma	Stage I	7
Rhabdomyosarcoma	Stage III	5
Rhabdomyosarcoma	Stage IV	3
Rhabdomyosarcoma	Stage II	2
Testicular Germ Cell Tumor	Stage II	1
Testicular Germ Cell Tumor	Stage IS	1

# Then render the documentation metrics
doc_metrics %>%
  kbl(caption = "Documentation Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Documentation Metrics
Metric	Value
Documents per patient (median, IQR)	8 (5–12)
Words per patient (median, IQR)	3874 (2988–5066)
With explicit stage documentation	38.3%
Without explicit stage documentation	61.7%

Table 2: Overall Accuracy and Reliability Metrics

# Create data for accuracy metrics
accuracy_metrics <- data.frame(
  metric = c("Accuracy", "Cohen's κ (exact stage agreement)"),
  run1 = c("93.8%", "—"),
  run2 = c("88.7%", "—"),
  combined = c("91.2%", "0.785")
)

# Create table
accuracy_metrics %>%
  kbl(col.names = c("Metric", "Run 1", "Run 2", "Combined"),
      caption = "Overall Accuracy and Reliability Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), 
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, color = "white", background = "#4E79A7") %>%
  row_spec(1, bold = TRUE) %>%
  column_spec(1, bold = TRUE)

Overall Accuracy and Reliability Metrics
Metric	Run 1	Run 2	Combined
Accuracy	93.8%	88.7%	91.2%
Cohen’s κ (exact stage agreement)	—	—	0.785

Table 3: Confusion Matrix (Predicted Stage vs. Ground Truth)

library(kableExtra)

# Create a more comprehensive confusion matrix with various staging categories
confusion_table <- data.frame(
  `Actual Stage` = c("Stage I", "Stage II", "Stage III", "Stage IV", 
                     "Localized", "Metastatic", "Stage L1", "Stage L2", "Stage M"),
  `Stage I` = c(125, 3, 0, 0, 2, 0, 0, 0, 0),
  `Stage II` = c(5, 110, 4, 0, 0, 0, 0, 0, 0),
  `Stage III` = c(0, 7, 80, 2, 0, 0, 0, 0, 0),
  `Stage IV` = c(0, 0, 2, 112, 0, 0, 0, 0, 0),
  `Localized` = c(3, 0, 0, 0, 86, 4, 0, 0, 0),
  `Metastatic` = c(0, 0, 0, 5, 7, 93, 0, 0, 0),
  `Stage L1` = c(0, 0, 0, 0, 0, 0, 45, 2, 0),
  `Stage L2` = c(0, 0, 0, 0, 0, 0, 3, 38, 1),
  `Stage M` = c(0, 0, 0, 0, 0, 3, 0, 1, 58)
)

# Create the table
knitr::kable(confusion_table,
             caption = "Comprehensive Confusion Matrix (Predicted vs. Actual Stage)",
             align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), 
                full_width = TRUE, 
                font_size = 11) %>%  # Reduced font size for better fit
  add_header_above(c(" " = 1, "Predicted Stage" = 9)) %>%
  row_spec(0, bold = TRUE, background = "#4E79A7", color = "white") %>%
  column_spec(1, bold = TRUE, background = "#EAF2F8") %>%
  scroll_box(width = "100%", height = "500px")  # Makes the table scrollable if it's too wide

Comprehensive Confusion Matrix (Predicted vs. Actual Stage)
	Predicted Stage
Actual.Stage	Stage.I	Stage.II	Stage.III	Stage.IV	Localized	Metastatic	Stage.L1	Stage.L2	Stage.M
Stage I	125	5	0	0	3	0	0	0	0
Stage II	3	110	7	0	0	0	0	0	0
Stage III	0	4	80	2	0	0	0	0	0
Stage IV	0	0	2	112	0	5	0	0	0
Localized	2	0	0	0	86	7	0	0	0
Metastatic	0	0	0	0	4	93	0	0	3
Stage L1	0	0	0	0	0	0	45	3	0
Stage L2	0	0	0	0	0	0	2	38	1
Stage M	0	0	0	0	0	0	0	1	58

These R code chunks generate the figures and tables you requested, with appropriate formatting and styling. The sample data is provided for illustration, but in your actual implementation, you would replace these with your real dataset variables.

toronto_tables_and_figures

2025-04-16