ds<- read.csv("combined.csv") %>% clean_names()
ds$mapped_type1<- ifelse(ds$mapped_type1=="Direct mapping (no transformation needed)", ds$cancer_type1,ds$mapped_type1)
ds$mapped_type2<- ifelse(ds$mapped_type2=="Direct mapping (no transformation needed)", ds$cancer_type1,ds$mapped_type2)
For a flowchart like this, we’ll use the DiagrammeR package which allows for creation of flowcharts:
library(DiagrammeR)
grViz("
digraph pipeline {
rankdir = LR;
node [shape = rectangle, style = filled, fillcolor = lightblue, fontname = Helvetica, fontsize = 12, margin = 0.2];
edge [arrowhead = vee, color = grey40];
subgraph cluster_input {
label = 'Input Documents';
bgcolor = lightgrey;
node [shape = note, fillcolor = white];
rad [label = 'Radiology Reports'];
path [label = 'Pathology Reports'];
clin [label = 'Clinical Notes'];
}
subgraph cluster_agents {
label = 'LangGraph Multi-Agent Pipeline';
bgcolor = lightgrey;
agent1 [label = 'Diagnosis Identification Agent'];
agent2 [label = 'Feature Extraction Agent'];
agent3 [label = 'Staging Logic Agent'];
valid [label = 'Internal Validation', fillcolor = '#FDE9D9'];
}
subgraph cluster_output {
label = 'Output';
bgcolor = lightgrey;
report [label = 'Final Stage Report', fillcolor = '#E8F4D9'];
}
// Edges
{rad path clin} -> agent1;
agent1 -> agent2;
agent2 -> agent3;
// Rules connection
rules [label = 'JSON-based Staging Rules', shape = cylinder, fillcolor = '#FFE6CC'];
rules -> agent3 [style = dashed];
agent3 -> valid;
valid -> report;
// Feedback loops
valid -> agent2 [label = 'Request\nmore data', color = red, constraint = false];
}
")
Flowchart of Automated Pediatric Cancer Staging Pipeline
# Create a combined dataset for mapped type analysis
mapped_type_data <- rbind(
ds %>%
select(mapped_type = mapped_type1, correct = ai_corret_1) %>%
mutate(evaluation = "Evaluation 1"),
ds %>%
select(mapped_type = mapped_type2, correct = ai_correct_2) %>%
mutate(evaluation = "Evaluation 2")
) %>%
filter(mapped_type %nin% c("Not Toronto", "Not applicable"))
# Calculate combined accuracy by mapped type
mapped_type_accuracy <- mapped_type_data %>%
group_by(mapped_type) %>%
summarise(
count = n()/2,
total_eval = n(),
correct_eval = sum(correct, na.rm = TRUE),
accuracy = correct_eval / total_eval
) %>%
arrange(accuracy) %>%
filter(count >= 3)
# Create a more presentable table
mapped_type_table <- mapped_type_accuracy %>%
mutate(
accuracy_percent = paste0(round(accuracy * 100, 1), "%"),
error_rate = paste0(round((1-accuracy) * 100, 1), "%")
) %>%
select(
`Mapped Type` = mapped_type,
`Sample Count` = count,
`Total Evaluations` = total_eval,
`Correct Evaluations` = correct_eval,
`Accuracy %` = accuracy_percent,
`Error Rate` = error_rate
) %>%
arrange(desc(`Accuracy %`)) # Sort from highest to lowest accuracy
# Display the table
knitr::kable(mapped_type_table,
caption = "Accuracy by Mapped Type (Combined across both evaluations)")
| Mapped Type | Sample Count | Total Evaluations | Correct Evaluations | Accuracy % | Error Rate |
|---|---|---|---|---|---|
| Bone Tumors | 41.0 | 82 | 81 | 98.8% | 1.2% |
| Astrocytoma | 44.0 | 88 | 85 | 96.6% | 3.4% |
| Acute Lymphoblastic Leukemia | 114.0 | 228 | 220 | 96.5% | 3.5% |
| Medulloblastoma | 19.0 | 38 | 36 | 94.7% | 5.3% |
| Ependymoma | 9.0 | 18 | 17 | 94.4% | 5.6% |
| Hodgkin Lymphoma | 43.5 | 87 | 78 | 89.7% | 10.3% |
| Rhabdomyosarcoma | 17.0 | 34 | 30 | 88.2% | 11.8% |
| Non-Hodgkin Lymphoma | 39.0 | 78 | 67 | 85.9% | 14.1% |
| Neuroblastoma | 28.0 | 56 | 48 | 85.7% | 14.3% |
| Retinoblastoma | 32.5 | 65 | 54 | 83.1% | 16.9% |
| Ovarian Germ Cell Tumor | 7.0 | 14 | 11 | 78.6% | 21.4% |
| Renal Tumors | 26.0 | 52 | 38 | 73.1% | 26.9% |
| Non-Rhabdomyosarcoma Soft Tissue Sarcoma | 6.0 | 12 | 12 | 100% | 0% |
# Create improved visualization with better text alignment and margins
ggplot(mapped_type_accuracy, aes(x = reorder(mapped_type, accuracy), y = accuracy, fill = mapped_type)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(accuracy * 100, 1), "% (n=", count, ")")),
hjust = -0.1, size = 3) +
labs(title = "Accuracy by Mapped Type",
subtitle = "Combined across both evaluations",
x = "Mapped Type",
y = "Accuracy") +
theme_minimal() +
coord_flip() +
scale_y_continuous(labels = scales::percent,
limits = c(0, 1),
expand = expansion(mult = c(0, 0.3))) + # Expand right margin by 30%
theme(legend.position = "none",
plot.margin = margin(10, 30, 10, 10, "pt")) # Add extra margin on the right side
Accuracy by Cancer Type
library(ggplot2)
# Create sample data for error categories
error_categories <- data.frame(
category = c("Missing Information", "Ambiguous Imaging",
"Local vs Metastatic Confusion", "Staging Criteria Misapplication",
"Diagnosis Mapping Errors"),
percentage = c(32, 25, 18, 15, 10)
)
# Calculate position for labels
error_categories <- error_categories %>%
arrange(desc(category)) %>%
mutate(ypos = cumsum(percentage) - 0.5*percentage)
# Create pie chart
ggplot(error_categories, aes(x = "", y = percentage, fill = category)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y", start = 0) +
geom_text(aes(y = ypos, label = paste0(percentage, "%")),
color = "white", size = 4, fontface = "bold") +
scale_fill_brewer(palette = "Set2", name = "Error Category") +
labs(title = "Distribution of Error Categories",
subtitle = "Percentage of cases with staging errors") +
theme_minimal() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "right")
Error Analysis Diagram
library(kableExtra)
library(dplyr)
# Assume your grouped stage data looks like this:
stage_counts <- ds %>%
group_by(mapped_type1, true_stage) %>%
tally(name = "n") %>%
arrange(mapped_type1, desc(n))
# Convert to table
stage_table <- stage_counts %>%
rename(`Cancer Type` = mapped_type1, Stage = true_stage, N = n)
library(dplyr)
library(stringr)
cleaned_stage_table <- stage_table %>%
mutate(Stage = str_replace_all(Stage, "\\*\\*|\\(.*?\\)", ""), # remove ** and parenthetical notes
Stage = str_trim(Stage),
Stage = str_replace_all(Stage, "Iunable to stage|Unable to determine -.*|Unknown", "Unable to Stage"),
Stage = str_replace_all(Stage, "Ms", "MS"),
Stage = str_replace_all(Stage, "III-A|III A", "Stage III-A"),
Stage = str_replace_all(Stage, "III-B|III B", "Stage III-B"),
Stage = str_replace_all(Stage, "II-A|II A", "Stage II-A"),
Stage = str_replace_all(Stage, "II-B|II B", "Stage II-B"),
Stage = str_replace_all(Stage, "I-A|I A", "Stage I-A"),
Stage = str_replace_all(Stage, "IV-B|IV B", "Stage IV-B"),
Stage = str_replace_all(Stage, "IV-A|IV A", "Stage IV-A"),
Stage = str_replace_all(Stage, "y-I", "Stage y-I"),
Stage = str_replace_all(Stage, "y-II", "Stage y-II"),
Stage = str_replace_all(Stage, "y-III|yIII", "Stage y-III"),
Stage = str_replace_all(Stage, "y-IV", "Stage y-IV"),
Stage = str_replace_all(Stage, "Stage ", "")) %>%
mutate(Stage = paste("Stage", Stage)) %>% # ensure all have "Stage" prefix
group_by(`Cancer Type`, Stage) %>%
summarise(N = sum(N), .groups = "drop") %>%
arrange(`Cancer Type`, desc(N))
# Documentation metrics table
doc_metrics <- data.frame(
Metric = c(
"Documents per patient (median, IQR)",
"Words per patient (median, IQR)",
"With explicit stage documentation",
"Without explicit stage documentation"
),
Value = c(
"8 (5–12)",
"3874 (2988–5066)",
"38.3%",
"61.7%"
)
)
# Render the first table
cleaned_stage_table %>%
kbl(caption = "Cancer Types with Stage Breakdown") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Cancer Type | Stage | N |
|---|---|---|
| Acute Lymphoblastic Leukemia | Stage CNS1 | 102 |
| Acute Lymphoblastic Leukemia | Stage CNS2 | 5 |
| Acute Lymphoblastic Leukemia | Stage CNS3 | 4 |
| Acute Lymphoblastic Leukemia | Stage Unable to Stage | 1 |
| Acute Lymphoblastic Leukemia | Stage Unable to stage | 1 |
| Acute Myeloid Leukemia | Stage Unable to Stage | 1 |
| Astrocytoma | Stage Localized | 35 |
| Astrocytoma | Stage Metastatic | 8 |
| Astrocytoma | Stage Unable to stage | 1 |
| Bone Tumors | Stage Localized | 22 |
| Bone Tumors | Stage Metastatic | 19 |
| Ependymoma | Stage M0 | 7 |
| Ependymoma | Stage M2 | 1 |
| Ependymoma | Stage M3 | 1 |
| Hepatoblastoma | Stage Localized | 1 |
| Hepatoblastoma | Stage Metastatic | 1 |
| Hodgkin Lymphoma | Stage II-A | 19 |
| Hodgkin Lymphoma | Stage IV-B | 10 |
| Hodgkin Lymphoma | Stage II-B | 8 |
| Hodgkin Lymphoma | Stage III-A | 4 |
| Hodgkin Lymphoma | Stage III-B | 4 |
| Hodgkin Lymphoma | Stage I-A | 1 |
| Hodgkin Lymphoma | Stage IV-A | 1 |
| Medulloblastoma | Stage M0 | 10 |
| Medulloblastoma | Stage M2 | 5 |
| Medulloblastoma | Stage M3 | 4 |
| Neuroblastoma | Stage M | 18 |
| Neuroblastoma | Stage L2 | 4 |
| Neuroblastoma | Stage L1 | 3 |
| Neuroblastoma | Stage MS | 3 |
| Non-Hodgkin Lymphoma | Stage III | 18 |
| Non-Hodgkin Lymphoma | Stage II | 9 |
| Non-Hodgkin Lymphoma | Stage IV | 9 |
| Non-Hodgkin Lymphoma | Stage I | 2 |
| Non-Hodgkin Lymphoma | Stage Pending further evaluation | 1 |
| Non-Rhabdomyosarcoma Soft Tissue Sarcoma | Stage II | 2 |
| Non-Rhabdomyosarcoma Soft Tissue Sarcoma | Stage III | 2 |
| Non-Rhabdomyosarcoma Soft Tissue Sarcoma | Stage IV | 2 |
| Not Toronto | Stage Unable to Stage | 49 |
| Not applicable | Stage Unable to Stage | 15 |
| Not applicable (as AML is not covered by the Toronto Pediatric Cancer Staging System) | Stage Unable to Stage | 1 |
| Not covered by Toronto Pediatric Cancer Staging System | Stage Unable to Stage | 1 |
| Ovarian Germ Cell Tumor | Stage IV | 3 |
| Ovarian Germ Cell Tumor | Stage I | 2 |
| Ovarian Germ Cell Tumor | Stage III | 2 |
| Renal Tumors | Stage y-I | 7 |
| Renal Tumors | Stage V | 4 |
| Renal Tumors | Stage y-II | 4 |
| Renal Tumors | Stage y-III | 3 |
| Renal Tumors | Stage I | 2 |
| Renal Tumors | Stage II | 2 |
| Renal Tumors | Stage y-IV | 2 |
| Renal Tumors | Stage IV | 1 |
| Renal Tumors | Stage Undetermined | 1 |
| Retinoblastoma | Stage 0 | 13 |
| Retinoblastoma | Stage I | 12 |
| Retinoblastoma | Stage II | 6 |
| Retinoblastoma | Stage III | 2 |
| Rhabdomyosarcoma | Stage I | 7 |
| Rhabdomyosarcoma | Stage III | 5 |
| Rhabdomyosarcoma | Stage IV | 3 |
| Rhabdomyosarcoma | Stage II | 2 |
| Testicular Germ Cell Tumor | Stage II | 1 |
| Testicular Germ Cell Tumor | Stage IS | 1 |
# Then render the documentation metrics
doc_metrics %>%
kbl(caption = "Documentation Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Metric | Value |
|---|---|
| Documents per patient (median, IQR) | 8 (5–12) |
| Words per patient (median, IQR) | 3874 (2988–5066) |
| With explicit stage documentation | 38.3% |
| Without explicit stage documentation | 61.7% |
# Create data for accuracy metrics
accuracy_metrics <- data.frame(
metric = c("Accuracy", "Cohen's κ (exact stage agreement)"),
run1 = c("93.8%", "—"),
run2 = c("88.7%", "—"),
combined = c("91.2%", "0.785")
)
# Create table
accuracy_metrics %>%
kbl(col.names = c("Metric", "Run 1", "Run 2", "Combined"),
caption = "Overall Accuracy and Reliability Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, color = "white", background = "#4E79A7") %>%
row_spec(1, bold = TRUE) %>%
column_spec(1, bold = TRUE)
| Metric | Run 1 | Run 2 | Combined |
|---|---|---|---|
| Accuracy | 93.8% | 88.7% | 91.2% |
| Cohen’s κ (exact stage agreement) | — | — | 0.785 |
library(kableExtra)
# Create a more comprehensive confusion matrix with various staging categories
confusion_table <- data.frame(
`Actual Stage` = c("Stage I", "Stage II", "Stage III", "Stage IV",
"Localized", "Metastatic", "Stage L1", "Stage L2", "Stage M"),
`Stage I` = c(125, 3, 0, 0, 2, 0, 0, 0, 0),
`Stage II` = c(5, 110, 4, 0, 0, 0, 0, 0, 0),
`Stage III` = c(0, 7, 80, 2, 0, 0, 0, 0, 0),
`Stage IV` = c(0, 0, 2, 112, 0, 0, 0, 0, 0),
`Localized` = c(3, 0, 0, 0, 86, 4, 0, 0, 0),
`Metastatic` = c(0, 0, 0, 5, 7, 93, 0, 0, 0),
`Stage L1` = c(0, 0, 0, 0, 0, 0, 45, 2, 0),
`Stage L2` = c(0, 0, 0, 0, 0, 0, 3, 38, 1),
`Stage M` = c(0, 0, 0, 0, 0, 3, 0, 1, 58)
)
# Create the table
knitr::kable(confusion_table,
caption = "Comprehensive Confusion Matrix (Predicted vs. Actual Stage)",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = TRUE,
font_size = 11) %>% # Reduced font size for better fit
add_header_above(c(" " = 1, "Predicted Stage" = 9)) %>%
row_spec(0, bold = TRUE, background = "#4E79A7", color = "white") %>%
column_spec(1, bold = TRUE, background = "#EAF2F8") %>%
scroll_box(width = "100%", height = "500px") # Makes the table scrollable if it's too wide
| Actual.Stage | Stage.I | Stage.II | Stage.III | Stage.IV | Localized | Metastatic | Stage.L1 | Stage.L2 | Stage.M |
|---|---|---|---|---|---|---|---|---|---|
| Stage I | 125 | 5 | 0 | 0 | 3 | 0 | 0 | 0 | 0 |
| Stage II | 3 | 110 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
| Stage III | 0 | 4 | 80 | 2 | 0 | 0 | 0 | 0 | 0 |
| Stage IV | 0 | 0 | 2 | 112 | 0 | 5 | 0 | 0 | 0 |
| Localized | 2 | 0 | 0 | 0 | 86 | 7 | 0 | 0 | 0 |
| Metastatic | 0 | 0 | 0 | 0 | 4 | 93 | 0 | 0 | 3 |
| Stage L1 | 0 | 0 | 0 | 0 | 0 | 0 | 45 | 3 | 0 |
| Stage L2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 38 | 1 |
| Stage M | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 58 |
These R code chunks generate the figures and tables you requested, with appropriate formatting and styling. The sample data is provided for illustration, but in your actual implementation, you would replace these with your real dataset variables.