# Load Required Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
# Define Path
data_dir <- "C:/Users/ravit/OneDrive/Desktop/Flatrion Project"
dx_path <- file.path(data_dir, "Patient_Diagnosis.csv")
tx_path <- file.path(data_dir, "Patient_Treatment.csv")
demo_path <- file.path(data_dir, "Patient_Demographics.csv")
# Read Input Data
dx <- read_csv(dx_path, show_col_types = FALSE)
tx <- read_csv(tx_path, show_col_types = FALSE)
demo <- read_csv(demo_path, show_col_types = FALSE)
# Initial Data Review & Validation
# Inspect structure
glimpse(dx)
## Rows: 57
## Columns: 5
## $ patient_id <dbl> 2120, 2720, 2038, 2238, 2175, 2475, 2407, 2607, 2425, 3…
## $ diagnosis_date <chr> "1/9/10", "1/9/10", "1/21/10", "1/21/10", "2/17/10", "2…
## $ diagnosis_code <dbl> 174.1, 174.1, 174.9, 174.9, 174.7, 174.7, 174.9, 174.9,…
## $ diagnosis <chr> "Breast Cancer", "Breast Cancer", "Breast Cancer", "Bre…
## $ stage_dx <chr> "IV", "II", "III", "IV", "III", "I", "I", "III", "II", …
glimpse(tx)
## Rows: 1,096
## Columns: 3
## $ patient_id <dbl> 2720, 2238, 2120, 2038, 2120, 2038, 2120, 2120, 2038, 2…
## $ treatment_date <chr> "1/20/10", "1/21/10", "1/23/10", "1/24/10", "1/24/10", …
## $ drug_code <chr> "B", "B", "B", "A", "A", "B", "A", "B", "A", "B", "A", …
glimpse(demo)
## Rows: 47
## Columns: 4
## $ patient_id <dbl> 2038, 2120, 2175, 2238, 2407, 2425, 2462, 2475, 2607, 2634,…
## $ birth_sex <chr> "F", "F", "F", "F", "M", "M", "F", "F", "F", "F", "F", "F",…
## $ birth_date <date> 1939-10-16, 1964-10-22, 1946-12-03, 1986-11-12, 1945-07-02…
## $ region <chr> "South", "Mid-west", "Northeast", "Northeast", "Mid-west", …
# Explicitly checking no missing values (per instructions)
dx %>% summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 5
## patient_id diagnosis_date diagnosis_code diagnosis stage_dx
## <int> <int> <int> <int> <int>
## 1 0 0 0 0 0
tx %>% summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 3
## patient_id treatment_date drug_code
## <int> <int> <int>
## 1 0 0 0
demo %>% summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 4
## patient_id birth_sex birth_date region
## <int> <int> <int> <int>
## 1 0 0 0 0
# Parse and Standardize Dates
# Diagnosis and treatment dates are mm/dd/yy
dx <- dx %>%
mutate(diagnosis_date = mdy(diagnosis_date))
tx <- tx %>%
mutate(treatment_date = mdy(treatment_date))
# Birth date is yyyy-mm-dd
demo <- demo %>%
mutate(birth_date = ymd(birth_date))
# Building Patient-Level Cancer Group Classification
dx_by_patient <- dx %>%
group_by(patient_id, diagnosis) %>%
summarise(
first_dx_for_cancer = min(diagnosis_date),
.groups = "drop"
) %>%
pivot_wider(
names_from = diagnosis,
values_from = first_dx_for_cancer
)
patient_cancer_group <- dx_by_patient %>%
mutate(
cancer_group = case_when(
!is.na(`Breast Cancer`) & !is.na(`Colon Cancer`) ~ "Both breast and colon cancer",
!is.na(`Breast Cancer`) & is.na(`Colon Cancer`) ~ "Breast cancer only",
is.na(`Breast Cancer`) & !is.na(`Colon Cancer`) ~ "Colon cancer only",
TRUE ~ "Unknown"
)
) %>%
select(patient_id, cancer_group)
# Question 1: Diagnosis Distribution
q1_results <- patient_cancer_group %>%
count(cancer_group, name = "n_patients") %>%
mutate(
proportion = n_patients / sum(n_patients)
) %>%
arrange(desc(n_patients))
q1_results
## # A tibble: 3 × 3
## cancer_group n_patients proportion
## <chr> <int> <dbl>
## 1 Breast cancer only 31 0.660
## 2 Colon cancer only 11 0.234
## 3 Both breast and colon cancer 5 0.106
# Question 2: Time from Diagnosis to Treatment
earliest_dx <- dx %>%
group_by(patient_id) %>%
summarise(
earliest_diagnosis = min(diagnosis_date),
.groups = "drop"
)
first_treatment <- tx %>%
group_by(patient_id) %>%
summarise(
first_treatment_date = min(treatment_date),
.groups = "drop"
)
q2_patient_level <- patient_cancer_group %>%
left_join(earliest_dx, by = "patient_id") %>%
left_join(first_treatment, by = "patient_id") %>%
mutate(
days_to_treatment = as.integer(first_treatment_date - earliest_diagnosis)
)
# Surface potential data issues without excluding them
q2_patient_level %>%
filter(is.na(first_treatment_date) | days_to_treatment < 0)
## # A tibble: 2 × 5
## patient_id cancer_group earliest_diagnosis first_treatment_date
## <dbl> <chr> <date> <date>
## 1 4256 Breast cancer only 2011-11-07 NA
## 2 8827 Breast cancer only 2013-07-21 2013-07-18
## # ℹ 1 more variable: days_to_treatment <int>
q2_results <- q2_patient_level %>%
group_by(cancer_group) %>%
summarise(
mean_days = mean(days_to_treatment, na.rm = TRUE),
median_days = median(days_to_treatment, na.rm = TRUE),
min_days = min(days_to_treatment, na.rm = TRUE),
max_days = max(days_to_treatment, na.rm = TRUE),
.groups = "drop"
)
q2_results
## # A tibble: 3 × 5
## cancer_group mean_days median_days min_days max_days
## <chr> <dbl> <dbl> <int> <int>
## 1 Both breast and colon cancer 8 7 7 11
## 2 Breast cancer only 5.17 5 -3 20
## 3 Colon cancer only 30.3 4 0 304
# Question 3: First-Line Treatment Regimens
tx_first_instance <- tx %>%
left_join(first_treatment, by = "patient_id") %>%
filter(treatment_date == first_treatment_date)
first_line_regimen <- tx_first_instance %>%
group_by(patient_id) %>%
summarise(
regimen = paste(sort(unique(drug_code)), collapse = "+"),
.groups = "drop"
)
q3_results <- patient_cancer_group %>%
left_join(first_line_regimen, by = "patient_id") %>%
count(cancer_group, regimen, name = "n_patients") %>%
group_by(cancer_group) %>%
mutate(proportion = n_patients / sum(n_patients)) %>%
ungroup()
q3_results
## # A tibble: 10 × 4
## cancer_group regimen n_patients proportion
## <chr> <chr> <int> <dbl>
## 1 Both breast and colon cancer A+B 1 0.2
## 2 Both breast and colon cancer C 4 0.8
## 3 Breast cancer only A 4 0.129
## 4 Breast cancer only A+B 17 0.548
## 5 Breast cancer only B 7 0.226
## 6 Breast cancer only C 2 0.0645
## 7 Breast cancer only <NA> 1 0.0323
## 8 Colon cancer only A+B 3 0.273
## 9 Colon cancer only B 4 0.364
## 10 Colon cancer only D 4 0.364
# Question 4: Patient characteristics table
# Identify whether the FIRST diagnosis was Breast or Colon cancer
first_dx_type <- dx_by_patient %>%
transmute(
patient_id,
first_dx_type = case_when(
is.na(`Colon Cancer`) ~ "Breast cancer first",
is.na(`Breast Cancer`) ~ "Colon cancer first",
`Breast Cancer` < `Colon Cancer` ~ "Breast cancer first",
`Colon Cancer` < `Breast Cancer` ~ "Colon cancer first",
TRUE ~ "Both diagnosed same day"
)
)
# Determining stage at earliest diagnosis
stage_levels <- c("I", "II", "III", "IV")
stage_at_first_dx <- dx %>%
left_join(earliest_dx, by = "patient_id") %>%
filter(diagnosis_date == earliest_diagnosis) %>%
mutate(stage_dx = factor(stage_dx, levels = stage_levels, ordered = TRUE)) %>%
group_by(patient_id) %>%
summarise(
stage_at_first_dx = as.character(max(stage_dx)),
.groups = "drop"
)
# Calculate age at earliest diagnosis
age_at_dx <- earliest_dx %>%
left_join(demo, by = "patient_id") %>%
mutate(
age_at_first_dx =
floor(time_length(interval(birth_date, earliest_diagnosis), "years"))
) %>%
select(patient_id, age_at_first_dx)
# Assemble dataset for Table 1
q4_data <- demo %>%
left_join(first_dx_type, by = "patient_id") %>%
left_join(age_at_dx, by = "patient_id") %>%
left_join(stage_at_first_dx, by = "patient_id")
# Helper formatting functions (simple and readable)
fmt_mean_sd <- function(x) {
x <- x[!is.na(x)]
sprintf("%.1f (%.1f)", mean(x), sd(x))
}
fmt_median_range <- function(x) {
x <- x[!is.na(x)]
sprintf("%.1f [%d, %d]", median(x), min(x), max(x))
}
fmt_n_pct <- function(n, total) {
sprintf("%d (%.1f%%)", n, 100 * n / total)
}
# Age summary by first diagnosis type
age_summary <- q4_data %>%
group_by(first_dx_type) %>%
summarise(
N = n(),
`Age (mean (SD))` = fmt_mean_sd(age_at_first_dx),
`Age (median [min, max])` = fmt_median_range(age_at_first_dx),
.groups = "drop"
)
age_summary
## # A tibble: 2 × 4
## first_dx_type N `Age (mean (SD))` `Age (median [min, max])`
## <chr> <int> <chr> <chr>
## 1 Breast cancer first 32 54.9 (16.5) 58.0 [18, 76]
## 2 Colon cancer first 15 59.4 (19.1) 58.0 [33, 90]
# Sex assigned at birth summary
sex_summary <- q4_data %>%
count(first_dx_type, birth_sex, name = "n") %>%
group_by(first_dx_type) %>%
mutate(
total = sum(n),
value = fmt_n_pct(n, total)
) %>%
ungroup() %>%
transmute(
Variable = "Sex assigned at birth",
Category = birth_sex,
first_dx_type,
Value = value
)
# Region summary
region_summary <- q4_data %>%
count(first_dx_type, region, name = "n") %>%
group_by(first_dx_type) %>%
mutate(
total = sum(n),
value = fmt_n_pct(n, total)
) %>%
ungroup() %>%
transmute(
Variable = "Region",
Category = region,
first_dx_type,
Value = value
)
# Stage at diagnosis summary
stage_summary <- q4_data %>%
count(first_dx_type, stage_at_first_dx, name = "n") %>%
group_by(first_dx_type) %>%
mutate(
total = sum(n),
value = fmt_n_pct(n, total)
) %>%
ungroup() %>%
transmute(
Variable = "Stage at diagnosis (earliest)",
Category = stage_at_first_dx,
first_dx_type,
Value = value
)
# Combine categorical summaries into one table
table1_long <- bind_rows(
sex_summary,
region_summary,
stage_summary
)
table1_long
## # A tibble: 20 × 4
## Variable Category first_dx_type Value
## <chr> <chr> <chr> <chr>
## 1 Sex assigned at birth F Breast cancer first 27 (84.4%)
## 2 Sex assigned at birth M Breast cancer first 5 (15.6%)
## 3 Sex assigned at birth F Colon cancer first 10 (66.7%)
## 4 Sex assigned at birth M Colon cancer first 5 (33.3%)
## 5 Region Mid-west Breast cancer first 12 (37.5%)
## 6 Region Northeast Breast cancer first 8 (25.0%)
## 7 Region South Breast cancer first 7 (21.9%)
## 8 Region West Breast cancer first 5 (15.6%)
## 9 Region Mid-west Colon cancer first 1 (6.7%)
## 10 Region Northeast Colon cancer first 5 (33.3%)
## 11 Region South Colon cancer first 6 (40.0%)
## 12 Region West Colon cancer first 3 (20.0%)
## 13 Stage at diagnosis (earliest) I Breast cancer first 7 (21.9%)
## 14 Stage at diagnosis (earliest) II Breast cancer first 12 (37.5%)
## 15 Stage at diagnosis (earliest) III Breast cancer first 9 (28.1%)
## 16 Stage at diagnosis (earliest) IV Breast cancer first 4 (12.5%)
## 17 Stage at diagnosis (earliest) I Colon cancer first 2 (13.3%)
## 18 Stage at diagnosis (earliest) II Colon cancer first 4 (26.7%)
## 19 Stage at diagnosis (earliest) III Colon cancer first 2 (13.3%)
## 20 Stage at diagnosis (earliest) IV Colon cancer first 7 (46.7%)
# Optional: Convert to wide format for cleaner presentation
table1_wide <- table1_long %>%
pivot_wider(
names_from = first_dx_type,
values_from = Value
) %>%
arrange(Variable, Category)
table1_wide
## # A tibble: 10 × 4
## Variable Category `Breast cancer first` `Colon cancer first`
## <chr> <chr> <chr> <chr>
## 1 Region Mid-west 12 (37.5%) 1 (6.7%)
## 2 Region Northea… 8 (25.0%) 5 (33.3%)
## 3 Region South 7 (21.9%) 6 (40.0%)
## 4 Region West 5 (15.6%) 3 (20.0%)
## 5 Sex assigned at birth F 27 (84.4%) 10 (66.7%)
## 6 Sex assigned at birth M 5 (15.6%) 5 (33.3%)
## 7 Stage at diagnosis (earl… I 7 (21.9%) 2 (13.3%)
## 8 Stage at diagnosis (earl… II 12 (37.5%) 4 (26.7%)
## 9 Stage at diagnosis (earl… III 9 (28.1%) 2 (13.3%)
## 10 Stage at diagnosis (earl… IV 4 (12.5%) 7 (46.7%)