# Load Required Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
# Define Path

data_dir <- "C:/Users/ravit/OneDrive/Desktop/Flatrion Project"

dx_path   <- file.path(data_dir, "Patient_Diagnosis.csv")
tx_path   <- file.path(data_dir, "Patient_Treatment.csv")
demo_path <- file.path(data_dir, "Patient_Demographics.csv")
# Read Input Data
dx   <- read_csv(dx_path, show_col_types = FALSE)
tx   <- read_csv(tx_path, show_col_types = FALSE)
demo <- read_csv(demo_path, show_col_types = FALSE)
# Initial Data Review & Validation

# Inspect structure
glimpse(dx)
## Rows: 57
## Columns: 5
## $ patient_id     <dbl> 2120, 2720, 2038, 2238, 2175, 2475, 2407, 2607, 2425, 3…
## $ diagnosis_date <chr> "1/9/10", "1/9/10", "1/21/10", "1/21/10", "2/17/10", "2…
## $ diagnosis_code <dbl> 174.1, 174.1, 174.9, 174.9, 174.7, 174.7, 174.9, 174.9,…
## $ diagnosis      <chr> "Breast Cancer", "Breast Cancer", "Breast Cancer", "Bre…
## $ stage_dx       <chr> "IV", "II", "III", "IV", "III", "I", "I", "III", "II", …
glimpse(tx)
## Rows: 1,096
## Columns: 3
## $ patient_id     <dbl> 2720, 2238, 2120, 2038, 2120, 2038, 2120, 2120, 2038, 2…
## $ treatment_date <chr> "1/20/10", "1/21/10", "1/23/10", "1/24/10", "1/24/10", …
## $ drug_code      <chr> "B", "B", "B", "A", "A", "B", "A", "B", "A", "B", "A", …
glimpse(demo)
## Rows: 47
## Columns: 4
## $ patient_id <dbl> 2038, 2120, 2175, 2238, 2407, 2425, 2462, 2475, 2607, 2634,…
## $ birth_sex  <chr> "F", "F", "F", "F", "M", "M", "F", "F", "F", "F", "F", "F",…
## $ birth_date <date> 1939-10-16, 1964-10-22, 1946-12-03, 1986-11-12, 1945-07-02…
## $ region     <chr> "South", "Mid-west", "Northeast", "Northeast", "Mid-west", …
# Explicitly checking no missing values (per instructions)
dx   %>% summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 5
##   patient_id diagnosis_date diagnosis_code diagnosis stage_dx
##        <int>          <int>          <int>     <int>    <int>
## 1          0              0              0         0        0
tx   %>% summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 3
##   patient_id treatment_date drug_code
##        <int>          <int>     <int>
## 1          0              0         0
demo %>% summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 4
##   patient_id birth_sex birth_date region
##        <int>     <int>      <int>  <int>
## 1          0         0          0      0
# Parse and Standardize Dates
# Diagnosis and treatment dates are mm/dd/yy
dx <- dx %>%
  mutate(diagnosis_date = mdy(diagnosis_date))

tx <- tx %>%
  mutate(treatment_date = mdy(treatment_date))

# Birth date is yyyy-mm-dd
demo <- demo %>%
  mutate(birth_date = ymd(birth_date))
# Building Patient-Level Cancer Group Classification

dx_by_patient <- dx %>%
  group_by(patient_id, diagnosis) %>%
  summarise(
    first_dx_for_cancer = min(diagnosis_date),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from  = diagnosis,
    values_from = first_dx_for_cancer
  )

patient_cancer_group <- dx_by_patient %>%
  mutate(
    cancer_group = case_when(
      !is.na(`Breast Cancer`) & !is.na(`Colon Cancer`) ~ "Both breast and colon cancer",
      !is.na(`Breast Cancer`) &  is.na(`Colon Cancer`) ~ "Breast cancer only",
       is.na(`Breast Cancer`) & !is.na(`Colon Cancer`) ~ "Colon cancer only",
      TRUE ~ "Unknown"
    )
  ) %>%
  select(patient_id, cancer_group)
# Question 1: Diagnosis Distribution

q1_results <- patient_cancer_group %>%
  count(cancer_group, name = "n_patients") %>%
  mutate(
    proportion = n_patients / sum(n_patients)
  ) %>%
  arrange(desc(n_patients))

q1_results
## # A tibble: 3 × 3
##   cancer_group                 n_patients proportion
##   <chr>                             <int>      <dbl>
## 1 Breast cancer only                   31      0.660
## 2 Colon cancer only                    11      0.234
## 3 Both breast and colon cancer          5      0.106
# Question 2: Time from Diagnosis to Treatment
earliest_dx <- dx %>%
  group_by(patient_id) %>%
  summarise(
    earliest_diagnosis = min(diagnosis_date),
    .groups = "drop"
  )

first_treatment <- tx %>%
  group_by(patient_id) %>%
  summarise(
    first_treatment_date = min(treatment_date),
    .groups = "drop"
  )

q2_patient_level <- patient_cancer_group %>%
  left_join(earliest_dx, by = "patient_id") %>%
  left_join(first_treatment, by = "patient_id") %>%
  mutate(
    days_to_treatment = as.integer(first_treatment_date - earliest_diagnosis)
  )

# Surface potential data issues without excluding them
q2_patient_level %>%
  filter(is.na(first_treatment_date) | days_to_treatment < 0)
## # A tibble: 2 × 5
##   patient_id cancer_group       earliest_diagnosis first_treatment_date
##        <dbl> <chr>              <date>             <date>              
## 1       4256 Breast cancer only 2011-11-07         NA                  
## 2       8827 Breast cancer only 2013-07-21         2013-07-18          
## # ℹ 1 more variable: days_to_treatment <int>
q2_results <- q2_patient_level %>%
  group_by(cancer_group) %>%
  summarise(
    mean_days   = mean(days_to_treatment, na.rm = TRUE),
    median_days = median(days_to_treatment, na.rm = TRUE),
    min_days    = min(days_to_treatment, na.rm = TRUE),
    max_days    = max(days_to_treatment, na.rm = TRUE),
    .groups = "drop"
  )

q2_results
## # A tibble: 3 × 5
##   cancer_group                 mean_days median_days min_days max_days
##   <chr>                            <dbl>       <dbl>    <int>    <int>
## 1 Both breast and colon cancer      8              7        7       11
## 2 Breast cancer only                5.17           5       -3       20
## 3 Colon cancer only                30.3            4        0      304
# Question 3: First-Line Treatment Regimens

tx_first_instance <- tx %>%
  left_join(first_treatment, by = "patient_id") %>%
  filter(treatment_date == first_treatment_date)

first_line_regimen <- tx_first_instance %>%
  group_by(patient_id) %>%
  summarise(
    regimen = paste(sort(unique(drug_code)), collapse = "+"),
    .groups = "drop"
  )

q3_results <- patient_cancer_group %>%
  left_join(first_line_regimen, by = "patient_id") %>%
  count(cancer_group, regimen, name = "n_patients") %>%
  group_by(cancer_group) %>%
  mutate(proportion = n_patients / sum(n_patients)) %>%
  ungroup()

q3_results
## # A tibble: 10 × 4
##    cancer_group                 regimen n_patients proportion
##    <chr>                        <chr>        <int>      <dbl>
##  1 Both breast and colon cancer A+B              1     0.2   
##  2 Both breast and colon cancer C                4     0.8   
##  3 Breast cancer only           A                4     0.129 
##  4 Breast cancer only           A+B             17     0.548 
##  5 Breast cancer only           B                7     0.226 
##  6 Breast cancer only           C                2     0.0645
##  7 Breast cancer only           <NA>             1     0.0323
##  8 Colon cancer only            A+B              3     0.273 
##  9 Colon cancer only            B                4     0.364 
## 10 Colon cancer only            D                4     0.364
# Question 4: Patient characteristics table

# Identify whether the FIRST diagnosis was Breast or Colon cancer

first_dx_type <- dx_by_patient %>%
  transmute(
    patient_id,
    first_dx_type = case_when(
      is.na(`Colon Cancer`) ~ "Breast cancer first",
      is.na(`Breast Cancer`) ~ "Colon cancer first",
      `Breast Cancer` < `Colon Cancer` ~ "Breast cancer first",
      `Colon Cancer` < `Breast Cancer` ~ "Colon cancer first",
      TRUE ~ "Both diagnosed same day"
    )
  )



# Determining stage at earliest diagnosis
stage_levels <- c("I", "II", "III", "IV")

stage_at_first_dx <- dx %>%
  left_join(earliest_dx, by = "patient_id") %>%
  filter(diagnosis_date == earliest_diagnosis) %>%
  mutate(stage_dx = factor(stage_dx, levels = stage_levels, ordered = TRUE)) %>%
  group_by(patient_id) %>%
  summarise(
    stage_at_first_dx = as.character(max(stage_dx)),
    .groups = "drop"
  )



# Calculate age at earliest diagnosis


age_at_dx <- earliest_dx %>%
  left_join(demo, by = "patient_id") %>%
  mutate(
    age_at_first_dx =
      floor(time_length(interval(birth_date, earliest_diagnosis), "years"))
  ) %>%
  select(patient_id, age_at_first_dx)


# Assemble dataset for Table 1

q4_data <- demo %>%
  left_join(first_dx_type, by = "patient_id") %>%
  left_join(age_at_dx, by = "patient_id") %>%
  left_join(stage_at_first_dx, by = "patient_id")



# Helper formatting functions (simple and readable)

fmt_mean_sd <- function(x) {
  x <- x[!is.na(x)]
  sprintf("%.1f (%.1f)", mean(x), sd(x))
}

fmt_median_range <- function(x) {
  x <- x[!is.na(x)]
  sprintf("%.1f [%d, %d]", median(x), min(x), max(x))
}

fmt_n_pct <- function(n, total) {
  sprintf("%d (%.1f%%)", n, 100 * n / total)
}


# Age summary by first diagnosis type

age_summary <- q4_data %>%
  group_by(first_dx_type) %>%
  summarise(
    N = n(),
    `Age (mean (SD))` = fmt_mean_sd(age_at_first_dx),
    `Age (median [min, max])` = fmt_median_range(age_at_first_dx),
    .groups = "drop"
  )

age_summary
## # A tibble: 2 × 4
##   first_dx_type           N `Age (mean (SD))` `Age (median [min, max])`
##   <chr>               <int> <chr>             <chr>                    
## 1 Breast cancer first    32 54.9 (16.5)       58.0 [18, 76]            
## 2 Colon cancer first     15 59.4 (19.1)       58.0 [33, 90]
# Sex assigned at birth summary

sex_summary <- q4_data %>%
  count(first_dx_type, birth_sex, name = "n") %>%
  group_by(first_dx_type) %>%
  mutate(
    total = sum(n),
    value = fmt_n_pct(n, total)
  ) %>%
  ungroup() %>%
  transmute(
    Variable = "Sex assigned at birth",
    Category = birth_sex,
    first_dx_type,
    Value = value
  )


# Region summary

region_summary <- q4_data %>%
  count(first_dx_type, region, name = "n") %>%
  group_by(first_dx_type) %>%
  mutate(
    total = sum(n),
    value = fmt_n_pct(n, total)
  ) %>%
  ungroup() %>%
  transmute(
    Variable = "Region",
    Category = region,
    first_dx_type,
    Value = value
  )



# Stage at diagnosis summary

stage_summary <- q4_data %>%
  count(first_dx_type, stage_at_first_dx, name = "n") %>%
  group_by(first_dx_type) %>%
  mutate(
    total = sum(n),
    value = fmt_n_pct(n, total)
  ) %>%
  ungroup() %>%
  transmute(
    Variable = "Stage at diagnosis (earliest)",
    Category = stage_at_first_dx,
    first_dx_type,
    Value = value
  )



# Combine categorical summaries into one table


table1_long <- bind_rows(
  sex_summary,
  region_summary,
  stage_summary
)

table1_long
## # A tibble: 20 × 4
##    Variable                      Category  first_dx_type       Value     
##    <chr>                         <chr>     <chr>               <chr>     
##  1 Sex assigned at birth         F         Breast cancer first 27 (84.4%)
##  2 Sex assigned at birth         M         Breast cancer first 5 (15.6%) 
##  3 Sex assigned at birth         F         Colon cancer first  10 (66.7%)
##  4 Sex assigned at birth         M         Colon cancer first  5 (33.3%) 
##  5 Region                        Mid-west  Breast cancer first 12 (37.5%)
##  6 Region                        Northeast Breast cancer first 8 (25.0%) 
##  7 Region                        South     Breast cancer first 7 (21.9%) 
##  8 Region                        West      Breast cancer first 5 (15.6%) 
##  9 Region                        Mid-west  Colon cancer first  1 (6.7%)  
## 10 Region                        Northeast Colon cancer first  5 (33.3%) 
## 11 Region                        South     Colon cancer first  6 (40.0%) 
## 12 Region                        West      Colon cancer first  3 (20.0%) 
## 13 Stage at diagnosis (earliest) I         Breast cancer first 7 (21.9%) 
## 14 Stage at diagnosis (earliest) II        Breast cancer first 12 (37.5%)
## 15 Stage at diagnosis (earliest) III       Breast cancer first 9 (28.1%) 
## 16 Stage at diagnosis (earliest) IV        Breast cancer first 4 (12.5%) 
## 17 Stage at diagnosis (earliest) I         Colon cancer first  2 (13.3%) 
## 18 Stage at diagnosis (earliest) II        Colon cancer first  4 (26.7%) 
## 19 Stage at diagnosis (earliest) III       Colon cancer first  2 (13.3%) 
## 20 Stage at diagnosis (earliest) IV        Colon cancer first  7 (46.7%)
# Optional: Convert to wide format for cleaner presentation


table1_wide <- table1_long %>%
  pivot_wider(
    names_from  = first_dx_type,
    values_from = Value
  ) %>%
  arrange(Variable, Category)

table1_wide
## # A tibble: 10 × 4
##    Variable                  Category `Breast cancer first` `Colon cancer first`
##    <chr>                     <chr>    <chr>                 <chr>               
##  1 Region                    Mid-west 12 (37.5%)            1 (6.7%)            
##  2 Region                    Northea… 8 (25.0%)             5 (33.3%)           
##  3 Region                    South    7 (21.9%)             6 (40.0%)           
##  4 Region                    West     5 (15.6%)             3 (20.0%)           
##  5 Sex assigned at birth     F        27 (84.4%)            10 (66.7%)          
##  6 Sex assigned at birth     M        5 (15.6%)             5 (33.3%)           
##  7 Stage at diagnosis (earl… I        7 (21.9%)             2 (13.3%)           
##  8 Stage at diagnosis (earl… II       12 (37.5%)            4 (26.7%)           
##  9 Stage at diagnosis (earl… III      9 (28.1%)             2 (13.3%)           
## 10 Stage at diagnosis (earl… IV       4 (12.5%)             7 (46.7%)