Loading libraries/packages & paths

library(openxlsx) # package to open xlsx files
library(psych) # package for psychometric research (e.g., describe)
library(plyr) # tools for splitting, applying, and combining data (use for ddply)
library(lubridate) # to format dates and times
library(stringr) # for string operations
library(dplyr) # for data manipulation 
library(fst) # lightning fast serialization of data frames for R (saves data in smaller file sizes; loads more quickly)
library(tidyr) # for pivot_wider to reshape data

raw_data_path <- "/Users/emurbanw/Dropbox (University of Michigan)/MED-ARSIs-Depression-ProstateCancer/data/raw/"
clean_data_path <- "/Users/emurbanw/Dropbox (University of Michigan)/MED-ARSIs-Depression-ProstateCancer/data/clean/"

# What are the sheet names in the xlsx datafile?
orch_sheets <- getSheetNames(paste0(raw_data_path, "Orch_Feb1.xlsx"))
adt_sheets <- getSheetNames(paste0(raw_data_path, "ADTs_Feb1.xlsx"))
print(orch_sheets)
print(adt_sheets)

Note

# This Rmd file cleans date from the demographics, encounters, diagnoses, and cancer registry views of Michigan Medicine's DataDirect. 
# There are two xlsx files that contain all the data: 
# One is for patients pulled from the orchiectomy cohort- those with prostate cancer who received an orchiectomy procedure.
# The other is for patients with prostate cancer who also received at least one of five ADT drugs (leuprolide, goserelin, triptorelin, degarelix, or relugolix). 
# 
# In this Rmd file, for each output view, we first append the sheets of interest between the Orch and ADT cohorts. 
# Then we clean the variables (e.g., make usable dates, harmonize discrepant dates, merge encounters/diagnoeses, date prostate cancer diagnosis, etc.).
# We save several DFs throughout the process. 

Demographics

Loading & appending raw data

# Opening separate data frames and indicating which df it came from
demos_orch <- read.xlsx(paste0(raw_data_path, "Orch_Feb1.xlsx"), sheet = "DemographicInfo")
demos_orch$adt_orch <- 1
demos_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DemographicInfo")
demos_adt$adt_orch <- 0

# Appending the dataframes
merged <- rbind(demos_adt, demos_orch)

# Determining unique cases and overlap cases
length(unique(merged$PatientID)) # 5442 unique IDs but 5452 cases, meaning 10 IDs are in both datasets

orch_IDs <- unique(demos_orch$PatientID)
adt_IDs <- unique(demos_adt$PatientID)
overlap_IDs <- intersect(adt_IDs, orch_IDs)

# Assigning a "1" to individuals who have overlap between both dataframes
merged$overlap <- ifelse(merged$PatientID %in% overlap_IDs, 1, 0)

# Taking a look at the overlap cases; all repeated rows have the exact same values and are total duplicates
overlap <- merged[ which(merged$overlap==1),]

# I am deleting the duplicated rows (specifically, I will keep those that are flagged as orch cases because there's likely a reason they were flagged for orchiectomy)
merged <- subset(merged, !(adt_orch == 0 & overlap == 1))

# Ordering rows by PatientID
merged <- merged[order(merged$PatientID), ]

Harmonizing deceased date

# There are two variables that note date of death for some patients. In some cases, the reported date of death differs between the two variables. This section harmonizes the date of death and selects the earliest one as the date we'll use when there are discrepancies between the two variables. 

# classifying as POSIX variable and dropping the time portion
merged$DeceasedDate <- as.POSIXct(merged$DeceasedDate, format = "%m/%d/%Y")
merged$DeathIndexDeceasedDate <- as.POSIXct(merged$DeathIndexDeceasedDate, format = "%m/%d/%Y")

# computing difference between variables
merged$DOD_diff <- merged$DeathIndexDeceasedDate - merged$DeceasedDate
merged$DOD_diff <- as.numeric(merged$DOD_diff, "days")

# investigating differences
describe(merged$DOD_diff) # difference in dates ranging from -92 to 243 days
summary(merged$DeceasedDate) # 4406 NAs
summary(merged$DeathIndexDeceasedDate) # 3824 NAs
length(merged$DeceasedDate[!is.na(merged$DeathIndexDeceasedDate) & is.na(merged$DeceasedDate)]) # 645 have death index and not UM
length(merged$DeceasedDate[is.na(merged$DeathIndexDeceasedDate) & !is.na(merged$DeceasedDate)]) # 63 have UM variable and not death index
length(merged$DeceasedDate[!is.na(merged$DeathIndexDeceasedDate) & !is.na(merged$DeceasedDate)]) # 973 have both

# Making separate DFs based on which date varaibles each row has
merged$DOD_date_var[!is.na(merged$DeathIndexDeceasedDate) & is.na(merged$DeceasedDate)] <- "DeathIndexDeceasedDate"
merged$DOD_date_var[is.na(merged$DeathIndexDeceasedDate) & !is.na(merged$DeceasedDate)] <- "DeceasedDate"
merged$DOD_date_var[!is.na(merged$DeathIndexDeceasedDate) & !is.na(merged$DeceasedDate)] <- "earlist death date"
merged$DOD_date_var[is.na(merged$DeathIndexDeceasedDate) & is.na(merged$DeceasedDate)] <- NA
table(merged$DOD_date_var)
a <- merged[ which(merged$DOD_date_var == "DeathIndexDeceasedDate"),]
b <- merged[ which(merged$DOD_date_var == "DeceasedDate"),]
c <- merged[ which(merged$DOD_date_var == "earlist death date"),]
d <- merged[ which(is.na(merged$DOD_date_var)),]

# making a deceased date based on other two date variables
# For people with only one DeceasedDate, we will use that date
# For people with both, we will use the earlier of the two dates
a$DOD_date <- a$DeathIndexDeceasedDate
b$DOD_date <- b$DeceasedDate
c$DOD_date <- pmin(c$DeathIndexDeceasedDate, c$DeceasedDate, na.rm = TRUE)
d$DOD_date <- NA

demos <- rbind(a, b, c, d)

# Re-ordering columns
demos <- demos[ ,c("PatientID", "adt_orch", "overlap", "GenderCode",  "RaceName", "RaceNameMultiple", "EthnicityName", "DOD_date", "DeathIndexCauseOfDeath", "DeathIndexUnderlyingCOD_ICD10",  "DOD_diff", "DOD_date_var", "DeceasedDate", "DeathIndexDeceasedDate")]

summary(demos$DOD_date) # earliest date of death on record: 10/25/2000; latest: 01/23/2024

# Sorting the data
demos <- demos[order(demos$PatientID), ]

Saving data

write.fst(demos, paste0(clean_data_path,"Demographics_merged.fst"))

# rm(a, b, c, d, demos_adt, demos_orch, merged, overlap)

Encounters

Loading & appending raw data

print(orch_sheets)
print(adt_sheets)

# Opening separate data frames
encounters_orch <- read.xlsx(paste0(raw_data_path, "Orch_Feb1.xlsx"), sheet = "EncounterAll")
encounters1_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "EncounterAll")
encounters2_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "EncounterAll (part 2)")

# Appending the dataframes
encounters <- rbind(encounters1_adt, encounters2_adt)
encounters <- rbind(encounters, encounters_orch)

# removing duplicated rows
print(nrow(encounters)) # 1,848,187
encounters <- encounters[!duplicated(encounters), ]
print(nrow(encounters)) # 1,848,068

# reformatting date
# AdmitDate is the date of the encounter. The date returned will be based on an algorithm that chooses the first non-null value available from the following list (in that order): HospitalAdmissionDate, CheckinTime, AppointmentTime, EncounterInstantDate, EncounterContactDate.
encounters$encounter_date <- as.POSIXct(encounters$AdmitDate, format = "%m/%d/%Y")

# Re-ordering columns (GETTING RID OF INSURANCE COLUMNS FOR NOW, COULD ADD BACK IN IF DESIRED: "Ins_A_PlanName", "Ins_A_PlanType", "Ins_B_PlanName", "Ins_B_PlanType", "Ins_C_PlanName", "Ins_C_PlanType", "Ins_D_PlanName", "Ins_D_PlanType", "Ins_E_PlanName", "Ins_E_PlanType")
encounters <- encounters[ ,c("PatientID", "EncounterID", "encounter_date", "AgeInYears", "HasMeds", "HasOrders", "HasProcedures", "EncounterTypeName", "PatientClassName", "DRGCode", "DRGName")]

# Sorting the data
encounters <- encounters[order(encounters$PatientID, encounters$EncounterID), ]

Saving data

write.fst(encounters, paste0(clean_data_path,"Encounters_merged.fst"))

# rm(encounters_orch, encounters1_adt, encounters2_adt)

Diagnoses

Loading & appending raw data

print(orch_sheets)
print(adt_sheets)

# Opening separate data frames and indicating which df it came from
diagnoses_orch <- read.xlsx(paste0(raw_data_path, "Orch_Feb1.xlsx"), sheet = "DiagnosesEverything")
diagnoses1_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DiagnosesEverything")
diagnoses2_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DiagnosesEverything (part 2)")
diagnoses3_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DiagnosesEverything (part 3)")
diagnoses4_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DiagnosesEverything (part 4)")
diagnoses5_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DiagnosesEverything (part 5)")
diagnoses6_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DiagnosesEverything (part 6)")
diagnoses7_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DiagnosesEverything (part 7)")

# Appending the dataframes
diagnoses <- rbind(diagnoses_orch, diagnoses1_adt)
diagnoses <- rbind(diagnoses, diagnoses2_adt)
diagnoses <- rbind(diagnoses, diagnoses3_adt)
diagnoses <- rbind(diagnoses, diagnoses4_adt)
diagnoses <- rbind(diagnoses, diagnoses5_adt)
diagnoses <- rbind(diagnoses, diagnoses6_adt)
diagnoses <- rbind(diagnoses, diagnoses7_adt)

length(unique(diagnoses$PatientID)) # 5442

# removing duplicated rows
print(nrow(diagnoses)) # 7,004,156
diagnoses <- diagnoses[!duplicated(diagnoses), ]
print(nrow(diagnoses)) # 6,769,468

diagnoses <- diagnoses[order(diagnoses$PatientID, diagnoses$EncounterID), ]

Saving data

write.fst(diagnoses, paste0(clean_data_path,"Diagnoses_merged.fst"))

rm(diagnoses_orch, diagnoses1_adt, diagnoses2_adt, diagnoses3_adt, diagnoses4_adt, diagnoses5_adt, diagnoses6_adt, diagnoses7_adt)

DiagnosisPSL (do we need it? No. It duplicates all the information we already have/need from the diagnoses df)

# # Opening separate data frames
# PSL_orch <- read.xlsx(paste0(raw_data_path, "Orch_Feb1.xlsx"), sheet = "DiagnosisPSL")
# PSL_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "DiagnosisPSL")
# 
# # Appending the dataframes
# PSL <- rbind(PSL_adt, PSL_orch)
# 
# # Indicating that the diagnosis came from PSL
# PSL$PSL <- 1
# 
# length(unique(PSL$PatientID)) # 5169 (not all patients are in this df)
# 
# # Renaming PSL vars to match DiagnosesEverything
# PSL$TermCodeMapped <- PSL$DiagnosisICDCodeSource
# PSL$TermNameMapped <- PSL$DiagnosisICDNameMapped
# PSL$TermDescription <- PSL$DiagnosisICDNameSource
# 
# # Keeping only variables that should overlap with the DiagnosisEverything frame
# PSL <- PSL[ ,c("PatientID", "EncounterID", "TermCodeMapped", "TermNameMapped", "TermDescription", "PSL")]
# 
# # Adding the PSL var to Diagnoses
# diagnoses$PSL <- 0
# 
# # binding diagnoses with PSL
# check <- rbind(diagnoses, PSL)
# 
# ###############################################################################
# # Checking overlap between encounterIDs
# PSL_IDs <- unique(PSL$EncounterID)
# diagnoses_IDs <- unique(diagnoses$EncounterID)
# overlap_IDs <- intersect(PSL_IDs, diagnoses_IDs)
# IDs_unique_to_PSL <- setdiff(PSL_IDs, diagnoses_IDs) # This is empty, meaning ALL encounters in the PSL df are also in the diagnoses df. We don't need the PSL df. 

Cancer registry

# Opening separate data frames
cancer_orch <- read.xlsx(paste0(raw_data_path, "Orch_Feb1.xlsx"), sheet = "CancerRegistry")
cancer_adt <- read.xlsx(paste0(raw_data_path, "ADTs_Feb1.xlsx"), sheet = "CancerRegistry")

cancer <- rbind(cancer_adt, cancer_orch)
length(unique(cancer$PatientID)) # 4550

# Determining unique cases and repeated cases
length(unique(cancer$PatientID)) # 4521 unique IDs but 5135 cases, meaning many are duplicated
table(cancer$SiteCode)

# Only keeping prostate cancer cases
cancer <- cancer[ which(cancer$SiteCode=="C61.9"),]
length(unique(cancer$PatientID)) # 4314 unique IDs, but 4332 cases, meaning 18 are duplicated
cancer$indexing <- 1

# removing duplicated rows
print(nrow(cancer)) # 4332
cancer <- cancer[!duplicated(cancer), ]
print(nrow(cancer)) # 4322
length(unique(cancer$PatientID)) # 4314 unique IDs, but 4322 cases, meaning 8 are duplicated

# Re-formatting dates
cancer$registry_dx_date <- as.POSIXct(cancer$OrigDxDate, format = "%m/%d/%Y")
cancer$UM_contact_date <- as.POSIXct(cancer$FirstContactDate, format = "%m/%d/%Y")

# Finding the first diagnosis date as defined by the cancer registry data
cancer_registry <- ddply(cancer, "PatientID", summarize,
                         earliest_registry_dx_date = min(registry_dx_date, na.rm = TRUE),
                         earliest_UM_contact_date = min(UM_contact_date, na.rm = TRUE),
                         count = sum(indexing, na.rm = TRUE))
cancer_registry$earliest_registry_dx_date[cancer_registry$earliest_registry_dx_date==Inf] <- NA

# Joining earliest date info back with main cancer df
cancer <- left_join(cancer, cancer_registry, by = c("PatientID"))

cancer <- cancer[order(cancer$PatientID), ]

Duplicate registry entries

# Isolating duplicate cases
duplicates <- cancer[ which(cancer$count > 1),]

# Ordering by ID and date
duplicates <- duplicates[order(duplicates$PatientID, duplicates$registry_dx_date, duplicates$UM_contact_date),]

# Indexing registry entries
duplicates <- duplicates %>% 
  arrange(PatientID, registry_dx_date) %>%
  group_by(PatientID) %>% 
  mutate(entry_num = row_number())

# Reshaping to wide using pivot_wider from tidyr package (reshape package wasn't working)
duplicates_wide <- pivot_wider(data = duplicates,
                               id_cols = c("PatientID", "SiteGroup", "SiteCode", "SiteDescription", "earliest_registry_dx_date", "earliest_UM_contact_date", "count"),
                               names_from = "entry_num",
                               values_from = c("registry_dx_date", "UM_contact_date", "HistoCode", "HistoDescription", "CaseType", "StageClinical", "StagePathologic", "SummaryStage_SEER2000"),
                               names_sep = ".")

# Renaming all the variables with a .1 so they will be the primary variables we look at
duplicates_wide$registry_dx_date <- duplicates_wide$registry_dx_date.1
duplicates_wide$UM_contact_date <- duplicates_wide$UM_contact_date.1
duplicates_wide$HistoCode <- duplicates_wide$HistoCode.1
duplicates_wide$HistoDescription <- duplicates_wide$HistoDescription.1
duplicates_wide$CaseType <- duplicates_wide$CaseType.1
duplicates_wide$StageClinical <- duplicates_wide$StageClinical.1
duplicates_wide$StagePathologic <- duplicates_wide$StagePathologic.1
duplicates_wide$SummaryStage_SEER2000 <- duplicates_wide$SummaryStage_SEER2000.1

# Keeping only the variables to match with cancer df
duplicates_wide <- duplicates_wide[ ,c("PatientID", "SiteGroup", "SiteCode", "SiteDescription", "earliest_registry_dx_date", 
"earliest_UM_contact_date", "count", "registry_dx_date", "UM_contact_date", "HistoCode", "HistoDescription", "CaseType", "StageClinical", "StagePathologic", "SummaryStage_SEER2000", "registry_dx_date.2", "UM_contact_date.2", "HistoCode.2", "HistoDescription.2", "CaseType.2", "StageClinical.2", "StagePathologic.2", "SummaryStage_SEER2000.2")]

#############################################################################
# Now, revising the cancer df

# First removing the duplicate rows that we will replace
cancer <- cancer[ which(cancer$count==1),]

# Next making NA variables for all the 2nd registry entries that a majority of patients don't have
cancer$registry_dx_date.2 <- NA
cancer$UM_contact_date.2 <- NA 
cancer$HistoCode.2 <- NA
cancer$HistoDescription.2 <- NA
cancer$CaseType.2 <- NA
cancer$StageClinical.2 <- NA
cancer$StagePathologic.2 <- NA 
cancer$SummaryStage_SEER2000.2 <- NA

# Now removing extra variables we don't need
cancer <- cancer[ ,c("PatientID", "earliest_registry_dx_date", "earliest_UM_contact_date", "SiteGroup", "SiteCode", "SiteDescription", "registry_dx_date", "UM_contact_date",  "HistoCode", "HistoDescription", "CaseType", "StageClinical", "StagePathologic", "SummaryStage_SEER2000", "count", "registry_dx_date.2", "UM_contact_date.2", "HistoCode.2", "HistoDescription.2", "CaseType.2", "StageClinical.2", "StagePathologic.2", "SummaryStage_SEER2000.2")]

#############################################################################
# Now binding the duplicates with the non-duplicates

cancer <- rbind(cancer, duplicates_wide)

length(unique(cancer$PatientID)) # 4314

cancer <- cancer[order(cancer$PatientID), ]

Saving data

write.fst(cancer, paste0(clean_data_path,"CancerRegistry_merged.fst"))

# rm(cancer_adt, cancer_orch, cancer_registry, duplicates, duplicates_wide)

Merging dfs

# clean_data_path <- "/Users/emurbanw/Dropbox (University of Michigan)/MED-ARSIs-Depression-ProstateCancer/data/clean/"

encounters <- read.fst(paste0(clean_data_path,"Encounters_merged.fst"))
diagnoses <- read.fst(paste0(clean_data_path,"Diagnoses_merged.fst"))
cancer <- read.fst(paste0(clean_data_path,"CancerRegistry_merged.fst"))
demos <- read.fst(paste0(clean_data_path,"Demographics_merged.fst"))

# Joining
combined <- full_join(diagnoses, encounters, by = c("PatientID", "EncounterID"))
combined <- full_join(combined, demos, by = c("PatientID"))
combined <- full_join(combined, cancer, by = c("PatientID"))

length(unique(combined$PatientID)) # 5442

# sorting the data
combined <- combined[order(combined$PatientID, combined$EncounterID), ]

Dating first prostate cancer diagnosis

# First, only keeping rows with PC diagnosis
PC <- combined[ which(combined$TermCodeMapped == "185" | combined$TermCodeMapped == "C61"), ]

# dropping rows that don't have a date
PC <- PC[ which(!is.na(PC$encounter_date)), ]

# Determining earliest PC dx date
PC_date <- ddply(PC, "PatientID", summarize,
                 earliest_PC_encounter_date = min(encounter_date, na.rm = TRUE),
                 earliest_registry_dx_date = min(earliest_registry_dx_date, na.rm = TRUE))
PC_date$earliest_PC_encounter_date[PC_date$earliest_PC_encounter_date == Inf] <- NA
PC_date$earliest_registry_dx_date[PC_date$earliest_registry_dx_date == Inf] <- NA

##############################################################################
# Comparing PC_dates

# finding difference between earliest_PC_encounter_date and earliest_registry_dx_date
PC_date$dx_date_diff <- PC_date$earliest_PC_encounter_date - PC_date$earliest_registry_dx_date
PC_date$dx_date_diff <- round(as.numeric(PC_date$dx_date_diff, "days"))
describe(PC_date$dx_date_diff) # difference in dates ranging from -5017 to 14622 days
summary(PC_date$earliest_PC_encounter_date) # 0 NAs
summary(PC_date$earliest_registry_dx_date) # 1184 NAs
length(PC_date$earliest_PC_encounter_date[!is.na(PC_date$earliest_registry_dx_date) & is.na(PC_date$earliest_PC_encounter_date)]) # 0
length(PC_date$earliest_PC_encounter_date[is.na(PC_date$earliest_registry_dx_date) & !is.na(PC_date$earliest_PC_encounter_date)]) # 1184
length(PC_date$earliest_PC_encounter_date[!is.na(PC_date$earliest_registry_dx_date) & !is.na(PC_date$earliest_PC_encounter_date)]) # 4141 have both
#table(PC_date$dx_date_diff)
# hist(PC_date$dx_date_diff) # Most are positive, which means that the registry date is often earlier than the encounter dx date (they joined the registry first, probably were diagnosed elsewhere)

# Making separate DFs based on which date variables each row has
PC_date$dx_date_var[!is.na(PC_date$earliest_registry_dx_date) & is.na(PC_date$earliest_PC_encounter_date)] <- "earliest_registry_dx_date"
PC_date$dx_date_var[is.na(PC_date$earliest_registry_dx_date) & !is.na(PC_date$earliest_PC_encounter_date)] <- "earliest_PC_encounter_date"
PC_date$dx_date_var[!is.na(PC_date$earliest_registry_dx_date) & !is.na(PC_date$earliest_PC_encounter_date)] <- "earlist dx date"
table(PC_date$dx_date_var)
a <- PC_date[ which(PC_date$dx_date_var == "earliest_registry_dx_date"),]
b <- PC_date[ which(PC_date$dx_date_var == "earliest_PC_encounter_date"),]
c <- PC_date[ which(PC_date$dx_date_var == "earlist dx date"),]

# making a diagnosis date based on other two date variables
# For people with no earliest_PC_encounter_date, we will use earliest_registry_dx_date
# For people  with no earliest_registry_dx_date, we will use earliest_PC_encounter_date
# For people with both, we will use the earlier of the two dates

a$PC_dx_date <- a$earliest_registry_dx_date
b$PC_dx_date <- b$earliest_PC_encounter_date
c$PC_dx_date <- pmin(c$earliest_registry_dx_date, c$earliest_PC_encounter_date, na.rm = TRUE)

abc <- rbind(a, b)
abc <- rbind(abc, c)

# sorting the data
abc <- abc[order(abc$PatientID), ]

# Rejoining with main df
data <- left_join(combined, abc, by = "PatientID")

# In the process of dating the registry dx date was duplicated. Bringing it back and then dropping the .x and .y variables
data$earliest_registry_dx_date <- data$earliest_registry_dx_date.x

# Re-ordering columns
data <- data[ ,c("PatientID", "adt_orch", "PC_dx_date", "GenderCode", "RaceName", "RaceNameMultiple", "EthnicityName", "SiteGroup", "SiteCode", "SiteDescription", "HistoCode", "HistoDescription", "CaseType", "StageClinical", "StagePathologic", "SummaryStage_SEER2000", "DOD_date", "DeathIndexCauseOfDeath", "DeathIndexUnderlyingCOD_ICD10", "EncounterID", "TermCodeMapped", "TermNameMapped", "TermDescription", "encounter_date", "AgeInYears", "HasMeds", "HasOrders", "HasProcedures", "EncounterTypeName", "PatientClassName", "DRGCode", "DRGName", "DOD_diff", "DOD_date_var", "DeceasedDate", "DeathIndexDeceasedDate", "registry_dx_date", "UM_contact_date", "registry_dx_date.2", "UM_contact_date.2", "HistoCode.2", "HistoDescription.2", "CaseType.2", "StageClinical.2", "StagePathologic.2", "SummaryStage_SEER2000.2", "earliest_registry_dx_date", "earliest_UM_contact_date", "earliest_PC_encounter_date", "dx_date_diff", "dx_date_var", "overlap", "count")]

# sorting the data
data <- data[order(data$PatientID, data$encounter_date), ]

Saving merged data

write.fst(data, paste0(clean_data_path,"DemosEncountersDiagnosesCancer_dx_dated.fst"))