# Opening the packages we need
library(openxlsx) # package to open xlsx files
library(readxl)
library(fst) # lightning fast serialization of data frames for R (saves data in smaller file sizes; loads more quickly)
library(psych)
library(dplyr)
library(tidyr)
# Creating data paths
setwd("/Users/emurbanw/University of Michigan Dropbox/MED-EFDC-DDC-internal/Incubator_2023-2024/Melissa_Dejonckheere/")
diabetes_sheets <- excel_sheets("Dejonckheere_DataResults_HUM00246707_2025-03-04.xlsx")
print(diabetes_sheets)
patient <- read_excel("Dejonckheere_DataResults_HUM00246707_2025-03-04.xlsx", sheet = "Patient")
# 1883 patients
length(unique(patient$MRN))
## [1] 1883
# Renaming variables so there are no spaces in the names
patient <- patient %>%
rename(DOB = `Birth Date`,
DiabetesTypeRegistry = `Diabetes Type Registry`,
DiabetesTypeMiCart = `Diabetes Type Micart`,
DiabetesDiagnosisDateMiCart = `Diabetes Diagnosis Date MiCart`,
DiabetesTypeDxCodes = `Diabetes Type Dx Codes`,
FirstDiabetesTypeDxCodesDate = `First Diabetes Type Dx Codes Date`,
LastDiabetesTypeDxCodesDate = `Last Diabetes Type Dx Codes Date`,
DiabetesTypeRegistryFlg = `Diabetes Type Registry Flg`,
DiabetesTypeMiCartFlg = `Diabetes Type Micart Flg`,
DiabetesTypeDxCodesFlg = `Diabetes Type Dx Codes Flg`)
# Keeping only currently relevant variables
patient <- patient[ ,c("MRN", "DOB", "DiabetesDiagnosisDateMiCart", "FirstDiabetesTypeDxCodesDate", "LastDiabetesTypeDxCodesDate", "DiabetesTypeRegistry", "DiabetesTypeMiCart", "DiabetesTypeDxCodes", "DiabetesTypeRegistryFlg", "DiabetesTypeMiCartFlg", "DiabetesTypeDxCodesFlg")]
# Chang all Null to NA
patient[patient == "NULL"] <- NA
# Changing dates to posixct
patient$DOB <- as.POSIXct(patient$DOB, format = "%Y-%m-%d")
patient$DiabetesDiagnosisDateMiCart <- as.POSIXct(patient$DiabetesDiagnosisDateMiCart, format = "%Y-%m-%d")
patient$FirstDiabetesTypeDxCodesDate <- as.POSIXct(patient$FirstDiabetesTypeDxCodesDate, format = "%Y-%m-%d")
patient$LastDiabetesTypeDxCodesDate <- as.POSIXct(patient$LastDiabetesTypeDxCodesDate, format = "%Y-%m-%d")
# Checking that FirstDiabetes Date is always after the DiagnosisDate in MiCart
patient$date_diff <- round(as.numeric(difftime(patient$FirstDiabetesTypeDxCodesDate, patient$DiabetesDiagnosisDateMiCart, units = "days")))
describe(patient$date_diff)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1833 946.28 1303.99 206 693.57 305.42 -749 6541 7290 1.43 1.26
## se
## X1 30.46
# 40 patients have FirstDiabetesTypeDxCodesDate before DiabetesDiagnosisDateMiCart (not sure why that would be or which is the most accurate date to use for first T1D diagnosis date)
check <- patient[ which(patient$date_diff < 0),]
# Checking different diabetes type variables
table(patient$DiabetesTypeRegistry) # 1736 "Type 1 Diabetes"
##
## Other Diabetes Type 1 Diabetes Type 2 Diabetes
## 9 1736 45
table(patient$DiabetesTypeMiCart) # 1800 "T1D"
##
## CFRD MODY Other Prediabetes Steroid Induced
## 3 7 3 5 4
## T1D T2D
## 1800 12
table(patient$DiabetesTypeDxCodes) # 1868 "Type 1 Diabetes"
##
## Cystic Fibrosis Related Diabetes Other Diabetes
## 2 4
## Type 1 Diabetes Type 2 Diabetes
## 1868 8
# Checking different diabetes flags; This sums to ALL types of diabetes, not just T1D
table(patient$DiabetesTypeRegistryFlg) # 1790
##
## 0 1
## 93 1790
table(patient$DiabetesTypeMiCartFlg) # 1834
##
## 0 1
## 49 1834
table(patient$DiabetesTypeDxCodesFlg) # 1882
##
## 0 1
## 1 1882
# Creating age at diagnosis
patient$Age_at_diagnosis <- round(as.numeric(difftime(patient$DiabetesDiagnosisDateMiCart, patient$DOB, units = "days")) / 365.25, 2)
PSL <- read_excel("Dejonckheere_DataResults_HUM00246707_2025-03-04.xlsx", sheet = "Problem List")
# 1815 patients
length(unique(PSL$MRN))
## [1] 1815
PSL <- PSL %>%
rename(DXID = "DX ID",
DXName = "DX Name")
# List of 131 ICD codes for T1D
# And another 9 ICD codes for T1D in pregnancy (all 024.0* codes-- this only adds 2 people)
T1D_codes <- c("E10.1", "E10.10", "E10.11", "E10.2", "E10.21", "E10.22", "E10.29", "E10.3", "E10.31", "E10.311", "E10.319", "E10.32", "E10.321", "E10.3211", "E10.3212", "E10.3213", "E10.3219", "E10.329", "E10.3291", "E10.3292", "E10.3293", "E10.3299", "E10.33", "E10.331", "E10.3311", "E10.3312", "E10.3313", "E10.3319", "E10.339", "E10.3391", "E10.3392", "E10.3393", "E10.3399", "E10.34", "E10.341", "E10.3411", "E10.3412", "E10.3413", "E10.3419", "E10.349", "E10.3491", "E10.3492", "E10.3493", "E10.3499", "E10.35", "E10.351", "E10.3511", "E10.3512", "E10.3513", "E10.3519", "E10.352", "E10.3521", "E10.3522", "E10.3523", "E10.3529", "E10.353", "E10.3531", "E10.3532", "E10.3533", "E10.3539", "E10.354", "E10.3541", "E10.3542", "E10.3543", "E10.3549", "E10.355", "E10.3551", "E10.3552", "E10.3553", "E10.3559", "E10.359", "E10.3591", "E10.3592", "E10.3593", "E10.3599", "E10.36", "E10.37", "E10.37X1", "E10.37X2", "E10.37X3", "E10.37X9", "E10.39", "E10.4", "E10.40", "E10.41", "E10.42", "E10.43", "E10.44", "E10.49", "E10.5", "E10.51", "E10.52", "E10.59", "E10.61", "E10.610", "E10.618", "E10.62", "E10.620", "E10.621", "E10.622", "E10.628", "E10.63", "E10.630", "E10.638", "E10.64", "E10.641", "E10.649", "E10.65", "E10.69", "E10.8", "E10.9", "250.01", "250.03", "250.11", "250.13", "250.21", "250.23", "250.31", "250.33", "250.41", "250.43", "250.51", "250.53", "250.61", "250.63", "250.71", "250.73", "250.81", "250.83", "250.91", "250.93", "O24.013", "O24.013", "O24.012", "O24.011", "O24.02", "O24.03", "O24.019", "O24.0", "O24.01")
# Create the T1D variable in PSL
PSL$T1D <- as.integer(grepl(paste(T1D_codes, collapse = "|"), PSL$ICD10))
# Counting number of T1D diagnoses within Patient and also making a variable of whether the patient had any T1D diagnoses (or not)
PSL <- PSL %>%
group_by(MRN) %>%
mutate(num_T1D_dx = sum(T1D),
any_T1D_dx = ifelse(num_T1D_dx > 0, 1, 0))
table(PSL$num_T1D_dx)
##
## 0 1 2 3 4 5 6
## 1332 6333 1952 621 259 130 21
table(PSL$any_T1D_dx)
##
## 0 1
## 1332 9316
# 1542 patients have any T1D dx using ICD codes
length(unique(PSL$MRN[PSL$any_T1D_dx==1]))
## [1] 1542
# List of 223 DXID codes for Eating Disorders
ED_dxid <- c("3268", "1593098", "493349", "490554", "3250", "90696", "145348", "335018", "557402", "1211378", "1357632", "1496644", "1496647", "1651764", "76239", "490555", "1053857", "1053858", "1053859", "1053860", "1053861", "1054625", "1054626", "1056617", "1056675", "1056706", "1056801", "1056836", "1056930", "1056989", "1057050", "1057051", "1300231", "1300232", "1300235", "1300236", "1300237", "1300238", "1300246", "1300255", "1598616", "1599275", "331523", "335452", "355043", "363057", "543291", "1053853", "1053854", "1053855", "1053856", "1054624", "1055125", "1055126", "1056596", "1056645", "1056771", "1056809", "1056862", "1056885", "1056914", "1057096", "1057097", "1300248", "1300249", "1300321", "1300324", "1300327", "1300329", "1300331", "1300334", "1599444", "1600953", "3270", "76241", "76242", "92027", "142604", "154628", "324555", "420230", "493337", "1053873", "1053874", "1053875", "1053876", "1053877", "1054637", "1054638", "1054639", "1055131", "1055132", "1055133", "1055439", "1056610", "1056687", "1056704", "1056740", "1056827", "1056868", "1056889", "1056894", "1056898", "1056943", "1056945", "1056966", "1212985", "1234686", "1234764", "1234868", "1635652", "1635653", "1650363", "496115", "1053864", "1053865", "1053866", "1053867", "1053868", "1053869", "1054631", "1054632", "1054633", "1054634", "1055127", "1055437", "1056620", "1056649", "1056701", "1056755", "1056770", "1056773", "1056841", "1056857", "1255457", "1311423", "1311424", "1311425", "1311426", "1452192", "967746", "1260545", "1260549", "1467625", "1529714", "1540523", "3271", "3274", "122186", "122187", "122189", "122190", "176419", "201660", "233896", "233897", "263589", "304827", "304828", "324860", "335850", "336827", "966216", "966440", "1260548", "1452303", "1455176", "1610801", "1610804", "1615325", "1623877", "3269", "178161", "200035", "200038", "200040", "258075", "258076", "321925", "350774", "480942", "493152", "925805", "1062104", "1258404", "1258419", "1615590", "327165", "3272", "76244", "122188", "490556", "1611706", "1657300", "200034", "423275", "443728", "486683", "551237", "1352496", "1474417", "1610810", "337492", "543871", "1611705", "428542", "1235807", "420982", "114854", "114855", "357251", "1496785", "485721", "200039", "251857", "1312189", "1312190", "1312194", "1312195", "1312196", "1312197", "1603862", "447359")
# Create the ED based on DXID variable in PSL
PSL$ED_dxid <- as.integer(grepl(paste(ED_dxid, collapse = "|"), PSL$DXID))
# Counting number of ED_dxid diagnoses within Patient and also making a variable of whether the patient had any ED_dxid diagnoses (or not)
PSL <- PSL %>%
group_by(MRN) %>%
mutate(num_ED_dxid_dx = sum(ED_dxid),
any_ED_dxid_dx = ifelse(num_ED_dxid_dx > 0, 1, 0))
table(PSL$num_ED_dxid_dx)
##
## 0 1 2
## 10094 500 54
table(PSL$any_ED_dxid_dx)
##
## 0 1
## 10094 554
# 36 patients have any ED_dxid dx using ICD codes
length(unique(PSL$MRN[PSL$any_ED_dxid_dx==1]))
## [1] 36
# 13 ICD-9; 23 ICD-10 (total 36)
ED_ICD <- c("307.1", "307.5", "307.50", "307.51", "307.52", "307.53", "307.59", "646.83", "779.31", "783.3", "783.6", "784.59", "787.03", "E63.9", "F50", "F50.0", "F50.00", "F50.01", "F50.02", "F50.2", "F50.8", "F50.81", "F50.82", "F50.89", "F50.9", "F63.89", "F98.21", "F98.29", "F98.3", "O99.340", "P92.8", "R11.10", "R46.89", "R47.89", "R63.30", "R63.39")
# Create the ED_ICD variable in PSL
PSL$ED_ICD <- as.integer(grepl(paste(ED_ICD, collapse = "|"), PSL$ICD10))
# Counting number of ED_ICD diagnoses within Patient and also making a variable of whether the patient had any ED_ICD diagnoses (or not)
PSL <- PSL %>%
group_by(MRN) %>%
mutate(num_ED_ICD_dx = sum(ED_ICD),
any_ED_ICD_dx = ifelse(num_ED_ICD_dx > 0, 1, 0))
table(PSL$num_ED_ICD_dx)
##
## 0 1 2 4
## 9833 712 70 33
table(PSL$any_ED_ICD_dx)
##
## 0 1
## 9833 815
# 57 patients have any ED_ICD dx using ICD codes
length(unique(PSL$MRN[PSL$any_ED_ICD_dx==1]))
## [1] 57
PSL <- PSL %>%
mutate(ED_dx = ifelse(ED_dxid == 1 | ED_ICD == 1, 1, 0))
# Counting number of ED_ICD diagnoses within Patient and also making a variable of whether the patient had any ED_ICD diagnoses (or not)
PSL <- PSL %>%
group_by(MRN) %>%
mutate(num_ED_dx = sum(ED_dx),
any_ED_dx = ifelse(num_ED_dx > 0, 1, 0))
table(PSL$num_ED_dx)
##
## 0 1 2 4
## 9618 927 70 33
table(PSL$any_ED_dx)
##
## 0 1
## 9618 1030
# 69 patients have any ED dx using DXID or ICD codes
length(unique(PSL$MRN[PSL$any_ED_dx==1]))
## [1] 69
sample <- PSL[ which(PSL$any_T1D_dx==1),]
# 1542 patients with a T1D diagnosis
length(unique(sample$MRN))
## [1] 1542
# 59 have a T1D and an ED diagnosis (3.8% of the sample)
length(unique(sample$MRN[sample$any_ED_dx==1]))
## [1] 59
# 1483 have a T1D and zero ED diagnoses
length(unique(sample$MRN[sample$any_ED_dx==0]))
## [1] 1483