library(ggplot2)
library(data.table)
Only running this once to create aggregate dataset. Can be ignored. Just included for reference.
library(data.table)
setwd("~/Downloads/CGM/dexcom_20210118/")
cgm_files = grep("dexcom_clean_record_id_", list.files("."), value = T)
cgm_data = rbindlist(lapply(cgm_files, fread))
cgm_data = cgm_data[, .(record_id, time_edit = ts, glucose = bg, dexcom_date = as.Date(ts))]
# For each patient-day and patient-week, generate aggregate CGM metrics
cgm_data[, `:=`(
inrange = ifelse(glucose >= 70 & glucose <= 180, 1, 0),
hyper = ifelse(glucose > 180, 1, 0),
hypo1 = ifelse(glucose <= 70, 1, 0),
hypo2 = ifelse(glucose <= 54, 1, 0),
dexcom_date = as.Date(dexcom_date),
dexcom_week = cut(as.Date(dexcom_date), 'week')
)]
patient_weeks = cgm_data[, .(
TIR = mean(inrange),
hyper = mean(hyper),
hypo1 = mean(hypo1),
hypo2 = mean(hypo2)
), by=c('record_id', 'dexcom_week')]
# Filter to patients who have at least 52 weeks of data
patient_weeks[,dexcom_week := as.character(dexcom_week)]
ptokeep = patient_weeks[,.(n_weeks=.N),by='record_id'][n_weeks>=52,record_id]
f_patient_weeks = patient_weeks[record_id %in% ptokeep]
# Replace dexcom_week with week_id
f_patient_weeks[,week_id := frank(dexcom_week), by='record_id']
f_patient_weeks[,dexcom_week := NULL]
# Replace record_id with a different permutation for deidentification
f_patient_weeks[,pat_id := frank(record_id, ties.method="dense")]
f_patient_weeks[,pat_id := sample(unique(f_patient_weeks$pat_id))[pat_id]]
# Create multiple copies of patients with more than 52 weeks of data,
# each one starting at a different offset such that there are still 52 weeks of data for the new fake patient.
max_pat_id = f_patient_weeks[,max(pat_id)]
fake_patients = rbindlist(lapply(f_patient_weeks[,unique(pat_id)], function(p){
n_weeks = f_patient_weeks[pat_id==p, max(week_id)]
fake_data = lapply(1:(n_weeks-52), function(offset) {
print(max_pat_id)
new_data = copy(f_patient_weeks[pat_id==p])
new_data[,week_id := week_id - offset]
max_pat_id <<- max_pat_id + 1
new_data[,pat_id := max_pat_id]
return(new_data[week_id >= 1 & week_id <= 52])
})
return(rbindlist(fake_data))
}))
# Combine real and fake data and permute patient ids again
combo_data = rbind(f_patient_weeks[week_id<=52], fake_patients)
combo_data[,pat_id := sample(unique(combo_data$pat_id))[pat_id]]
combo_data[,record_id := NULL]
# Checks
combo_data[,.(n=uniqueN(pat_id), max(week_id), min(week_id))]
# Save
fwrite(combo_data, "~/Downloads/CGM/deid_pat_weeks.csv")
Load data
combo_data = fread("~/Downloads/CGM/deid_pat_weeks.csv")
# Number of patients
combo_data[,uniqueN(pat_id)]
## [1] 2640
Sample to 600 patients
set.seed(1814)
N_PATIENTS = 600
sampled_pat_ids = sample(combo_data[,unique(pat_id)], N_PATIENTS, FALSE)
sampled_data = combo_data[pat_id %in% sampled_pat_ids]
# Number of patients
sampled_data[,uniqueN(pat_id)]
## [1] 600
Now, simulate contacts with a given capacity each week, contacting patients based on current clinic ranking
sim_contacts = function(dt, K, review_outside_targets_consecutive_weeks=FALSE) {
out_dt = copy(dt)
# Count violated targets
out_dt[, violated_targets := (TIR<0.65) + (hypo1 > 0.05) + (hypo2 > 0.01)]
# Order data by week, patient
setorder(out_dt, week_id, pat_id)
# Get last weeks violated targets
out_dt[, lag_violated_targets := shift(violated_targets), by='pat_id']
# Iterate over weeks and create indicators for whether a patient was reviewed
for(wk in out_dt[,unique(week_id)]) {
# No review as baseline
out_dt[week_id == wk, review := 0]
# Rank patients and review top K
setorder(out_dt, week_id, -violated_targets, TIR)
out_dt[week_id == wk, rnk := seq_len(.N)]
out_dt[week_id == wk & rnk <= K, review := 1]
# Optionally also review anyone who wasn't contacted last week but was outside of targets last week and this week
if(review_outside_targets_consecutive_weeks) {
out_dt[, lag_reviewed := shift(review), by = 'pat_id']
out_dt[week_id == wk & lag_violated_targets>0 & violated_targets>0 & lag_reviewed==0, review := 1]
}
}
return(out_dt)
}
test_dt = sim_contacts(sampled_data, 100)
# Get distribution of reviews by patient
test_dt_reviews_by_pat = test_dt[,.(n_reviews = sum(review)), by = 'pat_id']
library(ggpubr)
gghistogram(test_dt_reviews_by_pat, x = "n_reviews",
add = "mean", rug = TRUE,
add_density = TRUE) + ggtitle("Distribution of annual reviews per patient (K=100)")
Run a bunch of K values and get a distribution for each one.
Not reviewing patients outside of targets in consecutive weeks here but you could.
Ks = seq(from=50, to=600, by=50)
all_sims_dt = rbindlist(lapply(Ks, function(k){
sim_dt = sim_contacts(sampled_data, k, review_outside_targets_consecutive_weeks = FALSE)
out_dt = sim_dt[,.(n_reviews = sum(review)), by = 'pat_id']
out_dt[, K := k]
return(out_dt)
}))
gghistogram(all_sims_dt, x = "n_reviews",
add = "mean", rug = FALSE, facet.by="K",
add_density = TRUE) + ggtitle("Distribution of annual reviews per patient (different Ks)") + facet_wrap(~K, nrow=2)