library(ggplot2)
library(data.table)

Preprocessing: Load raw CGM data and create weekly deidentified metrics

Only running this once to create aggregate dataset. Can be ignored. Just included for reference.

Click to see code used to generate datasets for simulations
library(data.table)
setwd("~/Downloads/CGM/dexcom_20210118/")
cgm_files = grep("dexcom_clean_record_id_", list.files("."), value = T)
cgm_data = rbindlist(lapply(cgm_files, fread))
cgm_data = cgm_data[, .(record_id, time_edit = ts, glucose = bg, dexcom_date = as.Date(ts))]

# For each patient-day and patient-week, generate aggregate CGM metrics

cgm_data[, `:=`(
  inrange = ifelse(glucose >= 70 & glucose <= 180, 1, 0),
  hyper = ifelse(glucose > 180, 1, 0),
  hypo1 = ifelse(glucose <= 70, 1, 0),
  hypo2 = ifelse(glucose <= 54, 1, 0),
  dexcom_date = as.Date(dexcom_date),
  dexcom_week = cut(as.Date(dexcom_date), 'week')
)]

patient_weeks = cgm_data[, .(
  TIR = mean(inrange),
  hyper = mean(hyper),
  hypo1 = mean(hypo1),
  hypo2 = mean(hypo2)
), by=c('record_id', 'dexcom_week')]

# Filter to patients who have at least 52 weeks of data
patient_weeks[,dexcom_week := as.character(dexcom_week)]
ptokeep = patient_weeks[,.(n_weeks=.N),by='record_id'][n_weeks>=52,record_id]
f_patient_weeks = patient_weeks[record_id %in% ptokeep]

# Replace dexcom_week with week_id
f_patient_weeks[,week_id := frank(dexcom_week), by='record_id']
f_patient_weeks[,dexcom_week := NULL]

# Replace record_id with a different permutation for deidentification
f_patient_weeks[,pat_id := frank(record_id, ties.method="dense")]
f_patient_weeks[,pat_id := sample(unique(f_patient_weeks$pat_id))[pat_id]]

# Create multiple copies of patients with more than 52 weeks of data,
# each one starting at a different offset such that there are still 52 weeks of data for the new fake patient.
max_pat_id = f_patient_weeks[,max(pat_id)]
fake_patients = rbindlist(lapply(f_patient_weeks[,unique(pat_id)], function(p){

  n_weeks = f_patient_weeks[pat_id==p, max(week_id)]
  fake_data = lapply(1:(n_weeks-52), function(offset) {

    print(max_pat_id)

    new_data = copy(f_patient_weeks[pat_id==p])
    new_data[,week_id := week_id - offset]
    max_pat_id <<- max_pat_id + 1
    new_data[,pat_id := max_pat_id]
    return(new_data[week_id >= 1 & week_id <= 52])
  })

  return(rbindlist(fake_data))
}))

# Combine real and fake data and permute patient ids again
combo_data = rbind(f_patient_weeks[week_id<=52], fake_patients)
combo_data[,pat_id := sample(unique(combo_data$pat_id))[pat_id]]
combo_data[,record_id := NULL]

# Checks
combo_data[,.(n=uniqueN(pat_id), max(week_id), min(week_id))]

# Save
fwrite(combo_data, "~/Downloads/CGM/deid_pat_weeks.csv")

Actual Simulation Code

Load data

combo_data = fread("~/Downloads/CGM/deid_pat_weeks.csv")
# Number of patients
combo_data[,uniqueN(pat_id)]
## [1] 2640

Sample to 600 patients

set.seed(1814)
N_PATIENTS = 600
sampled_pat_ids = sample(combo_data[,unique(pat_id)], N_PATIENTS, FALSE)
sampled_data = combo_data[pat_id %in% sampled_pat_ids]

# Number of patients
sampled_data[,uniqueN(pat_id)]
## [1] 600

Now, simulate contacts with a given capacity each week, contacting patients based on current clinic ranking

sim_contacts = function(dt, K, review_outside_targets_consecutive_weeks=FALSE) {
  out_dt = copy(dt)
  
  # Count violated targets
  out_dt[, violated_targets := (TIR<0.65) + (hypo1 > 0.05) + (hypo2 > 0.01)]
  
  # Order data by week, patient
  setorder(out_dt, week_id, pat_id)
  
  # Get last weeks violated targets
  out_dt[, lag_violated_targets := shift(violated_targets), by='pat_id']
  
  # Iterate over weeks and create indicators for whether a patient was reviewed
  for(wk in out_dt[,unique(week_id)]) {
    
    # No review as baseline
    out_dt[week_id == wk, review := 0]
    
    # Rank patients and review top K
    setorder(out_dt, week_id, -violated_targets, TIR)
    out_dt[week_id == wk, rnk := seq_len(.N)]
    out_dt[week_id == wk & rnk <= K, review := 1]
    
    # Optionally also review anyone who wasn't contacted last week but was outside of targets last week and this week
    if(review_outside_targets_consecutive_weeks) {
      out_dt[, lag_reviewed := shift(review), by = 'pat_id']
      out_dt[week_id == wk & lag_violated_targets>0 & violated_targets>0 & lag_reviewed==0, review := 1] 
    }
  }
  
  return(out_dt)
}


test_dt = sim_contacts(sampled_data, 100)

# Get distribution of reviews by patient
test_dt_reviews_by_pat = test_dt[,.(n_reviews = sum(review)), by = 'pat_id']
library(ggpubr)

gghistogram(test_dt_reviews_by_pat, x = "n_reviews",
   add = "mean", rug = TRUE,
   add_density = TRUE) + ggtitle("Distribution of annual reviews per patient (K=100)")

Run a bunch of K values and get a distribution for each one.

Not reviewing patients outside of targets in consecutive weeks here but you could.

Ks = seq(from=50, to=600, by=50)

all_sims_dt = rbindlist(lapply(Ks, function(k){
  sim_dt = sim_contacts(sampled_data, k, review_outside_targets_consecutive_weeks = FALSE)
  out_dt = sim_dt[,.(n_reviews = sum(review)), by = 'pat_id']
  out_dt[, K := k]
  return(out_dt)
}))

gghistogram(all_sims_dt, x = "n_reviews",
   add = "mean", rug = FALSE, facet.by="K",
   add_density = TRUE) + ggtitle("Distribution of annual reviews per patient (different Ks)") + facet_wrap(~K, nrow=2)