predict_patient_trajectories.utf8

library(ggplot2)
library(data.table)
library(caret)

setwd("~/Downloads/CGM/")
patient_week_contacts = fread('~/Downloads/CGM/patient_week_contacts.csv')
str(patient_week_contacts)

## Classes 'data.table' and 'data.frame':   2949 obs. of  22 variables:
##  $ record_id               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dexcom_week             : chr  "2018-10-01" "2018-10-08" "2018-10-15" "2018-10-22" ...
##  $ TIR                     : num  0.2 0.809 0.946 0.92 0.968 ...
##  $ hyper                   : num  0.7996 0.18044 0.03375 0.03747 0.00505 ...
##  $ hypo1                   : num  0 0.0127 0.0218 0.0477 0.0288 ...
##  $ hypo2                   : num  0 0 0.002978 0.0077 0.000505 ...
##  $ state                   : chr  "red" "green" "green" "red" ...
##  $ date_contact            : chr  "" "2018-10-11" "" "" ...
##  $ type                    : chr  "" "remote" "" "" ...
##  $ type_detail             : chr  "" "remote_patient_adj" "" "" ...
##  $ num_date                : int  NA 17812 NA NA NA NA NA NA NA NA ...
##  $ num_last_contact_date   : int  NA 17812 17812 17812 17812 17812 17812 17812 17812 17812 ...
##  $ num_next_contact_date   : int  17812 17812 NA NA NA NA NA NA NA NA ...
##  $ days_since_last_contact : int  NA 0 7 14 21 28 35 42 49 56 ...
##  $ days_to_next_contact    : int  7 0 NA NA NA NA NA NA NA NA ...
##  $ contact                 : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ cum_contacts            : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ cum_days                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ p_contact               : num  0 0.5 0.333 0.25 0.2 ...
##  $ diab_onset              : chr  "2018-10-02" "2018-10-02" "2018-10-02" "2018-10-02" ...
##  $ months_since_onset_trunc: int  0 0 0 1 1 1 1 2 2 2 ...
##  $ next_state              : chr  "green" "green" "red" "green" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Fit logistic regression model for all states Feats: Current state, time since contact, and cumulative p_contact.

train_dt = patient_week_contacts[dexcom_week >= "2019-04-15" & dexcom_week <= "2020-07-06" & !is.na(days_since_last_contact)]
train_dt[,days_since_last_contact_trunc := pmin(days_since_last_contact, 21)]
#train_dt[is.na(days_since_last_contact_trunc), days_since_last_contact_trunc := 21]

states = c("green", "yellow", "red")
full_models = sapply(states, function(s) {
  dt = copy(train_dt)[, outcome := ifelse(next_state == s, 1, 0)]
  return(
   # glm(outcome ~ as.factor(days_since_last_contact_trunc):state - 1 + p_contact:state + I(months_since_onset_trunc<6):state, 
   #          family = binomial(), data = dt[dexcom_week < "2020-04-06"]))
   glm(outcome ~ I(days_since_last_contact_trunc==0):state - 1 + p_contact:state + I(months_since_onset_trunc<6):state,
            family = binomial(), data = dt[dexcom_week < "2020-04-06"]))
  }, simplify = F, USE.NAMES = T)

null_models = sapply(states, function(s) {
  dt = copy(train_dt)[, outcome := ifelse(next_state == s, 1, 0)]
  return(
   glm(outcome ~ - 1 + p_contact:state + I(months_since_onset_trunc<6):state, 
            family = binomial(), data = dt))
  }, simplify = F, USE.NAMES = T)

summary(full_models[["green"]])

## 
## Call:
## glm(formula = outcome ~ I(days_since_last_contact_trunc == 0):state - 
##     1 + p_contact:state + I(months_since_onset_trunc < 6):state, 
##     family = binomial(), data = dt[dexcom_week < "2020-04-06"])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0151  -0.8072  -0.4181   0.7602   2.4176  
## 
## Coefficients:
##                                                        Estimate Std. Error
## I(days_since_last_contact_trunc == 0)FALSE:stategreen   1.62639    0.18716
## I(days_since_last_contact_trunc == 0)TRUE:stategreen    1.61492    0.33760
## I(days_since_last_contact_trunc == 0)FALSE:statered    -1.38716    0.25019
## I(days_since_last_contact_trunc == 0)TRUE:statered     -1.09518    0.33228
## I(days_since_last_contact_trunc == 0)FALSE:stateyellow -0.01965    0.28621
## I(days_since_last_contact_trunc == 0)TRUE:stateyellow   0.11568    0.39811
## stategreen:p_contact                                   -4.42795    0.69384
## statered:p_contact                                     -2.89385    0.67969
## stateyellow:p_contact                                  -2.35927    0.86514
## stategreen:I(months_since_onset_trunc < 6)TRUE          0.48453    0.19460
## statered:I(months_since_onset_trunc < 6)TRUE            1.12199    0.26061
## stateyellow:I(months_since_onset_trunc < 6)TRUE        -0.07830    0.26252
##                                                        z value Pr(>|z|)    
## I(days_since_last_contact_trunc == 0)FALSE:stategreen    8.690  < 2e-16 ***
## I(days_since_last_contact_trunc == 0)TRUE:stategreen     4.784 1.72e-06 ***
## I(days_since_last_contact_trunc == 0)FALSE:statered     -5.544 2.95e-08 ***
## I(days_since_last_contact_trunc == 0)TRUE:statered      -3.296 0.000981 ***
## I(days_since_last_contact_trunc == 0)FALSE:stateyellow  -0.069 0.945254    
## I(days_since_last_contact_trunc == 0)TRUE:stateyellow    0.291 0.771372    
## stategreen:p_contact                                    -6.382 1.75e-10 ***
## statered:p_contact                                      -4.258 2.07e-05 ***
## stateyellow:p_contact                                   -2.727 0.006391 ** 
## stategreen:I(months_since_onset_trunc < 6)TRUE           2.490 0.012779 *  
## statered:I(months_since_onset_trunc < 6)TRUE             4.305 1.67e-05 ***
## stateyellow:I(months_since_onset_trunc < 6)TRUE         -0.298 0.765503    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2118.3  on 1528  degrees of freedom
## Residual deviance: 1611.7  on 1516  degrees of freedom
## AIC: 1635.7
## 
## Number of Fisher Scoring iterations: 4

Simulate patient trajectories starting with states at the start time and observed contacts during the period.

Create sim dataset

START_DATE = "2020-04-06"

sim_dt = train_dt[dexcom_week >= START_DATE, 
                  .(record_id, dexcom_week, p_contact, months_since_onset_trunc, days_since_last_contact_trunc)]

# filter to patients with data in every week between start date and "2020-07-06"
sim_dt[, n_records := .N, by='record_id']
sim_dt = sim_dt[n_records == max(n_records)]
sim_dt[, n_records := NULL]

# add start state
sim_dt = merge(sim_dt, train_dt[dexcom_week == START_DATE, .(dexcom_week, record_id, state)], 
               by = c('dexcom_week', 'record_id'), all.x=T)

# add true states
true_states = train_dt[, .(dexcom_week, record_id, true_state = state)]
sim_dt = merge(sim_dt, true_states, by = c('dexcom_week', 'record_id'))

sim_dt[, table(state)]

## state
##  green    red yellow 
##     24     18     10

Simulate trajectories by looping over dates and patients, and simulating next state from logistic model probabilities

setorder(sim_dt, record_id, dexcom_week)
sim_dt[, week_id := seq_len(.N), by='record_id']

weeks_to_sim = sim_dt[dexcom_week > START_DATE, unique(week_id)]
patients_to_sim = sim_dt[, unique(record_id)]

N_SIMS = 50

library(parallel)

sim_accuracy = lapply(seq(1,N_SIMS), function(x) {
  set.seed(x)
  for(w in weeks_to_sim) {
    #print(w)
    pred_dt = sim_dt[week_id == w-1]
    preds = cbind(
      predict(full_models[["green"]], newdata=pred_dt, type="response"),
      predict(full_models[["yellow"]], newdata=pred_dt, type="response"),
      predict(full_models[["red"]], newdata=pred_dt, type="response"))
    next_states = apply(preds, 1, function(x) states[sample.int(3,size=1,prob=x)])
    sim_dt[week_id == w, state := next_states]
  }
  return(sim_dt[week_id>1, mean(state == true_state)])
  })#, mc.cores = 8)
sim_accuracy = unlist(sim_accuracy)

summary(sim_accuracy)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3772  0.4190  0.4408  0.4354  0.4512  0.4822

mean(sim_accuracy)

## [1] 0.4353846

sd(sim_accuracy)/sqrt(length(sim_accuracy))

## [1] 0.003212295

confusionMatrix(as.factor(sim_dt[week_id>1]$state), as.factor(sim_dt[week_id>1]$true_state))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction green red yellow
##     green    140  65     57
##     red       94 121     65
##     yellow    42  53     39
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4438          
##                  95% CI : (0.4059, 0.4822)
##     No Information Rate : 0.4083          
##     P-Value [Acc > NIR] : 0.03334         
##                                           
##                   Kappa : 0.1418          
##                                           
##  Mcnemar's Test P-Value : 0.03233         
## 
## Statistics by Class:
## 
##                      Class: green Class: red Class: yellow
## Sensitivity                0.5072     0.5063       0.24224
## Specificity                0.6950     0.6362       0.81553
## Pos Pred Value             0.5344     0.4321       0.29104
## Neg Pred Value             0.6715     0.7020       0.77491
## Prevalence                 0.4083     0.3536       0.23817
## Detection Rate             0.2071     0.1790       0.05769
## Detection Prevalence       0.3876     0.4142       0.19822
## Balanced Accuracy          0.6011     0.5712       0.52889

sim_dt[week_id>1, mean(true_state == "green")]

## [1] 0.408284

Sim with model without contacts

sim_null_accuracy = mclapply(seq(1,N_SIMS), function(x) {
  set.seed(x)
  for(w in weeks_to_sim) {
    #print(w)
    pred_dt = sim_dt[week_id == w-1]
    preds = cbind(
      predict(null_models[["green"]], newdata=pred_dt, type="response"),
      predict(null_models[["yellow"]], newdata=pred_dt, type="response"),
      predict(null_models[["red"]], newdata=pred_dt, type="response"))
    next_states = apply(preds, 1, function(x) states[sample.int(3,size=1,prob=x)])
    sim_dt[week_id == w, state := next_states]
  }
  return(sim_dt[week_id>1, mean(state == true_state)])
  }, mc.cores = 8)
sim_null_accuracy = unlist(sim_null_accuracy)

summary(sim_null_accuracy)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3373  0.3695  0.3883  0.3860  0.4024  0.4231

mean(sim_null_accuracy)

## [1] 0.3860355

sd(sim_null_accuracy)/sqrt(length(sim_null_accuracy))

## [1] 0.002845929

Sim just predicting same state as before every time

sim_constant_accuracy = mclapply(seq(1,N_SIMS), function(x) {
  set.seed(x)
  for(w in weeks_to_sim) {
    next_states = sim_dt[week_id == w-1, state]
    sim_dt[week_id == w, state := next_states]
  }
  return(sim_dt[week_id>1, mean(state == true_state)])
  }, mc.cores = 8)
sim_constant_accuracy = unlist(sim_constant_accuracy)

summary(sim_constant_accuracy)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.605   0.605   0.605   0.605   0.605   0.605

mean(sim_constant_accuracy)

## [1] 0.6050296

sd(sim_constant_accuracy)/sqrt(length(sim_constant_accuracy))

## [1] 0

Predicting the same state as before for each patient is better than using either of the logistic regression models. I think this either means the models are terrible and/or that there is more information at the patient-level that predicts all their future states than in the contacts and other features at the week level.