library(ggplot2)
library(data.table)
library(caret)
setwd("~/Downloads/CGM/")
patient_week_contacts = fread('~/Downloads/CGM/patient_week_contacts.csv')
str(patient_week_contacts)
## Classes 'data.table' and 'data.frame': 2949 obs. of 22 variables:
## $ record_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dexcom_week : chr "2018-10-01" "2018-10-08" "2018-10-15" "2018-10-22" ...
## $ TIR : num 0.2 0.809 0.946 0.92 0.968 ...
## $ hyper : num 0.7996 0.18044 0.03375 0.03747 0.00505 ...
## $ hypo1 : num 0 0.0127 0.0218 0.0477 0.0288 ...
## $ hypo2 : num 0 0 0.002978 0.0077 0.000505 ...
## $ state : chr "red" "green" "green" "red" ...
## $ date_contact : chr "" "2018-10-11" "" "" ...
## $ type : chr "" "remote" "" "" ...
## $ type_detail : chr "" "remote_patient_adj" "" "" ...
## $ num_date : int NA 17812 NA NA NA NA NA NA NA NA ...
## $ num_last_contact_date : int NA 17812 17812 17812 17812 17812 17812 17812 17812 17812 ...
## $ num_next_contact_date : int 17812 17812 NA NA NA NA NA NA NA NA ...
## $ days_since_last_contact : int NA 0 7 14 21 28 35 42 49 56 ...
## $ days_to_next_contact : int 7 0 NA NA NA NA NA NA NA NA ...
## $ contact : int 0 1 0 0 0 0 0 0 0 0 ...
## $ cum_contacts : int 0 1 1 1 1 1 1 1 1 1 ...
## $ cum_days : int 1 2 3 4 5 6 7 8 9 10 ...
## $ p_contact : num 0 0.5 0.333 0.25 0.2 ...
## $ diab_onset : chr "2018-10-02" "2018-10-02" "2018-10-02" "2018-10-02" ...
## $ months_since_onset_trunc: int 0 0 0 1 1 1 1 2 2 2 ...
## $ next_state : chr "green" "green" "red" "green" ...
## - attr(*, ".internal.selfref")=<externalptr>
Fit logistic regression model for all states Feats: Current state, time since contact, and cumulative p_contact.
train_dt = patient_week_contacts[dexcom_week >= "2019-04-15" & dexcom_week <= "2020-07-06" & !is.na(days_since_last_contact)]
train_dt[,days_since_last_contact_trunc := pmin(days_since_last_contact, 21)]
#train_dt[is.na(days_since_last_contact_trunc), days_since_last_contact_trunc := 21]
states = c("green", "yellow", "red")
full_models = sapply(states, function(s) {
dt = copy(train_dt)[, outcome := ifelse(next_state == s, 1, 0)]
return(
# glm(outcome ~ as.factor(days_since_last_contact_trunc):state - 1 + p_contact:state + I(months_since_onset_trunc<6):state,
# family = binomial(), data = dt[dexcom_week < "2020-04-06"]))
glm(outcome ~ I(days_since_last_contact_trunc==0):state - 1 + p_contact:state + I(months_since_onset_trunc<6):state,
family = binomial(), data = dt[dexcom_week < "2020-04-06"]))
}, simplify = F, USE.NAMES = T)
null_models = sapply(states, function(s) {
dt = copy(train_dt)[, outcome := ifelse(next_state == s, 1, 0)]
return(
glm(outcome ~ - 1 + p_contact:state + I(months_since_onset_trunc<6):state,
family = binomial(), data = dt))
}, simplify = F, USE.NAMES = T)
summary(full_models[["green"]])
##
## Call:
## glm(formula = outcome ~ I(days_since_last_contact_trunc == 0):state -
## 1 + p_contact:state + I(months_since_onset_trunc < 6):state,
## family = binomial(), data = dt[dexcom_week < "2020-04-06"])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0151 -0.8072 -0.4181 0.7602 2.4176
##
## Coefficients:
## Estimate Std. Error
## I(days_since_last_contact_trunc == 0)FALSE:stategreen 1.62639 0.18716
## I(days_since_last_contact_trunc == 0)TRUE:stategreen 1.61492 0.33760
## I(days_since_last_contact_trunc == 0)FALSE:statered -1.38716 0.25019
## I(days_since_last_contact_trunc == 0)TRUE:statered -1.09518 0.33228
## I(days_since_last_contact_trunc == 0)FALSE:stateyellow -0.01965 0.28621
## I(days_since_last_contact_trunc == 0)TRUE:stateyellow 0.11568 0.39811
## stategreen:p_contact -4.42795 0.69384
## statered:p_contact -2.89385 0.67969
## stateyellow:p_contact -2.35927 0.86514
## stategreen:I(months_since_onset_trunc < 6)TRUE 0.48453 0.19460
## statered:I(months_since_onset_trunc < 6)TRUE 1.12199 0.26061
## stateyellow:I(months_since_onset_trunc < 6)TRUE -0.07830 0.26252
## z value Pr(>|z|)
## I(days_since_last_contact_trunc == 0)FALSE:stategreen 8.690 < 2e-16 ***
## I(days_since_last_contact_trunc == 0)TRUE:stategreen 4.784 1.72e-06 ***
## I(days_since_last_contact_trunc == 0)FALSE:statered -5.544 2.95e-08 ***
## I(days_since_last_contact_trunc == 0)TRUE:statered -3.296 0.000981 ***
## I(days_since_last_contact_trunc == 0)FALSE:stateyellow -0.069 0.945254
## I(days_since_last_contact_trunc == 0)TRUE:stateyellow 0.291 0.771372
## stategreen:p_contact -6.382 1.75e-10 ***
## statered:p_contact -4.258 2.07e-05 ***
## stateyellow:p_contact -2.727 0.006391 **
## stategreen:I(months_since_onset_trunc < 6)TRUE 2.490 0.012779 *
## statered:I(months_since_onset_trunc < 6)TRUE 4.305 1.67e-05 ***
## stateyellow:I(months_since_onset_trunc < 6)TRUE -0.298 0.765503
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2118.3 on 1528 degrees of freedom
## Residual deviance: 1611.7 on 1516 degrees of freedom
## AIC: 1635.7
##
## Number of Fisher Scoring iterations: 4
Simulate patient trajectories starting with states at the start time and observed contacts during the period.
Create sim dataset
START_DATE = "2020-04-06"
sim_dt = train_dt[dexcom_week >= START_DATE,
.(record_id, dexcom_week, p_contact, months_since_onset_trunc, days_since_last_contact_trunc)]
# filter to patients with data in every week between start date and "2020-07-06"
sim_dt[, n_records := .N, by='record_id']
sim_dt = sim_dt[n_records == max(n_records)]
sim_dt[, n_records := NULL]
# add start state
sim_dt = merge(sim_dt, train_dt[dexcom_week == START_DATE, .(dexcom_week, record_id, state)],
by = c('dexcom_week', 'record_id'), all.x=T)
# add true states
true_states = train_dt[, .(dexcom_week, record_id, true_state = state)]
sim_dt = merge(sim_dt, true_states, by = c('dexcom_week', 'record_id'))
sim_dt[, table(state)]
## state
## green red yellow
## 24 18 10
Simulate trajectories by looping over dates and patients, and simulating next state from logistic model probabilities
setorder(sim_dt, record_id, dexcom_week)
sim_dt[, week_id := seq_len(.N), by='record_id']
weeks_to_sim = sim_dt[dexcom_week > START_DATE, unique(week_id)]
patients_to_sim = sim_dt[, unique(record_id)]
N_SIMS = 50
library(parallel)
sim_accuracy = lapply(seq(1,N_SIMS), function(x) {
set.seed(x)
for(w in weeks_to_sim) {
#print(w)
pred_dt = sim_dt[week_id == w-1]
preds = cbind(
predict(full_models[["green"]], newdata=pred_dt, type="response"),
predict(full_models[["yellow"]], newdata=pred_dt, type="response"),
predict(full_models[["red"]], newdata=pred_dt, type="response"))
next_states = apply(preds, 1, function(x) states[sample.int(3,size=1,prob=x)])
sim_dt[week_id == w, state := next_states]
}
return(sim_dt[week_id>1, mean(state == true_state)])
})#, mc.cores = 8)
sim_accuracy = unlist(sim_accuracy)
summary(sim_accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3772 0.4190 0.4408 0.4354 0.4512 0.4822
mean(sim_accuracy)
## [1] 0.4353846
sd(sim_accuracy)/sqrt(length(sim_accuracy))
## [1] 0.003212295
confusionMatrix(as.factor(sim_dt[week_id>1]$state), as.factor(sim_dt[week_id>1]$true_state))
## Confusion Matrix and Statistics
##
## Reference
## Prediction green red yellow
## green 140 65 57
## red 94 121 65
## yellow 42 53 39
##
## Overall Statistics
##
## Accuracy : 0.4438
## 95% CI : (0.4059, 0.4822)
## No Information Rate : 0.4083
## P-Value [Acc > NIR] : 0.03334
##
## Kappa : 0.1418
##
## Mcnemar's Test P-Value : 0.03233
##
## Statistics by Class:
##
## Class: green Class: red Class: yellow
## Sensitivity 0.5072 0.5063 0.24224
## Specificity 0.6950 0.6362 0.81553
## Pos Pred Value 0.5344 0.4321 0.29104
## Neg Pred Value 0.6715 0.7020 0.77491
## Prevalence 0.4083 0.3536 0.23817
## Detection Rate 0.2071 0.1790 0.05769
## Detection Prevalence 0.3876 0.4142 0.19822
## Balanced Accuracy 0.6011 0.5712 0.52889
sim_dt[week_id>1, mean(true_state == "green")]
## [1] 0.408284
Sim with model without contacts
sim_null_accuracy = mclapply(seq(1,N_SIMS), function(x) {
set.seed(x)
for(w in weeks_to_sim) {
#print(w)
pred_dt = sim_dt[week_id == w-1]
preds = cbind(
predict(null_models[["green"]], newdata=pred_dt, type="response"),
predict(null_models[["yellow"]], newdata=pred_dt, type="response"),
predict(null_models[["red"]], newdata=pred_dt, type="response"))
next_states = apply(preds, 1, function(x) states[sample.int(3,size=1,prob=x)])
sim_dt[week_id == w, state := next_states]
}
return(sim_dt[week_id>1, mean(state == true_state)])
}, mc.cores = 8)
sim_null_accuracy = unlist(sim_null_accuracy)
summary(sim_null_accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3373 0.3695 0.3883 0.3860 0.4024 0.4231
mean(sim_null_accuracy)
## [1] 0.3860355
sd(sim_null_accuracy)/sqrt(length(sim_null_accuracy))
## [1] 0.002845929
Sim just predicting same state as before every time
sim_constant_accuracy = mclapply(seq(1,N_SIMS), function(x) {
set.seed(x)
for(w in weeks_to_sim) {
next_states = sim_dt[week_id == w-1, state]
sim_dt[week_id == w, state := next_states]
}
return(sim_dt[week_id>1, mean(state == true_state)])
}, mc.cores = 8)
sim_constant_accuracy = unlist(sim_constant_accuracy)
summary(sim_constant_accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.605 0.605 0.605 0.605 0.605 0.605
mean(sim_constant_accuracy)
## [1] 0.6050296
sd(sim_constant_accuracy)/sqrt(length(sim_constant_accuracy))
## [1] 0
Predicting the same state as before for each patient is better than using either of the logistic regression models. I think this either means the models are terrible and/or that there is more information at the patient-level that predicts all their future states than in the contacts and other features at the week level.