mle_test.utf8

Load data

library(data.table)
library(ggplot2)
library(ggpubr)
cgm_ts = fread("~/Downloads/CGM/deid_hourly_timeseries.csv")
cgm_ts[, pat_id := paste0(pop,"-",pat_id)]
str(cgm_ts)

## Classes 'data.table' and 'data.frame':   1503146 obs. of  13 variables:
##  $ day_id           : int  636 636 636 636 636 636 636 636 636 636 ...
##  $ hour             : int  12 13 14 15 16 17 18 19 20 21 ...
##  $ mean_glucose     : num  107.8 90.8 117.5 112.9 154.5 ...
##  $ TIR              : num  1 1 1 1 0.75 ...
##  $ in_person_visits : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ telehealth_visits: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ remote_contacts  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pat_id           : chr  "4T-35" "4T-35" "4T-35" "4T-35" ...
##  $ weekday_id       : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ pop              : chr  "4T" "4T" "4T" "4T" ...
##  $ age              : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ sexF             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ public_insurance : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

all_pats = cgm_ts[,.(
  TIR = mean(TIR),
  remote_contacts = max(remote_contacts),
  in_person_visits = max(in_person_visits),
  telehealth_visits = max(telehealth_visits)
  ), by = c('pat_id','pop','day_id')]

#all_pats[,day_id := day_id - min(day_id), by='pat_id']
all_pats[,day_id := seq_len(.N), by='pat_id']

all_pats[day_id < 180, remote_contacts := 0]
all_pats[TIR > 0.7, remote_contacts := 0]

# Only keep patients with >0 remote contacts
all_pats = all_pats[pat_id %in% all_pats[remote_contacts==1, unique(pat_id)]]

# Get TIR of last contact
setorder(all_pats, pat_id, day_id)
all_pats[, last_contact_TIR := remote_contacts*TIR]
all_pats[last_contact_TIR == 0, last_contact_TIR := NA]
all_pats[, last_contact_TIR := nafill(last_contact_TIR, "locf"), by='pat_id']

#all_pats[pat_id == "4T-22", last_contact_TIR][1:300]

Basic MLE testing to make sure I’m doing it right

one_pat = copy(all_pats[pat_id=="Pilot-14"])
one_pat[,.(
  .N,
  mean(TIR),
  sd(TIR),
  sum(remote_contacts)
)]

##      N        V2        V3 V4
## 1: 582 0.6497213 0.1923579 12

First just try to learn mean and variance

mean_var_nloglik = function(theta) -sum(dnorm(one_pat$TIR, theta[1], max(1E-9,theta[2]), log=TRUE))

# MLE
fit0 <- optim(c(0,1), mean_var_nloglik, method = "BFGS", control=list(maxit=10000))

fit0

## $par
## [1] 0.6497191 0.1921935
## 
## $value
## [1] -134.0455
## 
## $counts
## function gradient 
##       72       11 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

Now try learning time trend as well

pred_tir = function(mu_i, d_i, t_i) {
  return(mu_i+d_i*t_i)
}

mean_var_nloglik = function(theta) -sum(dnorm(one_pat$TIR, 
                                              pred_tir(theta["mu_i"], theta["d_i"], one_pat$day_id), max(1E-9,theta["sigma_i"]), 
                                              log=TRUE))

# MLE
init_params = unlist(list(
  "mu_i" = mean(one_pat$TIR),
  "d_i" = 0.001,
  "sigma_i" = sd(one_pat$TIR)
))
fit0 <- optim(init_params, mean_var_nloglik, method = "BFGS", control=list(maxit=10000))

fit0

## $par
##          mu_i           d_i       sigma_i 
##  0.7339981570 -0.0002891144  0.1859575139 
## 
## $value
## [1] -153.2533
## 
## $counts
## function gradient 
##       91       14 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

Fit model with cumulative effect of contacts and exponential decay

pred_tir = function(mu_i, d_i, t_i, g_i, h_i) {
  one_pat$discounted_contacts = one_pat$remote_contacts
  one_pat$discounted_contacts = stats::filter(one_pat$discounted_contacts, 2^(-1/max(1E-9,h_i)), method = "recursive")
  # print(sum(one_pat$discounted_contacts))
  contact_effect = one_pat$discounted_contacts*g_i
  mu_i = fcoalesce(one_pat$last_contact_TIR, mu_i)
  return(mu_i + d_i*t_i + contact_effect)
}


mean_var_nloglik = function(theta) -sum(dnorm(one_pat$TIR, 
  pred_tir(theta["mu_i"], theta["d_i"], one_pat$day_id, theta["g_i"], theta["h_i"]), 
  max(1E-9,theta["sigma_i"]), log=TRUE))

# MLE
init_params = unlist(list(
  "mu_i" = mean(one_pat$TIR),
  "d_i" = 0.001,
  "sigma_i" = sd(one_pat$TIR),
  "g_i" = 0.01,
  "h_i" = 14
))
fit0 <- optim(init_params, mean_var_nloglik, control=list(maxit=10000), method = "BFGS")
fit0

## $par
##          mu_i           d_i       sigma_i           g_i           h_i 
##  7.111705e-01 -3.654684e-06  1.863615e-01  3.782521e-02  1.400500e+01 
## 
## $value
## [1] -151.9871
## 
## $counts
## function gradient 
##       99       23 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

Let’s try doing it for every patient

pred_tir = function(dt, mu_i, d_i, t_i, g_i, h_i) {
  dt$discounted_contacts = dt$remote_contacts
  dt$discounted_contacts = stats::filter(dt$discounted_contacts, 2^(-1/max(1E-9,h_i)), method = "recursive")
  contact_effect = dt$discounted_contacts*g_i
  mu_i = fcoalesce(dt$last_contact_TIR, mu_i)
  return(mu_i + d_i*t_i + contact_effect)
}

get_params = function(pid) {
  dt = copy(all_pats[pat_id == pid & day_id>=180])
  dt[, day_id := day_id - min(day_id), by='pat_id']
  
  mean_var_nloglik = function(theta) -sum(dnorm(dt$TIR, 
    pred_tir(dt, theta["mu_i"], theta["d_i"], dt$day_id, theta["g_i"], theta["h_i"]), 
    max(1E-9,theta["sigma_i"]), log=TRUE))
  
  init_params = unlist(list(
  "mu_i" = mean(dt$TIR),
  "d_i" = -0.001,
  "sigma_i" = sd(dt$TIR),
  "g_i" = 0.02,
  "h_i" = 24))
  
  fit0 <- optim(init_params, mean_var_nloglik, control=list(maxit=10000), method = "BFGS")
  out_dt = as.data.table(t(fit0$par))
  out_dt[, pat_id := pid]
  out_dt[, pop := dt$pop[1]]
  out_dt[, n_contacts := sum(dt$remote_contacts)]
  return(out_dt)
}

testfit = get_params("Pilot-1")

fits = rbindlist(lapply(all_pats[,unique(pat_id)], get_params))
#head(fits[n_contacts>=4 & g_i>0])

# ggdensity(melt(fits, id.vars = c('pat_id','pop')), x = "value",
#    add = "mean", rug = TRUE,
#    color = "pop", facet.by = "variable", scales="free")

Winsorize

fits_long = melt(fits, id.vars = c('pat_id','pop'))
fits_long[, wins_min := quantile(value, 0.05), by='variable']
fits_long[, wins_max := quantile(value, 0.95), by='variable']
fits_long[, value_wins := pmax(wins_min, pmin(wins_max, value))]
#unique(fits_long[,.(variable,wins_min,wins_max)])

ggdensity(fits_long[variable!='n_contacts'], x = "value_wins",
   add = "median", rug = TRUE,
   color = "pop", facet.by = "variable", scales="free")

Try doing just one patient at different time steps and see how parameters vary over time

get_params = function(ts) {
  dt = one_pat[day_id <= ts]
  
  mean_var_nloglik = function(theta) -sum(dnorm(dt$TIR, 
    pred_tir(dt, theta["mu_i"], theta["d_i"], dt$day_id, theta["g_i"], theta["h_i"]), 
    max(1E-9,theta["sigma_i"]), log=TRUE))
  
  init_params = unlist(list(
    "mu_i" = mean(dt$TIR),
    "d_i" = -0.001,
    "sigma_i" = sd(dt$TIR),
    "g_i" = 0.02,
    "h_i" = 24))
  
  fit0 <- optim(init_params, mean_var_nloglik, control=list(maxit=10000), method = "BFGS")
  out_dt = as.data.table(t(fit0$par))
  out_dt[, pat_id := dt$pat_id[1]]
  out_dt[, contact := dt[day_id==ts, remote_contacts][1]]
  out_dt[, pop := dt$pop[1]]
  out_dt[, ts := ts]
  out_dt[, TIR := dt[day_id>=ts-7 & day_id <= ts-1, mean(TIR)]]
  return(out_dt)
}

ts_set = one_pat[day_id >= one_pat[remote_contacts==1, min(day_id)], unique(day_id)]

testfit = get_params(max(ts_set))

ts_fits = rbindlist(lapply(ts_set, get_params))
#head(ts_fits)

ts_fits_long = melt(ts_fits, id.vars = c('pat_id','pop','contact','ts'))
ggplot(ts_fits_long, aes(x=ts, y=value)) + geom_vline(xintercept = ts_fits_long[contact==1, unique(ts)], linetype="dashed") + facet_wrap(~variable, scales="free", ncol = 1) + geom_line() + theme_bw() + coord_cartesian(xlim=c(min(ts_fits$ts),400)) + ggtitle("Single patient MLE parameters over time")