Packages

Installation

# install.packages("arrow")
# install.packages("ggplot2")
# install.packages("dplyr")
# install.packages("frailtypack")

Loading

## 
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
## 
##     timestamp
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Import data

bcr_data <- read_parquet("kdata/bcr.parquet")
tfs_data <- read_parquet("kdata/tfs.parquet")

Some processed data

# Patients with BCR before treatment (!!!)
bcr_before_tx <- subset(bcr_data, relapse_date < treatment_date)
bcr_data_v2 <- subset(bcr_data, !(pseudoid %in%  bcr_before_tx$pseudoid))

Scatter plots

All data

# Create a scatter plot with regression line
ggplot(bcr_data_v2, aes(x = months_sampling_to_treatment, y = months_treatment_to_relapse)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "[A~B] All BCR patients",
       x = "Months Sampling to Treatment",
       y = "Months Treatment to Relapse")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6043 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 6043 rows containing missing values (`geom_point()`).

ggplot(bcr_data_v2, aes(x = months_sampling_to_treatment, y = months_treatment_to_relapse)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_grid(~ grade_group) + 
  labs(title = "[A~B] BCR patients by Gleason Grade Group",
       x = "Months Sampling to Treatment",
       y = "Months Treatment to Relapse")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6043 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 6043 rows containing missing values (`geom_point()`).

ggplot(bcr_data_v2, aes(x = months_sampling_to_treatment, y = months_sampling_to_relapse)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "[A~C] All BCR patients",
       x = "Months Sampling to Treatment",
       y = "Months Sampling to Relapse")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6043 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 6043 rows containing missing values (`geom_point()`).

ggplot(bcr_data_v2, aes(x = months_sampling_to_treatment, y = months_sampling_to_relapse)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_grid(~ grade_group) +
  labs(title = "[A~C] BCR patients by Gleason grade group",
       x = "Months Sampling to Treatment",
       y = "Months Sampling to Relapse")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6043 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 6043 rows containing missing values (`geom_point()`).

Grouped

[A~B.mean] by Time to treatment [A]

# Group by A and calculate mean of B
grouped_data <- bcr_data_v2 %>%
  group_by(months_sampling_to_treatment) %>%
  summarize(mean_months_relapse = mean(months_treatment_to_relapse, na.rm = TRUE))

# Create a scatter plot with regression line and facet by 'grade_group'
ggplot(grouped_data, aes(x = months_sampling_to_treatment, y = mean_months_relapse)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "[A~B.mean] Mean Months Treatment to relapse by Months Sampling to Treatment",
       x = "Months Sampling to Treatment",
       y = "Mean Months Treatment to Relapse")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 37 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values (`geom_point()`).

[A~B] by Months Sampling to treatment [A] and Grade group

# Group by A and calculate mean of B
grouped_data <- bcr_data_v2 %>%
  group_by(months_sampling_to_treatment, grade_group) %>%
  summarize(mean_months_relapse = mean(months_treatment_to_relapse, na.rm = TRUE))
## `summarise()` has grouped output by 'months_sampling_to_treatment'. You can
## override using the `.groups` argument.
# Create a scatter plot with regression line and facet by 'grade_group'
ggplot(grouped_data, aes(x = months_sampling_to_treatment, y = mean_months_relapse)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "[A~B.mean] Mean Months Treatment to relapse",
       x = "Months Sampling to Treatment",
       y = "Mean Months Treatment to Relapse")+
  facet_grid(~ grade_group)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 173 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 173 rows containing missing values (`geom_point()`).

[A~C.mean] by Months Sampling to relpse [C]

# Group by [A] 'months_sampling_to_treatment' and calculate mean
grouped_data <- bcr_data_v2 %>%
  group_by(months_sampling_to_treatment) %>%
  summarize(mean_months_relapse = mean(months_sampling_to_relapse, na.rm = TRUE))

# Create a scatter plot with regression line 
ggplot(grouped_data, aes(x = months_sampling_to_treatment, y = mean_months_relapse)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "[A~C.mean] Mean Months Sampling to Relapse by Months Sampling to Relapse",
       x = "Months Sampling to Treatment",
       y = "Mean Months Sampling to Relapse")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 37 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values (`geom_point()`).

[A~C.mean] by Months Sampling to relpse [C] and Grade group

# Group by [A] 'months_sampling_to_treatment' and calculate mean
grouped_data <- bcr_data_v2 %>%
  group_by(months_sampling_to_treatment, grade_group) %>%
  summarize(mean_months_relapse = mean(months_sampling_to_relapse, na.rm = TRUE))
## `summarise()` has grouped output by 'months_sampling_to_treatment'. You can
## override using the `.groups` argument.
# Create a scatter plot with regression line and facet by 'grade_group'
ggplot(grouped_data, aes(x = months_sampling_to_treatment, y = mean_months_relapse)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "[A~C.mean] Mean Months Sampling to Relapse by Months Sampling to Relapse",
       x = "Months Sampling to Treatment",
       y = "Mean Months Sampling to Relapse")+
  facet_grid(~ grade_group)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 173 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 173 rows containing missing values (`geom_point()`).

Modelling

library(INLA)
## Loading required package: Matrix
## Loading required package: sp
## This is INLA_24.01.29 built 2024-01-28 20:44:17 UTC.
##  - See www.r-inla.org/contact-us for how to get help.
##  - List available models/likelihoods/etc with inla.list.models()
##  - Use inla.doc(<NAME>) to access documentation
library(INLAjoint)
## Package 'INLAjoint' version 24.2.5
## Type 'citation("INLAjoint")' for citing this R package in publications.

Simple modelling: Time to Treatment

Time sampling to treatment survival curve

# time to treatment , is treated, time to recurrence, is relapsed
# 2 surv modals 

# v3: only patients with Tx , we look at the risk over time
tfs_data_v3 <- tfs_data[ tfs_data$tfs_time >0,]
M1 <- joint(formSurv = list(inla.surv(time=tfs_time, event=tfs_status) ~ -1), 
            basRisk = c("weibullsurv"), 
            dataSurv = list(tfs_data_v3))
summary(M1)
## 
## Survival outcome
##                   mean     sd 0.025quant 0.5quant 0.975quant
## Weibull (shape) 0.5518 0.0045     0.5431   0.5518     0.5607
## Weibull (scale) 0.1682 0.0026     0.1633   0.1681     0.1732
## 
## log marginal-likelihood (integration)    log marginal-likelihood (Gaussian) 
##                              -37835.3                              -37835.3 
## 
## Deviance Information Criterion:  75651.92
## Widely applicable Bayesian information criterion:  75652.02
## Computation time: 4.66 seconds

Note: mean is ….

M1rw2 <- joint(formSurv = list(inla.surv(time=tfs_time, event=tfs_status) ~ -1), 
               basRisk = c("rw2"), 
               dataSurv = list(tfs_data_v3))
plot(M1rw2)
## $Baseline

## 
## attr(,"class")
## [1] "plot.INLAjoint" "list"

Note: this cureve shows …..

Simple modelling: Time to Relapse

# v3: only patients with BCR , we look at the risk over time
# !!!!! WRONG, this should include both rfs 1 and 0 ????
bcr_data_v3 <- bcr_data_v2[which(!is.na(bcr_data_v2$months_sampling_to_relapse)),]
M2 <- joint(formSurv=list(inla.surv(time= drfs_time, event=drfs_status) ~ -1), 
            basRisk=c("weibullsurv"), 
            dataSurv = list(bcr_data_v3))
summary(M2)
## 
## Survival outcome
##                   mean     sd 0.025quant 0.5quant 0.975quant
## Weibull (shape) 1.4135 0.0210     1.3726   1.4133     1.4555
## Weibull (scale) 0.0031 0.0002     0.0027   0.0031     0.0034
## 
## log marginal-likelihood (integration)    log marginal-likelihood (Gaussian) 
##                             -13199.48                             -13199.48 
## 
## Deviance Information Criterion:  26387.24
## Widely applicable Bayesian information criterion:  26387.19
## Computation time: 0.99 seconds

Note: the resutls are ….

M2rw2 <- joint(formSurv = list(inla.surv(time=drfs_time, event=drfs_status) ~ -1), 
               basRisk = c("rw2"), 
               dataSurv = list(bcr_data_v3))
plot(M2rw2)
## $Baseline

## 
## attr(,"class")
## [1] "plot.INLAjoint" "list"

Note: This curve shows ….

Joint models

Test code

# 12 recurrent-terminal
library(frailtypack)
## Loading required package: survival
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:survival':
## 
##     aml
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: survC1
## Loading required package: doBy
## 
## Attaching package: 'doBy'
## The following object is masked from 'package:dplyr':
## 
##     order_by
## 
## Attaching package: 'frailtypack'
## The following object is masked from 'package:survival':
## 
##     cluster
data(readmission)
terminalData <- readmission[readmission$event==0,]

#data$id <- 1:nrow(data)

M1 <- joint(formSurv=list(inla.surv(time = t.stop, event = death) ~ -1),
             basRisk=c("rw2"),  dataSurv = list(terminalData))
summary(M1)
## 
## Survival outcome
##                            mean    sd 0.025quant 0.5quant 0.975quant
## Baseline risk (variance) 0.0162 0.037      1e-04   0.0044       0.11
## 
## log marginal-likelihood (integration)    log marginal-likelihood (Gaussian) 
##                             -581.1012                             -581.1012 
## 
## Deviance Information Criterion:  1118.513
## Widely applicable Bayesian information criterion:  1118.468
## Computation time: 0.97 seconds
plot(M1)
## $Baseline

## 
## attr(,"class")
## [1] "plot.INLAjoint" "list"
M2 <- joint(formSurv=list(inla.surv(time = t.stop, event = event) ~ -1),
            basRisk=c("rw2"),  dataSurv = list(terminalData))
summary(M2)
## 
## Survival outcome
##                            mean     sd 0.025quant 0.5quant 0.975quant
## Baseline risk (variance) 0.0283 0.0754      1e-04   0.0059     0.2062
## 
## log marginal-likelihood (integration)    log marginal-likelihood (Gaussian) 
##                             -27.04279                             -27.04279 
## 
## Deviance Information Criterion:  5.486813
## Widely applicable Bayesian information criterion:  3.93188
## Computation time: 0.97 seconds
plot(M2)$Baseline+scale_y_log10()

M12 <- joint(formSurv=list(inla.surv(time = t.stop, event = event) ~ -1+(1|id),
                           inla.surv(time = t.stop, event = death) ~ -1), id="id",
             basRisk=c("rw2", "rw2"), assocSurv=TRUE,
             dataSurv = list(readmission,terminalData))
summary(M12)
## 
## Survival outcome (S1)
##                               mean     sd 0.025quant 0.5quant 0.975quant
## Baseline risk (variance)_S1 0.0291 0.0173     0.0081    0.025     0.0745
## 
## Frailty term variance (S1)
##                  mean     sd 0.025quant 0.5quant 0.975quant
## IDIntercept_S1 0.9113 0.3461     0.4192    0.849     1.7667
## 
## Survival outcome (S2)
##                               mean     sd 0.025quant 0.5quant 0.975quant
## Baseline risk (variance)_S2 0.0127 0.0058     0.0049   0.0116     0.0272
## 
## Association survival - survival
##                    mean     sd 0.025quant 0.5quant 0.975quant
## IDIntercept_S1_S2 1.938 0.3073     1.3342   1.9377     2.5441
## 
## log marginal-likelihood (integration)    log marginal-likelihood (Gaussian) 
##                             -2491.607                             -2491.607 
## 
## Deviance Information Criterion:  4756.345
## Widely applicable Bayesian information criterion:  4746.463
## Computation time: 2.2 seconds

My code

# First I have to combine (back) the tfs and rfs data in one dataframe
surv_data <- merge(bcr_data_v3[, c('pseudoid', 'drfs_status', 'drfs_time')],
                   tfs_data[, c('pseudoid', 'tfs_status', 'tfs_time')], on='pseudoid', all.x=TRUE)
# New ID
surv_data$id <- 1:nrow(surv_data)
# Re-Order
surv_data <- surv_data[, c(6, 5, 4, 3, 2)]
# Remove where both time to treatment/bcr = 0 and event = 0 (this should be update when converting time to days)
all_null_tfs <- surv_data[surv_data$tfs_time==0 & surv_data$tfs_status==0, ]
surv_data <- surv_data[!(surv_data$id %in% all_null_tfs$id), ]
all_null_drfs <- surv_data[surv_data$drfs_time==0 & surv_data$drfs_status==0, ]
surv_data <- surv_data[!(surv_data$id %in% all_null_drfs$id), ]
# Now if no BCR data then rfs status = 0 and rfs time = tfs time [to fill out the NULL cells]
surv_data$drfs_status <- ifelse(is.na(surv_data$drfs_status), 0, 0)
surv_data$drfs_time <- ifelse(is.na(surv_data$drfs_time), surv_data$tfs_time, surv_data$drfs_time)

# Remove when time to tx/bcr = 0
all_null_tfs_drfs <- surv_data[surv_data$tfs_time==0 & surv_data$drfs_time==0, ]
surv_data <- surv_data[!(surv_data$id %in% all_null_tfs_drfs$id), ]
M1 <- joint(formSurv =list(inla.surv(time = tfs_time, event = tfs_status) ~ -1),
            basRisk = c("rw2"),  
            dataSurv = list(surv_data))
summary(M1)
## 
## Survival outcome
##                            mean     sd 0.025quant 0.5quant 0.975quant
## Baseline risk (variance) 0.6014 0.2514     0.2529   0.5536     1.2299
## 
## log marginal-likelihood (integration)    log marginal-likelihood (Gaussian) 
##                             -5874.327                             -5874.327 
## 
## Deviance Information Criterion:  11683.18
## Widely applicable Bayesian information criterion:  11683.43
## Computation time: 1.09 seconds
plot(M1)
## $Baseline

## 
## attr(,"class")
## [1] "plot.INLAjoint" "list"
M2 <- joint(formSurv = list(inla.surv(time = drfs_time, event = drfs_status) ~ -1),
            basRisk = c("rw2"),
            dataSurv = list(surv_data))
summary(M2)
## 
## Survival outcome
##                            mean     sd 0.025quant 0.5quant 0.975quant
## Baseline risk (variance) 0.0284 0.0768          0   0.0058     0.2082
## 
## log marginal-likelihood (integration)    log marginal-likelihood (Gaussian) 
##                             -23.81622                             -23.81622 
## 
## Deviance Information Criterion:  4.900577
## Widely applicable Bayesian information criterion:  3.413754
## Computation time: 1.31 seconds
plot(M2)$Baseline+scale_y_log10()

M12 <- joint(formSurv=list(inla.surv(time = tfs_time, event = tfs_status) ~ -1+(1|id),
                           inla.surv(time = drfs_time, event = drfs_status) ~ -1), id="id",
             basRisk=c("rw2", "rw2"), assocSurv=TRUE, NbasRisk = 30,
             
             dataSurv = list(surv_data))
summary(M12, hr=TRUE)
## 
## Survival outcome (S1)
##                             exp(mean)     sd 0.025quant 0.5quant 0.975quant
## Baseline risk (variance)_S1    0.5965 0.2421     0.2565    0.552     1.1979
## 
## Frailty term variance (S1)
##                  mean     sd 0.025quant 0.5quant 0.975quant
## IDIntercept_S1 0.0177 0.0156     0.0027   0.0131     0.0607
## 
## Survival outcome (S2)
##                             exp(mean)     sd 0.025quant 0.5quant 0.975quant
## Baseline risk (variance)_S2     0.026 0.0669      1e-04   0.0058     0.1869
## 
## Association survival - survival
##                   exp(mean)    sd 0.025quant 0.5quant 0.975quant
## IDIntercept_S1_S2    1.6871 1.972     0.1447   1.0413     7.2437
## 
## log marginal-likelihood (integration)    log marginal-likelihood (Gaussian) 
##                             -6943.721                             -6943.721 
## 
## Deviance Information Criterion:  13579.76
## Widely applicable Bayesian information criterion:  13576.21
## Computation time: 3.73 seconds
plot(M12)$Baseline+scale_y_log10()