Intro

This RMD is a summary of the work done between 04-02-2024 and 11-02-2024. The aim is to create a joint-model that includes time to treatment, time to recurrence, and longitudinal PSA data. This file includes the needed packages (mainly INLA and INLAJoint), the implementation functions and the part of the interpretation. The modeling is in Part2 ## Packages

Installation

# No packages to install, so far

Loading

library(readr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

Import data

abx <- read.csv("datafolder/bx.csv", row.names = c('X'))
apsa <- read.csv("datafolder/psa.csv", row.names = c('X'))
abcr <- read.csv("datafolder/bcr.csv", row.names = c('X'))
atfs <- read.csv("datafolder/tfs.csv", row.names = c('X'))

Preliminary cleaning

# Biopsy-related data
abx$grade_group <- as.factor(abx$grade_group)
abx$mri <- as.factor(abx$mri)
abx$bx_date <- as.Date(abx$bx_date, format = "%Y-%m-%d", na.rm = FALSE)
abx$mri_date <- as.Date(abx$mri_date, format = "%Y-%m-%d", na.rm = FALSE)
abx$last_data_date <- as.Date(abx$last_data_date, format = "%Y-%m-%d", na.rm = FALSE)
abx$bx_age <- cut(abx$age_at_bx, 
                  breaks = c(-Inf, 60, 70, Inf), 
                  labels = c("<60", "60-70", ">70"),
                  include.lowest = TRUE)

# Recurrence-related data
abcr$grade_group <- as.factor(abcr$grade_group)
abcr$mri <- as.factor(abcr$mri)
#abcr$after_tx <- as.factor(abcr$after_tx)
abcr$tx_date <- as.Date(abcr$tx_date, format = "%Y-%m-%d", na.rm = FALSE)
abcr$bcr_date <- as.Date(abcr$bcr_date, format = "%Y-%m-%d", na.rm = FALSE)

# Treatment-related data
atfs$grade_group <- as.factor(atfs$grade_group)
atfs$mri <- as.factor(atfs$mri)
atfs$type_of_treatment <- ifelse(atfs$type_of_treatment=='', 'tfs-unk', atfs$type_of_treatment)
atfs$type_of_treatment <- as.factor(atfs$type_of_treatment)
atfs$treatment_date <- as.Date(atfs$treatment_date, format = "%Y-%m-%d", na.rm = FALSE)
names(atfs)[names(atfs) == "treatment_date"] <- "tx_date"
names(atfs)[names(atfs) == "type_of_treatment"] <- "tx1_type"

# PSA-related data
apsa$psa_date <- as.Date(apsa$psa_date, format = "%Y-%m-%d", na.rm = FALSE)

Building the data

Survival

##################################### 1- SURVIVAL DATA
# -> TFS and RFS
surv_ <- merge(abcr[, c('id', 'drfs_status', 'drfs_time', 'after_tx')],
                   abx[, c('id', 'grade_group', 'mri','bx_age')], on='id', all=TRUE)

surv_ <- merge(surv_,
                   atfs[, c('id', 'tx1_type','tx_date', 'tfs_status', 'tfs_time')], on='id', all=TRUE)

# Re-Order
surv_ <- surv_[, c(1, 7, 6, 5, 8, 9, 11,10,4, 3, 2)]

# Remove where both time to treatment/bcr = 0 and event = 0 (this should be update when converting time to days)
all_null_tfs <- surv_[surv_$tfs_time==0 & surv_$tfs_status==0, ]
surv_ <- surv_[!(surv_$id %in% all_null_tfs$id), ]
all_null_drfs <- surv_[surv_$drfs_time==0 & surv_$drfs_status==0, ]
surv_ <- surv_[!(surv_$id %in% all_null_drfs$id), ]

# Now if no BCR data then rfs status = 0 and rfs_time = tfs_time [to fill out the NA cells]
surv_$drfs_status <- ifelse(is.na(surv_$drfs_status), 0, surv_$drfs_status)
surv_$drfs_time <- ifelse(is.na(surv_$drfs_time), surv_$tfs_time, surv_$drfs_time)

# Remove remaining patients with time to tx+bcr = 0 (follow up time = 0)
all_null_tfs_drfs <- surv_[surv_$tfs_time==0 & surv_$drfs_time==0, ]
surv_ <- surv_[!(surv_$id %in% all_null_tfs_drfs$id), ]

# Filling out the NA treatments of the BCR (the BCRs that we don't know the CT)
surv_$after_tx <- ifelse(is.na(surv_$after_tx), 'bcr-unk', surv_$after_tx)
surv_$after_tx <- as.factor(surv_$after_tx)

# Make sure the status are factors
surv_$tfs_status <- as.factor(surv_$tfs_status)
surv_$drfs_status <- as.factor(surv_$drfs_status)

Longitudinal

##################################### 2- LONGITUDINAL DATA
# Add log PSA
long_ <- apsa

long_ <- long_[with(long_, order(id, time_to_psa)), ]

# ------ Make sure we have for each patient, PSA at reference time t=o
# Step 1: Define a function to calculate the average PSA value at time=0 for each patient
calculate_psa_at_t0 <- function(patient_data) {
  # Make sure it's ordered
  patient_data <- patient_data[with(patient_data, order( time_to_psa)), ]
  
  psas_before_t0 <- patient_data$psa_results[patient_data$time_to_psa < 0]
  psas_after_t0 <- patient_data$psa_results[patient_data$time_to_psa > 0]

  # if he has psa only before 0
  if(length(psas_before_t0) > 0 && length(psas_after_t0) == 0){
    return(psas_before_t0[length(psas_before_t0)])
  }
  # if he has psa only after 0
  if(length(psas_after_t0) > 0 && length(psas_before_t0) == 0){
    return(psas_after_t0[1])
  }
  
  # if he has psa both before abd after
  if(length(psas_after_t0) > 0 && length(psas_before_t0) > 0){
    return((psas_before_t0[length(psas_before_t0)] + psas_after_t0[1])/2)
  }
}

# Step 2: Calculate average PSA value at time=0 for each patient
unique_ids <- unique(long_$id)
for (pid in unique_ids) {
  # Subset data for the current patient
  patient_data <- long_[long_$id == pid,]
  #cat(patient_data$id)
  psa_at_t0 <- patient_data$psa_results[patient_data$time_to_psa == 0]
  if(length(psa_at_t0) == 0){# No data at 0
    # Calculate PSA value at time=0 for this patient
    psa_t0 <- calculate_psa_at_t0(patient_data)
    #cat("\nPID", pid)
    new_row = c(id = pid, psa_results = psa_t0, psa_date=NA, time_to_psa = 0)
    #cat("\nNew data:")
    #cat(new_row)
    long_ <- rbind(long_, new_row)
  }
  
}
# Make sure it is sorted
long_ <- long_[with(long_, order(id, time_to_psa)), ]
# ------ End

# Add log PSA
long_$log_psa <- log(long_$psa_results + 1)
# Merge to update IDs
surv_long_ <- merge(surv_, long_, by='id', all.x=T)

# Reassigning ID
# -> Calculate run-length encoding of the original id column
id_rle <- rle(surv_long_$id)

# -> Generate a sequence of IDs based on the lengths of runs in the original id column
new_id <- rep(seq_along(id_rle$lengths), id_rle$lengths)

# -> Reassign the new IDs to the id column in surv_long_data
surv_long_$id <- new_id

# THE Surv data to be used in the latter join model 
surv <- surv_long_[!duplicated(surv_long_$id), names(surv_)]
# Make sure it is sorted
surv <- surv[with(surv, order(id)), ]

# THE Long data to be used in the latter join model 
long <- surv_long_[, names(long_) ]
long <- long[with(long, order(id, time_to_psa)), ]

Descriptive stat

summary(surv)
##        id          bx_age     mri       grade_group    tx1_type   
##  Min.   :    1   <60  :2086   0:10012   1:3533      chemo  :   6  
##  1st Qu.: 2944   60-70:5177   1: 1761   2:3197      horm   :2711  
##  Median : 5887   >70  :4510             3:2426      radio  :4029  
##  Mean   : 5887                          4:1140      rp     :2685  
##  3rd Qu.: 8830                          5:1477      tfs-unk:2342  
##  Max.   :11773                                                    
##                                                                   
##     tx_date              tfs_time      tfs_status    after_tx   
##  Min.   :1999-06-07   Min.   :  0.00   0:2342     bcr-unk:2884  
##  1st Qu.:2010-10-14   1st Qu.:  2.00   1:9431     rp     :2700  
##  Median :2014-02-11   Median :  4.00              rt     :6189  
##  Mean   :2013-07-31   Mean   : 25.72                            
##  3rd Qu.:2016-12-15   3rd Qu.: 28.00                            
##  Max.   :2021-04-29   Max.   :259.00                            
##  NA's   :2342                                                   
##    drfs_time      drfs_status
##  Min.   :  1.00   0:8932     
##  1st Qu.: 30.00   1:2841     
##  Median : 57.00              
##  Mean   : 68.05              
##  3rd Qu.: 97.00              
##  Max.   :284.00              
## 
summary(long)
##        id         psa_results          psa_date           time_to_psa     
##  Min.   :    1   Min.   :    0.00   Min.   :1998-05-26   Min.   :-244.00  
##  1st Qu.: 2925   1st Qu.:    0.08   1st Qu.:2011-01-04   1st Qu.:   1.00  
##  Median : 5876   Median :    1.20   Median :2015-01-23   Median :  24.00  
##  Mean   : 5865   Mean   :   13.54   Mean   :2014-04-05   Mean   :  30.05  
##  3rd Qu.: 8771   3rd Qu.:    6.00   3rd Qu.:2018-03-13   3rd Qu.:  58.00  
##  Max.   :11773   Max.   :76800.00   Max.   :2021-04-08   Max.   : 284.00  
##                  NA's   :26         NA's   :7534         NA's   :26       
##     log_psa        
##  Min.   : 0.00000  
##  1st Qu.: 0.07696  
##  Median : 0.78846  
##  Mean   : 1.13457  
##  3rd Qu.: 1.94591  
##  Max.   :11.24897  
##  NA's   :26
barplot(table(surv$grade_group), main = "Distribution of grade_group", xlab = "Grade Group", ylab = "Count")

barplot(table(surv$bx_age), main = " Age at Biopsy", xlab = "Age at Bx", ylab = "Count")

hist(as.numeric(long$log_psa)[long$time_to_psa==0], main = "Dist LOG PSA at baseline (ref)", xlab = "log(PSA+1)", ylab = "Frequency", breaks = 30)

hist(as.numeric(long$log_psa), main = "Dist LOG PSA", xlab = "log(PSA+1)", ylab = "Frequency", breaks = 30)

# Calculate the average log_psa_result
average_log_psa <- mean(long$log_psa[long$time_to_psa == 0])
# Center the log_psa_result around its average
centered_log_psa <- long$log_psa - average_log_psa
# Plot the histogram
#hist(centered_log_psa, main = "Centered Dist LOG PSA at baseline", xlab = "log(PSA+1) - avg", ylab = "Frequency", breaks = 30)
hist(long$time_to_psa, main = "Distribution of Time to PSA", xlab = "Time to PSA", ylab = "Frequency")

#### Spaghetti Plots

# ggplot2 has a nicer plots
pos_apsa <- long[long$time_to_psa>0,]
meta_apsa <- merge(pos_apsa,
                   surv[, c('id', 'grade_group', 'mri','bx_age')], on='id', all.y=TRUE)
p <- ggplot(data = meta_apsa, aes(x = time_to_psa, y = log_psa, group = id, colour=grade_group))

# all
p + 
  geom_line() + 
  stat_smooth(aes(group = 1), color='red') + 
  stat_summary(aes(group = 1), geom = "point", fun.y = mean, shape = 17, size = 3)
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 45 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 45 rows containing non-finite values (`stat_summary()`).
## Warning: Removed 45 rows containing missing values (`geom_line()`).

# By GG
p + 
  geom_line() + 
  stat_smooth(aes(group = 1), color='red') + 
  #stat_summary(aes(group = 1), geom = "point", fun.y = mean, shape = 17, size = 3, color='yellow')+ 
  facet_grid(. ~ grade_group)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 45 rows containing non-finite values (`stat_smooth()`).
## Removed 45 rows containing missing values (`geom_line()`).

p + 
  geom_line() + 
  stat_smooth(aes(group = 1), color='red') + 
  #stat_summary(aes(group = 1), geom = "point", fun.y = mean, shape = 17, size = 3, color='yellow')+ 
  facet_grid(. ~ mri)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 45 rows containing non-finite values (`stat_smooth()`).
## Removed 45 rows containing missing values (`geom_line()`).

Save Data (Modeling is in Part2)

write.csv(surv, "processed/surv.csv", row.names = FALSE)
write.csv(long, "processed/long.csv", row.names = FALSE)