This RMD is a summary of the work done between 04-02-2024 and 11-02-2024. The aim is to create a joint-model that includes time to treatment, time to recurrence, and longitudinal PSA data. This file includes the needed packages (mainly INLA and INLAJoint), the implementation functions and the part of the interpretation. The modeling is in Part2 ## Packages
# No packages to install, so far
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
abx <- read.csv("datafolder/bx.csv", row.names = c('X'))
apsa <- read.csv("datafolder/psa.csv", row.names = c('X'))
abcr <- read.csv("datafolder/bcr.csv", row.names = c('X'))
atfs <- read.csv("datafolder/tfs.csv", row.names = c('X'))
# Biopsy-related data
abx$grade_group <- as.factor(abx$grade_group)
abx$mri <- as.factor(abx$mri)
abx$bx_date <- as.Date(abx$bx_date, format = "%Y-%m-%d", na.rm = FALSE)
abx$mri_date <- as.Date(abx$mri_date, format = "%Y-%m-%d", na.rm = FALSE)
abx$last_data_date <- as.Date(abx$last_data_date, format = "%Y-%m-%d", na.rm = FALSE)
abx$bx_age <- cut(abx$age_at_bx,
breaks = c(-Inf, 60, 70, Inf),
labels = c("<60", "60-70", ">70"),
include.lowest = TRUE)
# Recurrence-related data
abcr$grade_group <- as.factor(abcr$grade_group)
abcr$mri <- as.factor(abcr$mri)
#abcr$after_tx <- as.factor(abcr$after_tx)
abcr$tx_date <- as.Date(abcr$tx_date, format = "%Y-%m-%d", na.rm = FALSE)
abcr$bcr_date <- as.Date(abcr$bcr_date, format = "%Y-%m-%d", na.rm = FALSE)
# Treatment-related data
atfs$grade_group <- as.factor(atfs$grade_group)
atfs$mri <- as.factor(atfs$mri)
atfs$type_of_treatment <- ifelse(atfs$type_of_treatment=='', 'tfs-unk', atfs$type_of_treatment)
atfs$type_of_treatment <- as.factor(atfs$type_of_treatment)
atfs$treatment_date <- as.Date(atfs$treatment_date, format = "%Y-%m-%d", na.rm = FALSE)
names(atfs)[names(atfs) == "treatment_date"] <- "tx_date"
names(atfs)[names(atfs) == "type_of_treatment"] <- "tx1_type"
# PSA-related data
apsa$psa_date <- as.Date(apsa$psa_date, format = "%Y-%m-%d", na.rm = FALSE)
##################################### 1- SURVIVAL DATA
# -> TFS and RFS
surv_ <- merge(abcr[, c('id', 'drfs_status', 'drfs_time', 'after_tx')],
abx[, c('id', 'grade_group', 'mri','bx_age')], on='id', all=TRUE)
surv_ <- merge(surv_,
atfs[, c('id', 'tx1_type','tx_date', 'tfs_status', 'tfs_time')], on='id', all=TRUE)
# Re-Order
surv_ <- surv_[, c(1, 7, 6, 5, 8, 9, 11,10,4, 3, 2)]
# Remove where both time to treatment/bcr = 0 and event = 0 (this should be update when converting time to days)
all_null_tfs <- surv_[surv_$tfs_time==0 & surv_$tfs_status==0, ]
surv_ <- surv_[!(surv_$id %in% all_null_tfs$id), ]
all_null_drfs <- surv_[surv_$drfs_time==0 & surv_$drfs_status==0, ]
surv_ <- surv_[!(surv_$id %in% all_null_drfs$id), ]
# Now if no BCR data then rfs status = 0 and rfs_time = tfs_time [to fill out the NA cells]
surv_$drfs_status <- ifelse(is.na(surv_$drfs_status), 0, surv_$drfs_status)
surv_$drfs_time <- ifelse(is.na(surv_$drfs_time), surv_$tfs_time, surv_$drfs_time)
# Remove remaining patients with time to tx+bcr = 0 (follow up time = 0)
all_null_tfs_drfs <- surv_[surv_$tfs_time==0 & surv_$drfs_time==0, ]
surv_ <- surv_[!(surv_$id %in% all_null_tfs_drfs$id), ]
# Filling out the NA treatments of the BCR (the BCRs that we don't know the CT)
surv_$after_tx <- ifelse(is.na(surv_$after_tx), 'bcr-unk', surv_$after_tx)
surv_$after_tx <- as.factor(surv_$after_tx)
# Make sure the status are factors
surv_$tfs_status <- as.factor(surv_$tfs_status)
surv_$drfs_status <- as.factor(surv_$drfs_status)
##################################### 2- LONGITUDINAL DATA
# Add log PSA
long_ <- apsa
long_ <- long_[with(long_, order(id, time_to_psa)), ]
# ------ Make sure we have for each patient, PSA at reference time t=o
# Step 1: Define a function to calculate the average PSA value at time=0 for each patient
calculate_psa_at_t0 <- function(patient_data) {
# Make sure it's ordered
patient_data <- patient_data[with(patient_data, order( time_to_psa)), ]
psas_before_t0 <- patient_data$psa_results[patient_data$time_to_psa < 0]
psas_after_t0 <- patient_data$psa_results[patient_data$time_to_psa > 0]
# if he has psa only before 0
if(length(psas_before_t0) > 0 && length(psas_after_t0) == 0){
return(psas_before_t0[length(psas_before_t0)])
}
# if he has psa only after 0
if(length(psas_after_t0) > 0 && length(psas_before_t0) == 0){
return(psas_after_t0[1])
}
# if he has psa both before abd after
if(length(psas_after_t0) > 0 && length(psas_before_t0) > 0){
return((psas_before_t0[length(psas_before_t0)] + psas_after_t0[1])/2)
}
}
# Step 2: Calculate average PSA value at time=0 for each patient
unique_ids <- unique(long_$id)
for (pid in unique_ids) {
# Subset data for the current patient
patient_data <- long_[long_$id == pid,]
#cat(patient_data$id)
psa_at_t0 <- patient_data$psa_results[patient_data$time_to_psa == 0]
if(length(psa_at_t0) == 0){# No data at 0
# Calculate PSA value at time=0 for this patient
psa_t0 <- calculate_psa_at_t0(patient_data)
#cat("\nPID", pid)
new_row = c(id = pid, psa_results = psa_t0, psa_date=NA, time_to_psa = 0)
#cat("\nNew data:")
#cat(new_row)
long_ <- rbind(long_, new_row)
}
}
# Make sure it is sorted
long_ <- long_[with(long_, order(id, time_to_psa)), ]
# ------ End
# Add log PSA
long_$log_psa <- log(long_$psa_results + 1)
# Merge to update IDs
surv_long_ <- merge(surv_, long_, by='id', all.x=T)
# Reassigning ID
# -> Calculate run-length encoding of the original id column
id_rle <- rle(surv_long_$id)
# -> Generate a sequence of IDs based on the lengths of runs in the original id column
new_id <- rep(seq_along(id_rle$lengths), id_rle$lengths)
# -> Reassign the new IDs to the id column in surv_long_data
surv_long_$id <- new_id
# THE Surv data to be used in the latter join model
surv <- surv_long_[!duplicated(surv_long_$id), names(surv_)]
# Make sure it is sorted
surv <- surv[with(surv, order(id)), ]
# THE Long data to be used in the latter join model
long <- surv_long_[, names(long_) ]
long <- long[with(long, order(id, time_to_psa)), ]
summary(surv)
## id bx_age mri grade_group tx1_type
## Min. : 1 <60 :2086 0:10012 1:3533 chemo : 6
## 1st Qu.: 2944 60-70:5177 1: 1761 2:3197 horm :2711
## Median : 5887 >70 :4510 3:2426 radio :4029
## Mean : 5887 4:1140 rp :2685
## 3rd Qu.: 8830 5:1477 tfs-unk:2342
## Max. :11773
##
## tx_date tfs_time tfs_status after_tx
## Min. :1999-06-07 Min. : 0.00 0:2342 bcr-unk:2884
## 1st Qu.:2010-10-14 1st Qu.: 2.00 1:9431 rp :2700
## Median :2014-02-11 Median : 4.00 rt :6189
## Mean :2013-07-31 Mean : 25.72
## 3rd Qu.:2016-12-15 3rd Qu.: 28.00
## Max. :2021-04-29 Max. :259.00
## NA's :2342
## drfs_time drfs_status
## Min. : 1.00 0:8932
## 1st Qu.: 30.00 1:2841
## Median : 57.00
## Mean : 68.05
## 3rd Qu.: 97.00
## Max. :284.00
##
summary(long)
## id psa_results psa_date time_to_psa
## Min. : 1 Min. : 0.00 Min. :1998-05-26 Min. :-244.00
## 1st Qu.: 2925 1st Qu.: 0.08 1st Qu.:2011-01-04 1st Qu.: 1.00
## Median : 5876 Median : 1.20 Median :2015-01-23 Median : 24.00
## Mean : 5865 Mean : 13.54 Mean :2014-04-05 Mean : 30.05
## 3rd Qu.: 8771 3rd Qu.: 6.00 3rd Qu.:2018-03-13 3rd Qu.: 58.00
## Max. :11773 Max. :76800.00 Max. :2021-04-08 Max. : 284.00
## NA's :26 NA's :7534 NA's :26
## log_psa
## Min. : 0.00000
## 1st Qu.: 0.07696
## Median : 0.78846
## Mean : 1.13457
## 3rd Qu.: 1.94591
## Max. :11.24897
## NA's :26
barplot(table(surv$grade_group), main = "Distribution of grade_group", xlab = "Grade Group", ylab = "Count")
barplot(table(surv$bx_age), main = " Age at Biopsy", xlab = "Age at Bx", ylab = "Count")
hist(as.numeric(long$log_psa)[long$time_to_psa==0], main = "Dist LOG PSA at baseline (ref)", xlab = "log(PSA+1)", ylab = "Frequency", breaks = 30)
hist(as.numeric(long$log_psa), main = "Dist LOG PSA", xlab = "log(PSA+1)", ylab = "Frequency", breaks = 30)
# Calculate the average log_psa_result
average_log_psa <- mean(long$log_psa[long$time_to_psa == 0])
# Center the log_psa_result around its average
centered_log_psa <- long$log_psa - average_log_psa
# Plot the histogram
#hist(centered_log_psa, main = "Centered Dist LOG PSA at baseline", xlab = "log(PSA+1) - avg", ylab = "Frequency", breaks = 30)
hist(long$time_to_psa, main = "Distribution of Time to PSA", xlab = "Time to PSA", ylab = "Frequency")
#### Spaghetti Plots
# ggplot2 has a nicer plots
pos_apsa <- long[long$time_to_psa>0,]
meta_apsa <- merge(pos_apsa,
surv[, c('id', 'grade_group', 'mri','bx_age')], on='id', all.y=TRUE)
p <- ggplot(data = meta_apsa, aes(x = time_to_psa, y = log_psa, group = id, colour=grade_group))
# all
p +
geom_line() +
stat_smooth(aes(group = 1), color='red') +
stat_summary(aes(group = 1), geom = "point", fun.y = mean, shape = 17, size = 3)
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 45 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 45 rows containing non-finite values (`stat_summary()`).
## Warning: Removed 45 rows containing missing values (`geom_line()`).
# By GG
p +
geom_line() +
stat_smooth(aes(group = 1), color='red') +
#stat_summary(aes(group = 1), geom = "point", fun.y = mean, shape = 17, size = 3, color='yellow')+
facet_grid(. ~ grade_group)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 45 rows containing non-finite values (`stat_smooth()`).
## Removed 45 rows containing missing values (`geom_line()`).
p +
geom_line() +
stat_smooth(aes(group = 1), color='red') +
#stat_summary(aes(group = 1), geom = "point", fun.y = mean, shape = 17, size = 3, color='yellow')+
facet_grid(. ~ mri)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 45 rows containing non-finite values (`stat_smooth()`).
## Removed 45 rows containing missing values (`geom_line()`).
write.csv(surv, "processed/surv.csv", row.names = FALSE)
write.csv(long, "processed/long.csv", row.names = FALSE)