WGS-TL and DTA

Libraries

library(readr)
library(readxl)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(fuzzyjoin)
library(epitools)
Warning: package 'epitools' was built under R version 4.5.2
library(ggplot2)
library(ggpubr)
library(moments)
Warning: package 'moments' was built under R version 4.5.2
library(gtsummary)
library(survival)

Attaching package: 'survival'

The following object is masked from 'package:epitools':

    ratetable
library(survminer) 

Attaching package: 'survminer'

The following object is masked from 'package:survival':

    myeloma

Files

load("G:/GroupAccess/Pulm-Research/John Kim/PFF Precisions/Telomere Length/Datasets/topmed.nomogram.Rdata")

DTA <- read_csv("G:/GroupAccess/Pulm-Research/Sam Konkol/PFF data/PFFR_qct_delivery-20230214.csv", 
    col_types = cols(study_date = col_date(format = "%m/%d/%Y")))

PFF <- read_csv("G:/GroupAccess/Pulm-Research/Sam Konkol/PFF data/Precisions Dataset January 2024.csv", 
    col_types = cols(Consent_Date = col_date(format = "%m/%d/%Y")))

SSID_SID <- read_csv("G:/GroupAccess/Pulm-Research/Sam Konkol/PFF data/SID Lookup Table February 2023.csv")
Rows: 1391 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): SSID
dbl (1): SID

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
telomere <- read_csv("G:/GroupAccess/Pulm-Research/Sam Konkol/PFF data/PRECIS_telomere_v1.csv")
Rows: 1387 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): sid
dbl (3): tellen, telfracdup, telfracmap

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CT_pattern <- read_csv("G:/GroupAccess/Pulm-Research/Sam Konkol/PC37 Outcomes/Data Files/pffr_reads_consolidated.20230214-113447.csv")
Rows: 1340 Columns: 36
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): record_id, subject_id, visit, sex, technically_adequate, inadequat...
dbl (27): scan_date, inadequate_reasons___resolution, inadequate_reasons___i...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mil <- read_csv("G:/GroupAccess/Pulm-Research/John Kim/PFF Registry Data/HRCT/pff-pr_mil-uip_01Mar2024.csv", 
    col_types = cols(ct_date = col_date(format = "%m/%d/%Y")))

Clean

# Add clean Diagnosis columns (IPF, CTD-ILD, FHP, IIP, and other)
PFF <- PFF %>%
  mutate(
    Dx = case_when(
      DiagCateg_Current == "Collagen Vascular Disease/Autoimmune Diseases" ~ "CTD-ILD",
      DiagCateg_Current == "Hypersensitivity pneumonitis"                  ~ "FHP",
      DiagCateg_Current %in% c("Other specific ILD", "Occupational exposure") ~ "other",
      Primary_Diag_Current == "IIP: Idiopathic pulmonary fibrosis"          ~ "IPF",
      TRUE                                                                  ~ "IIP"
    )
  )

# Link PFF to telomere and Automated CT detection variables
colnames(telomere)[1] <- "SID"
Telomere_linkage <- merge(telomere, SSID_SID, by = "SID")
PFF_telomere <- merge(PFF, Telomere_linkage, by = "SSID")
colnames(DTA)[2] <- "SSID"

complete <- PFF_telomere %>%
  left_join(DTA, by = "SSID") %>%
  left_join(mil, by = "SSID")

# Replace NA with Zero in Cigarette Years Column
complete <- complete %>%
  mutate(CigYears = replace_na(CigYears, 0))

# rename MIL-UIP for ease
colnames(complete)[159] <- "mil"

# Generate above/below 10th percentile age adjusted telomere length column
complete <- complete %>%
  mutate(
    age_nom = floor(AgeConsent),                 # map to integer age
    age_nom = pmin(pmax(age_nom, 18), 95)        # clamp to 18–95
  ) %>%
  left_join(
    df.mesa.copd.normogram %>%
      mutate(age_nom = as.integer(age)) %>%      # <-- change "Age" if your nomogram age column has a different name
      select(age_nom, p10 = `adj.telomere.10th`),             # <-- change `10th` if the column name differs
    by = "age_nom"
  ) %>%
  mutate(
    Below10th_age_adj = if_else(
      !is.na(tellen) & !is.na(p10) & tellen <= p10,
      TRUE, FALSE,
      missing = NA
    )
  ) %>%
  select(-age_nom, -p10)

# Generate above/below 50th percentile age adjusted telomere length column
complete <- complete %>%
  mutate(
    age_nom = floor(AgeConsent),                 # map to integer age
    age_nom = pmin(pmax(age_nom, 18), 95)        # clamp to 18–95
  ) %>%
  left_join(
    df.mesa.copd.normogram %>%
      mutate(age_nom = as.integer(age)) %>%      # <-- change "Age" if your nomogram age column has a different name
      select(age_nom, p50 = `adj.telomere.50th`),             # <-- change `10th` if the column name differs
    by = "age_nom"
  ) %>%
  mutate(
    Below50th_age_adj = if_else(
      !is.na(tellen) & !is.na(p50) & tellen <= p50,
      TRUE, FALSE,
      missing = NA
    )
  ) %>%
  select(-age_nom)

# Generate age adjusted telomere length
complete$AgeAdjTL <- complete$tellen - complete$p50 

# Filter to those with automated CT values within 1 year of consent
complete_1yr <- complete %>%
  mutate(
    gap_dta = abs(as.numeric(difftime(Consent_Date, study_date, units = "days"))),
    gap_mil    = abs(as.numeric(difftime(Consent_Date, ct_date,    units = "days"))),
    gap_sum   = gap_dta + gap_mil   # or use pmax(gap_study, gap_ct)
  ) %>%
  filter(
    !is.na(gap_dta), !is.na(gap_mil),
    gap_dta <= 365,
    gap_mil    <= 365
  ) %>%
  group_by(SSID) %>%
  slice_min(order_by = gap_sum, n = 1, with_ties = FALSE) %>%
  ungroup() 

Testing normality

### Continuous Dependent variables ###
# MIL_UIP
hist(complete_1yr$mil, breaks = 30, main = "Histogram", xlab = "MIL-UIP")

qqnorm(complete_1yr$mil)
qqline(complete_1yr$mil, col = "red")

shapiro.test(complete_1yr$mil)

    Shapiro-Wilk normality test

data:  complete_1yr$mil
W = 0.81406, p-value < 2.2e-16
skewness(complete_1yr$mil)
[1] -1.167784
# DTA
hist(complete_1yr$deepdta_fibrosis_score, breaks = 30, main = "Histogram", xlab = "DTA Fibrosis Score")

qqnorm(complete_1yr$deepdta_fibrosis_score)
qqline(complete_1yr$deepdta_fibrosis_score, col = "red")

shapiro.test(complete_1yr$deepdta_fibrosis_score)

    Shapiro-Wilk normality test

data:  complete_1yr$deepdta_fibrosis_score
W = 0.97257, p-value = 1.603e-07
skewness(complete_1yr$deepdta_fibrosis_score)
[1] 0.5231103
### Indenpendent variables ###
# WGS_TL
hist(complete_1yr$tellen, breaks = 30, main = "Histogram", xlab = "Raw TL")

qqnorm(complete_1yr$tellen)
qqline(complete_1yr$tellen, col = "red")

shapiro.test(complete_1yr$tellen)

    Shapiro-Wilk normality test

data:  complete_1yr$tellen
W = 0.87245, p-value < 2.2e-16
skewness(complete_1yr$tellen)
[1] 1.590429
# age
hist(complete_1yr$AgeConsent, breaks = 30, main = "Histogram", xlab = "Age at Consent")

qqnorm(complete_1yr$AgeConsent)
qqline(complete_1yr$AgeConsent, col = "red")

shapiro.test(complete_1yr$AgeConsent)

    Shapiro-Wilk normality test

data:  complete_1yr$AgeConsent
W = 0.92615, p-value = 3.685e-14
skewness(complete_1yr$AgeConsent)
[1] -1.222365
# fvc
hist(complete_1yr$pctFVC, breaks = 30, main = "Histogram", xlab = "FVC pp")

qqnorm(complete_1yr$pctFVC)
qqline(complete_1yr$pctFVC, col = "red")

shapiro.test(complete_1yr$pctFVC)

    Shapiro-Wilk normality test

data:  complete_1yr$pctFVC
W = 0.98701, p-value = 0.0006448
skewness(complete_1yr$pctFVC)
[1] NA
# pack_years
hist(complete_1yr$CigYears, breaks = 30, main = "Histogram", xlab = "DTA Fibrosis Score")

qqnorm(complete_1yr$CigYears)
qqline(complete_1yr$CigYears, col = "red")

shapiro.test(complete_1yr$CigYears)

    Shapiro-Wilk normality test

data:  complete_1yr$CigYears
W = 0.82721, p-value < 2.2e-16
skewness(complete_1yr$CigYears)
[1] 0.8601929

Continuous Correlations of DTA

ggscatter(
  complete_1yr,
  x = "tellen",
  y = "deepdta_fibrosis_score",
  add = "reg.line",
  conf.int = TRUE,
  cor.coef = TRUE,
  cor.method = "spearman",
  xlab = "Raw Telomere Length",
  ylab = "Deep DTA fibrosis score"
)
Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
ℹ The deprecated feature was likely used in the ggpubr package.
  Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
ℹ The deprecated feature was likely used in the ggpubr package.
  Please report the issue at <https://github.com/kassambara/ggpubr/issues>.

ggscatter(
  complete_1yr,
  x = "AgeAdjTL",
  y = "deepdta_fibrosis_score",
  add = "reg.line",
  conf.int = TRUE,
  cor.coef = TRUE,
  cor.method = "spearman",
  xlab = "Raw Telomere Length",
  ylab = "Deep DTA fibrosis score"
)

Continuous Correlation - MIL

ggscatter(
  complete_1yr,
  x = "tellen",
  y = "mil",
  add = "reg.line",
  conf.int = TRUE,
  cor.coef = TRUE,
  cor.method = "spearman",
  xlab = "Raw Telomere Length",
  ylab = "MIL-UIP"
)

ggscatter(
  complete_1yr,
  x = "AgeAdjTL",
  y = "mil",
  add = "reg.line",
  conf.int = TRUE,
  cor.coef = TRUE,
  cor.method = "spearman",
  xlab = "Age Adjusted Telomere Length",
  ylab = "MIL-UIP"
)

Clininical/Radiologic Features Above/Below 10th

# Merge with CT pattern
colnames(CT_pattern)[2] <- "SSID"
table_1df <- merge(complete_1yr, CT_pattern, by = "SSID") %>%
  merge(mil, by = "SSID")
table_1df$CT_pattern <- ifelse(
  table_1df$parenchymal_pattern %in% c("definite", "probable"),
  "UIP",
  "non-UIP"
)

table_1df$Male <- ifelse(table_1df$Sex == "Male", 1, 0)
table_1df$White <- ifelse(table_1df$Race == "White", 1, 0)
table_1df$IPF <- ifelse(table_1df$Dx == "IPF", "IPF", "non-IPF")


tbl_summary(
  table_1df,
  by = "Below10th_age_adj",
  include = c("AgeConsent",
              "Male",
              "White",
              "SmokingHistory",
              "CigYears",
              "IPF",
              "CT_pattern",
              "deepdta_fibrosis_score",
              "mil"),
  statistic = list(
  all_categorical() ~ "{n} ({p}%)",
  all_continuous()  ~ "{mean} ({sd})"
),
  percent = "column",
  missing = "no"
) %>%
  add_p()
Characteristic FALSE
N = 3321
TRUE
N = 1181
p-value2
AgeConsent 68 (10) 66 (10) 0.021
Male 220 (66%) 79 (67%) 0.9
White 298 (92%) 107 (94%) 0.5
SmokingHistory 202 (61%) 85 (72%) 0.030
CigYears 12 (15) 15 (15) 0.019
IPF

0.011
    IPF 214 (64%) 91 (77%)
    non-IPF 118 (36%) 27 (23%)
CT_pattern

0.039
    non-UIP 166 (50%) 46 (39%)
    UIP 166 (50%) 72 (61%)
deepdta_fibrosis_score 30 (18) 30 (16) 0.8
mil 0.66 (0.28) 0.73 (0.23) 0.003
1 Mean (SD); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

Linear Regression DTA

fit_dta_unadj <- glm(deepdta_fibrosis_score ~ AgeAdjTL, data = complete_1yr)

fit_dta_adj <- glm(deepdta_fibrosis_score ~ AgeAdjTL + AgeConsent + Sex + SmokingHistory + CigYears + pctFVC, data = complete_1yr)

summary(fit_dta_unadj)

Call:
glm(formula = deepdta_fibrosis_score ~ AgeAdjTL, data = complete_1yr)

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 30.21201    0.92686  32.596   <2e-16 ***
AgeAdjTL    -0.07573    1.04437  -0.073    0.942    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 304.0409)

    Null deviance: 137428  on 453  degrees of freedom
Residual deviance: 137426  on 452  degrees of freedom
AIC: 3888

Number of Fisher Scoring iterations: 2
summary(fit_dta_adj)

Call:
glm(formula = deepdta_fibrosis_score ~ AgeAdjTL + AgeConsent + 
    Sex + SmokingHistory + CigYears + pctFVC, data = complete_1yr)

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       33.70476    5.25687   6.412 3.82e-10 ***
AgeAdjTL           0.51822    0.90561   0.572   0.5675    
AgeConsent         0.35816    0.07251   4.940 1.13e-06 ***
SexMale            0.62640    1.56356   0.401   0.6889    
SmokingHistoryYes  2.62772    1.98784   1.322   0.1869    
CigYears           0.12609    0.06538   1.929   0.0545 .  
pctFVC            -0.45999    0.04010 -11.472  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 220.7883)

    Null deviance: 129742  on 433  degrees of freedom
Residual deviance:  94277  on 427  degrees of freedom
  (20 observations deleted due to missingness)
AIC: 3583

Number of Fisher Scoring iterations: 2

Linear Regression MIL-UIP

fit_mil_unadj <- glm(mil ~ AgeAdjTL, data = complete_1yr)
fit_mil_adj <- glm(mil ~ AgeAdjTL+ AgeConsent + Sex + SmokingHistory + CigYears + pctFVC, data = complete_1yr)

summary(fit_mil_unadj)

Call:
glm(formula = mil ~ AgeAdjTL, data = complete_1yr)

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.65636    0.01445  45.408  < 2e-16 ***
AgeAdjTL    -0.04516    0.01629  -2.773  0.00579 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.07394644)

    Null deviance: 33.992  on 453  degrees of freedom
Residual deviance: 33.424  on 452  degrees of freedom
AIC: 109.99

Number of Fisher Scoring iterations: 2
summary(fit_mil_adj)

Call:
glm(formula = mil ~ AgeAdjTL + AgeConsent + Sex + SmokingHistory + 
    CigYears + pctFVC, data = complete_1yr)

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.1372646  0.0868683   1.580   0.1148    
AgeAdjTL          -0.0366632  0.0149649  -2.450   0.0147 *  
AgeConsent         0.0098364  0.0011982   8.209 2.66e-15 ***
SexMale            0.0631982  0.0258375   2.446   0.0148 *  
SmokingHistoryYes  0.0272815  0.0328485   0.831   0.4067    
CigYears           0.0012515  0.0010804   1.158   0.2474    
pctFVC            -0.0032253  0.0006626  -4.868 1.59e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.06028988)

    Null deviance: 32.723  on 433  degrees of freedom
Residual deviance: 25.744  on 427  degrees of freedom
  (20 observations deleted due to missingness)
AIC: 21.653

Number of Fisher Scoring iterations: 2