Prognostic PRS Visualisation and Analysis

Visualisation

#Reading in the files
prognostic_everything <- read.table("prognostic_everything_2.profile", header = T)
prognostic_0.05 <- read.table("prognostic_0.05.profile", header = T)
prognostic_10_5 <- read.table("prognostic_10_5.profile", header = T)
prognostic_10_8 <- read.table("prognostic_10_8.profile", header = T)

# Read in principal components capturing population stratificaton
pcs <- read.table("merged_pca_Shellfish.evecs")

# Accompanying .fam file
fam <- read.table("merged_pca.fam")

pcs <- t(pcs) %>% 
  as.data.frame() %>% 
  mutate(IID = fam$V2) %>% 
  relocate(IID)


colnames(pcs)[2:11] <- paste0("PC",seq(1,10))

# Exponentiate score to transform back to an OR scale, add disease status
prognostic_everything <- prognostic_everything %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>% 
  left_join(pcs)
## Joining, by = "IID"
table(prognostic_everything$status) ## 54 cases, 998 controls
## 
##   0   1 
## 998  54
prognostic_0.05 <- prognostic_0.05 %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>% 
  left_join(pcs)
## Joining, by = "IID"
prognostic_10_5 <- prognostic_10_5 %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>% 
  left_join(pcs)
## Joining, by = "IID"
prognostic_10_8 <- prognostic_10_8 %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>% 
  left_join(pcs)
## Joining, by = "IID"
# PRS Prognostic Distribution

p.prog_everything <- ggplot(prognostic_everything, aes(x = exp_SCORE, group = status)) +
  geom_density(aes(fill = status), alpha = 0.7) +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD Prognostic PRS", y = "Density")


p.prog_0.05 <- ggplot(prognostic_0.05, aes(x = exp_SCORE, group = status)) +
  geom_density(aes(fill = status), alpha = 0.7) +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD Prognostic PRS", y = "Density")

p.prog_10_5 <- ggplot(prognostic_10_5, aes(x = exp_SCORE, group = status)) +
  geom_density(aes(fill = status), alpha = 0.7) +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD Prognostic PRS", y = "Density")


p.prog_10_8 <- ggplot(prognostic_10_8, aes(x = exp_SCORE, group = status)) +
  geom_density(aes(fill = status), alpha = 0.7) +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD Prognostic PRS", y = "Density")


ggarrange(p.prog_everything, p.prog_0.05, p.prog_10_5, p.prog_10_8)

p.prognostic.box_everything <- ggplot(prognostic_everything, aes(y = exp_SCORE, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "IBD Prognostic PRS", x = "Disease status")


p.prognostic.box_0.05 <- ggplot(prognostic_0.05, aes(y = exp_SCORE, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "IBD Prognostic PRS", x = "Disease status")


p.prognostic.box_10_5 <- ggplot(prognostic_10_5, aes(y = exp_SCORE, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "IBD Prognostic PRS", x = "Disease status")


p.prognostic.box_10_8 <- ggplot(prognostic_10_8, aes(y = exp_SCORE, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "IBD Prognostic PRS", x = "Disease status")


ggarrange(p.prognostic.box_everything, p.prognostic.box_0.05, p.prognostic.box_10_5, p.prognostic.box_10_8)

Statistics

T Test

## ==== T-test (is the mean PRS different between groups) ==== ##
prog.t_everything <- prognostic_everything %>% t_test(data = prognostic_everything, formula = exp_SCORE ~ status)
prog.t_everything ## p = 0.565
## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df     p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl>
## 1 exp_SCORE 0      1        998    54    -0.578  59.3 0.565
prog.t_0.05 <- prognostic_0.05 %>% t_test(data = prognostic_0.05, formula = exp_SCORE ~ status)
prog.t_0.05 ## p = 0.453
## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df     p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl>
## 1 exp_SCORE 0      1        998    54     0.755  59.9 0.453
prog.t_10_5 <- prognostic_10_5 %>% t_test(data = prognostic_10_5, formula = exp_SCORE ~ status)
prog.t_10_5 ## p = 0.549
## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df     p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl>
## 1 exp_SCORE 0      1        998    54     0.602  61.4 0.549
prog.t_10_8 <- prognostic_10_8 %>% t_test(data = prognostic_10_8, formula = exp_SCORE ~ status)
prog.t_10_8 ## p = 0.663
## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df     p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl>
## 1 exp_SCORE 0      1        998    54     0.437  60.7 0.663

Logistic Regression

## ==== Logistic regression (estimates how disease status varies based on PRS) ==== ##
stat.prog_everything <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = prognostic_everything)
summary(stat.prog_everything)
## 
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), 
##     data = prognostic_everything)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3704  -0.3321  -0.3226  -0.3145   2.4983  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -854.6     1513.6  -0.565    0.572
## exp_SCORE      851.5     1513.1   0.563    0.574
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 425.57  on 1050  degrees of freedom
## AIC: 429.57
## 
## Number of Fisher Scoring iterations: 5
coef(summary(stat.prog_everything))[,'Pr(>|z|)']
## (Intercept)   exp_SCORE 
##   0.5723085   0.5736190
#p = 0.574

stat.prog_0.05 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = prognostic_0.05)
summary(stat.prog_0.05)
## 
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), 
##     data = prognostic_0.05)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3717  -0.3342  -0.3238  -0.3114   2.5230  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    127.8      186.3   0.686    0.493
## exp_SCORE     -132.5      188.7  -0.702    0.483
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 425.38  on 1050  degrees of freedom
## AIC: 429.38
## 
## Number of Fisher Scoring iterations: 5
coef(summary(stat.prog_0.05))[,'Pr(>|z|)']
## (Intercept)   exp_SCORE 
##   0.4925264   0.4827215
#p = 0.483

stat.prog_10_5 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = prognostic_10_5)
summary(stat.prog_10_5)
## 
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), 
##     data = prognostic_10_5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3330  -0.3317  -0.3306  -0.3126   2.4890  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -2.005      1.772  -1.132    0.258
## exp_SCORE     -1.333      2.589  -0.515    0.607
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 425.61  on 1050  degrees of freedom
## AIC: 429.61
## 
## Number of Fisher Scoring iterations: 5
coef(summary(stat.prog_10_5))[,'Pr(>|z|)']
## (Intercept)   exp_SCORE 
##   0.2577509   0.6065992
#p = 0.607


stat.prog_10_8 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = prognostic_10_8)
summary(stat.prog_10_8)
## 
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), 
##     data = prognostic_10_8)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3291  -0.3291  -0.3291  -0.3105   2.4724  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -2.4767     1.1386  -2.175   0.0296 *
## exp_SCORE    -0.6857     1.7661  -0.388   0.6978  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 425.73  on 1050  degrees of freedom
## AIC: 429.73
## 
## Number of Fisher Scoring iterations: 5
coef(summary(stat.prog_10_8))[,'Pr(>|z|)']
## (Intercept)   exp_SCORE 
##  0.02961642  0.69783285
#p = 0.6978

#  Logistic Regression with PC inclusion
p.pcs <- ggplot(prognostic_everything, aes(x = PC1, y = PC2)) +
  geom_point(aes(fill = status, shape = status), alpha = 0.7, size = 4) +
  scale_shape_manual(name = "Disease status", values = c(21,24)) +
  scale_fill_manual(name = "Disease status", values = c("grey90","grey20")) +
  theme_bw()

p.PC1 <- ggplot(prognostic_everything, aes(y = PC1, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 3, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PC1", x = "Disease status")


gridExtra::grid.arrange(grobs = list(p.pcs,p.PC1), nrow = 1, widths = c(0.7,0.3))

stat.PCs.prog_everything <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = prognostic_everything)

summary(stat.PCs.prog_everything)
## 
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"), 
##     data = prognostic_everything)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6661  -0.3392  -0.2697  -0.2093   2.8420  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -712.52    1613.86  -0.442 0.658850    
## exp_SCORE     728.14    1612.98   0.451 0.651682    
## PC1          1488.15     416.01   3.577 0.000347 ***
## PC2           -65.44     342.87  -0.191 0.848635    
## PC3           104.77     185.61   0.564 0.572439    
## PC4            90.33      72.67   1.243 0.213867    
## PC5            54.72      59.07   0.926 0.354311    
## PC6           -77.18      39.54  -1.952 0.050919 .  
## PC7            17.68      53.48   0.331 0.740886    
## PC8           -63.96      42.39  -1.509 0.131303    
## PC9            35.70      36.27   0.984 0.324935    
## PC10           91.98      31.84   2.888 0.003871 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 381.27  on 1040  degrees of freedom
## AIC: 405.27
## 
## Number of Fisher Scoring iterations: 6
coef(summary(stat.PCs.prog_everything))[,'Pr(>|z|)']
##  (Intercept)    exp_SCORE          PC1          PC2          PC3          PC4 
## 0.6588495593 0.6516817100 0.0003473392 0.8486345878 0.5724387608 0.2138674181 
##          PC5          PC6          PC7          PC8          PC9         PC10 
## 0.3543105323 0.0509191383 0.7408861863 0.1313031230 0.3249345492 0.0038714124
# p = 0.652

stat.PCs.prog_0.05 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = prognostic_0.05)

summary(stat.PCs.prog_0.05)
## 
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"), 
##     data = prognostic_0.05)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6680  -0.3432  -0.2671  -0.2105   2.7864  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   118.99     196.96   0.604 0.545748    
## exp_SCORE    -104.29     199.41  -0.523 0.600983    
## PC1          1506.57     413.38   3.645 0.000268 ***
## PC2           -64.92     343.37  -0.189 0.850032    
## PC3           109.68     185.76   0.590 0.554878    
## PC4            92.24      72.78   1.267 0.205025    
## PC5            48.89      58.99   0.829 0.407216    
## PC6           -76.75      39.41  -1.948 0.051445 .  
## PC7            17.55      53.45   0.328 0.742702    
## PC8           -64.07      42.31  -1.514 0.129913    
## PC9            33.03      36.40   0.907 0.364234    
## PC10           89.78      31.75   2.827 0.004693 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 381.20  on 1040  degrees of freedom
## AIC: 405.2
## 
## Number of Fisher Scoring iterations: 6
coef(summary(stat.PCs.prog_0.05))[,'Pr(>|z|)']
##  (Intercept)    exp_SCORE          PC1          PC2          PC3          PC4 
## 0.5457483160 0.6009826805 0.0002679013 0.8500323707 0.5548778486 0.2050249637 
##          PC5          PC6          PC7          PC8          PC9         PC10 
## 0.4072157779 0.0514451441 0.7427016407 0.1299127101 0.3642341529 0.0046930511
# p = 0.601


stat.PCs.prog_10_5 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = prognostic_10_5)

summary(stat.PCs.prog_10_5)
## 
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"), 
##     data = prognostic_10_5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6703  -0.3425  -0.2672  -0.2088   2.7979  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   16.832      5.198   3.238 0.001203 ** 
## exp_SCORE     -1.218      2.739  -0.445 0.656479    
## PC1         1500.748    412.586   3.637 0.000275 ***
## PC2          -64.131    343.356  -0.187 0.851835    
## PC3          109.920    185.343   0.593 0.553138    
## PC4           91.817     72.760   1.262 0.206980    
## PC5           51.231     58.779   0.872 0.383435    
## PC6          -77.625     39.463  -1.967 0.049178 *  
## PC7           18.956     53.553   0.354 0.723363    
## PC8          -63.071     42.367  -1.489 0.136572    
## PC9           34.351     36.203   0.949 0.342696    
## PC10          90.954     31.730   2.867 0.004150 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 381.27  on 1040  degrees of freedom
## AIC: 405.27
## 
## Number of Fisher Scoring iterations: 6
coef(summary(stat.PCs.prog_10_5))[,'Pr(>|z|)']
##  (Intercept)    exp_SCORE          PC1          PC2          PC3          PC4 
## 0.0012029670 0.6564789898 0.0002753827 0.8518347636 0.5531380525 0.2069797293 
##          PC5          PC6          PC7          PC8          PC9         PC10 
## 0.3834348122 0.0491780898 0.7233628700 0.1365719075 0.3426959532 0.0041500055
# p = 0.656


stat.PCs.prog_10_8 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = prognostic_10_8)

summary(stat.PCs.prog_10_8)
## 
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"), 
##     data = prognostic_10_8)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6738  -0.3427  -0.2676  -0.2085   2.8020  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   16.5593     5.0109   3.305 0.000951 ***
## exp_SCORE     -0.8395     1.8751  -0.448 0.654342    
## PC1         1502.4832   412.2607   3.644 0.000268 ***
## PC2          -65.0298   343.4095  -0.189 0.849807    
## PC3          108.8145   185.3544   0.587 0.557162    
## PC4           92.3153    72.7888   1.268 0.204704    
## PC5           51.5643    58.7907   0.877 0.380442    
## PC6          -77.8312    39.4883  -1.971 0.048725 *  
## PC7           18.8945    53.5391   0.353 0.724156    
## PC8          -63.3290    42.3540  -1.495 0.134854    
## PC9           34.4987    36.2145   0.953 0.340782    
## PC10          91.0421    31.7219   2.870 0.004105 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 381.26  on 1040  degrees of freedom
## AIC: 405.26
## 
## Number of Fisher Scoring iterations: 6
coef(summary(stat.PCs.prog_10_8))[,'Pr(>|z|)']
##  (Intercept)    exp_SCORE          PC1          PC2          PC3          PC4 
## 0.0009509936 0.6543423897 0.0002679148 0.8498065200 0.5571620865 0.2047040948 
##          PC5          PC6          PC7          PC8          PC9         PC10 
## 0.3804417306 0.0487245032 0.7241562128 0.1348542724 0.3407817557 0.0041046105
# p = 0.654

Area Under Receiver Operator Curve

## Using ROC curves to calculate PRS sensitivity and specificity for IBD diagnosis

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## GLM (as above)

stat.PCs.prog_everything <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = prognostic_everything)

## Predict disease status from model
status.pred.prog_everything <- predict(stat.PCs.prog_everything, newdata = prognostic_everything, type = "response")
status.pred.prog_everything <- ifelse(status.pred.prog_everything > 0.20, 1, 0) ## IBD prediction >20% 

pred.res <- data.frame(IID = prognostic_everything$IID,
                       status = prognostic_everything$status, 
                       status_pred_everything = status.pred.prog_everything)

## Compare results
table(prognostic_everything$status, status.pred.prog_everything)
##    status.pred.prog_everything
##       0   1
##   0 989   9
##   1  47   7
## Calculate AUC (area under the curve, ie. prediction accuracy)
par(mfrow=c(2,2))
roc.prog_everything <- roc(prognostic_everything$status, status.pred.prog_everything)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.prog_everything #AUC 0.560
## 
## Call:
## roc.default(response = prognostic_everything$status, predictor = status.pred.prog_everything)
## 
## Data: status.pred.prog_everything in 998 controls (prognostic_everything$status 0) < 54 cases (prognostic_everything$status 1).
## Area under the curve: 0.5603
plot.roc(roc.prog_everything, col = "blue", print.auc = TRUE)

#p<0.05
stat.PCs.prog_0.05 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = prognostic_0.05)
status.pred.prog_0.05 <- predict(stat.PCs.prog_0.05, newdata = prognostic_0.05, type = "response")
status.pred.prog_0.05 <- ifelse(status.pred.prog_0.05 > 0.20, 1, 0) ## IBD prediction >20% 



roc.prog_0.05 <- roc(prognostic_0.05$status, status.pred.prog_0.05)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.prog_0.05 #AUC 0.561
## 
## Call:
## roc.default(response = prognostic_0.05$status, predictor = status.pred.prog_0.05)
## 
## Data: status.pred.prog_0.05 in 998 controls (prognostic_0.05$status 0) < 54 cases (prognostic_0.05$status 1).
## Area under the curve: 0.5613
plot.roc(roc.prog_0.05, col = "blue", print.auc = TRUE)

#p<1*10^5
stat.PCs.prog_10_5 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = prognostic_10_5)
status.pred.prog_10_5 <- predict(stat.PCs.prog_10_5, newdata = prognostic_10_5, type = "response")
status.pred.prog_10_5 <- ifelse(status.pred.prog_10_5 > 0.20, 1, 0) ## IBD prediction >20% 


roc.prog_10_5 <- roc(prognostic_10_5$status, status.pred.prog_10_5)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.prog_10_5 #AUC 0.561
## 
## Call:
## roc.default(response = prognostic_10_5$status, predictor = status.pred.prog_10_5)
## 
## Data: status.pred.prog_10_5 in 998 controls (prognostic_10_5$status 0) < 54 cases (prognostic_10_5$status 1).
## Area under the curve: 0.5608
plot.roc(roc.prog_10_5, col = "blue", print.auc = TRUE)

#p<1*10^8
stat.PCs.prog_10_8 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = prognostic_10_8)
status.pred.prog_10_8 <- predict(stat.PCs.prog_10_8, newdata = prognostic_10_8, type = "response")
status.pred.prog_10_8 <- ifelse(status.pred.prog_10_8 > 0.20, 1, 0) ## IBD prediction >20% 

table(prognostic_10_8$status, status.pred.prog_10_8)
##    status.pred.prog_10_8
##       0   1
##   0 990   8
##   1  47   7
roc.prog_10_8 <- roc(prognostic_10_8$status, status.pred.prog_10_8)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.prog_10_8 #AUC 0.561
## 
## Call:
## roc.default(response = prognostic_10_8$status, predictor = status.pred.prog_10_8)
## 
## Data: status.pred.prog_10_8 in 998 controls (prognostic_10_8$status 0) < 54 cases (prognostic_10_8$status 1).
## Area under the curve: 0.5608
plot.roc(roc.prog_10_8, col = "blue", print.auc = TRUE)

Clinical Analysis

clinical_data_prognostic <- read.csv("updated_clinical_data.csv")
merged_clinical_prognostic_initial <- merge(prognostic_everything, clinical_data_prognostic, by.x = "IID", by.y = "Patient_ID", all.x = TRUE, all.y = FALSE)
merged_clinical_prognostic <- subset(merged_clinical_prognostic_initial, grepl("TRIAD", merged_clinical_prognostic_initial$IID))
clinical_prognostic <- subset(merged_clinical_prognostic, !is.na(merged_clinical_prognostic$DOB))

Treatment Escalations

prognostic_escalation <- subset(clinical_prognostic, !is.na(clinical_prognostic$Total_escalations))
escalataions_year <- prognostic_escalation$Total_escalations/(prognostic_escalation$FollowUp_Days/365)
escalations_df <- data.frame(prognostic_escalation$exp_SCORE, prognostic_escalation$Total_escalations, escalataions_year)
colnames(escalations_df) <- c("exp_SCORE","Escalations", "Escalation_rate")

Escalations Vs No Escalations

escalation_binary <- ifelse(grepl("0", prognostic_escalation$Total_escalations), "No_Escalation", "Escalation")
escalation_binary_table <- data.frame(escalation_binary, prognostic_escalation$exp_SCORE)

box_escalation_prognostic_binary <- ggplot(escalation_binary_table, aes(x = escalation_binary, y = prognostic_escalation.exp_SCORE)) + 
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Escalations vs No escalations", values = c("grey70","grey20")) +  
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "Prognostic PRS", x = "Escalations vs No Escalations") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")


density_prognostic_escalation <- ggplot(escalation_binary_table, aes(x = prognostic_escalation.exp_SCORE, group = escalation_binary)) +
  geom_density(aes(fill = escalation_binary), alpha = 0.7) +
  scale_fill_manual(name = "Escalation Status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "Prognostic PRS", y = "Density")

ggarrange(box_escalation_prognostic_binary, density_prognostic_escalation, ncol = 1, nrow = 2, labels = c("A", "B"))

Escalation Rate

prognostic_escalation_rate <- ggplot(escalations_df, aes(x=Escalation_rate , y=exp_SCORE)) + geom_point() + annotate("text", x = 2.1, y = 1.00032, label="p-value = 0.26") + annotate("text", x = 2.1, y = 1.0003, label="R^2 = 0.04172") + labs(y = "Prognostic PRS", x = "Escalation rate (Escalation per year of Follow Up)") + geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="blue") 
prognostic_escalation_rate
## `geom_smooth()` using formula = 'y ~ x'

Treatment Choice on PRS

Surgeries

prognostic_surgeries <- subset(clinical_prognostic, !is.na(clinical_prognostic$Total_IBD_surgeries))

box_surgeries <- ggplot(prognostic_surgeries, aes(x = as.character(Total_IBD_surgeries), y = exp_SCORE)) + 
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "Prognostic PRS Score", x = "Surgery vs No Surgery") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

density_surgeries <- ggplot(prognostic_surgeries, aes(x = exp_SCORE, group = as.character(Total_IBD_surgeries))) +
  geom_density(aes(fill = as.character(Total_IBD_surgeries)), alpha = 0.7) +
  scale_fill_manual(name = "Surgery Status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "Prognostic PRS", y = "Density")

ggarrange(box_surgeries, density_surgeries, ncol = 1, nrow = 2, labels = c("A", "B"))

Initial Treatment Class

clinical_prognostic_initial_class <- clinical_prognostic %>%
  mutate(Initial_treatment_Class = recode(Initial_treatment_Class, "Aninosalicylate" = "Aminosalicylate"))

ggplot(clinical_prognostic_initial_class, aes(x=Initial_treatment_Class, y=exp_SCORE)) + geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") + geom_boxplot(alpha = 0.7, outlier.shape = NA) + ggtitle("Effect of Initial Treatment on Score") + labs(y = "PRS", x = "Initial Treatment")

filtered_initial_treatment <- clinical_prognostic_initial_class %>%
  filter(!is.na(Initial_treatment_Class), !is.na(exp_SCORE))


ggplot(filtered_initial_treatment, aes(x = factor(ifelse(Initial_treatment_Class == "None", "None", "Other")), y = exp_SCORE)) +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  geom_boxplot(fill = alpha("white", 0.5)) +
  ggtitle("Initial treatment vs no treatment on Score") +
  labs(y = "PRS", x = "Initial Treatment") +
  scale_x_discrete(labels = c("None", "Treatment")) +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

amino_steroid <- clinical_prognostic_initial_class %>%
  filter(Initial_treatment_Class %in% c("Aminosalicylate", "Steroid"), !is.na(exp_SCORE))

ggplot(amino_steroid, aes(x = Initial_treatment_Class, y = exp_SCORE)) +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  geom_boxplot(fill = alpha("white", 0.5)) +
  ggtitle("Effect of Initial Treatment on Score") +
  labs(y = "PRS", x = "Initial Treatment") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

1st Treatment Escalation Class

ggplot(clinical_prognostic_initial_class, aes(x=Escalation1_Treatment_Class, y=exp_SCORE)) + geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") + geom_boxplot(alpha = 0.7, outlier.shape = NA) + ggtitle("Effect of 1st Escalation Type on Score") + labs(y = "Prognostic PRS", x = "1st Escalation Treatment")

ggplot(clinical_prognostic_initial_class, aes(x = ifelse(is.na(Escalation1_Treatment_Class), "NA", "Other"), y = exp_SCORE)) +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  ggtitle("Effect of 1st Escalation Type on Score") +
  labs(y = "Prognostic PRS", x = "1st Escalation Treatment") +
  scale_x_discrete(labels = c("Other", "NA")) +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

Endoscopic Findings

prognostic_endoscopy <- subset(clinical_prognostic, !is.na(clinical_prognostic$Endoscopy))
prognostic_endoscopy$Endoscopy[prognostic_endoscopy$Endoscopy == "Severe (CT)"] <- "Severe"
anova_prognostic_endoscopy <- aov(exp_SCORE ~ Endoscopy, data = prognostic_endoscopy)
summary(anova_prognostic_endoscopy)
##             Df    Sum Sq   Mean Sq F value Pr(>F)
## Endoscopy    2 2.017e-08 1.008e-08    1.54  0.235
## Residuals   24 1.572e-07 6.548e-09
box_prognostic_endoscopy <- ggplot(prognostic_endoscopy, aes(x=Endoscopy, y=exp_SCORE)) + geom_boxplot() + labs(y = "Prognostic PRS", x = "Endoscopy Findings")
box_prognostic_endoscopy

Disease Extent

prognostic_clinical_extent <- subset(clinical_prognostic, !is.na(clinical_prognostic$Extent))
box_update_extent <- ggplot(prognostic_clinical_extent, aes(x=Extent, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Disease Extent on Prognostic Score") + labs(y = "Prognostic PRS", x = "Disease Extent")
box_update_extent

anova_extent_update <- aov(exp_SCORE ~ Extent, data = prognostic_clinical_extent)
summary(anova_extent_update) # p < 0.32
##             Df    Sum Sq   Mean Sq F value Pr(>F)
## Extent       4 3.266e-08 8.164e-09   1.234  0.323
## Residuals   24 1.588e-07 6.615e-09
prognostic_extensive_binary <- ifelse(grepl("Extensive", prognostic_clinical_extent$Extent), "Extensive", "Not_Extensive")
extensive_or_not_update <- data.frame(prognostic_extensive_binary, prognostic_clinical_extent$exp_SCORE)

ggplot(extensive_or_not_update, aes(x=prognostic_extensive_binary, y=prognostic_clinical_extent$exp_SCORE)) + geom_boxplot() + ggtitle("Extensive vs not extensive on Score") + ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") +   theme_bw() +
  theme(legend.position = "none") +
  labs(y = "Prognostic PRS", x = "Extensive vs Not Extensive") + stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 1.5, pch = 21, stroke = 0.7, fill = "white")