1 Polygenic Risk Score (PRS) Visualisation
2 Principal Component Analysis
- 2.1 PCA with ethnic controls from 1000G study
- 2.2 PCA with just cases and controls
3 Testing PRS association with trait (disease status)
4 Clinical Data incorporated with the Polygenic Risk Score
5 Updated Clinical Data
6 Clinical Outcomes in the ROC group
- 6.0.1 Total number of escalations
7 Predictive power of PRS for IBD prognosis

1 Polygenic Risk Score (PRS) Visualisation

setwd("/home/the28/imputed_data/scores/")
#Reading in the files
PRS_everything <- read.table("PRS_everything_trial2_2.profile", header = T)
PRS_0.05 <- read.table("PRS_0.05.profile", header = T)
PRS_10_5 <- read.table("PRS_10_5.profile", header = T)
PRS_10_8 <- read.table("PRS_10_8.profile", header = T)

## Read in principal components capturing population stratificaton
pcs <- read.table("merged_pca_Shellfish.evecs")

## Accompanying .fam file
fam <- read.table("merged_pca.fam")

pcs <- t(pcs) %>% 
  as.data.frame() %>% 
  mutate(IID = fam$V2) %>% 
  relocate(IID)

colnames(pcs)[2:11] <- paste0("PC",seq(1,10))


# Exponentiate score to transform back to an OR scale, add disease status
PRS_everything <- PRS_everything %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>% 
  left_join(pcs)

## Joining, by = "IID"

table(PRS_everything$status) ## 54 cases, 998 controls

## 
##   0   1 
## 998  54

PRS_0.05 <- PRS_0.05 %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>% 
  left_join(pcs)

## Joining, by = "IID"

PRS_10_5 <- PRS_10_5 %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>% 
  left_join(pcs)

## Joining, by = "IID"

PRS_10_8 <- PRS_10_8 %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>% 
  left_join(pcs)

## Joining, by = "IID"

1.1 Density Plots

p.dist_everything <- ggplot(PRS_everything, aes(x = exp_SCORE, group = status)) +
  geom_density(aes(fill = status), alpha = 0.7) +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD PRS", y = "Density") + ggtitle("p<1")

p.dist_0.05 <- ggplot(PRS_0.05, aes(x = exp_SCORE, group = status)) +
  geom_density(aes(fill = status), alpha = 0.7) +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD PRS", y = "Density") + ggtitle("p<0.05")

p.dist_10_5 <- ggplot(PRS_10_5, aes(x = exp_SCORE, group = status)) +
  geom_density(aes(fill = status), alpha = 0.7) +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD PRS", y = "Density") + ggtitle("p<10^(-5)")

p.dist_10_8 <- ggplot(PRS_10_8, aes(x = exp_SCORE, group = status)) +
  geom_density(aes(fill = status), alpha = 0.7) +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD PRS", y = "Density") + ggtitle("p<10^(-8)")

ggarrange(p.dist_everything, p.dist_0.05, p.dist_10_5, p.dist_10_8)

1.2 Box Plots

p.box_everything <- ggplot(PRS_everything, aes(y = exp_SCORE, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "IBD PRS", x = "Disease status")

p.box_0.05 <- ggplot(PRS_0.05, aes(y = exp_SCORE, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "IBD PRS", x = "Disease status")

p.box_10_5 <- ggplot(PRS_10_5, aes(y = exp_SCORE, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "IBD PRS", x = "Disease status")

p.box_10_8 <- ggplot(PRS_10_8, aes(y = exp_SCORE, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "IBD PRS", x = "Disease status")

ggarrange(p.box_everything, p.box_0.05, p.box_10_5, p.box_10_8)

1.3 Final Visualisation

p_1_visual <- ggarrange(p.dist_everything,p.box_everything)
p_0.05_visual <- ggarrange(p.dist_0.05,p.box_0.05)
p_10_5_visual <- ggarrange(p.dist_10_5,p.box_10_5)
p_10_8_visual <- ggarrange(p.dist_10_8,p.box_10_8)

ggarrange(p_1_visual, p_0.05_visual, p_10_5_visual, p_10_8_visual, labels = c("A","B","C","D"))

2 Principal Component Analysis

2.1 PCA with ethnic controls from 1000G study

setwd("/home/the28/pca_analysis/")

library(tidyverse)
library(ggplot2)
library(hrbrthemes)

meta <- read.csv(file = "1000G_HDGenotypes_SampleInfo.csv")
evec <- read.delim(file = "merged_pca_Shellfish.evecs",sep = " ", header = F)
fam <- read.delim(file = "merged_pca.fam", header = F, sep = " ")
evec <- as.data.frame(t(evec))
evec = evec[-c(nrow(evec)), ]
colnames(evec) <- paste0("PC",seq(1,10))


rownames(evec) <- fam$V2


#ID_name column
ID_name <- fam$V2
evec <- add_column(evec, ID_name)


## Merge evecs and metadata files and IDs
evecs.full <- evec %>% rownames_to_column("Sample") %>% 
  left_join(meta)

## Joining, by = "Sample"

#Generating the case_controls
evecs.full <- evecs.full %>% 
  mutate(case_control = ifelse(!is.na(evecs.full$Population), "2",
                       ifelse(grepl("TRIAD", evecs.full$ID_name), '1', '0')))


df <- evecs.full[c('PC1', 'PC2','Population', 'case_control')]


# basic scatterplot
colour_blind <- c("#ff9045", "#7370fc", "#4aad19", "#cf67ef", "#007c0e", "#ff56cb", "#00a35b", "#e7005e", "#68dc96","#832a92","#b6d167","#0161cb", "#be8800", "#fbaafc","#3b5b23","#ff6ba7", "#60dac3", "#f03c3c","#008d88","#9e2400","#723f7d","#b36500","#a21146","#ecbe8b","#895f00","#ffa190")
  

ggplot(df, aes(x=PC1, y=PC2, group=case_control, fill=Population, color=Population)) + 
  geom_point(aes(shape=case_control,color=Population,size=case_control)) +
  scale_color_manual(values=colour_blind, na.value = "yellow") +
  scale_fill_manual(values=colour_blind, na.value = "black") +
  scale_shape_manual(values = c(21,24,22), labels = c("Controls", "Cases", "1000G Data"), name = "Data Type") +
  scale_size_manual(values = c(4,5,1)) +
  geom_vline(aes(xintercept = 0.002), linetype=3) +
  geom_hline(aes(yintercept = -0.01), linetype=3) +
  geom_hline(aes(yintercept = -0.033), linetype=3) +
  geom_vline(aes(xintercept = 0.025), linetype=3)

##Non-European exclusion trial

european_pops <- c("CEU", "FIN", "GBR", "IBS", "TSI")
european_samples <- filter(evecs.full, Population %in% european_pops)
pc1_mean <- mean(european_samples$PC1)
pc2_mean <- mean(european_samples$PC2)
pc1_sd <- sd(european_samples$PC1)
pc2_sd <- sd(european_samples$PC2)

european_samples$pc1_dist <- abs(european_samples$PC1 - pc1_mean)
european_samples$pc2_dist <- abs(european_samples$PC2 - pc2_mean)

outliers <- subset(european_samples, pc1_dist > 3 * pc1_sd | pc2_dist > 3 * pc2_sd)

2.2 PCA with just cases and controls

setwd("/home/the28/imputed_data/scores/")

p.pcs <- ggplot(PRS_everything, aes(x = PC1, y = PC2)) +
  geom_point(aes(fill = status, shape = status), alpha = 0.7, size = 4) +
  scale_shape_manual(name = "Disease status", values = c(21,24), labels = c("Controls", "Cases")) +
  scale_fill_manual(name = "Disease status", values = c("grey90","grey20"), labels = c("Controls", "Cases")) +
  theme_bw()

p.pcs

p.pcafull_trial <- data.frame(PRS_everything$PC1, PRS_everything$PC2, PRS_everything$status)
colnames(p.pcafull_trial) <- c("PC1","PC2","status")
evecs.named <- evec %>% rownames_to_column("Sample")
merged_evecs <- merge(meta, evecs.named, by = "Sample")
control_genome_dataset <- merged_evecs[c("Sample","PC1","PC2","Population")]
control_genome_dataset$status <- 2
p.pcafull_trial$Population <- NA
control_genome_subset <- control_genome_dataset %>% 
  select(PC1, PC2, status, Population)
merged_PCA <- merge(control_genome_subset, p.pcafull_trial, by = c("PC1", "PC2", "status", "Population"), all.x = TRUE, all.y = TRUE)


# basic scatterplot
colour_blind <- c("#ff9045", "#7370fc", "#4aad19", "#cf67ef", "#007c0e", "#ff56cb", "#00a35b", "#e7005e", "#68dc96","#832a92","#b6d167","#0161cb", "#be8800", "#fbaafc","#3b5b23","#ff6ba7", "#60dac3", "#f03c3c","#008d88","#9e2400","#723f7d","#b36500","#a21146","#ecbe8b","#895f00","#ffa190")

ggplot(merged_PCA, aes(x=PC1, y=PC2, group=status, fill=Population, color=Population)) + 
  geom_point(aes(shape=status,color=Population,size=status)) +
  scale_color_manual(values=colour_blind, na.value = "yellow") +
  scale_fill_manual(values=colour_blind, na.value = "black") +
  scale_shape_manual(values = c(21,24,22), labels = c("Controls", "Cases", "1000G Data"), name = "Data Type") +
  scale_size_manual(values = c(4,5,1)) +
  geom_vline(aes(xintercept = 0.002), linetype=3) +
  geom_hline(aes(yintercept = -0.01), linetype=3) +
  geom_hline(aes(yintercept = -0.033), linetype=3) +
  geom_vline(aes(xintercept = 0.025), linetype=3)

ggplot(control_genome_subset, aes(x=PC1, y=PC2, fill=Population, color=Population)) + 
  geom_point(aes(color=Population)) +
  scale_color_manual(values=colour_blind) +
  scale_fill_manual(values=colour_blind)

3 Testing PRS association with trait (disease status)

3.1 T-test

Looks at whether the means PRS is different between groups

stat.t_everything <- PRS_everything %>% t_test(data = PRS_everything, formula = exp_SCORE ~ status)
stat.t_everything ## p = 0.00000521

## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df          p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl>      <dbl>
## 1 exp_SCORE 0      1        998    54     -5.02  58.7 0.00000521

stat.t_0.05 <- PRS_0.05 %>% t_test(data = PRS_0.05, formula = exp_SCORE ~ status)
stat.t_0.05 ## p = 0.00000544

## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df          p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl>      <dbl>
## 1 exp_SCORE 0      1        998    54     -5.00  58.6 0.00000544

stat.t_10_5 <- PRS_10_5 %>% t_test(data = PRS_10_5, formula = exp_SCORE ~ status)
stat.t_10_5 ## p = 0.000019

## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df        p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>
## 1 exp_SCORE 0      1        998    54     -4.65  59.3 0.000019

stat.t_10_8 <- PRS_10_8 %>% t_test(data = PRS_10_8, formula = exp_SCORE ~ status)
stat.t_10_8 ## p = 0.00197

## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df       p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>
## 1 exp_SCORE 0      1        998    54     -3.24  59.2 0.00197

p_values <- c("p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)")
t_values <- c(stat.t_everything$p, stat.t_0.05$p, stat.t_10_5$p, stat.t_10_8$p)
log_t_values <- as.numeric(format(round(abs(log10(t_values)), 2), nsmall = 2))
t_table <- data.frame(t_values, p_values, log_t_values)


t_graph <- ggplot(data=t_table, aes(x=reorder(p_values, t_values), y=t_values)) +
  geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "T-test p values") 

t_graph

log_t_graph <- ggplot(data=t_table, aes(x=reorder(p_values, -log_t_values), y=log_t_values)) +
  geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "-log10(p) from T-test") +
  geom_text(aes(label=log_t_values), vjust=1.6, color="white", size=3.5) +
  theme_minimal()
                      
log_t_graph

3.2 Logistic Regression

Estimates how disease status varies based on PRS

3.2.1 Initial calculation of p values from logistical regression

stat.reg_everything <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = PRS_everything)
summary(stat.reg_everything)

## 
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), 
##     data = PRS_everything)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7489  -0.3575  -0.2747  -0.2158   2.9775  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -10925       2203  -4.958 7.11e-07 ***
## exp_SCORE      10918       2202   4.957 7.15e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 399.84  on 1050  degrees of freedom
## AIC: 403.84
## 
## Number of Fisher Scoring iterations: 6

coef(summary(stat.reg_everything))[,'Pr(>|z|)']

## (Intercept)   exp_SCORE 
## 7.10608e-07 7.15096e-07

#p = 7.15e-07

stat.reg_0.05 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = PRS_0.05)
summary(stat.reg_0.05)

## 
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), 
##     data = PRS_0.05)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7692  -0.3563  -0.2796  -0.2141   3.0845  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3428.3      690.9  -4.962 6.96e-07 ***
## exp_SCORE     3423.4      690.4   4.959 7.10e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 399.81  on 1050  degrees of freedom
## AIC: 403.81
## 
## Number of Fisher Scoring iterations: 6

coef(summary(stat.reg_0.05))[,'Pr(>|z|)']

##  (Intercept)    exp_SCORE 
## 6.961371e-07 7.102635e-07

#p = 7.10e-07

stat.reg_10_5 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = PRS_10_5)
summary(stat.reg_10_5)

## 
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), 
##     data = PRS_10_5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8773  -0.3544  -0.2874  -0.2340   2.8278  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -223.62      50.11  -4.462 8.11e-06 ***
## exp_SCORE     221.41      50.24   4.407 1.05e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 406.32  on 1050  degrees of freedom
## AIC: 410.32
## 
## Number of Fisher Scoring iterations: 6

coef(summary(stat.reg_10_5))[,'Pr(>|z|)']

##  (Intercept)    exp_SCORE 
## 8.107217e-06 1.046227e-05

#p = 1.05e-05

stat.reg_10_8 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = PRS_10_8)
summary(stat.reg_10_8)

## 
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), 
##     data = PRS_10_8)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7016  -0.3516  -0.3073  -0.2632   2.7381  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   -92.10      28.45  -3.237  0.00121 **
## exp_SCORE      90.30      28.78   3.137  0.00171 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 416.27  on 1050  degrees of freedom
## AIC: 420.27
## 
## Number of Fisher Scoring iterations: 6

coef(summary(stat.reg_10_8))[,'Pr(>|z|)']

## (Intercept)   exp_SCORE 
## 0.001208313 0.001705098

#p = 0.00171

3.2.2 Initial comparison of p-threshold values on logistical regression p values

p_values_regression <- as.numeric(c("7.15e-07","7.10e-07","1.05e-05","0.00171"))
log_p_regression <- as.numeric(format(round((abs(log10(p_values_regression))) , 2), nsmall = 2))
       
                         
                             
regression_table <- data.frame(p_values, p_values_regression, log_p_regression)

regression_graph_unedited <- ggplot(data=regression_table, aes(x=p_values, y=p_values_regression)) +
  geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "Logistical Regression p values") 

regression_graph_unedited

log_regression_graph_unedited <- ggplot(data=regression_table, aes(x=reorder(p_values, -log_p_regression), y=log_p_regression)) +
  geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "-log10(p values) from logistical regression") + geom_text(aes(label=format(round(log_p_regression, 3), nsmall = 3)),vjust=1.6, color="white", size=3.5)


log_regression_graph_unedited

3.2.3 Is there a statistical difference in PC1 values between cases and controls

p.PC1 <- ggplot(PRS_everything, aes(y = PC1, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 3, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PC1", x = "Disease status") +
  scale_x_discrete(labels = c("0" = "Healthy Controls", "1" = "Cases"))


p.PC1

p.PC2 <- ggplot(PRS_everything, aes(y = PC2, x = status)) +
  geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 3, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PC2", x = "Disease status") +
  scale_x_discrete(labels = c("0" = "Healthy Controls", "1" = "Cases"))


p.PC2

wilcox.test(PC2 ~ status, data = PRS_everything)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  PC2 by status
## W = 33480, p-value = 0.002663
## alternative hypothesis: true location shift is not equal to 0

wilcox.test(PC3 ~ status, data = PRS_everything)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  PC3 by status
## W = 25164, p-value = 0.4125
## alternative hypothesis: true location shift is not equal to 0

p.pcs <- ggplot(PRS_everything, aes(x = PC1, y = PC2)) +
  geom_point(aes(fill = status, shape = status), alpha = 0.7, size = 4) +
  scale_shape_manual(name = "Disease status", values = c(21,24), labels = c("Healthy Controls", "Cases")) +
  scale_fill_manual(name = "Disease status", values = c("grey90","grey20"), labels = c("Healthy Controls", "Cases")) +
  theme_bw()

ggarrange(p.pcs, p.PC1, p.PC2, labels = c("A","B", "C"), nrow = 1)

###Logistic regression with PC correction

## ==== Logistic regression with PC correction ==== ##
stat.PCs.reg_everything <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = PRS_everything)

summary(stat.PCs.reg_everything)

## 
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"), 
##     data = PRS_everything)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5280  -0.3286  -0.2273  -0.1629   3.1827  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -11146.01    2363.00  -4.717 2.39e-06 ***
## exp_SCORE    11157.61    2362.24   4.723 2.32e-06 ***
## PC1           1583.40     433.65   3.651 0.000261 ***
## PC2            -15.83     345.59  -0.046 0.963472    
## PC3             96.39     186.47   0.517 0.605219    
## PC4            104.66      73.16   1.431 0.152516    
## PC5             28.83      61.50   0.469 0.639228    
## PC6            -66.87      40.89  -1.635 0.102004    
## PC7             16.67      54.11   0.308 0.758055    
## PC8            -72.38      44.19  -1.638 0.101441    
## PC9             38.04      37.67   1.010 0.312612    
## PC10            76.60      32.64   2.347 0.018936 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 357.33  on 1040  degrees of freedom
## AIC: 381.33
## 
## Number of Fisher Scoring iterations: 6

coef(summary(stat.PCs.reg_everything))[,'Pr(>|z|)']

##  (Intercept)    exp_SCORE          PC1          PC2          PC3          PC4 
## 2.394677e-06 2.320200e-06 2.608502e-04 9.634717e-01 6.052193e-01 1.525160e-01 
##          PC5          PC6          PC7          PC8          PC9         PC10 
## 6.392278e-01 1.020043e-01 7.580546e-01 1.014410e-01 3.126120e-01 1.893577e-02

#p = 2.32e-06

stat.PCs.reg_0.05 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = PRS_0.05)

summary(stat.PCs.reg_0.05)

## 
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"), 
##     data = PRS_0.05)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5206  -0.3291  -0.2314  -0.1655   3.2597  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3379.37     725.27  -4.659 3.17e-06 ***
## exp_SCORE    3393.88     725.05   4.681 2.86e-06 ***
## PC1          1582.03     429.68   3.682 0.000231 ***
## PC2           -36.03     342.98  -0.105 0.916334    
## PC3            98.42     186.23   0.528 0.597171    
## PC4           106.25      73.49   1.446 0.148232    
## PC5            30.47      61.31   0.497 0.619237    
## PC6           -71.35      40.81  -1.748 0.080394 .  
## PC7            20.09      54.30   0.370 0.711427    
## PC8           -66.99      44.21  -1.515 0.129734    
## PC9            33.47      37.24   0.899 0.368784    
## PC10           75.43      32.76   2.302 0.021323 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 357.78  on 1040  degrees of freedom
## AIC: 381.78
## 
## Number of Fisher Scoring iterations: 6

coef(summary(stat.PCs.reg_0.05))[,'Pr(>|z|)']

##  (Intercept)    exp_SCORE          PC1          PC2          PC3          PC4 
## 3.170562e-06 2.856442e-06 2.314931e-04 9.163339e-01 5.971708e-01 1.482323e-01 
##          PC5          PC6          PC7          PC8          PC9         PC10 
## 6.192367e-01 8.039444e-02 7.114266e-01 1.297336e-01 3.687843e-01 2.132294e-02

#p = 2.86e-06

stat.PCs.reg_10_5 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = PRS_10_5)

summary(stat.PCs.reg_10_5)

## 
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"), 
##     data = PRS_10_5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6501  -0.3319  -0.2482  -0.1777   3.0001  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -197.64      52.37  -3.774 0.000161 ***
## exp_SCORE     215.05      52.57   4.091 4.29e-05 ***
## PC1          1509.84     430.81   3.505 0.000457 ***
## PC2          -128.35     351.58  -0.365 0.715062    
## PC3           123.94     187.59   0.661 0.508808    
## PC4            96.80      74.20   1.305 0.192062    
## PC5            43.31      59.70   0.726 0.468126    
## PC6           -74.95      40.40  -1.855 0.063586 .  
## PC7            24.47      53.52   0.457 0.647539    
## PC8           -66.42      43.68  -1.520 0.128392    
## PC9            28.93      37.33   0.775 0.438285    
## PC10           84.26      32.34   2.605 0.009175 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 364.26  on 1040  degrees of freedom
## AIC: 388.26
## 
## Number of Fisher Scoring iterations: 6

coef(summary(stat.PCs.reg_10_5))[,'Pr(>|z|)']

##  (Intercept)    exp_SCORE          PC1          PC2          PC3          PC4 
## 1.605232e-04 4.294042e-05 4.572746e-04 7.150617e-01 5.088082e-01 1.920621e-01 
##          PC5          PC6          PC7          PC8          PC9         PC10 
## 4.681259e-01 6.358647e-02 6.475389e-01 1.283924e-01 4.382855e-01 9.174659e-03

#p = 4.29e-05

stat.PCs.reg_10_8 <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = PRS_10_8)

summary(stat.PCs.reg_10_8)

## 
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"), 
##     data = PRS_10_8)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6995  -0.3348  -0.2577  -0.1924   2.9927  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -69.21      29.66  -2.333  0.01963 *  
## exp_SCORE      86.98      29.93   2.906  0.00366 ** 
## PC1          1518.35     426.41   3.561  0.00037 ***
## PC2          -115.09     348.01  -0.331  0.74086    
## PC3           119.95     186.99   0.641  0.52121    
## PC4            94.21      73.59   1.280  0.20050    
## PC5            45.09      59.04   0.764  0.44501    
## PC6           -75.85      40.05  -1.894  0.05822 .  
## PC7            19.26      53.43   0.361  0.71847    
## PC8           -68.74      43.09  -1.595  0.11064    
## PC9            29.63      36.82   0.805  0.42105    
## PC10           86.40      31.97   2.703  0.00688 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.88  on 1051  degrees of freedom
## Residual deviance: 373.09  on 1040  degrees of freedom
## AIC: 397.09
## 
## Number of Fisher Scoring iterations: 6

coef(summary(stat.PCs.reg_10_8))[,'Pr(>|z|)']

##  (Intercept)    exp_SCORE          PC1          PC2          PC3          PC4 
## 0.0196296236 0.0036630415 0.0003698046 0.7408583514 0.5212126251 0.2004982348 
##          PC5          PC6          PC7          PC8          PC9         PC10 
## 0.4450090313 0.0582237356 0.7184700441 0.1106427260 0.4210510997 0.0068767880

#p = 0.00366

3.2.4 Comparison of p-threshold levels on logistical regression p value with and without PC correction

p_value_regression_pca <- as.numeric(c("2.32e-06","2.86e-06","4.29e-05","0.00366"))
log_p_regression_pca <- as.numeric(format(round((abs(log10(p_value_regression_pca))), 2), nsmall = 2))
merged_log <- c(log_p_regression, log_p_regression_pca)
merged_p_values <- c("p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)","p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)")
merged_p_values <- factor(merged_p_values, levels=c("p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)"))

which_p_value <- c("Initial", "Initial", "Initial", "Initial", "PCA", "PCA", "PCA", "PCA")

p_value_combined <- data.frame(merged_log, merged_p_values, which_p_value)

ggplot(data=p_value_combined, aes(x=merged_p_values, y=merged_log, fill=which_p_value)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=format(round(merged_log, 2), nsmall = 2)), vjust=1.6, color="white",
            position = position_dodge(0.9), size=5.5)+
  scale_fill_brewer(palette="Paired") + labs(x = "p threshold values", y = "-log10(p values)", fill="PC correction")

p_value_PC_inclusion <- c(7.15e-7, 7.10e-7, 1.05e-5, 1.71e-3, 2.32e-6, 2.86e-6, 4.29e-5, 3.66e-3)

p_value_table_logistic <- data.frame(p_value_PC_inclusion, which_p_value)

wilcox.test(p_value_PC_inclusion ~ which_p_value, data = p_value_table_logistic)

## 
##  Wilcoxon rank sum test
## 
## data:  p_value_PC_inclusion by which_p_value
## W = 5, p-value = 0.4857
## alternative hypothesis: true location shift is not equal to 0

3.3 Pseudo-R^2

3.3.1 Initial calculation of Pseudo-R^2 without PCs

# Pseudo R^2 can be calculated as an indication of variance in disease status explained by PRS
# Use rms package (score needs to be scaled)

stat.reg2_everything <- rms::lrm(formula = status ~ scale(exp_SCORE) , data = PRS_everything)
#print(stat.reg2_everything) #R2 = 0.073

stat.reg2_0.05 <- rms::lrm(formula = status ~ scale(exp_SCORE) , data = PRS_0.05)
#print(stat.reg2_0.05) #R2 = 0.074

stat.reg2_10_5 <- rms::lrm(formula = status ~ scale(exp_SCORE) , data = PRS_10_5)
#print(stat.reg2_10_5) #R2 = 0.055

stat.reg2_10_8 <- rms::lrm(formula = status ~ scale(exp_SCORE) , data = PRS_10_8)
#print(stat.reg2_10_8) #R2 = 0.027

pseudo_R2_initial_values <- as.numeric(c("0.073","0.074","0.055","0.027"))
pseudo_R2_initial_table <- data.frame(pseudo_R2_initial_values, p_values)
pseudo_R2_initial_graph <- ggplot(data=pseudo_R2_initial_table, aes(x=p_values, y=pseudo_R2_initial_values)) +
  geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "Pseudo R^2 Values") + geom_text(aes(label=pseudo_R2_initial_values), vjust=1.6, color="white", size=3.5)

pseudo_R2_initial_graph

3.3.2 Calculation of Pseudo-R^2 with PCs

# Calculate pseudo R^2
stat.PCs.reg2_everything <- rms::lrm(
  formula = status ~ scale(exp_SCORE) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = PRS_everything)

#R2=0.189

stat.PCs.reg2_0.05 <- rms::lrm(
  formula = status ~ scale(exp_SCORE) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = PRS_0.05)

#R2=0.188

stat.PCs.reg2_10_5 <- rms::lrm(
  formula = status ~ scale(exp_SCORE) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = PRS_10_5)

#R2=0.171

stat.PCs.reg2_10_8 <- rms::lrm(
  formula = status ~ scale(exp_SCORE) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = PRS_10_8)

#R2=0.147

3.3.3 Final Pseudo-R^2 graph

## Generate bar graph with x = P-value threshold for PRS calculation, y = pseudo R2
merged_pseudo_R2_values <- as.numeric(c(pseudo_R2_initial_values, "0.189", "0.188", "0.171", "0.147"))
merged_pseudo_R2_table <- data.frame(merged_p_values, which_p_value, merged_pseudo_R2_values)

merged_pseudo_R2_table$merged_p_values <- factor(merged_pseudo_R2_table$merged_p_values, levels=c("p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)"))


ggplot(data=merged_pseudo_R2_table, aes(x=merged_p_values, y=merged_pseudo_R2_values, fill=which_p_value)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=format(round(merged_pseudo_R2_values, 3), nsmall = 3)), vjust=1.6, color="white",
            position = position_dodge(0.9), size=4.75)+
  scale_fill_brewer(palette="Paired") + labs(x = "p threshold values", y = "Pseudo R^2 values", fill="PC correction")

pseudo_comparison_values <- c(0.189, 0.188, 0.171, 0.147, 0.073, 0.074, 0.055, 0.027)
pseudo_comparison_names <- c("with","with","with","with", "no", "no","no","no")
pseudo_comparison_table <- data.frame(pseudo_comparison_values, pseudo_comparison_names)
wilcox.test(pseudo_comparison_values ~ pseudo_comparison_names, data = pseudo_comparison_table)

## 
##  Wilcoxon rank sum test
## 
## data:  pseudo_comparison_values by pseudo_comparison_names
## W = 0, p-value = 0.02857
## alternative hypothesis: true location shift is not equal to 0

3.4 ROC Curves

Used to calculate PRS sensitivity and specificity for IBD diagnosis

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## GLM (as above)

stat.PCs.reg_everything <- glm(
  formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
  family = binomial(link = "logit"), data = PRS_everything)

## Predict disease status from model
status.pred_everything <- predict(stat.PCs.reg_everything, newdata = PRS_everything, type = "response")
status.pred_everything <- ifelse(status.pred_everything > 0.20, 1, 0) ## IBD prediction >20% 

pred.res <- data.frame(IID = PRS_everything$IID,
                       status = PRS_everything$status, 
                       status_pred_everything = status.pred_everything)

## Compare results
table(PRS_everything$status, status.pred_everything)

##    status.pred_everything
##       0   1
##   0 982  16
##   1  41  13

## Calculate AUC (area under the curve, ie. prediction accuracy)


par(mfrow=c(2,2))


roc.score_everything <- roc(PRS_everything$status, status.pred_everything)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc.score_everything

## 
## Call:
## roc.default(response = PRS_everything$status, predictor = status.pred_everything)
## 
## Data: status.pred_everything in 998 controls (PRS_everything$status 0) < 54 cases (PRS_everything$status 1).
## Area under the curve: 0.6124

plot.roc(roc.score_everything, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
text(1.75, 0.9, "A", cex=1.5, font=2)

#p<0.05
status.pred_0.05 <- predict(stat.PCs.reg_0.05, newdata = PRS_0.05, type = "response")
status.pred_0.05 <- ifelse(status.pred_0.05 > 0.20, 1, 0) 
roc.score_0.05 <- roc(PRS_0.05$status, status.pred_0.05)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(roc.score_0.05, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
text(1.75, 0.9, "B", cex=1.5, font=2)

#p < 1*10^(5)
status.pred_10_5 <- predict(stat.PCs.reg_10_5, newdata = PRS_10_5, type = "response")
status.pred_10_5 <- ifelse(status.pred_10_5 > 0.20, 1, 0) 
roc.score_10_5 <- roc(PRS_10_5$status, status.pred_10_5)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(roc.score_10_5, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
text(1.75, 0.9, "C", cex=1.5, font=2)

#p < 1*10^(8)
status.pred_10_8 <- predict(stat.PCs.reg_10_8, newdata = PRS_10_8, type = "response")
status.pred_10_8 <- ifelse(status.pred_10_8 > 0.20, 1, 0) 
roc.score_10_8 <- roc(PRS_10_8$status, status.pred_10_8)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(roc.score_10_8, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
text(1.75, 0.9, "D", cex=1.5, font=2)

3.4.1 AUC Trial update

library(pROC)


## Calculate AUC (area under the curve, ie. prediction accuracy)

roc.score_everything_trial <- roc(PRS_everything$status, status.pred_everything)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc.score_everything_trial

## 
## Call:
## roc.default(response = PRS_everything$status, predictor = status.pred_everything)
## 
## Data: status.pred_everything in 998 controls (PRS_everything$status 0) < 54 cases (PRS_everything$status 1).
## Area under the curve: 0.6124

## Calculate 1-specificity values
spec_values <- roc.score_everything$specificities
one_spec_values <- 1 - spec_values

## Plot ROC curve with 1-specificity on the x-axis
plot(one_spec_values, roc.score_everything$sensitivities, 
     col = "blue", type = "l", lwd = 2, 
     xlab = "1 - Specificity", ylab = "Sensitivity",
     xlim = c(0, 1), ylim = c(0, 1))
text(0.1, 0.9, paste("AUC =", round(roc.score_everything$auc, 2)), cex=1.5, font=2)

3.4.2 AUC values according to p-threshold

AUC_values <- as.numeric(c("0.612", "0.593", "0.577", "0.560"))
p_values_factor <- c(1, 0.05, 1e-5, 1e-8)
AUC_table <- data.frame(AUC_values, p_values, p_values_factor)

AUC_graph <- ggplot(data=AUC_table, aes(x=reorder(p_values, -AUC_values), y=AUC_values)) +
  geom_bar(stat="identity", fill="steelblue", width = 0.5) + labs(x = "p threshold values", y = "AUC Values") + geom_text(aes(label=AUC_values), vjust=1.6, color="white", size=3.5)

AUC_graph

one.WAY_AUC <- aov(AUC_values ~ p_values_factor, data = AUC_table)
summary(one.WAY_AUC)

##                 Df    Sum Sq   Mean Sq F value Pr(>F)
## p_values_factor  1 0.0009937 0.0009937   4.078  0.181
## Residuals        2 0.0004873 0.0002437

kruskal.test(AUC_values ~ p_values_factor, data = AUC_table)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  AUC_values by p_values_factor
## Kruskal-Wallis chi-squared = 3, df = 3, p-value = 0.3916

4 Clinical Data incorporated with the Polygenic Risk Score

setwd("/home/the28/imputed_data/scores/clinical_data/")
clinical_data <- read.csv("IBD_ClinicalMetadata_forTE.csv")
clinical_data <- clinical_data[-c(1)]
merged_clinical_initial <- merge(PRS_everything, clinical_data, by.x = "IID", by.y = "Patient_ID", all.x = TRUE, all.y = FALSE)
merged_clinical <- subset(merged_clinical_initial, grepl("TRIAD", merged_clinical_initial$IID))
merged_clinical <- merged_clinical %>% mutate(exp_SCORE = exp(SCORE))
merged_clinical_final <- subset(merged_clinical, !is.na(merged_clinical$Birth_Year))

4.1 Effect of CD vs UC on PRS

##CD vs UC on the Score##
box_diagnosis <- ggplot(merged_clinical_final, aes(x=Diagnosis, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Crohn's Disease vs Ulcerative Colitis on Polygenic Risk Score")
box_diagnosis + stat_compare_means(method = "t.test")

4.2 Gender on PRS

#Gender on Score
box_gender <- ggplot(merged_clinical_final, aes(x=Gender, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Gender on Score")
box_gender + stat_compare_means(method = "t.test")

4.3 CRP on PRS

#CRP Levels affect on Score
merged_clinical_CRP <- subset(merged_clinical_final, !is.na(merged_clinical_final$CRP))
scatter_CRP <- ggplot(merged_clinical_CRP, aes(x=CRP, y=exp_SCORE)) + geom_point() + labs(y = "PRS", x = "CRP") + geom_smooth(method=lm,  linetype="dashed",
                                                                                             color="darkred", fill="blue") 
scatter_CRP

## `geom_smooth()` using formula = 'y ~ x'

lm_CRP <- lm(exp_SCORE~CRP, data=merged_clinical_CRP)
summary(lm_CRP)

## 
## Call:
## lm(formula = exp_SCORE ~ CRP, data = merged_clinical_CRP)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.162e-04 -3.680e-05 -7.172e-06  4.296e-05  1.120e-04 
## 
## Coefficients:
##              Estimate Std. Error   t value Pr(>|t|)    
## (Intercept) 1.000e+00  1.613e-05 62033.823   <2e-16 ***
## CRP         5.139e-07  2.605e-07     1.973   0.0618 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.067e-05 on 21 degrees of freedom
## Multiple R-squared:  0.1564, Adjusted R-squared:  0.1162 
## F-statistic: 3.892 on 1 and 21 DF,  p-value: 0.06182

cor.test(merged_clinical_CRP$CRP, merged_clinical_CRP$exp_SCORE, method=c("pearson"))

## 
##  Pearson's product-moment correlation
## 
## data:  merged_clinical_CRP$CRP and merged_clinical_CRP$exp_SCORE
## t = 1.9728, df = 21, p-value = 0.06182
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.02004913  0.69443483
## sample estimates:
##       cor 
## 0.3954209

4.4 Extent of Disease on PRS

#Extent of Disease on Score
merged_clinical_extent <- subset(merged_clinical_final, !is.na(merged_clinical_final$Extent))
box_extent <- ggplot(merged_clinical_extent, aes(x=Extent, y=exp_SCORE)) + geom_boxplot() + ggtitle("Disease Extent on PRS") + labs(y="PRS", x="Disease Extent") + annotate("text", x = 'Extensive', y = 1.0003, label="p-value: 0.59")
res.aov <- aov(exp_SCORE ~ Extent, data = merged_clinical_extent)
summary(res.aov)

##             Df    Sum Sq   Mean Sq F value Pr(>F)
## Extent       3 1.046e-08 3.488e-09   0.655  0.589
## Residuals   21 1.119e-07 5.327e-09

## Extensive vs not extensive
merged_clinical_extensive <- ifelse(grepl("Extensive", merged_clinical_extent$Extent), "Extensive", "Not_Extensive")
extensive_or_not <- data.frame(merged_clinical_extensive, merged_clinical_extent$exp_SCORE)

binary_extensive <- ggplot(extensive_or_not, aes(x=merged_clinical_extensive, y=merged_clinical_extent$exp_SCORE)) + geom_boxplot() + ggtitle("Extensive vs not extensive on Score") + ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") +   theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PRS", x = "Extensive vs Not Extensive") + stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 1.5, pch = 21, stroke = 0.7, fill = "white")

ggarrange(box_extent, binary_extensive, labels = c("A","B"))

4.5 IBDhigh vs IBDlow on PRS

#IBDhi vs IBDlo
merged_clinical_hilo <- subset(merged_clinical_final, !is.na(merged_clinical_final$PAX))
box_hilo <- ggplot(merged_clinical_hilo, aes(x=PAX, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of IBDhigh vs IBDlow on Score")
box_hilo + stat_compare_means(method = "t.test")

4.6 Effect of different escalations on PRS

4.6.1 Escalations vs. No Escalations on PRS

#Escalations vs. no escalations 
effect_escalations <- data.frame(merged_clinical_final$IID, merged_clinical_final$Total_escalations, merged_clinical_final$exp_SCORE)
effect_escalations <- subset(effect_escalations, !is.na(effect_escalations$merged_clinical_final.Total_escalations))
effect_escalations_binary <- ifelse(grepl("0", effect_escalations$merged_clinical_final.Total_escalations), "No_Escalation", "Escalation")
box_binary_table <- data.frame(effect_escalations, effect_escalations_binary)

box_escalation_binary <- ggplot(box_binary_table, aes(x = effect_escalations_binary, y = merged_clinical_final.exp_SCORE)) + 
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Escalations vs No escalations", values = c("grey70","grey20")) +  
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PRS Score", x = "Escalations vs No Escalations") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

box_escalation_binary

4.6.2 Total Escalations on PRS

#Total Escalations on Polygenic Risk Score
merged_clinical_escalations <- subset(merged_clinical_final, !is.na(merged_clinical_final$Total_escalations))
Total_Escalations <- as.character(merged_clinical_escalations$Total_escalations)
box_escalation <- ggplot(merged_clinical_escalations, aes(x=Total_Escalations, y=exp_SCORE)) + geom_boxplot()
box_escalation

4.6.3 Initial Treatment Class on PRS

#Initial Treatment Class on Polygenic Risk Score
merged_clinical_treatmentclass <- subset(merged_clinical_final, !is.na(merged_clinical_final$Initial_treatment_Class))
box_treatment_class <- ggplot(merged_clinical_treatmentclass, aes(x=Initial_treatment_Class, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Initial Treatment Class on Score")
box_treatment_class

4.6.4 Escalation 1 on Score

#Escalation 1 on Score
trial <-subset(merged_clinical_final, c("Immunosuppressant", "Biologic",    "Surgery") %in% merged_clinical_final$Escalation1_Treatment_Class)
trial2 <- subset(trial, !is.na(trial$Escalation1_Treatment_Class))
test <-  trial2$IID == "TRIAD1481"
trial3 <- subset(trial2, !test)
test2 <-  trial3$IID == "TRIAD1487"
merged_clinical_escalation_1 <- subset(trial3, !test2)
box_escalation1 <- ggplot(merged_clinical_escalation_1, aes(x=Escalation1_Treatment_Class, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of First Escalation on Score")
box_escalation1

5 Updated Clinical Data

setwd("/home/the28/imputed_data/scores/clinical_data/updated_clinical_data/")
updated_clinical_data <- read.csv("updated_clinical_data.csv")
PRS_everything <- read.table("PRS_everything_trial2_2.profile", header = T)
PRS_everything <- PRS_everything %>% 
  mutate(exp_SCORE = exp(SCORE),
         status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0)))

merged_update_clinical_initial <- merge(PRS_everything, updated_clinical_data, by.x = "IID", by.y = "Patient_ID", all.x = TRUE, all.y = FALSE)
merged_update_clinical <- subset(merged_update_clinical_initial, grepl("TRIAD", merged_update_clinical_initial$IID))
merged_update_clinical_final <- subset(merged_update_clinical, !is.na(merged_update_clinical$DOB))

5.1 Escalations vs No Escalations

updated_clinical_escalations <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Total_escalations))
escalataions_year <- updated_clinical_escalations$Total_escalations/(updated_clinical_escalations$FollowUp_Days/365)
escalations_df <- data.frame(updated_clinical_escalations$exp_SCORE, updated_clinical_escalations$Total_escalations, escalataions_year)
colnames(escalations_df) <- c("exp_SCORE","Escalations", "Escalation_rate")

escalation_binary <- ifelse(grepl("0", updated_clinical_escalations$Total_escalations), "No_Escalation", "Escalation")
escalation_binary_table <- data.frame(escalation_binary, updated_clinical_escalations$exp_SCORE)

box_escalation_updated_binary <- ggplot(escalation_binary_table, aes(x = escalation_binary, y = updated_clinical_escalations.exp_SCORE)) + 
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  scale_fill_manual(name = "Escalations vs No escalations", values = c("grey70","grey20")) +  
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PRS Score", x = "Escalations vs No Escalations") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

box_escalation_updated_binary

density_escalation_updated_binary <- ggplot(escalation_binary_table, aes(x = updated_clinical_escalations.exp_SCORE, group = escalation_binary)) +
  geom_density(aes(fill = escalation_binary), alpha = 0.7) +
  scale_fill_manual(name = "Escalation Status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD PRS", y = "Density")

density_escalation_updated_binary

ggarrange(box_escalation_updated_binary, density_escalation_updated_binary, ncol = 1, nrow = 2, labels = c("A", "B"))

5.2 Total number of escalation

lm_escalation_rate <- lm(exp_SCORE~Escalation_rate, data=escalations_df)
summary(lm_escalation_rate)

## 
## Call:
## lm(formula = exp_SCORE ~ Escalation_rate, data = escalations_df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.445e-04 -4.736e-05  5.957e-06  6.520e-05  1.246e-04 
## 
## Coefficients:
##                  Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)     1.000e+00  1.527e-05 65509.959   <2e-16 ***
## Escalation_rate 2.492e-05  2.181e-05     1.143    0.262    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.42e-05 on 30 degrees of freedom
## Multiple R-squared:  0.04172,    Adjusted R-squared:  0.009773 
## F-statistic: 1.306 on 1 and 30 DF,  p-value: 0.2622

#0.04172

update_scatter_escalation_rate <- ggplot(escalations_df, aes(x=Escalation_rate , y=exp_SCORE)) + geom_point() + annotate("text", x = 2.1, y = 1.00032, label="p-value = 0.26") + annotate("text", x = 2.1, y = 1.0003, label="R^2 = 0.04172") + labs(y = "PRS", x = "Escalation rate (Escalation per year of Follow Up)") + geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="blue") 
update_scatter_escalation_rate

## `geom_smooth()` using formula = 'y ~ x'

## Effect of Surgeries

updated_surgeries <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Total_IBD_surgeries))

box_surgeries <- ggplot(updated_surgeries, aes(x = as.character(Total_IBD_surgeries), y = exp_SCORE)) + 
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PRS Score", x = "Surgery vs No Surgery") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")


box_surgeries

density_surgeries <- ggplot(updated_surgeries, aes(x = exp_SCORE, group = as.character(Total_IBD_surgeries))) +
  geom_density(aes(fill = as.character(Total_IBD_surgeries)), alpha = 0.7) +
  scale_fill_manual(name = "Surgery Status", values = c("grey70","grey20")) +
  theme_bw() +
  labs(x = "IBD PRS", y = "Density")

density_surgeries

ggarrange(box_surgeries, density_surgeries, ncol = 1, nrow = 2, labels = c("A", "B"))

5.3 Disease Extent

setwd("/home/the28/imputed_data/scores/clinical_data/updated_clinical_data/")
updated_clinical_extent <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Extent))
box_update_extent <- ggplot(updated_clinical_extent, aes(x=Extent, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Disease Extent on Score") + labs(y = "PRS", x = "Disease Extent")
box_update_extent

anova_extent_update <- aov(exp_SCORE ~ Extent, data = updated_clinical_extent)
summary(anova_extent_update)

##             Df    Sum Sq   Mean Sq F value Pr(>F)
## Extent       4 1.132e-08 2.831e-09   0.427  0.788
## Residuals   24 1.592e-07 6.635e-09

# Extensive vs not extensive
updated_extensive_binary <- ifelse(grepl("Extensive", updated_clinical_extent$Extent), "Extensive", "Not_Extensive")
extensive_or_not_update <- data.frame(updated_extensive_binary, updated_clinical_extent$exp_SCORE)

ggplot(extensive_or_not_update, aes(x=updated_extensive_binary, y=updated_clinical_extent$exp_SCORE)) + geom_boxplot() + ggtitle("Extensive vs not extensive on Score") + ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") +   theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PRS", x = "Extensive vs Not Extensive") + stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 1.5, pch = 21, stroke = 0.7, fill = "white")

#Endoscopy effect
updated_endoscopy <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Endoscopy))
updated_endoscopy$Endoscopy[updated_endoscopy$Endoscopy == "Severe (CT)"] <- "Severe"
anova_endoscopy <- aov(exp_SCORE ~ Endoscopy, data = updated_endoscopy)
summary(anova_endoscopy)

##             Df    Sum Sq   Mean Sq F value Pr(>F)
## Endoscopy    2 6.710e-09 3.355e-09   0.588  0.563
## Residuals   24 1.369e-07 5.704e-09

box_update_endoscopy <- ggplot(updated_endoscopy, aes(x=Endoscopy, y=exp_SCORE)) + geom_boxplot() + labs(y = "PRS", x = "Endoscopy Findings") + annotate("text", x = "Mild", y = 1.0003, label="p-value = 0.56")
box_update_endoscopy

updated_endoscopy_moderate <- subset(updated_endoscopy, !grepl("Mild", updated_endoscopy$Endoscopy))

boxplot_endoscopy_moderate <- ggplot(updated_endoscopy_moderate, aes(x = Endoscopy, y = exp_SCORE)) + 
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "PRS Score", x = "Endoscopy Findings") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

boxplot_endoscopy_moderate

5.4 Endoscopy finding

ggarrange(box_update_endoscopy, boxplot_endoscopy_moderate, nrow = 2, ncol = 1, labels = c("A","B"))

5.5 CRP

updated_CRP <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$CRP))
lm_CRP <- lm(exp_SCORE~CRP, data=updated_CRP)
summary(lm_CRP)

## 
## Call:
## lm(formula = exp_SCORE ~ CRP, data = updated_CRP)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.190e-04 -4.070e-05  1.139e-05  5.128e-05  1.095e-04 
## 
## Coefficients:
##              Estimate Std. Error   t value Pr(>|t|)    
## (Intercept) 1.000e+00  1.532e-05 65298.762   <2e-16 ***
## CRP         5.197e-07  2.624e-07     1.981   0.0592 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.284e-05 on 24 degrees of freedom
## Multiple R-squared:  0.1405, Adjusted R-squared:  0.1047 
## F-statistic: 3.923 on 1 and 24 DF,  p-value: 0.0592

cor.test(updated_CRP$CRP, updated_CRP$exp_SCORE, method=c("pearson"))

## 
##  Pearson's product-moment correlation
## 
## data:  updated_CRP$CRP and updated_CRP$exp_SCORE
## t = 1.9807, df = 24, p-value = 0.0592
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01465053  0.66554903
## sample estimates:
##       cor 
## 0.3748285

scatter_CRP <- ggplot(updated_CRP, aes(x=CRP, y=exp_SCORE)) + geom_point() + labs(y = "PRS", x = "CRP") + annotate("text", x = 150, y = 1.000365, label="p-value = 0.059") + annotate("text", x = 150, y = 1.000345, label="R^2 = 0.141") + geom_smooth(method=lm,  linetype="dashed",                                                                                                                            color="darkred", fill="blue") + ggtitle("CRP on PRS")
scatter_CRP

## `geom_smooth()` using formula = 'y ~ x'

5.6 Hb

updated_Hb <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Hb))
lm_Hb <- lm(exp_SCORE~Hb, data=updated_Hb)
summary(lm_Hb)

## 
## Call:
## lm(formula = exp_SCORE ~ Hb, data = updated_Hb)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.328e-04 -6.111e-05  2.057e-05  5.084e-05  1.118e-04 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  1.001e+00  1.026e-04 9749.773   <2e-16 ***
## Hb          -8.698e-06  8.030e-06   -1.083    0.289    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.214e-05 on 25 degrees of freedom
## Multiple R-squared:  0.04483,    Adjusted R-squared:  0.006628 
## F-statistic: 1.173 on 1 and 25 DF,  p-value: 0.289

cor.test(updated_Hb$Hb, updated_Hb$exp_SCORE, method=c("pearson"))

## 
##  Pearson's product-moment correlation
## 
## data:  updated_Hb$Hb and updated_Hb$exp_SCORE
## t = -1.0833, df = 25, p-value = 0.289
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5476856  0.1829979
## sample estimates:
##        cor 
## -0.2117409

scatter_Hb <- ggplot(updated_Hb, aes(x=Hb, y=exp_SCORE)) + geom_point() + labs(y = "PRS", x = "Hb") + annotate("text", x = 9, y = 1.00033, label="p-value = 0.29") + annotate("text", x = 9, y = 1.00031, label="R^2 = 0.0448") + geom_smooth(method=lm,  linetype="dashed",                                                                                                                            color="darkred", fill="blue") + ggtitle("Haemoglobin on PRS")

5.7 Alb

updated_Alb <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Alb))
lm_Alb <- lm(exp_SCORE~Alb, data=updated_Alb)
summary(lm_Alb)

## 
## Call:
## lm(formula = exp_SCORE ~ Alb, data = updated_Alb)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.236e-04 -6.387e-05  2.515e-05  5.195e-05  1.178e-04 
## 
## Coefficients:
##               Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)  1.001e+00  9.373e-05 10674.475   <2e-16 ***
## Alb         -3.293e-06  2.667e-06    -1.234    0.229    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.272e-05 on 24 degrees of freedom
## Multiple R-squared:  0.0597, Adjusted R-squared:  0.02052 
## F-statistic: 1.524 on 1 and 24 DF,  p-value: 0.229

cor.test(updated_Alb$Alb, updated_Alb$exp_SCORE, method=c("pearson"))

## 
##  Pearson's product-moment correlation
## 
## data:  updated_Alb$Alb and updated_Alb$exp_SCORE
## t = -1.2344, df = 24, p-value = 0.229
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5770765  0.1579594
## sample estimates:
##        cor 
## -0.2443429

scatter_Alb <- ggplot(updated_Alb, aes(x=Alb, y=exp_SCORE)) + geom_point() + labs(y = "PRS", x = "Alb") + annotate("text", x=25, y=1.00033, label="p-value = 0.23") + annotate("text", x=25, y=1.00031, label="R^2 = 0.0597") + geom_smooth(method=lm,  linetype="dashed",                                                                                                                            color="darkred", fill="blue") + ggtitle("Albumin on PRS")

ggarrange(scatter_Hb, scatter_Alb, labels = c("A","B"))

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

6 Clinical Outcomes in the ROC group

6.0.1 Total number of escalations

status.pred_everything_trial <- ifelse(status.pred_everything > 0.3, 1, 0) ## IBD prediction >30% 

pred.res_trial <- data.frame(IID = PRS_everything$IID,
                       status_pred_everything = status.pred_everything_trial)

pred.res_merge <- merge(pred.res_trial, merged_update_clinical_final, by = "IID")
pred.res_merge$prediction <- ifelse(pred.res_merge$status_pred_everything == 1, "predicted", "not_predicted")

#Comparison of CRP levels of predicted vs not predicted
merged_clinical_predicted_CRP <- subset(pred.res_merge, !is.na(pred.res_merge$CRP))
ggplot(merged_clinical_predicted_CRP, aes(x = prediction, y = CRP)) + 
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +  
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "CRP", x = "IBD Prediction") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

#Comparison of total number of escalation of predicted vs not predicted
ggplot(pred.res_merge, aes(x = prediction, y = Total_IBD_surgeries)) + 
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  stat_boxplot(geom = "errorbar") +
  geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +  
  theme_bw() +
  theme(legend.position = "none") +
  labs(y = "Total Surgeries", x = "IBD Prediction") +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

#pred_escalations_box <- ggplot(pred_escalations_table, aes(y = Total_Escalations, x = Predicted_IBD)) + geom_boxplot(aes(fill = Predicted_IBD), alpha = 0.7, outlier.shape = NA) + stat_boxplot(geom = "errorbar") + geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") + scale_fill_manual(values = c("grey70","grey20")) + theme_bw() + theme(legend.position = "none") + labs(y = "Total Number of Escalations", x = "Patients predicted with IBD in ROC curve") +
  

pred_escalation_binary <- ifelse(grepl("0", pred.res_merge$Total_escalations), "No_Escalation", "Escalation")
pred_escalation_binary_table <- table(pred.res_merge$prediction, pred_escalation_binary) ; pred_escalation_binary_table

##                pred_escalation_binary
##                 Escalation No_Escalation
##   not_predicted         11            11
##   predicted              6             4

#mcnemar.test(pred_escalation_binary_table)
#p-value = 0.1

pred_escalation_binary_bar_table <- data.frame(as.numeric(c("7","4","11","5")), c("Escalation","Escalation","No_Escalation","No_Escalation"), c("not_predicted",'predicted',"not_predicted",'predicted'))
colnames(pred_escalation_binary_bar_table) <- c("Freq","Escalation","Prediction")

                                               
pred_escalation_binary_bar_box <- ggplot(pred_escalation_binary_bar_table, aes(x=Prediction, y=Freq, fill=Escalation)) + 
  geom_bar(stat="identity", position=position_dodge()) + scale_fill_brewer(palette="Paired") + labs(x = "Prediction of IBD from ROC Curve", y = "Frequency") +  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")

  
  
pred_escalation_binary_bar_box

7 Predictive power of PRS for IBD prognosis

library(pROC)
## GLM (as above)

IBD_prognostic_glm <- glm(
  formula = escalation_binary ~ updated_clinical_escalations.exp_SCORE, 
  family = binomial(link = "logit"), data = escalation_binary_table)

summary(IBD_prognostic_glm)

## 
## Call:
## glm(formula = escalation_binary ~ updated_clinical_escalations.exp_SCORE, 
##     family = binomial(link = "logit"), data = escalation_binary_table)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6109  -0.8583  -0.6101   1.0177   1.9251  
## 
## Coefficients:
##                                        Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                               13373       5948   2.248   0.0246 *
## updated_clinical_escalations.exp_SCORE   -13367       5946  -2.248   0.0246 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.236  on 31  degrees of freedom
## Residual deviance: 38.017  on 30  degrees of freedom
## AIC: 42.017
## 
## Number of Fisher Scoring iterations: 4

coef(summary(IBD_prognostic_glm))[,'Pr(>|z|)'] #p=0.0246

##                            (Intercept) updated_clinical_escalations.exp_SCORE 
##                             0.02456445                             0.02456309

## Predict disease status from model
IBD_prognostic_pred <- predict(IBD_prognostic_glm, newdata = escalation_binary_table, type = "response")
status.pred_IBD_prognostic <- ifelse(IBD_prognostic_pred > 0.3, 1, 0) ## Escalation > 30% 

pred.res_prognostic <- data.frame(status = escalation_binary_table$escalation_binary, status.pred_IBD_prognostic = status.pred_IBD_prognostic)
                           
                       

## Compare results
table(escalation_binary_table$escalation_binary, status.pred_IBD_prognostic)

##                status.pred_IBD_prognostic
##                  0  1
##   Escalation     9  8
##   No_Escalation  2 13

roc.score_prognostic <- roc(escalation_binary_table$escalation_binary, status.pred_IBD_prognostic)

## Setting levels: control = Escalation, case = No_Escalation

## Setting direction: controls < cases

plot.roc(roc.score_prognostic, col = "blue", print.auc = TRUE, legacy.axes = TRUE)

summary(lm_Alb)

## 
## Call:
## lm(formula = exp_SCORE ~ Alb, data = updated_Alb)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.236e-04 -6.387e-05  2.515e-05  5.195e-05  1.178e-04 
## 
## Coefficients:
##               Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)  1.001e+00  9.373e-05 10674.475   <2e-16 ***
## Alb         -3.293e-06  2.667e-06    -1.234    0.229    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.272e-05 on 24 degrees of freedom
## Multiple R-squared:  0.0597, Adjusted R-squared:  0.02052 
## F-statistic: 1.524 on 1 and 24 DF,  p-value: 0.229

cor.test(updated_Alb$Alb, updated_Alb$exp_SCORE, method=c("pearson"))

## 
##  Pearson's product-moment correlation
## 
## data:  updated_Alb$Alb and updated_Alb$exp_SCORE
## t = -1.2344, df = 24, p-value = 0.229
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5770765  0.1579594
## sample estimates:
##        cor 
## -0.2443429

updated_Alb <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Alb))
scatter_Alb <- ggplot(updated_Alb, aes(x=Alb, y=exp_SCORE)) + geom_point() + labs(y = "Alb", x = "Hb") + annotate("text", x=42, y=1.00033, label="p-value: 0.23") + geom_smooth(method=lm,  linetype="dashed",                                                                                                                            color="darkred", fill="blue") 
scatter_Alb

## `geom_smooth()` using formula = 'y ~ x'

Investigating a genetic basis for clinical outcome for Inflammatory Bowel Disease

Tejas Easwar