setwd("/home/the28/imputed_data/scores/")
#Reading in the files
PRS_everything <- read.table("PRS_everything_trial2_2.profile", header = T)
PRS_0.05 <- read.table("PRS_0.05.profile", header = T)
PRS_10_5 <- read.table("PRS_10_5.profile", header = T)
PRS_10_8 <- read.table("PRS_10_8.profile", header = T)
## Read in principal components capturing population stratificaton
pcs <- read.table("merged_pca_Shellfish.evecs")
## Accompanying .fam file
fam <- read.table("merged_pca.fam")
pcs <- t(pcs) %>%
as.data.frame() %>%
mutate(IID = fam$V2) %>%
relocate(IID)
colnames(pcs)[2:11] <- paste0("PC",seq(1,10))
# Exponentiate score to transform back to an OR scale, add disease status
PRS_everything <- PRS_everything %>%
mutate(exp_SCORE = exp(SCORE),
status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>%
left_join(pcs)
## Joining, by = "IID"
table(PRS_everything$status) ## 54 cases, 998 controls
##
## 0 1
## 998 54
PRS_0.05 <- PRS_0.05 %>%
mutate(exp_SCORE = exp(SCORE),
status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>%
left_join(pcs)
## Joining, by = "IID"
PRS_10_5 <- PRS_10_5 %>%
mutate(exp_SCORE = exp(SCORE),
status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>%
left_join(pcs)
## Joining, by = "IID"
PRS_10_8 <- PRS_10_8 %>%
mutate(exp_SCORE = exp(SCORE),
status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0))) %>%
left_join(pcs)
## Joining, by = "IID"
p.dist_everything <- ggplot(PRS_everything, aes(x = exp_SCORE, group = status)) +
geom_density(aes(fill = status), alpha = 0.7) +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
theme_bw() +
labs(x = "IBD PRS", y = "Density") + ggtitle("p<1")
p.dist_0.05 <- ggplot(PRS_0.05, aes(x = exp_SCORE, group = status)) +
geom_density(aes(fill = status), alpha = 0.7) +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
theme_bw() +
labs(x = "IBD PRS", y = "Density") + ggtitle("p<0.05")
p.dist_10_5 <- ggplot(PRS_10_5, aes(x = exp_SCORE, group = status)) +
geom_density(aes(fill = status), alpha = 0.7) +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
theme_bw() +
labs(x = "IBD PRS", y = "Density") + ggtitle("p<10^(-5)")
p.dist_10_8 <- ggplot(PRS_10_8, aes(x = exp_SCORE, group = status)) +
geom_density(aes(fill = status), alpha = 0.7) +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
theme_bw() +
labs(x = "IBD PRS", y = "Density") + ggtitle("p<10^(-8)")
ggarrange(p.dist_everything, p.dist_0.05, p.dist_10_5, p.dist_10_8)
p.box_everything <- ggplot(PRS_everything, aes(y = exp_SCORE, x = status)) +
geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
theme_bw() +
theme(legend.position = "none") +
labs(y = "IBD PRS", x = "Disease status")
p.box_0.05 <- ggplot(PRS_0.05, aes(y = exp_SCORE, x = status)) +
geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
theme_bw() +
theme(legend.position = "none") +
labs(y = "IBD PRS", x = "Disease status")
p.box_10_5 <- ggplot(PRS_10_5, aes(y = exp_SCORE, x = status)) +
geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
theme_bw() +
theme(legend.position = "none") +
labs(y = "IBD PRS", x = "Disease status")
p.box_10_8 <- ggplot(PRS_10_8, aes(y = exp_SCORE, x = status)) +
geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
theme_bw() +
theme(legend.position = "none") +
labs(y = "IBD PRS", x = "Disease status")
ggarrange(p.box_everything, p.box_0.05, p.box_10_5, p.box_10_8)
p_1_visual <- ggarrange(p.dist_everything,p.box_everything)
p_0.05_visual <- ggarrange(p.dist_0.05,p.box_0.05)
p_10_5_visual <- ggarrange(p.dist_10_5,p.box_10_5)
p_10_8_visual <- ggarrange(p.dist_10_8,p.box_10_8)
ggarrange(p_1_visual, p_0.05_visual, p_10_5_visual, p_10_8_visual, labels = c("A","B","C","D"))
setwd("/home/the28/pca_analysis/")
library(tidyverse)
library(ggplot2)
library(hrbrthemes)
meta <- read.csv(file = "1000G_HDGenotypes_SampleInfo.csv")
evec <- read.delim(file = "merged_pca_Shellfish.evecs",sep = " ", header = F)
fam <- read.delim(file = "merged_pca.fam", header = F, sep = " ")
evec <- as.data.frame(t(evec))
evec = evec[-c(nrow(evec)), ]
colnames(evec) <- paste0("PC",seq(1,10))
rownames(evec) <- fam$V2
#ID_name column
ID_name <- fam$V2
evec <- add_column(evec, ID_name)
## Merge evecs and metadata files and IDs
evecs.full <- evec %>% rownames_to_column("Sample") %>%
left_join(meta)
## Joining, by = "Sample"
#Generating the case_controls
evecs.full <- evecs.full %>%
mutate(case_control = ifelse(!is.na(evecs.full$Population), "2",
ifelse(grepl("TRIAD", evecs.full$ID_name), '1', '0')))
df <- evecs.full[c('PC1', 'PC2','Population', 'case_control')]
# basic scatterplot
colour_blind <- c("#ff9045", "#7370fc", "#4aad19", "#cf67ef", "#007c0e", "#ff56cb", "#00a35b", "#e7005e", "#68dc96","#832a92","#b6d167","#0161cb", "#be8800", "#fbaafc","#3b5b23","#ff6ba7", "#60dac3", "#f03c3c","#008d88","#9e2400","#723f7d","#b36500","#a21146","#ecbe8b","#895f00","#ffa190")
ggplot(df, aes(x=PC1, y=PC2, group=case_control, fill=Population, color=Population)) +
geom_point(aes(shape=case_control,color=Population,size=case_control)) +
scale_color_manual(values=colour_blind, na.value = "yellow") +
scale_fill_manual(values=colour_blind, na.value = "black") +
scale_shape_manual(values = c(21,24,22), labels = c("Controls", "Cases", "1000G Data"), name = "Data Type") +
scale_size_manual(values = c(4,5,1)) +
geom_vline(aes(xintercept = 0.002), linetype=3) +
geom_hline(aes(yintercept = -0.01), linetype=3) +
geom_hline(aes(yintercept = -0.033), linetype=3) +
geom_vline(aes(xintercept = 0.025), linetype=3)
##Non-European exclusion trial
european_pops <- c("CEU", "FIN", "GBR", "IBS", "TSI")
european_samples <- filter(evecs.full, Population %in% european_pops)
pc1_mean <- mean(european_samples$PC1)
pc2_mean <- mean(european_samples$PC2)
pc1_sd <- sd(european_samples$PC1)
pc2_sd <- sd(european_samples$PC2)
european_samples$pc1_dist <- abs(european_samples$PC1 - pc1_mean)
european_samples$pc2_dist <- abs(european_samples$PC2 - pc2_mean)
outliers <- subset(european_samples, pc1_dist > 3 * pc1_sd | pc2_dist > 3 * pc2_sd)
setwd("/home/the28/imputed_data/scores/")
p.pcs <- ggplot(PRS_everything, aes(x = PC1, y = PC2)) +
geom_point(aes(fill = status, shape = status), alpha = 0.7, size = 4) +
scale_shape_manual(name = "Disease status", values = c(21,24), labels = c("Controls", "Cases")) +
scale_fill_manual(name = "Disease status", values = c("grey90","grey20"), labels = c("Controls", "Cases")) +
theme_bw()
p.pcs
p.pcafull_trial <- data.frame(PRS_everything$PC1, PRS_everything$PC2, PRS_everything$status)
colnames(p.pcafull_trial) <- c("PC1","PC2","status")
evecs.named <- evec %>% rownames_to_column("Sample")
merged_evecs <- merge(meta, evecs.named, by = "Sample")
control_genome_dataset <- merged_evecs[c("Sample","PC1","PC2","Population")]
control_genome_dataset$status <- 2
p.pcafull_trial$Population <- NA
control_genome_subset <- control_genome_dataset %>%
select(PC1, PC2, status, Population)
merged_PCA <- merge(control_genome_subset, p.pcafull_trial, by = c("PC1", "PC2", "status", "Population"), all.x = TRUE, all.y = TRUE)
# basic scatterplot
colour_blind <- c("#ff9045", "#7370fc", "#4aad19", "#cf67ef", "#007c0e", "#ff56cb", "#00a35b", "#e7005e", "#68dc96","#832a92","#b6d167","#0161cb", "#be8800", "#fbaafc","#3b5b23","#ff6ba7", "#60dac3", "#f03c3c","#008d88","#9e2400","#723f7d","#b36500","#a21146","#ecbe8b","#895f00","#ffa190")
ggplot(merged_PCA, aes(x=PC1, y=PC2, group=status, fill=Population, color=Population)) +
geom_point(aes(shape=status,color=Population,size=status)) +
scale_color_manual(values=colour_blind, na.value = "yellow") +
scale_fill_manual(values=colour_blind, na.value = "black") +
scale_shape_manual(values = c(21,24,22), labels = c("Controls", "Cases", "1000G Data"), name = "Data Type") +
scale_size_manual(values = c(4,5,1)) +
geom_vline(aes(xintercept = 0.002), linetype=3) +
geom_hline(aes(yintercept = -0.01), linetype=3) +
geom_hline(aes(yintercept = -0.033), linetype=3) +
geom_vline(aes(xintercept = 0.025), linetype=3)
ggplot(control_genome_subset, aes(x=PC1, y=PC2, fill=Population, color=Population)) +
geom_point(aes(color=Population)) +
scale_color_manual(values=colour_blind) +
scale_fill_manual(values=colour_blind)
Looks at whether the means PRS is different between groups
stat.t_everything <- PRS_everything %>% t_test(data = PRS_everything, formula = exp_SCORE ~ status)
stat.t_everything ## p = 0.00000521
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 exp_SCORE 0 1 998 54 -5.02 58.7 0.00000521
stat.t_0.05 <- PRS_0.05 %>% t_test(data = PRS_0.05, formula = exp_SCORE ~ status)
stat.t_0.05 ## p = 0.00000544
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 exp_SCORE 0 1 998 54 -5.00 58.6 0.00000544
stat.t_10_5 <- PRS_10_5 %>% t_test(data = PRS_10_5, formula = exp_SCORE ~ status)
stat.t_10_5 ## p = 0.000019
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 exp_SCORE 0 1 998 54 -4.65 59.3 0.000019
stat.t_10_8 <- PRS_10_8 %>% t_test(data = PRS_10_8, formula = exp_SCORE ~ status)
stat.t_10_8 ## p = 0.00197
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 exp_SCORE 0 1 998 54 -3.24 59.2 0.00197
p_values <- c("p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)")
t_values <- c(stat.t_everything$p, stat.t_0.05$p, stat.t_10_5$p, stat.t_10_8$p)
log_t_values <- as.numeric(format(round(abs(log10(t_values)), 2), nsmall = 2))
t_table <- data.frame(t_values, p_values, log_t_values)
t_graph <- ggplot(data=t_table, aes(x=reorder(p_values, t_values), y=t_values)) +
geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "T-test p values")
t_graph
log_t_graph <- ggplot(data=t_table, aes(x=reorder(p_values, -log_t_values), y=log_t_values)) +
geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "-log10(p) from T-test") +
geom_text(aes(label=log_t_values), vjust=1.6, color="white", size=3.5) +
theme_minimal()
log_t_graph
Estimates how disease status varies based on PRS
stat.reg_everything <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = PRS_everything)
summary(stat.reg_everything)
##
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"),
## data = PRS_everything)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7489 -0.3575 -0.2747 -0.2158 2.9775
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10925 2203 -4.958 7.11e-07 ***
## exp_SCORE 10918 2202 4.957 7.15e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.88 on 1051 degrees of freedom
## Residual deviance: 399.84 on 1050 degrees of freedom
## AIC: 403.84
##
## Number of Fisher Scoring iterations: 6
coef(summary(stat.reg_everything))[,'Pr(>|z|)']
## (Intercept) exp_SCORE
## 7.10608e-07 7.15096e-07
#p = 7.15e-07
stat.reg_0.05 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = PRS_0.05)
summary(stat.reg_0.05)
##
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"),
## data = PRS_0.05)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7692 -0.3563 -0.2796 -0.2141 3.0845
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3428.3 690.9 -4.962 6.96e-07 ***
## exp_SCORE 3423.4 690.4 4.959 7.10e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.88 on 1051 degrees of freedom
## Residual deviance: 399.81 on 1050 degrees of freedom
## AIC: 403.81
##
## Number of Fisher Scoring iterations: 6
coef(summary(stat.reg_0.05))[,'Pr(>|z|)']
## (Intercept) exp_SCORE
## 6.961371e-07 7.102635e-07
#p = 7.10e-07
stat.reg_10_5 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = PRS_10_5)
summary(stat.reg_10_5)
##
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"),
## data = PRS_10_5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8773 -0.3544 -0.2874 -0.2340 2.8278
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -223.62 50.11 -4.462 8.11e-06 ***
## exp_SCORE 221.41 50.24 4.407 1.05e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.88 on 1051 degrees of freedom
## Residual deviance: 406.32 on 1050 degrees of freedom
## AIC: 410.32
##
## Number of Fisher Scoring iterations: 6
coef(summary(stat.reg_10_5))[,'Pr(>|z|)']
## (Intercept) exp_SCORE
## 8.107217e-06 1.046227e-05
#p = 1.05e-05
stat.reg_10_8 <- glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"), data = PRS_10_8)
summary(stat.reg_10_8)
##
## Call:
## glm(formula = status ~ exp_SCORE, family = binomial(link = "logit"),
## data = PRS_10_8)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7016 -0.3516 -0.3073 -0.2632 2.7381
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -92.10 28.45 -3.237 0.00121 **
## exp_SCORE 90.30 28.78 3.137 0.00171 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.88 on 1051 degrees of freedom
## Residual deviance: 416.27 on 1050 degrees of freedom
## AIC: 420.27
##
## Number of Fisher Scoring iterations: 6
coef(summary(stat.reg_10_8))[,'Pr(>|z|)']
## (Intercept) exp_SCORE
## 0.001208313 0.001705098
#p = 0.00171
p_values_regression <- as.numeric(c("7.15e-07","7.10e-07","1.05e-05","0.00171"))
log_p_regression <- as.numeric(format(round((abs(log10(p_values_regression))) , 2), nsmall = 2))
regression_table <- data.frame(p_values, p_values_regression, log_p_regression)
regression_graph_unedited <- ggplot(data=regression_table, aes(x=p_values, y=p_values_regression)) +
geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "Logistical Regression p values")
regression_graph_unedited
log_regression_graph_unedited <- ggplot(data=regression_table, aes(x=reorder(p_values, -log_p_regression), y=log_p_regression)) +
geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "-log10(p values) from logistical regression") + geom_text(aes(label=format(round(log_p_regression, 3), nsmall = 3)),vjust=1.6, color="white", size=3.5)
log_regression_graph_unedited
p.PC1 <- ggplot(PRS_everything, aes(y = PC1, x = status)) +
geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 3, pch = 21, stroke = 0.7, fill = "white") +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") +
theme_bw() +
theme(legend.position = "none") +
labs(y = "PC1", x = "Disease status") +
scale_x_discrete(labels = c("0" = "Healthy Controls", "1" = "Cases"))
p.PC1
p.PC2 <- ggplot(PRS_everything, aes(y = PC2, x = status)) +
geom_boxplot(aes(fill = status), alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 3, pch = 21, stroke = 0.7, fill = "white") +
scale_fill_manual(name = "Disease status", values = c("grey70","grey20")) +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") +
theme_bw() +
theme(legend.position = "none") +
labs(y = "PC2", x = "Disease status") +
scale_x_discrete(labels = c("0" = "Healthy Controls", "1" = "Cases"))
p.PC2
wilcox.test(PC2 ~ status, data = PRS_everything)
##
## Wilcoxon rank sum test with continuity correction
##
## data: PC2 by status
## W = 33480, p-value = 0.002663
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(PC3 ~ status, data = PRS_everything)
##
## Wilcoxon rank sum test with continuity correction
##
## data: PC3 by status
## W = 25164, p-value = 0.4125
## alternative hypothesis: true location shift is not equal to 0
p.pcs <- ggplot(PRS_everything, aes(x = PC1, y = PC2)) +
geom_point(aes(fill = status, shape = status), alpha = 0.7, size = 4) +
scale_shape_manual(name = "Disease status", values = c(21,24), labels = c("Healthy Controls", "Cases")) +
scale_fill_manual(name = "Disease status", values = c("grey90","grey20"), labels = c("Healthy Controls", "Cases")) +
theme_bw()
ggarrange(p.pcs, p.PC1, p.PC2, labels = c("A","B", "C"), nrow = 1)
###Logistic regression with PC correction
## ==== Logistic regression with PC correction ==== ##
stat.PCs.reg_everything <- glm(
formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
family = binomial(link = "logit"), data = PRS_everything)
summary(stat.PCs.reg_everything)
##
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 +
## PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"),
## data = PRS_everything)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5280 -0.3286 -0.2273 -0.1629 3.1827
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11146.01 2363.00 -4.717 2.39e-06 ***
## exp_SCORE 11157.61 2362.24 4.723 2.32e-06 ***
## PC1 1583.40 433.65 3.651 0.000261 ***
## PC2 -15.83 345.59 -0.046 0.963472
## PC3 96.39 186.47 0.517 0.605219
## PC4 104.66 73.16 1.431 0.152516
## PC5 28.83 61.50 0.469 0.639228
## PC6 -66.87 40.89 -1.635 0.102004
## PC7 16.67 54.11 0.308 0.758055
## PC8 -72.38 44.19 -1.638 0.101441
## PC9 38.04 37.67 1.010 0.312612
## PC10 76.60 32.64 2.347 0.018936 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.88 on 1051 degrees of freedom
## Residual deviance: 357.33 on 1040 degrees of freedom
## AIC: 381.33
##
## Number of Fisher Scoring iterations: 6
coef(summary(stat.PCs.reg_everything))[,'Pr(>|z|)']
## (Intercept) exp_SCORE PC1 PC2 PC3 PC4
## 2.394677e-06 2.320200e-06 2.608502e-04 9.634717e-01 6.052193e-01 1.525160e-01
## PC5 PC6 PC7 PC8 PC9 PC10
## 6.392278e-01 1.020043e-01 7.580546e-01 1.014410e-01 3.126120e-01 1.893577e-02
#p = 2.32e-06
stat.PCs.reg_0.05 <- glm(
formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
family = binomial(link = "logit"), data = PRS_0.05)
summary(stat.PCs.reg_0.05)
##
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 +
## PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"),
## data = PRS_0.05)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5206 -0.3291 -0.2314 -0.1655 3.2597
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3379.37 725.27 -4.659 3.17e-06 ***
## exp_SCORE 3393.88 725.05 4.681 2.86e-06 ***
## PC1 1582.03 429.68 3.682 0.000231 ***
## PC2 -36.03 342.98 -0.105 0.916334
## PC3 98.42 186.23 0.528 0.597171
## PC4 106.25 73.49 1.446 0.148232
## PC5 30.47 61.31 0.497 0.619237
## PC6 -71.35 40.81 -1.748 0.080394 .
## PC7 20.09 54.30 0.370 0.711427
## PC8 -66.99 44.21 -1.515 0.129734
## PC9 33.47 37.24 0.899 0.368784
## PC10 75.43 32.76 2.302 0.021323 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.88 on 1051 degrees of freedom
## Residual deviance: 357.78 on 1040 degrees of freedom
## AIC: 381.78
##
## Number of Fisher Scoring iterations: 6
coef(summary(stat.PCs.reg_0.05))[,'Pr(>|z|)']
## (Intercept) exp_SCORE PC1 PC2 PC3 PC4
## 3.170562e-06 2.856442e-06 2.314931e-04 9.163339e-01 5.971708e-01 1.482323e-01
## PC5 PC6 PC7 PC8 PC9 PC10
## 6.192367e-01 8.039444e-02 7.114266e-01 1.297336e-01 3.687843e-01 2.132294e-02
#p = 2.86e-06
stat.PCs.reg_10_5 <- glm(
formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
family = binomial(link = "logit"), data = PRS_10_5)
summary(stat.PCs.reg_10_5)
##
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 +
## PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"),
## data = PRS_10_5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6501 -0.3319 -0.2482 -0.1777 3.0001
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -197.64 52.37 -3.774 0.000161 ***
## exp_SCORE 215.05 52.57 4.091 4.29e-05 ***
## PC1 1509.84 430.81 3.505 0.000457 ***
## PC2 -128.35 351.58 -0.365 0.715062
## PC3 123.94 187.59 0.661 0.508808
## PC4 96.80 74.20 1.305 0.192062
## PC5 43.31 59.70 0.726 0.468126
## PC6 -74.95 40.40 -1.855 0.063586 .
## PC7 24.47 53.52 0.457 0.647539
## PC8 -66.42 43.68 -1.520 0.128392
## PC9 28.93 37.33 0.775 0.438285
## PC10 84.26 32.34 2.605 0.009175 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.88 on 1051 degrees of freedom
## Residual deviance: 364.26 on 1040 degrees of freedom
## AIC: 388.26
##
## Number of Fisher Scoring iterations: 6
coef(summary(stat.PCs.reg_10_5))[,'Pr(>|z|)']
## (Intercept) exp_SCORE PC1 PC2 PC3 PC4
## 1.605232e-04 4.294042e-05 4.572746e-04 7.150617e-01 5.088082e-01 1.920621e-01
## PC5 PC6 PC7 PC8 PC9 PC10
## 4.681259e-01 6.358647e-02 6.475389e-01 1.283924e-01 4.382855e-01 9.174659e-03
#p = 4.29e-05
stat.PCs.reg_10_8 <- glm(
formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
family = binomial(link = "logit"), data = PRS_10_8)
summary(stat.PCs.reg_10_8)
##
## Call:
## glm(formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 +
## PC6 + PC7 + PC8 + PC9 + PC10, family = binomial(link = "logit"),
## data = PRS_10_8)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6995 -0.3348 -0.2577 -0.1924 2.9927
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -69.21 29.66 -2.333 0.01963 *
## exp_SCORE 86.98 29.93 2.906 0.00366 **
## PC1 1518.35 426.41 3.561 0.00037 ***
## PC2 -115.09 348.01 -0.331 0.74086
## PC3 119.95 186.99 0.641 0.52121
## PC4 94.21 73.59 1.280 0.20050
## PC5 45.09 59.04 0.764 0.44501
## PC6 -75.85 40.05 -1.894 0.05822 .
## PC7 19.26 53.43 0.361 0.71847
## PC8 -68.74 43.09 -1.595 0.11064
## PC9 29.63 36.82 0.805 0.42105
## PC10 86.40 31.97 2.703 0.00688 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.88 on 1051 degrees of freedom
## Residual deviance: 373.09 on 1040 degrees of freedom
## AIC: 397.09
##
## Number of Fisher Scoring iterations: 6
coef(summary(stat.PCs.reg_10_8))[,'Pr(>|z|)']
## (Intercept) exp_SCORE PC1 PC2 PC3 PC4
## 0.0196296236 0.0036630415 0.0003698046 0.7408583514 0.5212126251 0.2004982348
## PC5 PC6 PC7 PC8 PC9 PC10
## 0.4450090313 0.0582237356 0.7184700441 0.1106427260 0.4210510997 0.0068767880
#p = 0.00366
p_value_regression_pca <- as.numeric(c("2.32e-06","2.86e-06","4.29e-05","0.00366"))
log_p_regression_pca <- as.numeric(format(round((abs(log10(p_value_regression_pca))), 2), nsmall = 2))
merged_log <- c(log_p_regression, log_p_regression_pca)
merged_p_values <- c("p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)","p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)")
merged_p_values <- factor(merged_p_values, levels=c("p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)"))
which_p_value <- c("Initial", "Initial", "Initial", "Initial", "PCA", "PCA", "PCA", "PCA")
p_value_combined <- data.frame(merged_log, merged_p_values, which_p_value)
ggplot(data=p_value_combined, aes(x=merged_p_values, y=merged_log, fill=which_p_value)) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=format(round(merged_log, 2), nsmall = 2)), vjust=1.6, color="white",
position = position_dodge(0.9), size=5.5)+
scale_fill_brewer(palette="Paired") + labs(x = "p threshold values", y = "-log10(p values)", fill="PC correction")
p_value_PC_inclusion <- c(7.15e-7, 7.10e-7, 1.05e-5, 1.71e-3, 2.32e-6, 2.86e-6, 4.29e-5, 3.66e-3)
p_value_table_logistic <- data.frame(p_value_PC_inclusion, which_p_value)
wilcox.test(p_value_PC_inclusion ~ which_p_value, data = p_value_table_logistic)
##
## Wilcoxon rank sum test
##
## data: p_value_PC_inclusion by which_p_value
## W = 5, p-value = 0.4857
## alternative hypothesis: true location shift is not equal to 0
# Pseudo R^2 can be calculated as an indication of variance in disease status explained by PRS
# Use rms package (score needs to be scaled)
stat.reg2_everything <- rms::lrm(formula = status ~ scale(exp_SCORE) , data = PRS_everything)
#print(stat.reg2_everything) #R2 = 0.073
stat.reg2_0.05 <- rms::lrm(formula = status ~ scale(exp_SCORE) , data = PRS_0.05)
#print(stat.reg2_0.05) #R2 = 0.074
stat.reg2_10_5 <- rms::lrm(formula = status ~ scale(exp_SCORE) , data = PRS_10_5)
#print(stat.reg2_10_5) #R2 = 0.055
stat.reg2_10_8 <- rms::lrm(formula = status ~ scale(exp_SCORE) , data = PRS_10_8)
#print(stat.reg2_10_8) #R2 = 0.027
pseudo_R2_initial_values <- as.numeric(c("0.073","0.074","0.055","0.027"))
pseudo_R2_initial_table <- data.frame(pseudo_R2_initial_values, p_values)
pseudo_R2_initial_graph <- ggplot(data=pseudo_R2_initial_table, aes(x=p_values, y=pseudo_R2_initial_values)) +
geom_bar(stat="identity", fill="steelblue") + labs(x = "p threshold values", y = "Pseudo R^2 Values") + geom_text(aes(label=pseudo_R2_initial_values), vjust=1.6, color="white", size=3.5)
pseudo_R2_initial_graph
# Calculate pseudo R^2
stat.PCs.reg2_everything <- rms::lrm(
formula = status ~ scale(exp_SCORE) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = PRS_everything)
#R2=0.189
stat.PCs.reg2_0.05 <- rms::lrm(
formula = status ~ scale(exp_SCORE) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = PRS_0.05)
#R2=0.188
stat.PCs.reg2_10_5 <- rms::lrm(
formula = status ~ scale(exp_SCORE) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = PRS_10_5)
#R2=0.171
stat.PCs.reg2_10_8 <- rms::lrm(
formula = status ~ scale(exp_SCORE) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = PRS_10_8)
#R2=0.147
## Generate bar graph with x = P-value threshold for PRS calculation, y = pseudo R2
merged_pseudo_R2_values <- as.numeric(c(pseudo_R2_initial_values, "0.189", "0.188", "0.171", "0.147"))
merged_pseudo_R2_table <- data.frame(merged_p_values, which_p_value, merged_pseudo_R2_values)
merged_pseudo_R2_table$merged_p_values <- factor(merged_pseudo_R2_table$merged_p_values, levels=c("p < 1", "p < 0.05", "p < 1*10^(5)", "p < 1*10^(8)"))
ggplot(data=merged_pseudo_R2_table, aes(x=merged_p_values, y=merged_pseudo_R2_values, fill=which_p_value)) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=format(round(merged_pseudo_R2_values, 3), nsmall = 3)), vjust=1.6, color="white",
position = position_dodge(0.9), size=4.75)+
scale_fill_brewer(palette="Paired") + labs(x = "p threshold values", y = "Pseudo R^2 values", fill="PC correction")
pseudo_comparison_values <- c(0.189, 0.188, 0.171, 0.147, 0.073, 0.074, 0.055, 0.027)
pseudo_comparison_names <- c("with","with","with","with", "no", "no","no","no")
pseudo_comparison_table <- data.frame(pseudo_comparison_values, pseudo_comparison_names)
wilcox.test(pseudo_comparison_values ~ pseudo_comparison_names, data = pseudo_comparison_table)
##
## Wilcoxon rank sum test
##
## data: pseudo_comparison_values by pseudo_comparison_names
## W = 0, p-value = 0.02857
## alternative hypothesis: true location shift is not equal to 0
Used to calculate PRS sensitivity and specificity for IBD diagnosis
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## GLM (as above)
stat.PCs.reg_everything <- glm(
formula = status ~ exp_SCORE + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
family = binomial(link = "logit"), data = PRS_everything)
## Predict disease status from model
status.pred_everything <- predict(stat.PCs.reg_everything, newdata = PRS_everything, type = "response")
status.pred_everything <- ifelse(status.pred_everything > 0.20, 1, 0) ## IBD prediction >20%
pred.res <- data.frame(IID = PRS_everything$IID,
status = PRS_everything$status,
status_pred_everything = status.pred_everything)
## Compare results
table(PRS_everything$status, status.pred_everything)
## status.pred_everything
## 0 1
## 0 982 16
## 1 41 13
## Calculate AUC (area under the curve, ie. prediction accuracy)
par(mfrow=c(2,2))
roc.score_everything <- roc(PRS_everything$status, status.pred_everything)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.score_everything
##
## Call:
## roc.default(response = PRS_everything$status, predictor = status.pred_everything)
##
## Data: status.pred_everything in 998 controls (PRS_everything$status 0) < 54 cases (PRS_everything$status 1).
## Area under the curve: 0.6124
plot.roc(roc.score_everything, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
text(1.75, 0.9, "A", cex=1.5, font=2)
#p<0.05
status.pred_0.05 <- predict(stat.PCs.reg_0.05, newdata = PRS_0.05, type = "response")
status.pred_0.05 <- ifelse(status.pred_0.05 > 0.20, 1, 0)
roc.score_0.05 <- roc(PRS_0.05$status, status.pred_0.05)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(roc.score_0.05, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
text(1.75, 0.9, "B", cex=1.5, font=2)
#p < 1*10^(5)
status.pred_10_5 <- predict(stat.PCs.reg_10_5, newdata = PRS_10_5, type = "response")
status.pred_10_5 <- ifelse(status.pred_10_5 > 0.20, 1, 0)
roc.score_10_5 <- roc(PRS_10_5$status, status.pred_10_5)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(roc.score_10_5, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
text(1.75, 0.9, "C", cex=1.5, font=2)
#p < 1*10^(8)
status.pred_10_8 <- predict(stat.PCs.reg_10_8, newdata = PRS_10_8, type = "response")
status.pred_10_8 <- ifelse(status.pred_10_8 > 0.20, 1, 0)
roc.score_10_8 <- roc(PRS_10_8$status, status.pred_10_8)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(roc.score_10_8, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
text(1.75, 0.9, "D", cex=1.5, font=2)
library(pROC)
## Calculate AUC (area under the curve, ie. prediction accuracy)
roc.score_everything_trial <- roc(PRS_everything$status, status.pred_everything)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.score_everything_trial
##
## Call:
## roc.default(response = PRS_everything$status, predictor = status.pred_everything)
##
## Data: status.pred_everything in 998 controls (PRS_everything$status 0) < 54 cases (PRS_everything$status 1).
## Area under the curve: 0.6124
## Calculate 1-specificity values
spec_values <- roc.score_everything$specificities
one_spec_values <- 1 - spec_values
## Plot ROC curve with 1-specificity on the x-axis
plot(one_spec_values, roc.score_everything$sensitivities,
col = "blue", type = "l", lwd = 2,
xlab = "1 - Specificity", ylab = "Sensitivity",
xlim = c(0, 1), ylim = c(0, 1))
text(0.1, 0.9, paste("AUC =", round(roc.score_everything$auc, 2)), cex=1.5, font=2)
AUC_values <- as.numeric(c("0.612", "0.593", "0.577", "0.560"))
p_values_factor <- c(1, 0.05, 1e-5, 1e-8)
AUC_table <- data.frame(AUC_values, p_values, p_values_factor)
AUC_graph <- ggplot(data=AUC_table, aes(x=reorder(p_values, -AUC_values), y=AUC_values)) +
geom_bar(stat="identity", fill="steelblue", width = 0.5) + labs(x = "p threshold values", y = "AUC Values") + geom_text(aes(label=AUC_values), vjust=1.6, color="white", size=3.5)
AUC_graph
one.WAY_AUC <- aov(AUC_values ~ p_values_factor, data = AUC_table)
summary(one.WAY_AUC)
## Df Sum Sq Mean Sq F value Pr(>F)
## p_values_factor 1 0.0009937 0.0009937 4.078 0.181
## Residuals 2 0.0004873 0.0002437
kruskal.test(AUC_values ~ p_values_factor, data = AUC_table)
##
## Kruskal-Wallis rank sum test
##
## data: AUC_values by p_values_factor
## Kruskal-Wallis chi-squared = 3, df = 3, p-value = 0.3916
setwd("/home/the28/imputed_data/scores/clinical_data/")
clinical_data <- read.csv("IBD_ClinicalMetadata_forTE.csv")
clinical_data <- clinical_data[-c(1)]
merged_clinical_initial <- merge(PRS_everything, clinical_data, by.x = "IID", by.y = "Patient_ID", all.x = TRUE, all.y = FALSE)
merged_clinical <- subset(merged_clinical_initial, grepl("TRIAD", merged_clinical_initial$IID))
merged_clinical <- merged_clinical %>% mutate(exp_SCORE = exp(SCORE))
merged_clinical_final <- subset(merged_clinical, !is.na(merged_clinical$Birth_Year))
##CD vs UC on the Score##
box_diagnosis <- ggplot(merged_clinical_final, aes(x=Diagnosis, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Crohn's Disease vs Ulcerative Colitis on Polygenic Risk Score")
box_diagnosis + stat_compare_means(method = "t.test")
#Gender on Score
box_gender <- ggplot(merged_clinical_final, aes(x=Gender, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Gender on Score")
box_gender + stat_compare_means(method = "t.test")
#CRP Levels affect on Score
merged_clinical_CRP <- subset(merged_clinical_final, !is.na(merged_clinical_final$CRP))
scatter_CRP <- ggplot(merged_clinical_CRP, aes(x=CRP, y=exp_SCORE)) + geom_point() + labs(y = "PRS", x = "CRP") + geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="blue")
scatter_CRP
## `geom_smooth()` using formula = 'y ~ x'
lm_CRP <- lm(exp_SCORE~CRP, data=merged_clinical_CRP)
summary(lm_CRP)
##
## Call:
## lm(formula = exp_SCORE ~ CRP, data = merged_clinical_CRP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.162e-04 -3.680e-05 -7.172e-06 4.296e-05 1.120e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+00 1.613e-05 62033.823 <2e-16 ***
## CRP 5.139e-07 2.605e-07 1.973 0.0618 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.067e-05 on 21 degrees of freedom
## Multiple R-squared: 0.1564, Adjusted R-squared: 0.1162
## F-statistic: 3.892 on 1 and 21 DF, p-value: 0.06182
cor.test(merged_clinical_CRP$CRP, merged_clinical_CRP$exp_SCORE, method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: merged_clinical_CRP$CRP and merged_clinical_CRP$exp_SCORE
## t = 1.9728, df = 21, p-value = 0.06182
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02004913 0.69443483
## sample estimates:
## cor
## 0.3954209
#Extent of Disease on Score
merged_clinical_extent <- subset(merged_clinical_final, !is.na(merged_clinical_final$Extent))
box_extent <- ggplot(merged_clinical_extent, aes(x=Extent, y=exp_SCORE)) + geom_boxplot() + ggtitle("Disease Extent on PRS") + labs(y="PRS", x="Disease Extent") + annotate("text", x = 'Extensive', y = 1.0003, label="p-value: 0.59")
res.aov <- aov(exp_SCORE ~ Extent, data = merged_clinical_extent)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Extent 3 1.046e-08 3.488e-09 0.655 0.589
## Residuals 21 1.119e-07 5.327e-09
## Extensive vs not extensive
merged_clinical_extensive <- ifelse(grepl("Extensive", merged_clinical_extent$Extent), "Extensive", "Not_Extensive")
extensive_or_not <- data.frame(merged_clinical_extensive, merged_clinical_extent$exp_SCORE)
binary_extensive <- ggplot(extensive_or_not, aes(x=merged_clinical_extensive, y=merged_clinical_extent$exp_SCORE)) + geom_boxplot() + ggtitle("Extensive vs not extensive on Score") + ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") + theme_bw() +
theme(legend.position = "none") +
labs(y = "PRS", x = "Extensive vs Not Extensive") + stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 1.5, pch = 21, stroke = 0.7, fill = "white")
ggarrange(box_extent, binary_extensive, labels = c("A","B"))
#IBDhi vs IBDlo
merged_clinical_hilo <- subset(merged_clinical_final, !is.na(merged_clinical_final$PAX))
box_hilo <- ggplot(merged_clinical_hilo, aes(x=PAX, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of IBDhigh vs IBDlow on Score")
box_hilo + stat_compare_means(method = "t.test")
#Escalations vs. no escalations
effect_escalations <- data.frame(merged_clinical_final$IID, merged_clinical_final$Total_escalations, merged_clinical_final$exp_SCORE)
effect_escalations <- subset(effect_escalations, !is.na(effect_escalations$merged_clinical_final.Total_escalations))
effect_escalations_binary <- ifelse(grepl("0", effect_escalations$merged_clinical_final.Total_escalations), "No_Escalation", "Escalation")
box_binary_table <- data.frame(effect_escalations, effect_escalations_binary)
box_escalation_binary <- ggplot(box_binary_table, aes(x = effect_escalations_binary, y = merged_clinical_final.exp_SCORE)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
scale_fill_manual(name = "Escalations vs No escalations", values = c("grey70","grey20")) +
theme_bw() +
theme(legend.position = "none") +
labs(y = "PRS Score", x = "Escalations vs No Escalations") +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")
box_escalation_binary
#Total Escalations on Polygenic Risk Score
merged_clinical_escalations <- subset(merged_clinical_final, !is.na(merged_clinical_final$Total_escalations))
Total_Escalations <- as.character(merged_clinical_escalations$Total_escalations)
box_escalation <- ggplot(merged_clinical_escalations, aes(x=Total_Escalations, y=exp_SCORE)) + geom_boxplot()
box_escalation
#Initial Treatment Class on Polygenic Risk Score
merged_clinical_treatmentclass <- subset(merged_clinical_final, !is.na(merged_clinical_final$Initial_treatment_Class))
box_treatment_class <- ggplot(merged_clinical_treatmentclass, aes(x=Initial_treatment_Class, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Initial Treatment Class on Score")
box_treatment_class
#Escalation 1 on Score
trial <-subset(merged_clinical_final, c("Immunosuppressant", "Biologic", "Surgery") %in% merged_clinical_final$Escalation1_Treatment_Class)
trial2 <- subset(trial, !is.na(trial$Escalation1_Treatment_Class))
test <- trial2$IID == "TRIAD1481"
trial3 <- subset(trial2, !test)
test2 <- trial3$IID == "TRIAD1487"
merged_clinical_escalation_1 <- subset(trial3, !test2)
box_escalation1 <- ggplot(merged_clinical_escalation_1, aes(x=Escalation1_Treatment_Class, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of First Escalation on Score")
box_escalation1
setwd("/home/the28/imputed_data/scores/clinical_data/updated_clinical_data/")
updated_clinical_data <- read.csv("updated_clinical_data.csv")
PRS_everything <- read.table("PRS_everything_trial2_2.profile", header = T)
PRS_everything <- PRS_everything %>%
mutate(exp_SCORE = exp(SCORE),
status = as.factor(ifelse(grepl("TRIAD", IID), 1, 0)))
merged_update_clinical_initial <- merge(PRS_everything, updated_clinical_data, by.x = "IID", by.y = "Patient_ID", all.x = TRUE, all.y = FALSE)
merged_update_clinical <- subset(merged_update_clinical_initial, grepl("TRIAD", merged_update_clinical_initial$IID))
merged_update_clinical_final <- subset(merged_update_clinical, !is.na(merged_update_clinical$DOB))
updated_clinical_escalations <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Total_escalations))
escalataions_year <- updated_clinical_escalations$Total_escalations/(updated_clinical_escalations$FollowUp_Days/365)
escalations_df <- data.frame(updated_clinical_escalations$exp_SCORE, updated_clinical_escalations$Total_escalations, escalataions_year)
colnames(escalations_df) <- c("exp_SCORE","Escalations", "Escalation_rate")
escalation_binary <- ifelse(grepl("0", updated_clinical_escalations$Total_escalations), "No_Escalation", "Escalation")
escalation_binary_table <- data.frame(escalation_binary, updated_clinical_escalations$exp_SCORE)
box_escalation_updated_binary <- ggplot(escalation_binary_table, aes(x = escalation_binary, y = updated_clinical_escalations.exp_SCORE)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
scale_fill_manual(name = "Escalations vs No escalations", values = c("grey70","grey20")) +
theme_bw() +
theme(legend.position = "none") +
labs(y = "PRS Score", x = "Escalations vs No Escalations") +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")
box_escalation_updated_binary
density_escalation_updated_binary <- ggplot(escalation_binary_table, aes(x = updated_clinical_escalations.exp_SCORE, group = escalation_binary)) +
geom_density(aes(fill = escalation_binary), alpha = 0.7) +
scale_fill_manual(name = "Escalation Status", values = c("grey70","grey20")) +
theme_bw() +
labs(x = "IBD PRS", y = "Density")
density_escalation_updated_binary
ggarrange(box_escalation_updated_binary, density_escalation_updated_binary, ncol = 1, nrow = 2, labels = c("A", "B"))
lm_escalation_rate <- lm(exp_SCORE~Escalation_rate, data=escalations_df)
summary(lm_escalation_rate)
##
## Call:
## lm(formula = exp_SCORE ~ Escalation_rate, data = escalations_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.445e-04 -4.736e-05 5.957e-06 6.520e-05 1.246e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+00 1.527e-05 65509.959 <2e-16 ***
## Escalation_rate 2.492e-05 2.181e-05 1.143 0.262
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.42e-05 on 30 degrees of freedom
## Multiple R-squared: 0.04172, Adjusted R-squared: 0.009773
## F-statistic: 1.306 on 1 and 30 DF, p-value: 0.2622
#0.04172
update_scatter_escalation_rate <- ggplot(escalations_df, aes(x=Escalation_rate , y=exp_SCORE)) + geom_point() + annotate("text", x = 2.1, y = 1.00032, label="p-value = 0.26") + annotate("text", x = 2.1, y = 1.0003, label="R^2 = 0.04172") + labs(y = "PRS", x = "Escalation rate (Escalation per year of Follow Up)") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue")
update_scatter_escalation_rate
## `geom_smooth()` using formula = 'y ~ x'
## Effect of Surgeries
updated_surgeries <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Total_IBD_surgeries))
box_surgeries <- ggplot(updated_surgeries, aes(x = as.character(Total_IBD_surgeries), y = exp_SCORE)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
theme_bw() +
theme(legend.position = "none") +
labs(y = "PRS Score", x = "Surgery vs No Surgery") +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")
box_surgeries
density_surgeries <- ggplot(updated_surgeries, aes(x = exp_SCORE, group = as.character(Total_IBD_surgeries))) +
geom_density(aes(fill = as.character(Total_IBD_surgeries)), alpha = 0.7) +
scale_fill_manual(name = "Surgery Status", values = c("grey70","grey20")) +
theme_bw() +
labs(x = "IBD PRS", y = "Density")
density_surgeries
ggarrange(box_surgeries, density_surgeries, ncol = 1, nrow = 2, labels = c("A", "B"))
setwd("/home/the28/imputed_data/scores/clinical_data/updated_clinical_data/")
updated_clinical_extent <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Extent))
box_update_extent <- ggplot(updated_clinical_extent, aes(x=Extent, y=exp_SCORE)) + geom_boxplot() + ggtitle("Effect of Disease Extent on Score") + labs(y = "PRS", x = "Disease Extent")
box_update_extent
anova_extent_update <- aov(exp_SCORE ~ Extent, data = updated_clinical_extent)
summary(anova_extent_update)
## Df Sum Sq Mean Sq F value Pr(>F)
## Extent 4 1.132e-08 2.831e-09 0.427 0.788
## Residuals 24 1.592e-07 6.635e-09
# Extensive vs not extensive
updated_extensive_binary <- ifelse(grepl("Extensive", updated_clinical_extent$Extent), "Extensive", "Not_Extensive")
extensive_or_not_update <- data.frame(updated_extensive_binary, updated_clinical_extent$exp_SCORE)
ggplot(extensive_or_not_update, aes(x=updated_extensive_binary, y=updated_clinical_extent$exp_SCORE)) + geom_boxplot() + ggtitle("Extensive vs not extensive on Score") + ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format") + theme_bw() +
theme(legend.position = "none") +
labs(y = "PRS", x = "Extensive vs Not Extensive") + stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 1.5, pch = 21, stroke = 0.7, fill = "white")
#Endoscopy effect
updated_endoscopy <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Endoscopy))
updated_endoscopy$Endoscopy[updated_endoscopy$Endoscopy == "Severe (CT)"] <- "Severe"
anova_endoscopy <- aov(exp_SCORE ~ Endoscopy, data = updated_endoscopy)
summary(anova_endoscopy)
## Df Sum Sq Mean Sq F value Pr(>F)
## Endoscopy 2 6.710e-09 3.355e-09 0.588 0.563
## Residuals 24 1.369e-07 5.704e-09
box_update_endoscopy <- ggplot(updated_endoscopy, aes(x=Endoscopy, y=exp_SCORE)) + geom_boxplot() + labs(y = "PRS", x = "Endoscopy Findings") + annotate("text", x = "Mild", y = 1.0003, label="p-value = 0.56")
box_update_endoscopy
updated_endoscopy_moderate <- subset(updated_endoscopy, !grepl("Mild", updated_endoscopy$Endoscopy))
boxplot_endoscopy_moderate <- ggplot(updated_endoscopy_moderate, aes(x = Endoscopy, y = exp_SCORE)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
theme_bw() +
theme(legend.position = "none") +
labs(y = "PRS Score", x = "Endoscopy Findings") +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")
boxplot_endoscopy_moderate
ggarrange(box_update_endoscopy, boxplot_endoscopy_moderate, nrow = 2, ncol = 1, labels = c("A","B"))
updated_CRP <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$CRP))
lm_CRP <- lm(exp_SCORE~CRP, data=updated_CRP)
summary(lm_CRP)
##
## Call:
## lm(formula = exp_SCORE ~ CRP, data = updated_CRP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.190e-04 -4.070e-05 1.139e-05 5.128e-05 1.095e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+00 1.532e-05 65298.762 <2e-16 ***
## CRP 5.197e-07 2.624e-07 1.981 0.0592 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.284e-05 on 24 degrees of freedom
## Multiple R-squared: 0.1405, Adjusted R-squared: 0.1047
## F-statistic: 3.923 on 1 and 24 DF, p-value: 0.0592
cor.test(updated_CRP$CRP, updated_CRP$exp_SCORE, method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: updated_CRP$CRP and updated_CRP$exp_SCORE
## t = 1.9807, df = 24, p-value = 0.0592
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01465053 0.66554903
## sample estimates:
## cor
## 0.3748285
scatter_CRP <- ggplot(updated_CRP, aes(x=CRP, y=exp_SCORE)) + geom_point() + labs(y = "PRS", x = "CRP") + annotate("text", x = 150, y = 1.000365, label="p-value = 0.059") + annotate("text", x = 150, y = 1.000345, label="R^2 = 0.141") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue") + ggtitle("CRP on PRS")
scatter_CRP
## `geom_smooth()` using formula = 'y ~ x'
updated_Hb <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Hb))
lm_Hb <- lm(exp_SCORE~Hb, data=updated_Hb)
summary(lm_Hb)
##
## Call:
## lm(formula = exp_SCORE ~ Hb, data = updated_Hb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.328e-04 -6.111e-05 2.057e-05 5.084e-05 1.118e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.001e+00 1.026e-04 9749.773 <2e-16 ***
## Hb -8.698e-06 8.030e-06 -1.083 0.289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.214e-05 on 25 degrees of freedom
## Multiple R-squared: 0.04483, Adjusted R-squared: 0.006628
## F-statistic: 1.173 on 1 and 25 DF, p-value: 0.289
cor.test(updated_Hb$Hb, updated_Hb$exp_SCORE, method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: updated_Hb$Hb and updated_Hb$exp_SCORE
## t = -1.0833, df = 25, p-value = 0.289
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5476856 0.1829979
## sample estimates:
## cor
## -0.2117409
scatter_Hb <- ggplot(updated_Hb, aes(x=Hb, y=exp_SCORE)) + geom_point() + labs(y = "PRS", x = "Hb") + annotate("text", x = 9, y = 1.00033, label="p-value = 0.29") + annotate("text", x = 9, y = 1.00031, label="R^2 = 0.0448") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue") + ggtitle("Haemoglobin on PRS")
updated_Alb <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Alb))
lm_Alb <- lm(exp_SCORE~Alb, data=updated_Alb)
summary(lm_Alb)
##
## Call:
## lm(formula = exp_SCORE ~ Alb, data = updated_Alb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.236e-04 -6.387e-05 2.515e-05 5.195e-05 1.178e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.001e+00 9.373e-05 10674.475 <2e-16 ***
## Alb -3.293e-06 2.667e-06 -1.234 0.229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.272e-05 on 24 degrees of freedom
## Multiple R-squared: 0.0597, Adjusted R-squared: 0.02052
## F-statistic: 1.524 on 1 and 24 DF, p-value: 0.229
cor.test(updated_Alb$Alb, updated_Alb$exp_SCORE, method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: updated_Alb$Alb and updated_Alb$exp_SCORE
## t = -1.2344, df = 24, p-value = 0.229
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5770765 0.1579594
## sample estimates:
## cor
## -0.2443429
scatter_Alb <- ggplot(updated_Alb, aes(x=Alb, y=exp_SCORE)) + geom_point() + labs(y = "PRS", x = "Alb") + annotate("text", x=25, y=1.00033, label="p-value = 0.23") + annotate("text", x=25, y=1.00031, label="R^2 = 0.0597") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue") + ggtitle("Albumin on PRS")
ggarrange(scatter_Hb, scatter_Alb, labels = c("A","B"))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
status.pred_everything_trial <- ifelse(status.pred_everything > 0.3, 1, 0) ## IBD prediction >30%
pred.res_trial <- data.frame(IID = PRS_everything$IID,
status_pred_everything = status.pred_everything_trial)
pred.res_merge <- merge(pred.res_trial, merged_update_clinical_final, by = "IID")
pred.res_merge$prediction <- ifelse(pred.res_merge$status_pred_everything == 1, "predicted", "not_predicted")
#Comparison of CRP levels of predicted vs not predicted
merged_clinical_predicted_CRP <- subset(pred.res_merge, !is.na(pred.res_merge$CRP))
ggplot(merged_clinical_predicted_CRP, aes(x = prediction, y = CRP)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
theme_bw() +
theme(legend.position = "none") +
labs(y = "CRP", x = "IBD Prediction") +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")
#Comparison of total number of escalation of predicted vs not predicted
ggplot(pred.res_merge, aes(x = prediction, y = Total_IBD_surgeries)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
stat_boxplot(geom = "errorbar") +
geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") +
theme_bw() +
theme(legend.position = "none") +
labs(y = "Total Surgeries", x = "IBD Prediction") +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")
#pred_escalations_box <- ggplot(pred_escalations_table, aes(y = Total_Escalations, x = Predicted_IBD)) + geom_boxplot(aes(fill = Predicted_IBD), alpha = 0.7, outlier.shape = NA) + stat_boxplot(geom = "errorbar") + geom_jitter(width = 0.2, size = 2, pch = 21, stroke = 0.7, fill = "white") + scale_fill_manual(values = c("grey70","grey20")) + theme_bw() + theme(legend.position = "none") + labs(y = "Total Number of Escalations", x = "Patients predicted with IBD in ROC curve") +
pred_escalation_binary <- ifelse(grepl("0", pred.res_merge$Total_escalations), "No_Escalation", "Escalation")
pred_escalation_binary_table <- table(pred.res_merge$prediction, pred_escalation_binary) ; pred_escalation_binary_table
## pred_escalation_binary
## Escalation No_Escalation
## not_predicted 11 11
## predicted 6 4
#mcnemar.test(pred_escalation_binary_table)
#p-value = 0.1
pred_escalation_binary_bar_table <- data.frame(as.numeric(c("7","4","11","5")), c("Escalation","Escalation","No_Escalation","No_Escalation"), c("not_predicted",'predicted',"not_predicted",'predicted'))
colnames(pred_escalation_binary_bar_table) <- c("Freq","Escalation","Prediction")
pred_escalation_binary_bar_box <- ggplot(pred_escalation_binary_bar_table, aes(x=Prediction, y=Freq, fill=Escalation)) +
geom_bar(stat="identity", position=position_dodge()) + scale_fill_brewer(palette="Paired") + labs(x = "Prediction of IBD from ROC Curve", y = "Frequency") + ggpubr::stat_compare_means(method = "wilcox.test", label = "p.format")
pred_escalation_binary_bar_box
library(pROC)
## GLM (as above)
IBD_prognostic_glm <- glm(
formula = escalation_binary ~ updated_clinical_escalations.exp_SCORE,
family = binomial(link = "logit"), data = escalation_binary_table)
summary(IBD_prognostic_glm)
##
## Call:
## glm(formula = escalation_binary ~ updated_clinical_escalations.exp_SCORE,
## family = binomial(link = "logit"), data = escalation_binary_table)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6109 -0.8583 -0.6101 1.0177 1.9251
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13373 5948 2.248 0.0246 *
## updated_clinical_escalations.exp_SCORE -13367 5946 -2.248 0.0246 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.236 on 31 degrees of freedom
## Residual deviance: 38.017 on 30 degrees of freedom
## AIC: 42.017
##
## Number of Fisher Scoring iterations: 4
coef(summary(IBD_prognostic_glm))[,'Pr(>|z|)'] #p=0.0246
## (Intercept) updated_clinical_escalations.exp_SCORE
## 0.02456445 0.02456309
## Predict disease status from model
IBD_prognostic_pred <- predict(IBD_prognostic_glm, newdata = escalation_binary_table, type = "response")
status.pred_IBD_prognostic <- ifelse(IBD_prognostic_pred > 0.3, 1, 0) ## Escalation > 30%
pred.res_prognostic <- data.frame(status = escalation_binary_table$escalation_binary, status.pred_IBD_prognostic = status.pred_IBD_prognostic)
## Compare results
table(escalation_binary_table$escalation_binary, status.pred_IBD_prognostic)
## status.pred_IBD_prognostic
## 0 1
## Escalation 9 8
## No_Escalation 2 13
roc.score_prognostic <- roc(escalation_binary_table$escalation_binary, status.pred_IBD_prognostic)
## Setting levels: control = Escalation, case = No_Escalation
## Setting direction: controls < cases
plot.roc(roc.score_prognostic, col = "blue", print.auc = TRUE, legacy.axes = TRUE)
summary(lm_Alb)
##
## Call:
## lm(formula = exp_SCORE ~ Alb, data = updated_Alb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.236e-04 -6.387e-05 2.515e-05 5.195e-05 1.178e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.001e+00 9.373e-05 10674.475 <2e-16 ***
## Alb -3.293e-06 2.667e-06 -1.234 0.229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.272e-05 on 24 degrees of freedom
## Multiple R-squared: 0.0597, Adjusted R-squared: 0.02052
## F-statistic: 1.524 on 1 and 24 DF, p-value: 0.229
cor.test(updated_Alb$Alb, updated_Alb$exp_SCORE, method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: updated_Alb$Alb and updated_Alb$exp_SCORE
## t = -1.2344, df = 24, p-value = 0.229
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5770765 0.1579594
## sample estimates:
## cor
## -0.2443429
updated_Alb <- subset(merged_update_clinical_final, !is.na(merged_update_clinical_final$Alb))
scatter_Alb <- ggplot(updated_Alb, aes(x=Alb, y=exp_SCORE)) + geom_point() + labs(y = "Alb", x = "Hb") + annotate("text", x=42, y=1.00033, label="p-value: 0.23") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue")
scatter_Alb
## `geom_smooth()` using formula = 'y ~ x'