PGS computation

Use the 142 SNPs effect size data and reverse the effect for SNCA and TRIP12

snps_af_142 <- read_xlsx("PGS SNCA TRIP12 adjustment 26_4_22.xlsx", sheet = 1, range = "A1:N143") %>%
  dplyr::select("snp", "gene", "log2_afc") %>% 
  dplyr::rename(rsID=snp) %>% 
  mutate(log2_afc = ifelse(gene %in% c("SNCA", "TRIP12"), -log2_afc, log2_afc)) # reverse the effect for the 2 genes

genotypes <- read_tsv(file = "/Users/rsultana/work/amppd/vcfs/amppd_149SNPs_genotypes.txt") %>%
  mutate(
    rsID = if_else(rsID == ".", "rs28691231", rsID),
    GT = gsub("|", "/", GT, fixed = T) 
  ) %>% 
  mutate(GENOTYPE = if_else((GT == "0/0") |
                      (rsID == "rs112957100" & GT == "0/1") |
                      (rsID == "rs75214905" & GT == "0/2"), 
                      0, # all homozygous ref alleles and the the hets that have not been designed against contribute 0 to the score
                      if_else(GT == '1/1', 
                              2, # the homozygous alt alleles contribute 2
                              1) # the heterozygous alt alleles contribute 1
                     )
  )
  
scores_all_cohorts <- genotypes %>%
  inner_join(snps_af_142, by = "rsID") %>%
  filter(!is.na(GENOTYPE)) %>%
  mutate(term = GENOTYPE * log2_afc) %>%
  drop_na(term) %>%
  group_by(participant_id) %>%
  summarize(score = sum(term))

Read metadata from AMP-PD/MGRB joint call

amppd_metadata <- read_tsv("/Users/rsultana/work/AMP-PD_MGRB_joint_call/amp-pd-nogenetic_for_MGRBJoint_calling.txt")
amppd_metadata %>% mutate(prefix=substr(PID_MRGB, 1, 2)) %>% count(prefix)

amppd_metadata %>% filter(is.na(Dataset)) %>% mutate(prefix=substr(PID_MRGB, 1, 2)) %>% count(prefix)

amppd_metadata <- amppd_metadata %>% 
  rename(participant_id = PID_MRGB, phenotype = Phenotype) %>%
  mutate(phenotype = if_else(!is.na(phenotype), phenotype, "Other")) %>%
  mutate(prefix = substr(participant_id, 1, 2)) %>% 
  mutate(Dataset = case_when(!is.na(Dataset) ~ Dataset, 
                             prefix=="HB" ~ "HBS", 
                             prefix=="SU" ~ "SURE-PD3", 
                             prefix=="SY" ~ "STEADY-PD3")
         ) %>% 
  select(-prefix)
amppd_metadata %>% count(Dataset)

BioFind

scores1 <- scores_all_cohorts %>% 
  inner_join(amppd_metadata, by = "participant_id") %>% 
  filter(Dataset == "BioFind")

scores1 %>% group_by(phenotype) %>% summarize(median_score = median(score))

n <- dim(scores1)[1]

mycols <- pal_nejm("default", alpha = 1)(2)
my_grob_test <- grobTree(textGrob("test = wilcox.test",
  x = 0.8,
  y = 0.95, hjust = 0, gp = gpar(col = "black", fontsize = 9, fontface = "italic")
))
ggplot(scores1, aes(x = phenotype, y = score, fill = phenotype)) +
  geom_boxplot(fill = mycols, alpha = 0.3) +
  geom_jitter(color = "black", width = 0.2, size = 0.7, alpha = 0.9, show.legend = FALSE) +
  geom_signif(comparisons = list(c("Control", "Case")), map_signif_level = TRUE, test = wilcox.test) +
  annotation_custom(my_grob_test) +
  ggtitle("BioFind: Predicted PGS for GBA expression from 142 SNPs",
    subtitle = paste("(", n , "participants)")
  )

HBS

scores1 <- scores_all_cohorts %>% 
  inner_join(amppd_metadata, by = "participant_id") %>% 
  filter((Dataset == 'HBS') |(!is.na(PID_HBS)))

scores1 %>% group_by(phenotype) %>% summarize(median_score = median(score))

n <- dim(scores1)[1]

mycols <- pal_nejm("default", alpha = 1)(3)
my_grob_test <- grobTree(textGrob("test = wilcox.test",
  x = 0.8,
  y = 0.95, hjust = 0, gp = gpar(col = "black", fontsize = 9, fontface = "italic")
))
ggplot(scores1, aes(x = phenotype, y = score, fill = phenotype)) +
  geom_boxplot(fill = mycols, alpha = 0.3) +
  geom_jitter(color = "black", width = 0.2, size = 0.7, alpha = 0.9, show.legend = FALSE) +
  geom_signif(comparisons = list(c("Control", "Case")), map_signif_level = TRUE, test = wilcox.test) +
  annotation_custom(my_grob_test) +
  ggtitle("HBS: Predicted PGS for GBA expression from 142 SNPs",
    subtitle = paste("(", n , "participants)")
  )

PDBP

scores1 <- scores_all_cohorts %>% 
  inner_join(amppd_metadata, by = "participant_id") %>% 
  filter((Dataset == 'PDBP'))

scores1 %>% group_by(phenotype) %>% summarize(median_score = median(score))

n <- dim(scores1)[1]

mycols <- pal_nejm("default", alpha = 1)(2)
my_grob_test <- grobTree(textGrob("test = wilcox.test",
  x = 0.8,
  y = 0.95, hjust = 0, gp = gpar(col = "black", fontsize = 9, fontface = "italic")
))
ggplot(scores1, aes(x = phenotype, y = score, fill = phenotype)) +
  geom_boxplot(fill = mycols, alpha = 0.3) +
  geom_jitter(color = "black", width = 0.2, size = 0.7, alpha = 0.9, show.legend = FALSE) +
  geom_signif(comparisons = list(c("Control", "Case")), map_signif_level = TRUE, test = wilcox.test) +
  annotation_custom(my_grob_test) +
  ggtitle("PDBP: Predicted PGS for GBA expression from 142 SNPs",
    subtitle = paste("(", n , "participants)")
  )

Q-Q plots for Cases and Controls, showing normal distributions

ggplot(scores1, aes(sample = score, color = phenotype)) +
  facet_grid(cols = vars(phenotype)) + 
  stat_qq(show.legend = FALSE) + 
  geom_qq_line(color="black") +
  theme_bw()

density plots for cases and controls with gaussian curves with same mean and std

ggplot(scores1, aes(x = score, color = phenotype)) +
  facet_grid(cols = vars(phenotype)) + 
  geom_density(show.legend = FALSE) +
  stat_function(
    fun = dnorm, 
    args = with(scores1, c(mean = mean(score), sd = sd(score))), 
    color = "black"
  ) +
  scale_x_continuous("polygenic score")

PPMI

scores1 <- scores_all_cohorts %>% 
  inner_join(amppd_metadata, by = "participant_id") %>% 
  filter(Dataset == 'PPMI')

scores1 %>% group_by(phenotype) %>% summarize(median_score = median(score))

n <- dim(scores1)[1]

mycols <- pal_nejm("default", alpha = 1)(2)
my_grob_test <- grobTree(textGrob("test = wilcox.test",
  x = 0.8,
  y = 0.95, hjust = 0, gp = gpar(col = "black", fontsize = 9, fontface = "italic")
))
ggplot(scores1, aes(x = phenotype, y = score, fill = phenotype)) +
  geom_boxplot(fill = mycols, alpha = 0.3) +
  geom_jitter(color = "black", width = 0.2, size = 0.7, alpha = 0.9, show.legend = FALSE) +
  geom_signif(comparisons = list(c("Control", "Case")), map_signif_level = TRUE, test = wilcox.test) +
  annotation_custom(my_grob_test) +
  ggtitle("PPMI: Predicted PGS for GBA expression from 142 SNPs",
    subtitle = paste("(", n , "participants)")
  )

STEADY-PD3

scores1 <- scores_all_cohorts %>% 
  inner_join(amppd_metadata, by = "participant_id") %>% 
  filter((Dataset == 'STEADY-PD3') |(!is.na(PID_STEADYPD3)))

scores1 %>% group_by(phenotype) %>% summarize(median_score = median(score))

n <- dim(scores1)[1]

mycols <- pal_nejm("default", alpha = 1)(3)
my_grob_test <- grobTree(textGrob("test = wilcox.test",
  x = 0.8,
  y = 0.95, hjust = 0, gp = gpar(col = "black", fontsize = 9, fontface = "italic")
))
ggplot(scores1, aes(x = phenotype, y = score, fill = phenotype)) +
  geom_boxplot(fill = mycols, alpha = 0.3) +
  geom_jitter(color = "black", width = 0.2, size = 0.7, alpha = 0.9, show.legend = FALSE) +
  geom_signif(comparisons = list(c("Control", "Case")), map_signif_level = TRUE, test = wilcox.test) +
  annotation_custom(my_grob_test) +
  ggtitle("STEADY-PD3: Predicted PGS for GBA expression from 142 SNPs",
    subtitle = paste("(", n , "participants)")
  )

SURE-PD3

scores1 <- scores_all_cohorts %>% 
  inner_join(amppd_metadata, by = "participant_id") %>% 
  filter((Dataset == 'SURE-PD3') |(!is.na(PID_SUREPD3)))

scores1 %>% group_by(phenotype) %>% summarize(median_score = median(score))

n <- dim(scores1)[1]

mycols <- pal_nejm("default", alpha = 1)(2)
my_grob_test <- grobTree(textGrob("test = wilcox.test",
  x = 0.8,
  y = 0.95, hjust = 0, gp = gpar(col = "black", fontsize = 9, fontface = "italic")
))
ggplot(scores1, aes(x = phenotype, y = score, fill = phenotype)) +
  geom_boxplot(fill = mycols, alpha = 0.3) +
  geom_jitter(color = "black", width = 0.2, size = 0.7, alpha = 0.9, show.legend = FALSE) +
  geom_signif(comparisons = list(c("Control", "Case")), map_signif_level = TRUE, test = wilcox.test) +
  annotation_custom(my_grob_test) +
  ggtitle("SURE-PD3: Predicted PGS for GBA expression from 142 SNPs",
    subtitle = paste("(", n , "participants)")
  )

Calculate PGS for the 48 samples in APM-CT1

genotypes <- read_tsv("48samples_149SNPs_genotypes_by_rsID.txt") %>%
  mutate(
    rsID = if_else(rsID == ".", "rs28691231", rsID),
    GT = gsub("|", "/", GT, fixed = T)
  ) %>%
  mutate(GENOTYPE = if_else((GT == "0/0" |
    (rsID == "rs112957100" & GT == "0/1") |
    (rsID == "rs75214905" & GT == "0/2")),
  0,
  if_else(GT == "1/1",
    2,
    1
  )
  ))

dict <- read_tsv("48samples_dict.tsv")
reversed_effect_snps <- snps_af_142 %>% filter(gene %in% c("SNCA", "TRIP12")) %>% select(rsID) %>% unlist()
snps_af_96 <- read_tsv(file = "~/work/parkons_disease_a-cooper/100_SNPs/R/96Snps.log2Afc.tsv") %>% 
  rename(rsID = snp_id)

scores1 <- genotypes %>%
  inner_join(snps_af_96, by = "rsID") %>%
  filter(!is.na(GENOTYPE)) %>%
  mutate(term = GENOTYPE * log2_afc) %>%
  drop_na(term) %>%
  group_by(sample) %>%
  summarize(score = sum(term)) %>%
  inner_join(dict, by = "sample")


scores2 <- genotypes %>%
  inner_join(snps_af_142, by = "rsID") %>%
  filter(!is.na(GENOTYPE)) %>%
  mutate(term = GENOTYPE * log2_afc) %>%
  drop_na(term) %>%
  group_by(sample) %>%
  summarize(score = sum(term)) %>%
  inner_join(dict, by = "sample")

all_scores <- (scores1 %>% mutate(dataset = "96SNPs")) %>%
  rbind(scores2 %>% mutate(dataset = "142SNPs"))

mycols <- pal_nejm("default", alpha = 1)(2)
ggplot(all_scores, aes(dataset, score, fill = dataset)) +
  geom_boxplot(fill = mycols, alpha = 0.3) +
  geom_label(aes(label = letter), position = "jitter", alpha = 0.75) +
  #  geom_jitter(color="black", width = 0.2, size=0.7, alpha=0.9, show.legend = FALSE) +
  ggtitle("Predicted PGS for GBA expression from 142 SNPs",
    subtitle = "Scores for weighted additive model with 142 and 96 markers"
  )

Comparing PGS scores for 48 samples on the two SNP datasets

Table

scores12 <- scores1 %>% 
  inner_join(scores2, by=c("sample", "letter")) %>% 
  rename(score96 = score.x, score142 = score.y) %>%
  relocate("sample", "letter") 
write_csv(scores12, file="score96_vs_score142.csv")
scores12

Download this table: score96_vs_score142.csv

Scatter plot

gg <- scores12 %>% 
  ggplot( aes(score96, score142, text=letter, key=sample, shape=as.factor(3))) + 
  xlim(-20, 20) + 
  ylim(-20, 20) + 
  geom_point(colour=mycols[1], alpha=0.5, size=4) +
  theme(legend.position='none') 
ggplotly(gg, tooltip=c('sample', 'letter', 'score96', 'score142'))

PGS computation

Razvan Sultana

2022-05-04

Use the 142 SNPs effect size data and reverse the effect for SNCA and TRIP12

Read metadata from AMP-PD/MGRB joint call

BioFind

HBS

PDBP

PPMI

STEADY-PD3

SURE-PD3

Calculate PGS for the 48 samples in APM-CT1

Comparing PGS scores for 48 samples on the two SNP datasets

Table

Scatter plot