library(dplyr)
library(ggplot2)
library(glue)
To compute the PCA loadings, a similar approach as detailed in the UKBB genotyping QC report guide was used. It’s reproduced here for convenience:
plink and its indep-pairwise function with a moving window of size 1000 bp).C/G and A/T SNPs to avoid unresolvable strand mismatches.[1] A.L. Price et al. Long-range LD can confound genome scans in admixed populations. The American Journal of Human Genetics, 83(1):132-135, 2008.
We also computed the PCA loadings specifically on the individuals self-reported as British, to capture population structure only on this subset.
data_dir <- "~/data/PhD/UKBB/"
GWAS_DIR <- "/home/rodrigo/GWAS/output/coma"
gwas_paths <- list(
"qqplot" = file.path(GWAS_DIR, "{input$experiment}/{input$suffix}/figures/GWAS__{as.character(input$z)}__{input$suffix}__QQ-plot.png"),
"manhattan" = file.path(GWAS_DIR, "{input$experiment}/{input$suffix}/figures/GWAS__{as.character(input$z)}__{input$suffix}__manhattan.png"),
"pooled_qqplot" = file.path(GWAS_DIR, "{input$experiment}/{input$suffix}/figures/GWAS__all__QQ-plot.png")
)
Only British individuals were kept, based on self-reported ethnicity (data field 21000, value 1001).
gbr_ids_file <- glue("{data_dir}/subject_ids/british_ids.txt")
cmr_gbr_ids_file <- glue("{data_dir}/subject_ids/cmr_british_ids.txt")
bfile_pattern <- "~/GWAS/data/genotypes/calls/ukb_cal_chr{chromosome}_v2_31803_indiv"
bim_file_pattern <- paste0(bfile_pattern, ".bim")
extract_variants_in_range <- function(chromosome, start, end) {
bim_file <- glue(bim_file_pattern)
snp_df <- read.table(bim_file, header=FALSE)
colnames(snp_df) <- c("chromosome", "rsid", "pos_cm", "position", "allele_1", "allele_2")
snp_df <- snp_df %>% filter(position > start & position < end)
snp_df$rsid
}
extract_variants_ambiguous_strand <- function(chromosome) {
bim_file <- glue(bim_file_pattern)
snp_df <- read.table(bim_file, header=FALSE)
colnames(snp_df) <- c("chromosome", "rsid", "pos_cm", "position", "allele_1", "allele_2")
snp_df <- snp_df %>% filter(allele_1 == "C" & allele_2 == "G" | allele_1 == "G" & allele_2 == "C" | allele_1 == "A" & allele_2 == "T" | allele_1 == "T" & allele_2 == "A")
as.character(snp_df$rsid)
}
long_range_ld_df <- read.table("~/GWAS/data/long_range_LD_regions.bed", sep = "\t", header=TRUE)
long_range_ld.lst <- long_range_ld_df %>% select(1:3) %>% split(., seq(nrow(.))) # dataframe to list by rows
snps_in_ld <- lapply(long_range_ld.lst, function(x) {x <- as.integer(x); extract_variants_in_range(x[1], x[2], x[3])})
snps_in_ld <- unlist(snps_in_ld)
ambiguous_strand_snps <- lapply(1:22, extract_variants_ambiguous_strand)
ambiguous_strand_snps <- unlist(ambiguous_strand_snps)
out_bfile_pattern <- "~/GWAS/data/genotypes/calls/ukb_cal_chr{chromosome}_v2_GBR_indiv"
for(chromosome in 1:22) {
bfile <- glue(bfile_pattern)
out_bfile <- glue(out_bfile_pattern)
plink_command_maf <- glue("plink --keep {cmr_gbr_ids_file} --freq --bfile {bfile} --out {out_bfile}")
system(plink_command_maf)
plink_command_missing <- glue("plink --keep {cmr_gbr_ids_file} --missing --bfile {bfile} --out {out_bfile}")
system(plink_command_missing)
plink_command_hwe <- glue("plink --keep {cmr_gbr_ids_file} --hardy --bfile {bfile} --out {out_bfile}")
system(plink_command_hwe)
}
get_snps_below_maf_thr <- function(chromosome, maf_threshold=0.025) {
maf_file_pattern <- paste0(out_bfile_pattern, ".frq")
maf_file <- glue(maf_file_pattern)
maf_df <- read.table(maf_file, header=TRUE)
maf_df <- maf_df %>% filter(MAF < maf_threshold)
as.character(maf_df$SNP)
}
get_snps_above_miss_thr <- function(chromosome, missingness_threshold=0.015) {
missingness_file_pattern <- paste0(out_bfile_pattern, ".lmiss")
missingness_file <- glue(missingness_file_pattern)
missingness_df <- read.table(missingness_file, header=TRUE)
missingness_df <- missingness_df %>% filter(F_MISS > missingness_threshold)
as.character(missingness_df$SNP)
}
get_snps_below_hwe_p_thr <- function(chromosome, hwe_p_threshold=1e-5) {
hwe_file_pattern <- paste0(out_bfile_pattern, ".hwe")
hwe_file <- glue(hwe_file_pattern)
hwe_df <- read.table(hwe_file, header=TRUE)
hwe_df <- hwe_df %>% filter(P < hwe_p_threshold)
as.character(hwe_df$SNP)
}
snps_below_maf_thr <- unlist(lapply(1:22, get_snps_below_maf_thr))
snps_above_miss_thr <- unlist(lapply(1:22, get_snps_above_miss_thr))
snps_below_hwe_p_thr <- unlist(lapply(1:22, get_snps_below_hwe_p_thr))
60267 with C/G or A/T allele pairs.
270481 with MAF < 0.025
114746 with missingness > 0.015
17629 with HWE p-value < \(10^{-5}\).
snps_to_exclude <- unique(
c(ambiguous_strand_snps,
snps_below_maf_thr,
snps_above_miss_thr,
snps_below_hwe_p_thr)
)
368547 SNPs to exclude given the above criterion.
snps_exclude_df <- as.data.frame(snps_to_exclude)
snps_to_exclude_file <- "~/GWAS/data/genotypes/calls/exclude_snps_for_pca.txt"
write.csv(snps_exclude_df, snps_to_exclude_file, quote = FALSE, row.names = FALSE)
for(chromosome in 1:22) {
bfile <- glue(bfile_pattern)
plink_ld_prune_command <- "plink --exclude {snps_to_exclude_file} --indep-pairwise 100 10 0.1 -bfile {bfile} --out {bfile}"
plink_ld_prune_command <- glue(plink_ld_prune_command)
system(plink_ld_prune_command)
}
prunein_file_pattern <- paste0(bfile_pattern, ".prune.in")
prunein_snps <- unlist(lapply(1:22, function(chromosome) read.table(glue(prunein_file_pattern))[,1]))
prunein_snps_df <- as.data.frame(prunein_snps)
prunein_snps_file <- "~/GWAS/data/genotypes/calls/snps_for_pca_prune.in"
write.csv(prunein_snps_df, prunein_snps_file, quote = FALSE, row.names = FALSE)
106396 SNPs after LD pruning.
files_to_merge <- "~/GWAS/data/genotypes/calls/PCA_on_GBR/files_to_merge.txt"
if (!file.exists(files_to_merge)) {
for(chromosome in 1:22) {
system(glue("echo ~/GWAS/data/genotypes/calls/PCA_on_GBR/ukb_cal_chr{chromosome}_v2_29102_GBR_indiv_snps_for_PCA >> {files_to_merge}"))
}
}
geno_file_for_pca_chr1 <- "~/GWAS/data/genotypes/calls/PCA_on_GBR/ukb_cal_chr1_v2_29102_GBR_indiv_snps_for_PCA"
geno_file_for_pca <- "~/GWAS/data/genotypes/calls/PCA_on_GBR/ukb_cal_all_chrs_v2_29102_GBR_indiv_snps_for_PCA"
if (!file.exists(geno_file_for_pca)) {
system(paste("plink",
"--bfile", geno_file_for_pca_chr1,
"--merge-list", files_to_merge,
"--make-bed", "--out", geno_file_for_pca))
}
extract_id <- function(x) strsplit(x, ":")[[1]][1]
bfile_filtered <- "/home/rodrigo/GWAS/data/genotypes/calls/PCA_on_GBR/ukb_cal_all_chrs_v2_29102_GBR_indiv_snps_for_PCA"
genomic_pca_file <- "~/data/PhD/UKBB/genomicPCs_unrelated_GBR.tsv"
if (!file.exists(genomic_pca_file)) {
genomic_pca <- flashpcaR::flashpca(bfile_filtered, ndim=10)
pca_proj <- genomic_pca$projection
pca_proj_df <- as.data.frame(pca_proj)
colnames(pca_proj_df) <- paste0("PC", 1:ncol(pca_proj_df))
pca_proj_df$ID <- rownames(pca_proj_df)
pca_proj_df <- cbind(pca_proj_df %>% select(ID), pca_proj_df %>% select(-ID)) # reorder columns
write.table(pca_proj_df, file=genomic_pca_file , quote=FALSE, row.names=FALSE, sep = "\t")
} else {
pca_proj_df <- read.table(genomic_pca_file, header=TRUE, sep = "\t", stringsAsFactors = FALSE)
}
pca_proj_df$ID <- unname(sapply(pca_proj_df$ID, extract_id))
rownames(pca_proj_df) <- pca_proj_df$ID
change_z_sign <- function(z_df, sign) {
for (z in names(sign)) {
sgn <- sign[[z]]
if (sgn == "-")
z_df[,z] <- -1 * z_df[,z]
}
z_df
}
fetch_latent_repr <- function(run_id, mapping=NULL, sign=NULL) {
root_dir <- "/home/rodrigo/pytorch_coma/"
z_df <- read.csv(glue::glue(file.path(root_dir, "output/{run_id}/latent_space.csv")))
rownames(z_df) <- z_df$ID
if(!is.null(mapping)) {
z_df <- data.table::setnames(z_df, old=unname(unlist(mapping)), new=names(mapping))
z_df <- z_df[,names(mapping)]
}
if(!is.null(sign))
z_df <- change_z_sign(z_df, sign)
z_df
}
run_params <- yaml::read_yaml("/home/rodrigo/pytorch_coma/analysis/paper_z_mapping_parameters.yaml")
experiments <- c("2020-09-11_02-13-41", "2020-09-30_12-36-48")
N_PC <- 10
for(run_id in experiments) {
mapping <- run_params[[run_id]]$mapping
sign <- run_params[[run_id]]$sign
label <- run_params[[run_id]]$label
z_df <- fetch_latent_repr(run_id, mapping, sign)
pca_proj_ <- pca_proj_df[intersect(rownames(z_df), rownames(pca_proj_df)),] %>% select(-ID)
z_df <- z_df[rownames(pca_proj_),]
corr_mat <- pval_mat <- matrix(nrow=N_PC, ncol=8)
colnames(corr_mat) <- colnames(pval_mat) <- paste0("z", 1:8)
rownames(corr_mat) <- rownames(pval_mat) <- colnames(pca_proj_)[1:N_PC]
for (zcol in paste0("z", 1:8)) {
for (pc in colnames(pca_proj_)[1:N_PC]) {
corr <- cor.test(pca_proj_[,pc], z_df[,zcol], method = "spearman")
corr_mat[pc, zcol] <- corr$estimate
pval_mat[pc, zcol] <- corr$p.value
}
}
# corrmat <- cor(pca_proj, z_df %>% select(starts_with("z")), method="spearman", use="complete.obs")
print(knitr::kable(corr_mat))
cat("\n")
#print(knitr::kable(pval_mat))
colnames(corr_mat) <- label
colnames(pval_mat) <- label
#corrplot::corrplot(corr_mat)
# corrplot::corrplot(pval_mat, p.mat = TRUE)
corrplot::corrplot(-log10(pval_mat), is.corr = FALSE)
}
| z1 | z2 | z3 | z4 | z5 | z6 | z7 | z8 | |
|---|---|---|---|---|---|---|---|---|
| PC1 | -0.0060511 | -0.0093328 | -0.0100627 | -0.0049685 | 0.0111006 | -0.0065770 | 0.0055051 | -0.0028247 |
| PC2 | 0.0143461 | -0.0132489 | 0.0113112 | 0.0004961 | 0.0070112 | -0.0079429 | 0.0000574 | -0.0027432 |
| PC3 | -0.0056838 | -0.0032027 | -0.0042216 | 0.0051550 | 0.0024892 | -0.0019109 | -0.0066021 | -0.0134289 |
| PC4 | -0.0033674 | 0.0093081 | -0.0021616 | -0.0030504 | 0.0143604 | -0.0037097 | 0.0165878 | -0.0022658 |
| PC5 | -0.0126088 | -0.0130336 | 0.0031143 | -0.0018043 | 0.0028480 | 0.0008641 | -0.0018994 | -0.0050211 |
| PC6 | -0.0278630 | -0.0058944 | -0.0255264 | -0.0083842 | -0.0009339 | 0.0119753 | -0.0002249 | -0.0104760 |
| PC7 | -0.0036423 | 0.0037955 | 0.0128557 | 0.0018069 | -0.0024859 | -0.0002936 | 0.0008076 | 0.0149532 |
| PC8 | 0.0128945 | -0.0025641 | 0.0019409 | 0.0010164 | 0.0048679 | -0.0035015 | 0.0048825 | 0.0030741 |
| PC9 | -0.0029489 | -0.0003111 | -0.0086427 | 0.0055054 | 0.0109544 | 0.0035906 | 0.0062229 | 0.0100403 |
| PC10 | 0.0126557 | 0.0015657 | -0.0066312 | 0.0060868 | -0.0109851 | 0.0043145 | -0.0055423 | 0.0002915 |
| z1 | z2 | z3 | z4 | z5 | z6 | z7 | z8 | |
|---|---|---|---|---|---|---|---|---|
| PC1 | 0.0210821 | -0.0188998 | -0.0002352 | -0.0112263 | -0.0091167 | -0.0028445 | -0.0030632 | 0.0012038 |
| PC2 | 0.0147659 | -0.0097434 | -0.0130406 | -0.0159214 | 0.0052375 | 0.0078096 | 0.0025153 | -0.0008332 |
| PC3 | 0.0080426 | -0.0097472 | 0.0061558 | -0.0049385 | 0.0089634 | 0.0038677 | -0.0043481 | 0.0078931 |
| PC4 | 0.0117267 | -0.0065401 | 0.0093961 | 0.0038609 | -0.0056222 | -0.0057883 | -0.0064735 | -0.0131098 |
| PC5 | 0.0112790 | -0.0211647 | -0.0036463 | 0.0001584 | -0.0037373 | 0.0038666 | -0.0016846 | 0.0068063 |
| PC6 | 0.0018556 | -0.0204457 | 0.0172565 | 0.0010715 | -0.0216611 | -0.0054037 | -0.0146860 | 0.0171825 |
| PC7 | -0.0190301 | 0.0112484 | -0.0077473 | 0.0149391 | 0.0039388 | -0.0066601 | 0.0087385 | -0.0034567 |
| PC8 | 0.0045447 | 0.0015508 | -0.0056079 | -0.0078998 | 0.0038796 | 0.0040008 | 0.0023681 | -0.0077592 |
| PC9 | 0.0098652 | -0.0073610 | -0.0096615 | 0.0026235 | -0.0111453 | -0.0064398 | -0.0066015 | -0.0020671 |
| PC10 | 0.0049926 | 0.0086623 | -0.0038186 | -0.0072781 | 0.0059011 | 0.0080428 | 0.0043803 | 0.0020185 |