library(dplyr)
library(ggplot2)
library(glue)

Genetic principal components

To compute the PCA loadings, a similar approach as detailed in the UKBB genotyping QC report guide was used. It’s reproduced here for convenience:

[1] A.L. Price et al. Long-range LD can confound genome scans in admixed populations. The American Journal of Human Genetics, 83(1):132-135, 2008.

We also computed the PCA loadings specifically on the individuals self-reported as British, to capture population structure only on this subset.

Paths

data_dir <- "~/data/PhD/UKBB/"
GWAS_DIR <- "/home/rodrigo/GWAS/output/coma"

gwas_paths <- list(
  "qqplot" = file.path(GWAS_DIR, "{input$experiment}/{input$suffix}/figures/GWAS__{as.character(input$z)}__{input$suffix}__QQ-plot.png"),
  "manhattan" = file.path(GWAS_DIR, "{input$experiment}/{input$suffix}/figures/GWAS__{as.character(input$z)}__{input$suffix}__manhattan.png"),
  "pooled_qqplot" = file.path(GWAS_DIR, "{input$experiment}/{input$suffix}/figures/GWAS__all__QQ-plot.png")
)

British individuals

Only British individuals were kept, based on self-reported ethnicity (data field 21000, value 1001).

gbr_ids_file <- glue("{data_dir}/subject_ids/british_ids.txt")
cmr_gbr_ids_file <- glue("{data_dir}/subject_ids/cmr_british_ids.txt")
bfile_pattern <- "~/GWAS/data/genotypes/calls/ukb_cal_chr{chromosome}_v2_31803_indiv"
bim_file_pattern <- paste0(bfile_pattern, ".bim")

extract_variants_in_range <- function(chromosome, start, end) {
  bim_file <- glue(bim_file_pattern)
  snp_df <- read.table(bim_file, header=FALSE)
  colnames(snp_df) <- c("chromosome", "rsid", "pos_cm", "position", "allele_1", "allele_2")
  snp_df <- snp_df %>% filter(position > start & position < end)
  snp_df$rsid
}

extract_variants_ambiguous_strand <- function(chromosome) {
  bim_file <- glue(bim_file_pattern)
  snp_df <- read.table(bim_file, header=FALSE)
  colnames(snp_df) <- c("chromosome", "rsid", "pos_cm", "position", "allele_1", "allele_2")
  snp_df <- snp_df %>% filter(allele_1 == "C" & allele_2 == "G" | allele_1 == "G" & allele_2 == "C" | allele_1 == "A" & allele_2 == "T" | allele_1 == "T" & allele_2 == "A")
  as.character(snp_df$rsid)
}

SNPs in regions with long-range LD

long_range_ld_df <- read.table("~/GWAS/data/long_range_LD_regions.bed", sep = "\t", header=TRUE)
long_range_ld.lst <- long_range_ld_df %>% select(1:3) %>% split(., seq(nrow(.))) # dataframe to list by rows
snps_in_ld <- lapply(long_range_ld.lst, function(x) {x <- as.integer(x); extract_variants_in_range(x[1], x[2], x[3])})
snps_in_ld <- unlist(snps_in_ld)

SNPs with A/T and C/G alleles

ambiguous_strand_snps <- lapply(1:22, extract_variants_ambiguous_strand)
ambiguous_strand_snps <- unlist(ambiguous_strand_snps)

MAF ≥ 2.5% and missingness ≤ 1.5%

out_bfile_pattern <- "~/GWAS/data/genotypes/calls/ukb_cal_chr{chromosome}_v2_GBR_indiv"

for(chromosome in 1:22) {
  bfile <- glue(bfile_pattern)
  out_bfile <- glue(out_bfile_pattern)
  plink_command_maf <- glue("plink --keep {cmr_gbr_ids_file} --freq --bfile {bfile} --out {out_bfile}")
  system(plink_command_maf)
  plink_command_missing <- glue("plink --keep {cmr_gbr_ids_file} --missing --bfile {bfile} --out {out_bfile}")
  system(plink_command_missing)
  plink_command_hwe <- glue("plink --keep {cmr_gbr_ids_file} --hardy --bfile {bfile} --out {out_bfile}")
  system(plink_command_hwe)
}

get_snps_below_maf_thr <- function(chromosome, maf_threshold=0.025) {
  maf_file_pattern <- paste0(out_bfile_pattern, ".frq")
  maf_file <- glue(maf_file_pattern)
  maf_df <- read.table(maf_file, header=TRUE)
  maf_df <- maf_df %>% filter(MAF < maf_threshold)
  as.character(maf_df$SNP)
}

get_snps_above_miss_thr <- function(chromosome, missingness_threshold=0.015) {
  missingness_file_pattern <- paste0(out_bfile_pattern, ".lmiss")
  missingness_file <- glue(missingness_file_pattern)
  missingness_df <- read.table(missingness_file, header=TRUE)
  missingness_df <- missingness_df %>% filter(F_MISS > missingness_threshold)
  as.character(missingness_df$SNP)
}

get_snps_below_hwe_p_thr <- function(chromosome, hwe_p_threshold=1e-5) {
  hwe_file_pattern <- paste0(out_bfile_pattern, ".hwe")
  hwe_file <- glue(hwe_file_pattern)
  hwe_df <- read.table(hwe_file, header=TRUE)
  hwe_df <- hwe_df %>% filter(P < hwe_p_threshold)
  as.character(hwe_df$SNP)
}
  
snps_below_maf_thr <- unlist(lapply(1:22, get_snps_below_maf_thr))
snps_above_miss_thr <- unlist(lapply(1:22, get_snps_above_miss_thr))
snps_below_hwe_p_thr <- unlist(lapply(1:22, get_snps_below_hwe_p_thr))

Counts

60267 with C/G or A/T allele pairs.

270481 with MAF < 0.025

114746 with missingness > 0.015

17629 with HWE p-value < \(10^{-5}\).

snps_to_exclude <- unique(
  c(ambiguous_strand_snps, 
  snps_below_maf_thr, 
  snps_above_miss_thr, 
  snps_below_hwe_p_thr)
)

368547 SNPs to exclude given the above criterion.

snps_exclude_df <- as.data.frame(snps_to_exclude)
snps_to_exclude_file <- "~/GWAS/data/genotypes/calls/exclude_snps_for_pca.txt"
write.csv(snps_exclude_df, snps_to_exclude_file, quote = FALSE, row.names = FALSE)