Dataset

The dataset comes from an association study around the APOE locus by the Japanese Genetic Study Consortium for Alzheimer Disease (PMID: 19442637). We simulated the data at neutral sites that were omitted from the publication. 547 patients with LOAD and 715 controls were genotyped at SNPs in a 200 kb region including the APOE locus. 171 SNPs passed quality control.

The study was conducted in a homogeneous Japanese population. This avoids spurious associations due to population stratification, i.e., mixing populations that are heterogeneous with respect to allele frequencies and disease prevalence.

Each SNP site has 2 alleles in the sample gene pool. The allele in lowercase is the minor allele, i.e., the allele that is less frequent in the population.

library(tidyverse)
geno <- read_csv("https://raw.githubusercontent.com/mss-genomics/first-meeting/main/APOE_LOAD.csv")
geno
## # A tibble: 171 × 9
##    SNP_ID    Allele   MAF LOAD_MM LOAD_Mm LOAD_mm Control_MM Control_Mm Contro…¹
##    <chr>     <chr>  <dbl>   <dbl>   <dbl>   <dbl>      <dbl>      <dbl>    <dbl>
##  1 rs419010  A/g    0.493     112     278     150        206        348      150
##  2 rs394221  C/t    0.487     149     269      91        153        325      180
##  3 rs4803766 G/a    0.434     136     273     130        265        328      107
##  4 rs395908  G/a    0.28      228     240      70        414        261       26
##  5 rs519113  C/g    0.273     237     226      73        423        258       23
##  6 rs412776  G/a    0.257     245     230      65        440        237       19
##  7 rs3865427 C/a    0.209     299     198      43        481        206       14
##  8 rs3852860 T/c    0.321     191     268      77        359        292       36
##  9 rs3852861 T/g    0.318     196     261      80        375        294       39
## 10 rs6857    C/t    0.18      316     184      26        504        183       11
## # … with 161 more rows, and abbreviated variable name ¹​Control_mm

Statistics

First, calculate the allele counts from the genotype counts.

geno1 <- geno %>% rowwise() %>%
  mutate(LOAD_M=2*LOAD_MM+LOAD_Mm) %>% 
  mutate(LOAD_m=2*LOAD_mm+LOAD_Mm) %>% 
  mutate(Control_M=2*Control_MM+Control_Mm) %>%
  mutate(Control_m=2*Control_mm+Control_Mm)
geno1[,c(1:3,10:13)]
## # A tibble: 171 × 7
## # Rowwise: 
##    SNP_ID    Allele   MAF LOAD_M LOAD_m Control_M Control_m
##    <chr>     <chr>  <dbl>  <dbl>  <dbl>     <dbl>     <dbl>
##  1 rs419010  A/g    0.493    502    578       760       648
##  2 rs394221  C/t    0.487    567    451       631       685
##  3 rs4803766 G/a    0.434    545    533       858       542
##  4 rs395908  G/a    0.28     696    380      1089       313
##  5 rs519113  C/g    0.273    700    372      1104       304
##  6 rs412776  G/a    0.257    720    360      1117       275
##  7 rs3865427 C/a    0.209    796    284      1168       234
##  8 rs3852860 T/c    0.321    650    422      1010       364
##  9 rs3852861 T/g    0.318    653    421      1044       372
## 10 rs6857    C/t    0.18     816    236      1191       205
## # … with 161 more rows

Association (i.e. lack of independence) between the genotypes at a site and the disease can be tested by a chi-square test with 2 d.f. for the 3x2 contingency table of genotype counts.

Here is an example for the E4 allele of APOE:

E4 <- "rs429358"
E4_genotype_table <- rbind(c(geno1[geno1$SNP_ID==E4,]$Control_MM, geno1[geno1$SNP_ID==E4,]$LOAD_MM), c(geno1[geno1$SNP_ID==E4,]$Control_Mm,geno1[geno1$SNP_ID==E4,]$LOAD_Mm), c(geno1[geno1$SNP_ID==E4,]$Control_mm,geno1[geno1$SNP_ID==E4,]$LOAD_mm))
chisq.test(E4_genotype_table)$p.value
## [1] 7.02883e-31

Calculate the odds ratios and their 95% confidence interval for the E4/E4 and E3/E4 genotypes compared to the E3/E3 genotype.

library(epitools)
oddsratio(E4_genotype_table)$measure
##           odds ratio with 95% C.I.
## Predictor  estimate     lower      upper
##   Exposed1  1.00000        NA         NA
##   Exposed2  3.22048  2.478019    4.20017
##   Exposed3 81.26147 17.910623 1899.72514

The Cochran-Armitage trend test requires a specific genetic model. For example, test whether the disease risk increases linearly with the number of risk alleles, i.e., the additive model. Also test for a recessive model or a dominant model.

trend_add <- prop.trend.test(E4_genotype_table[,1], E4_genotype_table[,1]+E4_genotype_table[,2], c(0,0.5,1))
trend_dom <- prop.trend.test(E4_genotype_table[,1], E4_genotype_table[,1]+E4_genotype_table[,2], c(0,1,1))
trend_rec <- prop.trend.test(E4_genotype_table[,1], E4_genotype_table[,1]+E4_genotype_table[,2], c(0,0,1))
c(trend_add$p.value, trend_dom$p.value, trend_rec$p.value)
## [1] 6.648448e-32 3.117625e-27 7.303970e-15

A commonly used significance test compares the allele frequencies between cases and controls using a chi-square test with 1 d.f. for the 2x2 contingency table of allele counts.

E4_allele_table <- rbind(c(geno1[geno1$SNP_ID==E4,]$Control_M, geno1[geno1$SNP_ID==E4,]$LOAD_M),c(geno1[geno1$SNP_ID==E4,]$Control_m,geno1[geno1$SNP_ID==E4,]$LOAD_m))
oddsratio(E4_allele_table)
## $data
##           Outcome
## Predictor  Disease1 Disease2 Total
##   Exposed1     1304      798  2102
##   Exposed2      126      296   422
##   Total        1430     1094  2524
## 
## $measure
##           odds ratio with 95% C.I.
## Predictor  estimate    lower    upper
##   Exposed1 1.000000       NA       NA
##   Exposed2 3.834175 3.064376 4.821686
## 
## $p.value
##           two-sided
## Predictor  midp.exact fisher.exact   chi.square
##   Exposed1         NA           NA           NA
##   Exposed2          0 5.211636e-34 4.319038e-34
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"

In what follows, calculate the allele-based chi-square test and odds ratio with 95% confidence intervals for each SNP.

geno1 <- geno1 %>% rowwise() %>%
  mutate(stats = paste(unlist(oddsratio(rbind(c(Control_M,LOAD_M),c(Control_m,LOAD_m))))[c("p.value6","measure2","measure4","measure6")], collapse=" ")) %>%
  separate(stats, c("p_value", "OR", "95_CI_inf", "95_CI_sup"), sep = " ", convert=TRUE)
geno1[,c(1:2,10:17)]
## # A tibble: 171 × 10
##    SNP_ID    Allele LOAD_M LOAD_m Contr…¹ Contr…²  p_value    OR 95_CI…³ 95_CI…⁴
##    <chr>     <chr>   <dbl>  <dbl>   <dbl>   <dbl>    <dbl> <dbl>   <dbl>   <dbl>
##  1 rs419010  A/g       502    578     760     648 2.10e- 4 1.35    1.15    1.58 
##  2 rs394221  C/t       567    451     631     685 2.04e- 4 0.733   0.622   0.864
##  3 rs4803766 G/a       545    533     858     542 9.16e- 8 1.55    1.32    1.82 
##  4 rs395908  G/a       696    380    1089     313 9.24e-13 1.90    1.59    2.27 
##  5 rs519113  C/g       700    372    1104     304 3.77e-13 1.93    1.61    2.31 
##  6 rs412776  G/a       720    360    1117     275 1.81e-14 2.03    1.69    2.44 
##  7 rs3865427 C/a       796    284    1168     234 5.27e- 9 1.78    1.47    2.16 
##  8 rs3852860 T/c       650    422    1010     364 1.33e-11 1.80    1.52    2.14 
##  9 rs3852861 T/g       653    421    1044     372 6.99e-12 1.81    1.53    2.15 
## 10 rs6857    C/t       816    236    1191     205 7.88e- 7 1.68    1.37    2.07 
## # … with 161 more rows, and abbreviated variable names ¹​Control_M, ²​Control_m,
## #   ³​`95_CI_inf`, ⁴​`95_CI_sup`

In order to adjust for multiple testing, apply the Bonferonni correction. Divide the desired genome-wide significance level by the number of tests performed: 0.05/171=0.0003. Consider an SNP to be statistically significant if its p-value<0.0003. In a GWAS involving 100K to 1M SNPs, the significance level is typically set to 5x10^-8.

Ensembl

Each rsID denotes a unique SNP site. Query the Ensembl database to obtain more information about an SNP site, such as its position on the chromosome and its genetic context.

library(biomaRt)
marts <- listEnsembl()
ensembl <- useEnsembl(biomart = "snps")
datasets <- searchDatasets(mart = ensembl, pattern = "Human")
ensembl <- useDataset(dataset = "hsapiens_snp", mart = ensembl)
filters <- listFilters(ensembl)
attributes <- listAttributes(ensembl)
rsid <- getBM(attributes = c('refsnp_id', 'chr_name', 'chrom_start', 'ensembl_gene_name', "cds_start"),
  filters = 'snp_filter',
  values = geno1$SNP_ID, 
  mart = ensembl
)
ensembl2 <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
symbols <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name'),
  filters = 'ensembl_gene_id',
  values = unique(rsid$ensembl_gene_name),
  mart = ensembl2
)
geno2 <- rsid %>% as_tibble() %>%
  left_join(symbols, by = join_by(ensembl_gene_name == ensembl_gene_id)) %>%
  left_join(geno1, by = join_by(refsnp_id == SNP_ID))
geno2[,1:8]
## # A tibble: 171 × 8
##    refsnp_id chr_name chrom_start ensembl_gene_name cds_s…¹ exter…² Allele   MAF
##    <chr>        <int>       <int> <chr>               <int> <chr>   <chr>  <dbl>
##  1 rs5120          19    44948363 ENSG00000234906        NA APOC2   A/t    0.37 
##  2 rs5157          19    44943904 ENSG00000267467        NA APOC4   C/t    0.32 
##  3 rs5158          19    44943921 ENSG00000267467        NA APOC4   C/t    0.07 
##  4 rs5167          19    44945208 ENSG00000267467       287 APOC4   T/g    0.48 
##  5 rs6857          19    44888997 ENSG00000130202        NA NECTIN2 C/t    0.18 
##  6 rs6859          19    44878777 ENSG00000130202        NA NECTIN2 G/a    0.4  
##  7 rs7026          19    44821259 ENSG00000187244        NA BCAM    C/t    0.36 
##  8 rs7412          19    44908822 ENSG00000130203       526 APOE    C/t    0.04 
##  9 rs9193          19    44993045 ENSG00000104853        NA CLPTM1  C/t    0.05 
## 10 rs10119         19    44903416 ENSG00000130204        NA TOMM40  C/t    0.229
## # … with 161 more rows, and abbreviated variable names ¹​cds_start,
## #   ²​external_gene_name

Plot

Plot the Q-Q plot, which compares the theoretical distribution of p-values under the null hypothesis to the observed p-values from the sample.

expected <- c(1:nrow(geno1))/(nrow(geno1)+1)
lexp <- -log10(expected)
qq <- geno1 %>% dplyr::select(p_value, MAF) %>% 
  dplyr::arrange(p_value) %>% 
  mutate(p_value=-log10(p_value)) %>%
  add_column(lexp)
ggplot(qq, aes(x=lexp, y=p_value, colour=MAF)) + 
  geom_point(alpha=0.4, size=3.5) +
  scale_colour_gradient(low = "red", high = "yellow") +
  scale_x_continuous(limits=c(0,5), expand = c(0, 0)) +
  scale_y_continuous(limits=c(0, 35), expand = c(0, 0)) +
  geom_abline(intercept=0, slope=1, color="red") +
  xlab("-log10(expected p-value)") + 
  ylab("-log10(observed p-value)") +
  theme_bw()

Plot the odds ratios and p-values in a volcano plot. Notice that common alleles cluster towards the pit: common polymorphisms tend to be neutral. Notice that rare alleles barely reach statistical significance. GWAS of unrelated individuals do not have the power to detect very rare risk alleles. However, very rare alleles with large effect may collectively explain an important part of the Alzheimer ‘missing heritability’.

ggplot(geno1, aes(x=log2(OR), y=-log10(p_value), colour=MAF)) + 
  geom_point(alpha=0.4, size=3.5) +
  scale_colour_gradient(low = "red", high = "yellow") +
  scale_x_continuous(limits=c(-2,2)) +
  scale_y_continuous(limits=c(0, 40), expand = c(0, 0)) +
  geom_hline(yintercept=-log10(0.0003), linetype="dashed") +
  theme_bw()

Export a table of p-values ordered by chromosomal position.

zoom <- geno2 %>% dplyr::select(chr_name,chrom_start,p_value) %>% dplyr::arrange(chrom_start)
write_tsv(zoom, "zoom.txt")
zoom
## # A tibble: 171 × 3
##    chr_name chrom_start p_value
##       <int>       <int>   <dbl>
##  1       19    44792828  0.847 
##  2       19    44806629  0.0332
##  3       19    44808266  0.648 
##  4       19    44808779  0.337 
##  5       19    44808857  0.481 
##  6       19    44810458  0.182 
##  7       19    44811510  0.290 
##  8       19    44813151  0.755 
##  9       19    44813547  0.441 
## 10       19    44813933  0.223 
## # … with 161 more rows

By uploading the table to LocusZoom, create a regional Manhattan plot. Note that the statistically significant associations around the E4 allele fall between two recombination hotspots.