# Load in any libraries
library(magrittr) # allows piping: %>%
# Load in the genotype data
simped <- read.table('GWAS4.ped',header=F,stringsAsFactors = F)
# Load in the .map file
sinmap <- read.table('GWAS4.map',header=T,sep='\t')
# Load in the phenotype data; Aff==1 is unaffected
simpheno <- read.table('GWAS4pheno.txt',header=T,sep='\t')

Question 1

There are slightly more than twice as many columns as SNPs because after accounting for the control probes, there are two bases per SNP because DNA is double-stranded.

Question 2

n.chr1 <- sum(sinmap[,1]==1)

There are 500 SNPs located on chromosome 1 of the MAPFILE.

Question 3

Aff.table <- simpheno$Aff %>% table
Aff.table
## .
##   1   2 
## 162  91

There are 91 people that are affected.

Question 4

# Load in the plink data
res1 <- read.table('GWAS1.assoc',header=T,stringsAsFactors = F)
# Get the number of p-values less than or equal to 0.05
n.P <- sum(res1$P<=0.05)
# Get the number that would occur by chance randomly
n.R <- round(nrow(res1)*0.05)

There are 376 SNPs which have a p-value of less than 0.05.

Question 5

If there data were completely random we would expect 5% of the tests to return p-values less than or equal to 5%. Since we ran 6388 tests, we would expect 319 results with p-values less than 5% even if the data were totally random, which is only 57 less than we found.

Question 6

# Get Bonferroni-adjusted p-value at the 5% level
bonfer <- 0.05/nrow(res1)
# Get the SNPs significant at the Bonferroni correction
res1.bonfer <- res1[res1$P<=bonfer,]

We see that there are 3 SNPs that have p-values less than 5% at the Bonferroni adjusted level, and they are: rs10008252, rs4571722, rs2881297.

Question 7

# Load in qqman package
library(qqman)
# Create the manhattan plot
manhat.dat <- res1[,c('SNP','CHR','BP','P')]
# Find which are greater than -log10>=5
g5 <- which(-log10(manhat.dat$P)>=5)
manhat.sig <- manhat.dat[g5,]
# Plot it
manhattan(manhat.dat,main='Q7: Manhattan Plot',col=c('blue4','orange3'),
          highlight=snpsOfInterest)

As we see in the Manhattan plot above, there are three statstically significant SNPs, the same ones we identified in Question 6, and they are on Chromosomes 4, 8, 12, as the table below shows.

manhat.sig
##             SNP CHR        BP         P
## 1803 rs10008252   4 179853616 1.893e-13
## 3141  rs4571722   8  60326734 2.617e-24
## 4334  rs2881297  12  27298922 1.679e-06

Question 8

g8 <- which(-log10(manhat.dat$P)>=8)
manhat.sig2 <- manhat.dat[g8,]
qq(res1$P,main='Q8: Q-Q plot')

As we can see in the figure above, there are 2 SNPs that have a -log(p-value)>8.