MESA Quality Control Document

This is an R Markdown document to genetate tables and plots after QC for MESA dataset

Set input parameters

    initial.G=0;
    maf_cutoff=0.01; 
    SNPQC_flag=0; 
    getG.stratified_flag=0;
    freqNA.indiv_flag=0; 
    freqNA.indiv_cutoff=0.05; 
    freqNA.SNP_cutoff=0.05; 
    G_diag_threshold=c(0.85,1.15); 
    G_offdiag_threshold=0.05; 
    getG.bicluster_flag=0; 
    getG.acrosscluster_flag=0; 

race = c("Whi","Bla","Mex","Chi")
race.cluster = list(c("Whi","Bla"),c("Whi","Mex"),c("Whi","Chi"),c("Bla","Mex"),c("Bla","Chi"),c("Mex","Chi"))

Set directory and load libraries

########
#commondir="/Users/Sudha/Documents/Projects/WGR_with_Heterogeneous_Pops/MESA/data/"
commondir="/Users/Sudha/Documents/Projects/scratch/MESA/data/"
lib.loc= "/Users/Sudha/Documents/RLib"
folder="simulations/"

library(knitr, lib.loc=lib.loc)
library(tools, lib.loc=lib.loc)
library(ggplot2, lib.loc=lib.loc)
library(labeling, lib.loc=lib.loc)
library(gdata, lib.loc=lib.loc)

## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## 
## The following object is masked from 'package:stats':
## 
##     nobs
## 
## The following object is masked from 'package:utils':
## 
##     object.size

package = c("bit","ff","BGData")
lapply(package,library,character.only=TRUE,lib.loc=lib.loc)

## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
## 
## Attaching package: 'bit'
## 
## The following object is masked from 'package:base':
## 
##     xor
## 
## Attaching package ff
## - getOption("fftempdir")=="/var/folders/tw/1zdm6ys15jxffl3h2p8838080000gn/T//Rtmpifiuhi"
## 
## - getOption("ffextension")=="ff"
## 
## - getOption("ffdrop")==TRUE
## 
## - getOption("fffinonexit")==TRUE
## 
## - getOption("ffpagesize")==65536
## 
## - getOption("ffcaching")=="mmnoflush"  -- consider "ffeachflush" if your system stalls on large writes
## 
## - getOption("ffbatchbytes")==16777216 -- consider a different value for tuning your system
## 
## - getOption("ffmaxbytes")==536870912 -- consider a different value for tuning your system
## 
## 
## Attaching package: 'ff'
## 
## The following objects are masked from 'package:bit':
## 
##     clone, clone.default, clone.list
## 
## The following objects are masked from 'package:utils':
## 
##     write.csv, write.csv2
## 
## The following objects are masked from 'package:base':
## 
##     is.factor, is.ordered

Load pedigree, phenotypes and genotypes

##### Load pedigree
ped = read.table(paste(commondir,"raw/pedigree/phs000209.v11.pht001109.v1.p3.MESA_Pedigree.MULTI.txt",sep=""),sep="\t",header=T)

##### Load phenotypes 
load(paste(commondir,"raw/phenotypes/Pheno_MESA.rda",sep=""))

##### Load genotypes 

setwd(paste(commondir,'raw/plink/BGData_SHARE_MESA_c1.raw/',sep=""))
load('BGData.RData')
head(BGData@pheno)

##       FID   IID PAT MAT SEX PHENOTYPE
## 10036   0 10036   0   0   2        -9
## 12987   0 12987   0   0   1        -9
## 10442   0 10442   0   0   2        -9
## 13434   0 13434   0   0   2        -9
## 12599   0 12599   0   0   2        -9
## 12425   0 12425   0   0   2        -9

dim(BGData@geno)

## [1]   7377 909622

X = BGData@geno

Plot eigen vectors of G for full data

if(initial.G==1){
G = getG(X, scaleCol=T)
EVD = eigen(G)
save(EVD, file=paste(commondir,"raw/bin/EVD_full.rda",sep=""))
}
load(paste(commondir,"raw/bin/EVD_full.rda",sep=""))

race.1 = sub(1,"Whi",pheno$race1c)
race.2 = sub(2,"Chi",race.1)
race.3 = sub(3,"Bla",race.2)
race.char = sub(4,"Hisp",race.3)
race.char[which(pheno$mexican1==1)] = "Mex"
race.char[which(pheno$dominic1==1)] = "DomRep"
race.char[which(pheno$puert1==1)] = "PR"
race.char[which(pheno$cuban1==1)] = "Cuba"
race.char[which(pheno$othhisp1==1)] = "OthHisp"

Remove non-target individuals from dataset

##### Remove related individuals from pedigree

pheno$race.char = race.char
table(pheno$race.char)

## 
##     Bla     Chi    Cuba  DomRep    Hisp     Mex OthHisp      PR     Whi 
##    1991     574      38     120     716     608     152     133    2238

common.id1 = pheno[which(pheno$dbGaP.SubjID %in% ped$dbGaP.SubjID),2]
common.id2 = ped[which(ped$dbGaP.SubjID %in% pheno$dbGaP.SubjID),1]
t1 = match(common.id1,common.id2)
t2 = match(common.id2,common.id1)
id1=common.id1[t2]
id2=common.id2[t1]
stopifnot(all.equal(id1,common.id2))
stopifnot(all.equal(id2,common.id1))
ped.rem = ped[which(ped$dbGaP.SubjID %in% common.id2),]

famid = names(table(ped.rem$FAMID))
keep.dbgapid = matrix(,length(famid),1)
for(i in 1:length(keep.dbgapid))
keep.dbgapid[i,1] = ped.rem[which(as.character(ped.rem$FAMID) == famid[i])[1],1]
dbgapid.ped.remove = ped.rem[-which(ped.rem$dbGaP.SubjID %in% keep.dbgapid),1]
pheno.noped = pheno[-which(pheno$dbGaP.SubjID %in% dbgapid.ped.remove),]

####### Remove Non-Mexican Hispanics

pheno.noped.mex = pheno.noped[which(pheno.noped$race.char %in% race),]
table(pheno.noped.mex$race.char)

## 
##  Bla  Chi  Mex  Whi 
## 1384  527  527 2205

Perform SNP QC

######## Remove SNPs with less than 1% allele frequency and >5% missing values

SNPQC.fn<-function(x,racegrp,n_submatrix,minMAF,maxfreqNA,verbose=TRUE){
  list.files = list.files(paste(commondir,"clean",sep=""))
  lookup = paste("failedQC.SNPs_",racegrp,".dat",sep="")
  if (length(which(grepl(lookup,list.files)))>0)
  unlink(paste(commondir,"clean/",lookup,sep=""))
  
  pheno = pheno.noped.mex[which(pheno.noped.mex$race.char == racegrp),]
  rownames_x =  rownames(x)
   sub = which(rownames_x %in% pheno$sidno)
   stopifnot(all.equal(as.numeric(rownames_x[sub]),pheno$sidno))
  to_column=0;
  from_column<-0
  ncol = 0; ncol_f = 0;
  p=ncol(x)
  delta=ceiling(p/n_submatrix);
  
 fileName <- paste(commondir,"clean/",lookup,sep="")
 names= cbind("SNP","MAF","freqNA")
 write.table(names,row.names=F,col.names=F,file=fileName,sep=' ',append=TRUE)
  for(k in 1:n_submatrix){
    from_column=to_column+1;
    to_column=min(p,from_column+delta-1)
    if(verbose){
     cat("  =>num subjects: ",length(sub)," ",racegrp,"\n")
      cat("Submatrix: ",k," (",from_column,":",to_column,")\n");
      cat("  =>Acquiring genotypes...\n")
    }
    X=x[sub,from_column:to_column];
    ncol = ncol+ncol(X)
    cat("  =>ncol:",ncol,"\n")

 tmp<-colMeans(X,na.rm=TRUE)/2
 maf<-ifelse(tmp>0.5,tmp,tmp)
 VAR<-apply(X=X,FUN=var,MARGIN=2,na.rm=TRUE)
 freqNA <- apply(X, 2, function(x) sum(is.na(x))) / nrow(X) 
 which.snp.remove<-which((maf<minMAF)|(VAR<1e-8)|(freqNA>maxfreqNA))
 snp.remove = colnames(X)[which.snp.remove]
 snp.keep = colnames(X)[-which.snp.remove]
 maf.remove = maf[which.snp.remove]
 freqNA.remove = freqNA[which.snp.remove]
 stopifnot(all.equal(length(snp.remove),length(which.snp.remove)))
writefile = cbind(maf.remove,freqNA.remove)
write.table(writefile,file=fileName,append=TRUE,sep=' ',row.names=T,col.names=F)

      ncol_f = ncol_f+length(snp.keep)
      cat("  =>ncol_final:",ncol_f,"\n")
    
  }
}

if(SNPQC_flag==1){
SNPQC.fn(X,race[race.code],200,maf_cutoff,freqNA.SNP_cutoff,TRUE)
}

######## Identify SNPs segregating in at least one subpopulation

whi = read.table(paste(commondir,"clean/failedQC.SNPs_Whi.dat",sep=""),header=T,sep=' ')
bla = read.table(paste(commondir,"clean/failedQC.SNPs_Bla.dat",sep=""),header=T,sep=' ')
mex = read.table(paste(commondir,"clean/failedQC.SNPs_Mex.dat",sep=""),header=T,sep=' ')
chi = read.table(paste(commondir,"clean/failedQC.SNPs_Chi.dat",sep=""),header=T,sep=' ')
remove.snp.union = union(union(whi[,1],mex[,1]),union(bla[,1],chi[,1]))
remove.snp.intersect = intersect(intersect(whi[,1],mex[,1]),intersect(bla[,1],chi[,1]))
str(whi)

## 'data.frame':    147330 obs. of  3 variables:
##  $ SNP   : Factor w/ 147330 levels "AFFX-SNP_10160179_G",..: 68290 68291 69503 69504 69506 69509 80274 79789 116803 118124 ...
##  $ MAF   : num  0.17487 0.00113 0 0.12638 0.32126 ...
##  $ freqNA: num  0.068934 0.000454 0.000454 0.135147 0.148753 ...

str(bla)

## 'data.frame':    23563 obs. of  3 variables:
##  $ SNP   : Factor w/ 23563 levels "AFFX-SNP_10015773_A",..: 11927 11932 13218 13475 14079 18964 19174 19389 19564 20049 ...
##  $ MAF   : num  0 0 0.16574 0.00109 0.12185 ...
##  $ freqNA: num  0 0 0.08887 0.00506 0.08382 ...

str(mex)

## 'data.frame':    99412 obs. of  3 variables:
##  $ SNP   : Factor w/ 99412 levels "AFFX-SNP_10105880_A",..: 43137 43138 43928 43929 43931 43933 43934 43936 50020 51555 ...
##  $ MAF   : num  0.14693 0.00285 0 0.13263 0.29933 ...
##  $ freqNA: num  0.1025 0 0 0.0987 0.1537 ...

str(chi)

## 'data.frame':    213503 obs. of  3 variables:
##  $ SNP   : Factor w/ 213503 levels "AFFX-SNP_10037362_C",..: 102563 102566 102567 104424 104426 104431 119706 118989 170308 172171 ...
##  $ MAF   : num  0.572 0.222 0 0 0.246 ...
##  $ freqNA: num  0.1063 0.1082 0 0 0.0683 ...

str(remove.snp.union)

##  chr [1:249988] "SNP_A-8282315_C" "SNP_A-8282317_A" ...

 remove.snp.intersect.bypair = vector('list', length(race.cluster))
 remove.snp.intersect.bypair[[1]] = intersect(whi[,1],bla[,1])
 remove.snp.intersect.bypair[[2]] = intersect(whi[,1],mex[,1])
 remove.snp.intersect.bypair[[3]] = intersect(whi[,1],chi[,1])
 remove.snp.intersect.bypair[[4]] = intersect(bla[,1],mex[,1])
 remove.snp.intersect.bypair[[5]] = intersect(bla[,1],chi[,1])
 remove.snp.intersect.bypair[[6]] = intersect(mex[,1],chi[,1])
 names(remove.snp.intersect.bypair) = race.cluster

Removed SNPs : final

head(remove.snp.intersect)

## [1] "SNP_A-8289947_0" "SNP_A-8289978_T" "SNP_A-8348835_G" "SNP_A-8547170_A"
## [5] "SNP_A-8554219_T" "SNP_A-8561949_T"

## for across-cluster analyses
length(remove.snp.intersect)

## [1] 4073

## for bi-cluster analyses
str(remove.snp.intersect.bypair)

## List of 6
##  $ c("Whi", "Bla"): chr [1:13224] "SNP_A-8289947_0" "SNP_A-8289978_T" "SNP_A-8348835_G" "SNP_A-8547170_A" ...
##  $ c("Whi", "Mex"): chr [1:94551] "SNP_A-8282315_C" "SNP_A-8282317_A" "SNP_A-8289947_0" "SNP_A-8289949_C" ...
##  $ c("Whi", "Chi"): chr [1:118073] "SNP_A-8282315_C" "SNP_A-8282317_A" "SNP_A-8289947_0" "SNP_A-8289951_A" ...
##  $ c("Bla", "Mex"): chr [1:9219] "SNP_A-8289947_0" "SNP_A-8289978_T" "SNP_A-8338251_G" "SNP_A-8348835_G" ...
##  $ c("Bla", "Chi"): chr [1:9071] "SNP_A-8289947_0" "SNP_A-8289978_T" "SNP_A-8348835_G" "SNP_A-8547170_A" ...
##  $ c("Mex", "Chi"): chr [1:88547] "SNP_A-8282315_C" "SNP_A-8282317_A" "SNP_A-8289947_0" "SNP_A-8289951_A" ...

Identify individuals with missing values

############ missing values in individuals

if(freqNA.indiv_flag==1){
freqNA = matrix(,nrow(X),1)
for(i in 1:nrow(X)){
x = as.vector(X[i,])
freqNA[i,1] <- sum(is.na(x))/ ncol(X) 
cat(i, "\n")
}
writeFile = cbind(pheno$sidno, freqNA, pheno$race1c)
fileName <- paste(commondir,"clean/freqNA_individuals.dat",sep="")
write.table(writeFile, file=fileName, row.names=F, col.names=T,sep=" ")
}

Compute G (for stratified analyses)

pheno.sim = pheno[which(pheno.indicatorQC[,2]+pheno.indicatorQC[,3]+pheno.indicatorQC[,4]+pheno.indicatorQC[,6]==0),]

getG.fn <-function(x,racegrp,snps,n_submatrix,scaleCol,verbose){
  pheno.fn = pheno.sim[which(pheno.sim$race.char == racegrp),]
   sub = which(rownames(x) %in% pheno.fn$sidno)
   stopifnot(all.equal(as.numeric(rownames(x)[sub]),pheno.fn$sidno))
  
  K<-0
  to_column<-0;
  from_column<-0
  ncol = 0; ncol_f = 0;
  p=ncol(x);
  n = length(sub)
  delta=ceiling(p/n_submatrix);
  G=matrix(0,nrow=n,ncol=n)
  rownames(G)<-rownames(x)[sub]
  colnames(G)<-rownames(x)[sub]

   for(k in 1:n_submatrix){
    from_column=to_column+1;
    to_column=min(p,from_column+delta-1)
    if(verbose){
     cat("  =>num subjects: ",length(sub)," ",racegrp,"\n")
      cat("Submatrix: ",k," (",from_column,":",to_column,")\n");
      cat("  =>Acquiring genotypes...\n")
    }
    X=x[sub,from_column:to_column];
    ncol = ncol+ncol(X)
    
    tmp = which(colnames(X) %in% snps)
    if(length(tmp)>0) X = X[,-tmp]
    
    cat("  =>ncol:",ncol,"\n")
   if(ncol(X)>0){
      cat("  =>Computing...\n")
      X<-scale(X,center=TRUE,scale=scaleCol)
      VAR<-apply(X=X,FUN=var,MARGIN=2,na.rm=TRUE)
      X<-ifelse(is.na(X),0,X)
      K<-K+ifelse(scaleCol,ncol(X)*(n-1)/n,sum(VAR))
      G<-G+tcrossprod(X)
      ncol_f = ncol_f+ncol(X)
      cat("  =>ncol_final:",ncol_f,"\n")
    }
  }
  G<-G/K
  return(G)
}
if(getG.stratified_flag==1){
unlink(paste(commondir,"clean/stratified/MESA_G_",race[race.code],".rda",sep=""))
G.tmp = getG.fn(X,race[race.code],remove.snp.intersect,200,scaleCol=TRUE,verbose=TRUE)
dir.create(paste(commondir,"clean/stratified",sep=""))
save(G.tmp,file=paste(commondir,"clean/stratified/MESA_G_",race[race.code],".rda",sep=""))
}

Remove nominally related individuals

G = vector("list",length(race))
for(i in 1:length(race)){
load(paste(commondir,"clean/stratified/MESA_G_",race[i],".rda",sep=""))
G[[i]] = G.tmp
}

editG_offD.fn <- function(G, threshold_offD=G_offdiag_threshold)
{
tmp <- G
diag(tmp) <- NA
t <- tmp>threshold_offD
z <- vector(,nrow(tmp))
for(i in 1: nrow(tmp))
z[i] <- ifelse("TRUE" %in% t[i,],i,NA)
z1 <- z[!is.na(z)]
G.ed=G[-z1,-z1]
return(G.ed)
}
editG_D.fn <- function(G, threshold_D=G_diag_threshold)
{
tmp <- G
diag = diag(tmp)
t1 <- which(diag<threshold_D[1])
t2 <- which(diag>threshold_D[2])
z1 <- c(t1,t2)
if(length(z1)>0) {G.ed=tmp[-z1,-z1]
} else G.ed = tmp
return(G.ed)
}
names = vector("list",length(race))
for(i in 1:length(race)) {
G[[i]] = editG_offD.fn(G[[i]])
G[[i]] = editG_D.fn(G[[i]])
names[[i]] = colnames(G[[i]])
}
idnames = as.numeric(c(names[[1]],names[[2]],names[[3]],names[[4]]))
pheno_MESA.postQC = pheno.noped.mex[which(pheno.noped.mex$sidno %in% idnames),]
dir.create(paste(commondir,"clean/stratified",sep=""))

## Warning in dir.create(paste(commondir, "clean/stratified", sep = "")): '/
## Users/Sudha/Documents/Projects/scratch/MESA/data/clean/stratified' already
## exists

names(G) = race
if(race.code==1)
save(G,pheno_MESA.postQC,file=paste(commondir,"clean/stratified/MESA_editedObjects.rda",sep=""))

Diagonals of edited G

load(paste(commondir,"clean/stratified/MESA_editedObjects.rda",sep=""))
par(mfrow=c(2,2))
for(i in 1:length(race))
plot(diag(G[[i]]), main=names(G)[i],ylab="diag(G)")

Off-diagonals of edited G

par(mfrow=c(2,2))
for(i in 1:length(race))
plot(lowerTriangle(G[[i]]), main=names(G)[i],ylab="LowerTri(G)",pch=".")

rm(G)
rm(pheno_MESA.postQC)

pheno.indicatorQC[,5] = ifelse(pheno$sidno %in% idnames,0,1) 
if(race.code==1)
write.table(pheno.indicatorQC, file=paste(commondir,"clean/individuals_postQCIndicators.dat",sep=""),col.names=T,row.names=F,sep='\t')
rm(idnames)
rm(pheno.sim)

head(pheno.indicatorQC)

##   Individual Pedigree Non-Mexican_Hispanics Missing_Race Nominally_related
## 1      10036        0                     0            0                 1
## 2      12987        0                     0            0                 1
## 3      10442        0                     0            0                 0
## 4      13434        0                     0            0                 0
## 5      12599        0                     0            0                 0
## 6      12425        0                     0            0                 0
##   freqNA>5%
## 1         0
## 2         0
## 3         0
## 4         0
## 5         0
## 6         0

Removed individuals: final

apply(pheno.indicatorQC[,2:ncol(pheno.indicatorQC)], 2, sum)

##              Pedigree Non-Mexican_Hispanics          Missing_Race 
##                  1350                  1157                   807 
##     Nominally_related             freqNA>5% 
##                  3911                     0

pheno.indicatorQC = read.table(paste(commondir,"clean/individuals_postQCIndicators.dat",sep=""),header=T, sep="\t")
pheno.sim = pheno[which(pheno.indicatorQC[,2]+pheno.indicatorQC[,3]+pheno.indicatorQC[,4]+pheno.indicatorQC[,5]+pheno.indicatorQC[,6]==0),]

head(pheno.sim)

##               File dbGaP.SubjID sidno age4c agecat4c race1c mexican1
## 401  Exam4Main.HMB       252618 10442    51        1      1        0
## 3130 Exam4Main.HMB       252148 13434    52        1      1        0
## 2367 Exam4Main.HMB       252669 12599    64        2      1        0
## 2207 Exam4Main.HMB       251459 12425    85        5      1        0
## 3030 Exam4Main.HMB       246051 13319    77        4      1        0
## 3870 Exam4Main.HMB       247656 14245    55        2      1        0
##      dominic1 puert1 cuban1 othhisp1 lang1 gender1 site4c month4 season4
## 401         0      0      0        0     1       0      7     12       4
## 3130        0      0      0        0     1       0      8     12       4
## 2367        0      0      0        0     1       0      4     12       4
## 2207        0      0      0        0     1       0      7     12       4
## 3030        0      0      0        0     1       1      5      6       2
## 3870        0      0      0        0     1       1      6      1       1
##      htcm4 wtlb4    bmi4c bmicat4c ldl4 hdl4 trig4 chol4 race.char
## 401  170.6 145.0 22.59868        1  154  102    50   266       Whi
## 3130 169.8 140.8 22.15136        1  136   65   203   242       Whi
## 2367 164.9 167.0 27.85789        2   91   51    71   156       Whi
## 2207 149.0 122.0 24.92644        1   64  105   118   193       Whi
## 3030 167.4 186.5 30.18846        3   76   63   114   162       Whi
## 3870 181.4 205.5 28.32763        2  135   48   133   210       Whi

table(pheno.sim$race.char)

## 
##  Bla  Chi  Mex  Whi 
## 1028  183  369 1886

length(which(!pheno$sidno %in% pheno.sim$sidno))

## [1] 3911

Compute G (for across-cluster analyses)

if(getG.acrosscluster_flag==1){
dir.create(paste(commondir,"clean/across_cluster",sep=""))
G = getG.fn3(X,remove.snp.intersect,400,scaleCol=TRUE,verbose=TRUE)
pheno_MESA.postQC=pheno.noped.mex[which(pheno.noped.mex$sidno %in% colnames(G)),]
save(G,pheno_MESA.postQC,file=paste(commondir,"clean/across_cluster/MESA_editedObjects.rda",sep=""))
}

MESA Quality Control Document

Yogasudha Veturi

02 Aug 2013