This is an R Markdown document to genetate tables and plots after QC for MESA dataset
Set input parameters
initial.G=0;
maf_cutoff=0.01;
SNPQC_flag=0;
getG.stratified_flag=0;
freqNA.indiv_flag=0;
freqNA.indiv_cutoff=0.05;
freqNA.SNP_cutoff=0.05;
G_diag_threshold=c(0.85,1.15);
G_offdiag_threshold=0.05;
getG.bicluster_flag=0;
getG.acrosscluster_flag=0;
race = c("Whi","Bla","Mex","Chi")
race.cluster = list(c("Whi","Bla"),c("Whi","Mex"),c("Whi","Chi"),c("Bla","Mex"),c("Bla","Chi"),c("Mex","Chi"))
Set directory and load libraries
########
#commondir="/Users/Sudha/Documents/Projects/WGR_with_Heterogeneous_Pops/MESA/data/"
commondir="/Users/Sudha/Documents/Projects/scratch/MESA/data/"
lib.loc= "/Users/Sudha/Documents/RLib"
folder="simulations/"
library(knitr, lib.loc=lib.loc)
library(tools, lib.loc=lib.loc)
library(ggplot2, lib.loc=lib.loc)
library(labeling, lib.loc=lib.loc)
library(gdata, lib.loc=lib.loc)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
package = c("bit","ff","BGData")
lapply(package,library,character.only=TRUE,lib.loc=lib.loc)
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
##
## Attaching package: 'bit'
##
## The following object is masked from 'package:base':
##
## xor
##
## Attaching package ff
## - getOption("fftempdir")=="/var/folders/tw/1zdm6ys15jxffl3h2p8838080000gn/T//Rtmpifiuhi"
##
## - getOption("ffextension")=="ff"
##
## - getOption("ffdrop")==TRUE
##
## - getOption("fffinonexit")==TRUE
##
## - getOption("ffpagesize")==65536
##
## - getOption("ffcaching")=="mmnoflush" -- consider "ffeachflush" if your system stalls on large writes
##
## - getOption("ffbatchbytes")==16777216 -- consider a different value for tuning your system
##
## - getOption("ffmaxbytes")==536870912 -- consider a different value for tuning your system
##
##
## Attaching package: 'ff'
##
## The following objects are masked from 'package:bit':
##
## clone, clone.default, clone.list
##
## The following objects are masked from 'package:utils':
##
## write.csv, write.csv2
##
## The following objects are masked from 'package:base':
##
## is.factor, is.ordered
Load pedigree, phenotypes and genotypes
##### Load pedigree
ped = read.table(paste(commondir,"raw/pedigree/phs000209.v11.pht001109.v1.p3.MESA_Pedigree.MULTI.txt",sep=""),sep="\t",header=T)
##### Load phenotypes
load(paste(commondir,"raw/phenotypes/Pheno_MESA.rda",sep=""))
##### Load genotypes
setwd(paste(commondir,'raw/plink/BGData_SHARE_MESA_c1.raw/',sep=""))
load('BGData.RData')
head(BGData@pheno)
## FID IID PAT MAT SEX PHENOTYPE
## 10036 0 10036 0 0 2 -9
## 12987 0 12987 0 0 1 -9
## 10442 0 10442 0 0 2 -9
## 13434 0 13434 0 0 2 -9
## 12599 0 12599 0 0 2 -9
## 12425 0 12425 0 0 2 -9
dim(BGData@geno)
## [1] 7377 909622
X = BGData@geno
Plot eigen vectors of G for full data
if(initial.G==1){
G = getG(X, scaleCol=T)
EVD = eigen(G)
save(EVD, file=paste(commondir,"raw/bin/EVD_full.rda",sep=""))
}
load(paste(commondir,"raw/bin/EVD_full.rda",sep=""))
race.1 = sub(1,"Whi",pheno$race1c)
race.2 = sub(2,"Chi",race.1)
race.3 = sub(3,"Bla",race.2)
race.char = sub(4,"Hisp",race.3)
race.char[which(pheno$mexican1==1)] = "Mex"
race.char[which(pheno$dominic1==1)] = "DomRep"
race.char[which(pheno$puert1==1)] = "PR"
race.char[which(pheno$cuban1==1)] = "Cuba"
race.char[which(pheno$othhisp1==1)] = "OthHisp"
Remove non-target individuals from dataset
##### Remove related individuals from pedigree
pheno$race.char = race.char
table(pheno$race.char)
##
## Bla Chi Cuba DomRep Hisp Mex OthHisp PR Whi
## 1991 574 38 120 716 608 152 133 2238
common.id1 = pheno[which(pheno$dbGaP.SubjID %in% ped$dbGaP.SubjID),2]
common.id2 = ped[which(ped$dbGaP.SubjID %in% pheno$dbGaP.SubjID),1]
t1 = match(common.id1,common.id2)
t2 = match(common.id2,common.id1)
id1=common.id1[t2]
id2=common.id2[t1]
stopifnot(all.equal(id1,common.id2))
stopifnot(all.equal(id2,common.id1))
ped.rem = ped[which(ped$dbGaP.SubjID %in% common.id2),]
famid = names(table(ped.rem$FAMID))
keep.dbgapid = matrix(,length(famid),1)
for(i in 1:length(keep.dbgapid))
keep.dbgapid[i,1] = ped.rem[which(as.character(ped.rem$FAMID) == famid[i])[1],1]
dbgapid.ped.remove = ped.rem[-which(ped.rem$dbGaP.SubjID %in% keep.dbgapid),1]
pheno.noped = pheno[-which(pheno$dbGaP.SubjID %in% dbgapid.ped.remove),]
####### Remove Non-Mexican Hispanics
pheno.noped.mex = pheno.noped[which(pheno.noped$race.char %in% race),]
table(pheno.noped.mex$race.char)
##
## Bla Chi Mex Whi
## 1384 527 527 2205
Perform SNP QC
######## Remove SNPs with less than 1% allele frequency and >5% missing values
SNPQC.fn<-function(x,racegrp,n_submatrix,minMAF,maxfreqNA,verbose=TRUE){
list.files = list.files(paste(commondir,"clean",sep=""))
lookup = paste("failedQC.SNPs_",racegrp,".dat",sep="")
if (length(which(grepl(lookup,list.files)))>0)
unlink(paste(commondir,"clean/",lookup,sep=""))
pheno = pheno.noped.mex[which(pheno.noped.mex$race.char == racegrp),]
rownames_x = rownames(x)
sub = which(rownames_x %in% pheno$sidno)
stopifnot(all.equal(as.numeric(rownames_x[sub]),pheno$sidno))
to_column=0;
from_column<-0
ncol = 0; ncol_f = 0;
p=ncol(x)
delta=ceiling(p/n_submatrix);
fileName <- paste(commondir,"clean/",lookup,sep="")
names= cbind("SNP","MAF","freqNA")
write.table(names,row.names=F,col.names=F,file=fileName,sep=' ',append=TRUE)
for(k in 1:n_submatrix){
from_column=to_column+1;
to_column=min(p,from_column+delta-1)
if(verbose){
cat(" =>num subjects: ",length(sub)," ",racegrp,"\n")
cat("Submatrix: ",k," (",from_column,":",to_column,")\n");
cat(" =>Acquiring genotypes...\n")
}
X=x[sub,from_column:to_column];
ncol = ncol+ncol(X)
cat(" =>ncol:",ncol,"\n")
tmp<-colMeans(X,na.rm=TRUE)/2
maf<-ifelse(tmp>0.5,tmp,tmp)
VAR<-apply(X=X,FUN=var,MARGIN=2,na.rm=TRUE)
freqNA <- apply(X, 2, function(x) sum(is.na(x))) / nrow(X)
which.snp.remove<-which((maf<minMAF)|(VAR<1e-8)|(freqNA>maxfreqNA))
snp.remove = colnames(X)[which.snp.remove]
snp.keep = colnames(X)[-which.snp.remove]
maf.remove = maf[which.snp.remove]
freqNA.remove = freqNA[which.snp.remove]
stopifnot(all.equal(length(snp.remove),length(which.snp.remove)))
writefile = cbind(maf.remove,freqNA.remove)
write.table(writefile,file=fileName,append=TRUE,sep=' ',row.names=T,col.names=F)
ncol_f = ncol_f+length(snp.keep)
cat(" =>ncol_final:",ncol_f,"\n")
}
}
if(SNPQC_flag==1){
SNPQC.fn(X,race[race.code],200,maf_cutoff,freqNA.SNP_cutoff,TRUE)
}
######## Identify SNPs segregating in at least one subpopulation
whi = read.table(paste(commondir,"clean/failedQC.SNPs_Whi.dat",sep=""),header=T,sep=' ')
bla = read.table(paste(commondir,"clean/failedQC.SNPs_Bla.dat",sep=""),header=T,sep=' ')
mex = read.table(paste(commondir,"clean/failedQC.SNPs_Mex.dat",sep=""),header=T,sep=' ')
chi = read.table(paste(commondir,"clean/failedQC.SNPs_Chi.dat",sep=""),header=T,sep=' ')
remove.snp.union = union(union(whi[,1],mex[,1]),union(bla[,1],chi[,1]))
remove.snp.intersect = intersect(intersect(whi[,1],mex[,1]),intersect(bla[,1],chi[,1]))
str(whi)
## 'data.frame': 147330 obs. of 3 variables:
## $ SNP : Factor w/ 147330 levels "AFFX-SNP_10160179_G",..: 68290 68291 69503 69504 69506 69509 80274 79789 116803 118124 ...
## $ MAF : num 0.17487 0.00113 0 0.12638 0.32126 ...
## $ freqNA: num 0.068934 0.000454 0.000454 0.135147 0.148753 ...
str(bla)
## 'data.frame': 23563 obs. of 3 variables:
## $ SNP : Factor w/ 23563 levels "AFFX-SNP_10015773_A",..: 11927 11932 13218 13475 14079 18964 19174 19389 19564 20049 ...
## $ MAF : num 0 0 0.16574 0.00109 0.12185 ...
## $ freqNA: num 0 0 0.08887 0.00506 0.08382 ...
str(mex)
## 'data.frame': 99412 obs. of 3 variables:
## $ SNP : Factor w/ 99412 levels "AFFX-SNP_10105880_A",..: 43137 43138 43928 43929 43931 43933 43934 43936 50020 51555 ...
## $ MAF : num 0.14693 0.00285 0 0.13263 0.29933 ...
## $ freqNA: num 0.1025 0 0 0.0987 0.1537 ...
str(chi)
## 'data.frame': 213503 obs. of 3 variables:
## $ SNP : Factor w/ 213503 levels "AFFX-SNP_10037362_C",..: 102563 102566 102567 104424 104426 104431 119706 118989 170308 172171 ...
## $ MAF : num 0.572 0.222 0 0 0.246 ...
## $ freqNA: num 0.1063 0.1082 0 0 0.0683 ...
str(remove.snp.union)
## chr [1:249988] "SNP_A-8282315_C" "SNP_A-8282317_A" ...
remove.snp.intersect.bypair = vector('list', length(race.cluster))
remove.snp.intersect.bypair[[1]] = intersect(whi[,1],bla[,1])
remove.snp.intersect.bypair[[2]] = intersect(whi[,1],mex[,1])
remove.snp.intersect.bypair[[3]] = intersect(whi[,1],chi[,1])
remove.snp.intersect.bypair[[4]] = intersect(bla[,1],mex[,1])
remove.snp.intersect.bypair[[5]] = intersect(bla[,1],chi[,1])
remove.snp.intersect.bypair[[6]] = intersect(mex[,1],chi[,1])
names(remove.snp.intersect.bypair) = race.cluster
Removed SNPs : final
head(remove.snp.intersect)
## [1] "SNP_A-8289947_0" "SNP_A-8289978_T" "SNP_A-8348835_G" "SNP_A-8547170_A"
## [5] "SNP_A-8554219_T" "SNP_A-8561949_T"
## for across-cluster analyses
length(remove.snp.intersect)
## [1] 4073
## for bi-cluster analyses
str(remove.snp.intersect.bypair)
## List of 6
## $ c("Whi", "Bla"): chr [1:13224] "SNP_A-8289947_0" "SNP_A-8289978_T" "SNP_A-8348835_G" "SNP_A-8547170_A" ...
## $ c("Whi", "Mex"): chr [1:94551] "SNP_A-8282315_C" "SNP_A-8282317_A" "SNP_A-8289947_0" "SNP_A-8289949_C" ...
## $ c("Whi", "Chi"): chr [1:118073] "SNP_A-8282315_C" "SNP_A-8282317_A" "SNP_A-8289947_0" "SNP_A-8289951_A" ...
## $ c("Bla", "Mex"): chr [1:9219] "SNP_A-8289947_0" "SNP_A-8289978_T" "SNP_A-8338251_G" "SNP_A-8348835_G" ...
## $ c("Bla", "Chi"): chr [1:9071] "SNP_A-8289947_0" "SNP_A-8289978_T" "SNP_A-8348835_G" "SNP_A-8547170_A" ...
## $ c("Mex", "Chi"): chr [1:88547] "SNP_A-8282315_C" "SNP_A-8282317_A" "SNP_A-8289947_0" "SNP_A-8289951_A" ...
Identify individuals with missing values
############ missing values in individuals
if(freqNA.indiv_flag==1){
freqNA = matrix(,nrow(X),1)
for(i in 1:nrow(X)){
x = as.vector(X[i,])
freqNA[i,1] <- sum(is.na(x))/ ncol(X)
cat(i, "\n")
}
writeFile = cbind(pheno$sidno, freqNA, pheno$race1c)
fileName <- paste(commondir,"clean/freqNA_individuals.dat",sep="")
write.table(writeFile, file=fileName, row.names=F, col.names=T,sep=" ")
}
Compute G (for stratified analyses)
pheno.sim = pheno[which(pheno.indicatorQC[,2]+pheno.indicatorQC[,3]+pheno.indicatorQC[,4]+pheno.indicatorQC[,6]==0),]
getG.fn <-function(x,racegrp,snps,n_submatrix,scaleCol,verbose){
pheno.fn = pheno.sim[which(pheno.sim$race.char == racegrp),]
sub = which(rownames(x) %in% pheno.fn$sidno)
stopifnot(all.equal(as.numeric(rownames(x)[sub]),pheno.fn$sidno))
K<-0
to_column<-0;
from_column<-0
ncol = 0; ncol_f = 0;
p=ncol(x);
n = length(sub)
delta=ceiling(p/n_submatrix);
G=matrix(0,nrow=n,ncol=n)
rownames(G)<-rownames(x)[sub]
colnames(G)<-rownames(x)[sub]
for(k in 1:n_submatrix){
from_column=to_column+1;
to_column=min(p,from_column+delta-1)
if(verbose){
cat(" =>num subjects: ",length(sub)," ",racegrp,"\n")
cat("Submatrix: ",k," (",from_column,":",to_column,")\n");
cat(" =>Acquiring genotypes...\n")
}
X=x[sub,from_column:to_column];
ncol = ncol+ncol(X)
tmp = which(colnames(X) %in% snps)
if(length(tmp)>0) X = X[,-tmp]
cat(" =>ncol:",ncol,"\n")
if(ncol(X)>0){
cat(" =>Computing...\n")
X<-scale(X,center=TRUE,scale=scaleCol)
VAR<-apply(X=X,FUN=var,MARGIN=2,na.rm=TRUE)
X<-ifelse(is.na(X),0,X)
K<-K+ifelse(scaleCol,ncol(X)*(n-1)/n,sum(VAR))
G<-G+tcrossprod(X)
ncol_f = ncol_f+ncol(X)
cat(" =>ncol_final:",ncol_f,"\n")
}
}
G<-G/K
return(G)
}
if(getG.stratified_flag==1){
unlink(paste(commondir,"clean/stratified/MESA_G_",race[race.code],".rda",sep=""))
G.tmp = getG.fn(X,race[race.code],remove.snp.intersect,200,scaleCol=TRUE,verbose=TRUE)
dir.create(paste(commondir,"clean/stratified",sep=""))
save(G.tmp,file=paste(commondir,"clean/stratified/MESA_G_",race[race.code],".rda",sep=""))
}
Remove nominally related individuals
G = vector("list",length(race))
for(i in 1:length(race)){
load(paste(commondir,"clean/stratified/MESA_G_",race[i],".rda",sep=""))
G[[i]] = G.tmp
}
editG_offD.fn <- function(G, threshold_offD=G_offdiag_threshold)
{
tmp <- G
diag(tmp) <- NA
t <- tmp>threshold_offD
z <- vector(,nrow(tmp))
for(i in 1: nrow(tmp))
z[i] <- ifelse("TRUE" %in% t[i,],i,NA)
z1 <- z[!is.na(z)]
G.ed=G[-z1,-z1]
return(G.ed)
}
editG_D.fn <- function(G, threshold_D=G_diag_threshold)
{
tmp <- G
diag = diag(tmp)
t1 <- which(diag<threshold_D[1])
t2 <- which(diag>threshold_D[2])
z1 <- c(t1,t2)
if(length(z1)>0) {G.ed=tmp[-z1,-z1]
} else G.ed = tmp
return(G.ed)
}
names = vector("list",length(race))
for(i in 1:length(race)) {
G[[i]] = editG_offD.fn(G[[i]])
G[[i]] = editG_D.fn(G[[i]])
names[[i]] = colnames(G[[i]])
}
idnames = as.numeric(c(names[[1]],names[[2]],names[[3]],names[[4]]))
pheno_MESA.postQC = pheno.noped.mex[which(pheno.noped.mex$sidno %in% idnames),]
dir.create(paste(commondir,"clean/stratified",sep=""))
## Warning in dir.create(paste(commondir, "clean/stratified", sep = "")): '/
## Users/Sudha/Documents/Projects/scratch/MESA/data/clean/stratified' already
## exists
names(G) = race
if(race.code==1)
save(G,pheno_MESA.postQC,file=paste(commondir,"clean/stratified/MESA_editedObjects.rda",sep=""))
Diagonals of edited G
load(paste(commondir,"clean/stratified/MESA_editedObjects.rda",sep=""))
par(mfrow=c(2,2))
for(i in 1:length(race))
plot(diag(G[[i]]), main=names(G)[i],ylab="diag(G)")
Off-diagonals of edited G
par(mfrow=c(2,2))
for(i in 1:length(race))
plot(lowerTriangle(G[[i]]), main=names(G)[i],ylab="LowerTri(G)",pch=".")
rm(G)
rm(pheno_MESA.postQC)
pheno.indicatorQC[,5] = ifelse(pheno$sidno %in% idnames,0,1)
if(race.code==1)
write.table(pheno.indicatorQC, file=paste(commondir,"clean/individuals_postQCIndicators.dat",sep=""),col.names=T,row.names=F,sep='\t')
rm(idnames)
rm(pheno.sim)
head(pheno.indicatorQC)
## Individual Pedigree Non-Mexican_Hispanics Missing_Race Nominally_related
## 1 10036 0 0 0 1
## 2 12987 0 0 0 1
## 3 10442 0 0 0 0
## 4 13434 0 0 0 0
## 5 12599 0 0 0 0
## 6 12425 0 0 0 0
## freqNA>5%
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
Removed individuals: final
apply(pheno.indicatorQC[,2:ncol(pheno.indicatorQC)], 2, sum)
## Pedigree Non-Mexican_Hispanics Missing_Race
## 1350 1157 807
## Nominally_related freqNA>5%
## 3911 0
pheno.indicatorQC = read.table(paste(commondir,"clean/individuals_postQCIndicators.dat",sep=""),header=T, sep="\t")
pheno.sim = pheno[which(pheno.indicatorQC[,2]+pheno.indicatorQC[,3]+pheno.indicatorQC[,4]+pheno.indicatorQC[,5]+pheno.indicatorQC[,6]==0),]
head(pheno.sim)
## File dbGaP.SubjID sidno age4c agecat4c race1c mexican1
## 401 Exam4Main.HMB 252618 10442 51 1 1 0
## 3130 Exam4Main.HMB 252148 13434 52 1 1 0
## 2367 Exam4Main.HMB 252669 12599 64 2 1 0
## 2207 Exam4Main.HMB 251459 12425 85 5 1 0
## 3030 Exam4Main.HMB 246051 13319 77 4 1 0
## 3870 Exam4Main.HMB 247656 14245 55 2 1 0
## dominic1 puert1 cuban1 othhisp1 lang1 gender1 site4c month4 season4
## 401 0 0 0 0 1 0 7 12 4
## 3130 0 0 0 0 1 0 8 12 4
## 2367 0 0 0 0 1 0 4 12 4
## 2207 0 0 0 0 1 0 7 12 4
## 3030 0 0 0 0 1 1 5 6 2
## 3870 0 0 0 0 1 1 6 1 1
## htcm4 wtlb4 bmi4c bmicat4c ldl4 hdl4 trig4 chol4 race.char
## 401 170.6 145.0 22.59868 1 154 102 50 266 Whi
## 3130 169.8 140.8 22.15136 1 136 65 203 242 Whi
## 2367 164.9 167.0 27.85789 2 91 51 71 156 Whi
## 2207 149.0 122.0 24.92644 1 64 105 118 193 Whi
## 3030 167.4 186.5 30.18846 3 76 63 114 162 Whi
## 3870 181.4 205.5 28.32763 2 135 48 133 210 Whi
table(pheno.sim$race.char)
##
## Bla Chi Mex Whi
## 1028 183 369 1886
length(which(!pheno$sidno %in% pheno.sim$sidno))
## [1] 3911
Compute G (for across-cluster analyses)
if(getG.acrosscluster_flag==1){
dir.create(paste(commondir,"clean/across_cluster",sep=""))
G = getG.fn3(X,remove.snp.intersect,400,scaleCol=TRUE,verbose=TRUE)
pheno_MESA.postQC=pheno.noped.mex[which(pheno.noped.mex$sidno %in% colnames(G)),]
save(G,pheno_MESA.postQC,file=paste(commondir,"clean/across_cluster/MESA_editedObjects.rda",sep=""))
}