mQTL Analysis

Data set-up
Creating .fam
Creating .map files
Creating the Covariates file
Dosage code in Plink2
Results
List of Results
Next steps?

Data set-up

# Create three file folders for organizing the PRACTICAL 224 me-QTL data

# /panfs/panasas01/sscm/ca16591/practical_224/data
# /panfs/panasas01/sscm/ca16591/practical_224/results
# /panfs/panasas01/sscm/ca16591/practical_224/scripts

#' Move the 224 oncoarray genetic and phenotypic data into /panfs/panasas01/sscm/ca16591/practical_224/data
# Unzipped genetic data located originally at: 
# Z:\data\practical\genetic\variants\arrays\oncoarray\applications\224\released\2016-07-12\data\imputed
# Phenotype data originally locataed at: 
# Z:\data\practical\phenotypic\oncoarray\applications\224\released\2016-07-12\data

#' Save the phenotype data and a sample file in Z:/data/protect/_devs/PROTECT_Clinical/data (a drive I can access with R)

# Read the phenotype and sample file for chr01 into R

setwd("Z:/data/protect/_devs/PROTECT_Clinical/data")
oncoarray <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/data.tsv", header=TRUE)
sample <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/data_chr01.sample_order.txt", header=FALSE)
sample$Onc_ID <- sample$V1

#' Merge oncoarray and sample files to find the set of people in the phenotype (oncoarray) file for which we have genetic data and then re-order the oncoarray file in the same order as the genetic data (sample) -- this is necessary because the dosage files use the order of IDs in the sample file--it's PRACTICAL's matching mechanism. Use the "merge" and "match" commands

mrg <- merge(x = oncoarray, y = sample, by = c("Onc_ID"), all.y = TRUE)
ordered_pheno <- mrg[match(sample$Onc_ID, mrg$Onc_ID),]
dim(ordered_pheno)

## [1] 79658   117

table(ordered_pheno$CaCo)

## 
##     0     1 
## 31661 47995

#oncoarray[which(oncoarray$Onc_ID=="21433"),]
#ordered_pheno[which(ordered_pheno$Onc_ID=="21433"),]

#' Plink uses the 1/2 convention for control/case designation                                                           <- change 0/1 to 1/2 in the CaCa (case control variable)
ordered_pheno$PCa <- ifelse(ordered_pheno$CaCo==1,2,1)
table(ordered_pheno$PCa)

## 
##     1     2 
## 31661 47995

ordered_pheno$CaCo <- ordered_pheno$PCa

# Save ordered_pheno
write.csv(ordered_pheno, "Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno2.txt")

Creating .fam

#' Open the ordered_pheno dataset in Excel and create the .fam file with these guidelines from Plink website:

# Family ID ('FID') <- Onc_ID
# Within-family ID ('IID'; cannot be '0') <- Onc_ID
# Within-family ID of father ('0' if father isn't in dataset) <- 0
# Within-family ID of mother ('0' if mother isn't in dataset) <-  0
# Sex code ('1' = male, '2' = female, '0' = unknown) <- 1
# Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)

ordered_pheno2 <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno2.txt")
attach(ordered_pheno2)
ordered_pheno2$FID <- ordered_pheno2$Onc_ID
ordered_pheno2$IID <- ordered_pheno2$Onc_ID
ordered_pheno2$f <- 0
ordered_pheno2$m <- 0
ordered_pheno2$sex <- 1
write.csv(ordered_pheno2, "Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno3.txt")

myvars <- c("FID","IID","f","m","sex","CaCo")
fam <- ordered_pheno2[myvars]
# head(fam)
# table(fam$CaCo)
# table(ordered_pheno2$CaCo)

# Remove the header (optional as have to open Excel to remove extra column shortly...)
# names(fam) <- NULL

# Save and open in Excel 

write.csv(fam, "Z:/data/protect/_devs/PROTECT_Clinical/data/fam.csv")

#' Open in Excel and remove the extra first column generated by R and the blank first row,                             then save. 

fam <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/fam.csv")
dim(fam)

## [1] 79658     7

#' Use WinSCP to move fam from Z:/data/protect/_devs/PROTECT_Clinical to /panfs/panasas01/sscm/ca16591/practical_224/data

Creating .map files

# Make map.sh in notepadd++ with the following code:

#!/bin/bash
#PBS -l nodes=1:ppn=1,walltime=1:00:00
#PBS -N 224_maps
#cd "/panfs/panasas01/sscm/ca16591/practical_224"

awk '{print 1,$1,0,$3}' ~/practical_224/data/data_chr01.dosages.txt > ~/practical_224/data/chr01.map

awk '{print 2,$1,0,$3}' ~/practical_224/data/data_chr02.dosages.txt > ~/practical_224/data/chr02.map

awk '{print 3,$1,0,$3}' ~/practical_224/data/data_chr03.dosages.txt > ~/practical_224/data/chr03.map

awk '{print 4,$1,0,$3}' ~/practical_224/data/data_chr04.dosages.txt > ~/practical_224/data/chr04.map

awk '{print 5,$1,0,$3}' ~/practical_224/data/data_chr05.dosages.txt > ~/practical_224/data/chr05.map

awk '{print 6,$1,0,$3}' ~/practical_224/data/data_chr06.dosages.txt > ~/practical_224/data/chr06.map

awk '{print 7,$1,0,$3}' ~/practical_224/data/data_chr07.dosages.txt > ~/practical_224/data/chr07.map

awk '{print 8,$1,0,$3}' ~/practical_224/data/data_chr08.dosages.txt > ~/practical_224/data/chr08.map

awk '{print 9,$1,0,$3}' ~/practical_224/data/data_chr09.dosages.txt > ~/practical_224/data/chr09.map

awk '{print 10,$1,0,$3}' ~/practical_224/data/data_chr10.dosages.txt > ~/practical_224/data/chr10.map

awk '{print 11,$1,0,$3}' ~/practical_224/data/data_chr11.dosages.txt > ~/practical_224/data/chr11.map

awk '{print 12,$1,0,$3}' ~/practical_224/data/data_chr12.dosages.txt > ~/practical_224/data/chr12.map

awk '{print 13,$1,0,$3}' ~/practical_224/data/data_chr13.dosages.txt > ~/practical_224/data/chr13.map

awk '{print 14,$1,0,$3}' ~/practical_224/data/data_chr14.dosages.txt > ~/practical_224/data/chr14.map

awk '{print 15,$1,0,$3}' ~/practical_224/data/data_chr15.dosages.txt > ~/practical_224/data/chr15.map

awk '{print 16,$1,0,$3}' ~/practical_224/data/data_chr16.dosages.txt > ~/practical_224/data/chr16.map

awk '{print 17,$1,0,$3}' ~/practical_224/data/data_chr17.dosages.txt > ~/practical_224/data/chr17.map

awk '{print 18,$1,0,$3}' ~/practical_224/data/data_chr18.dosages.txt > ~/practical_224/data/chr18.map

awk '{print 19,$1,0,$3}' ~/practical_224/data/data_chr19.dosages.txt > ~/practical_224/data/chr19.map

awk '{print 20,$1,0,$3}' ~/practical_224/data/data_chr20.dosages.txt > ~/practical_224/data/chr20.map

awk '{print 21,$1,0,$3}' ~/practical_224/data/data_chr21.dosages.txt > ~/practical_224/data/chr21.map

awk '{print 22,$1,0,$3}' ~/practical_224/data/data_chr22.dosages.txt > ~/practical_224/data/chr22.map

# Use the qsub command to run map.sh: qsub map.sh

Creating the Covariates file

ordered_pheno3 <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno3.txt")
attach(ordered_pheno3)
tail(ordered_pheno3)
myvars <- c("FID","IID","CaCo","pc1_euro","pc2_euro","pc3_euro","pc4_euro", "pc5_euro", "pc6_euro", "pc7_euro")
pca_covars <- ordered_pheno3[myvars]
head(pca_covars)

# Save as .txt and open in Excel to remove the additional column added by R, then resave.

write.csv(pca_covars, "Z:/data/protect/_devs/PROTECT_Clinical/data/pca_covars.csv")

# Read back in here to look at it (optional)

# pca_covars <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/pca_covars.csv", header=FALSE)

#' Use WinSCP to move pca_covars (saved in Excel without extra column) from Z:/data/protect/_devs/PROTECT_Clinical to /panfs/panasas01/sscm/ca16591/practical_224/data

Dosage code in Plink2

#' Create script to loop through the associations for all the chromosomes                                              and output the data into "results" folder

#!/bin/bash
 
#PBS -N pca_GWAS_224
#PBS -o pca_GWAS_224-output
#PBS -e pca_GWAS_224-error
#PBS -t 1-22
#PBS -l walltime=12:00:00
#PBS -l nodes=1:ppn=2
#PBS -S /bin/bash
 
set -e
 
echo "Running on ${HOSTNAME}"

if [ -n "${1}" ]; then
  echo "${1}"
  PBS_ARRAYID=${1}
fi
 
i=${PBS_ARRAYID}
ii=`printf "%02d" ${i}`

module load apps/plink2

#' The dosage command doesn't permit any missing data and didn't take categorical data, so I only adjusted for PCAs in the corvariates file. 

plink --fam ~/practical_224/data/fam_final.txt 
--map ~/practical_224/data/chr${ii}.map 
--covar ~/practical_224/data/pca_covars.txt 
--covar-number 2-8 
--dosage ~/practical_224/data/data_chr${ii}.dosages.txt noheader skip1=2 format=1 
--out ~/practical_224/results2/pca.full${ii}.txt

#' Use WinSCP to move pca_trial_224.sh (saved in Excel without extra column)                                            from Z:/data/protect/_devs/PROTECT_Clinical                                                                             to /panfs/panasas01/sscm/ca16591/practical_224/scripts

#' Within /panfs/panasas01/sscm/ca16591/practical_224/scripts, run qsub command with pca_trial_244.sh: qsub pca_trial_224.sh 

# Use qstat to check the job: qstat -u ca16591

Results

List of Results

#' Subset headers so that they print nicely and print out                                                               a sorted list of results that are matched on base position                                                                (i.e., are within both our results and the me-QTL list)

myvars <- c("CHR", "SNP", "BP", "OR", "SE", "P")
results <- BP_start[myvars]
head(results, n=50)

##         CHR                     SNP       BP     OR     SE          P
## 1245081  10      chr10_51549496_C_T 51549496 0.7878 0.0105 8.405e-115
## 1864183  17 rs10908278:36099952:T:A 36099952 1.2456 0.0104  7.488e-99
## 1864194  17              rs11263763 36103565 1.2442 0.0104  1.828e-98
## 1864190  17              rs11651052 36102381 1.2420 0.0104  7.378e-97
## 1864182  17              rs11651755 36099840 1.2412 0.0104  8.892e-97
## 1864186  17               rs8064454 36101586 1.2419 0.0104  9.211e-97
## 1864180  17  rs4430796:36098040:G:A 36098040 1.2371 0.0104  3.103e-92
## 1864179  17 rs11263761:36097775:G:A 36097775 1.2374 0.0105  7.121e-92
## 1245055  10  rs7911198:51537475:G:A 51537475 0.8154 0.0103  2.424e-87
## 1245018  10  rs3101227:51520203:C:A 51520203 0.8150 0.0103  3.437e-87
## 1245024  10  rs7077830:51522276:C:G 51522276 1.2266 0.0103  5.172e-87
## 1244992  10 rs10825587:51509258:C:T 51509258 1.2260 0.0103  7.013e-87
## 1244994  10 rs61847060:51510203:G:A 51510203 1.2256 0.0103  1.096e-86
## 1245023  10  rs7088225:51522120:T:C 51522120 1.2256 0.0103  1.179e-86
## 1244971  10  rs7071471:51503335:C:T 51503335 1.2256 0.0103  1.180e-86
## 1244978  10  rs2337711:51504604:C:T 51504604 1.2255 0.0103  1.523e-86
## 1245026  10  rs2843554:51523861:G:T 51523861 0.8161 0.0103  1.638e-86
## 1244993  10 rs10763256:51509408:C:A 51509408 1.2253 0.0103  1.651e-86
## 1245019  10      chr10_51521247_A_C 51521247 0.8161 0.0103  1.858e-86
## 1245022  10  rs2843551:51521945:C:A 51521945 0.8161 0.0103  1.861e-86
## 1245020  10      chr10_51521452_C_T 51521452 0.8161 0.0103  1.922e-86
## 1244996  10      chr10_51510761_A_G 51510761 1.2250 0.0103  2.037e-86
## 1244997  10      chr10_51510813_A_C 51510813 1.2249 0.0103  2.468e-86
## 1245067  10 rs11006274:51540291:T:C 51540291 0.8164 0.0103  3.003e-86
## 1245003  10      chr10_51515457_A_G 51515457 0.8164 0.0103  3.264e-86
## 1245070  10  rs4512771:51540906:C:A 51540906 0.8161 0.0103  3.273e-86
## 1244963  10 rs67289834:51501304:C:T 51501304 1.2250 0.0103  3.754e-86
## 1245031  10  rs7081532:51526093:A:G 51526093 0.8165 0.0103  3.940e-86
## 1245029  10               rs3123078 51524971 0.8165 0.0103  4.139e-86
## 1245057  10      chr10_51538169_A_C 51538169 1.2246 0.0103  4.623e-86
## 1245030  10      chr10_51525699_C_G 51525699 0.8166 0.0103  5.437e-86
## 1245028  10      chr10_51524889_A_G 51524889 0.8166 0.0103  5.847e-86
## 1245016  10  rs2843547:51519358:A:G 51519358 0.8162 0.0103  6.389e-86
## 1245048  10 rs10763538:51535857:A:G 51535857 0.8167 0.0103  6.563e-86
## 1245032  10  rs7915602:51528656:A:G 51528656 0.8168 0.0103  7.848e-86
## 1245034  10  rs7896437:51529593:A:C 51529593 0.8168 0.0103  8.760e-86
## 1245040  10  rs4486572:51531805:A:G 51531805 0.8168 0.0103  1.035e-85
## 1245047  10 rs10763536:51535801:G:A 51535801 0.8170 0.0103  1.128e-85
## 1245041  10  rs4581397:51532367:A:G 51532367 0.8170 0.0103  1.404e-85
## 1245078  10 rs11006462:51545813:G:A 51545813 0.8178 0.0103  5.665e-85
## 1245009  10  rs2611508:51518047:T:A 51518047 0.8171 0.0103  7.204e-85
## 1245074  10  rs7075009:51544143:T:G 51544143 0.8181 0.0103  8.105e-85
## 1245075  10      chr10_51544475_C_T 51544475 0.8182 0.0103  9.281e-85
## 1245008  10  rs2926494:51517356:T:C 51517356 0.8175 0.0103  9.950e-85
## 1245021  10      chr10_51521684_G_T 51521684 0.8183 0.0103  1.536e-84
## 1245014  10  rs2611504:51518910:G:A 51518910 0.8184 0.0103  2.466e-84
## 1245027  10  rs2843555:51524274:C:T 51524274 0.8185 0.0103  2.611e-84
## 1245025  10      chr10_51523213_A_T 51523213 0.8185 0.0103  2.626e-84
## 1245011  10  rs2843546:51518688:A:C 51518688 0.8184 0.0103  2.661e-84
## 1245069  10  rs4630243:51540867:T:C 51540867 0.8184 0.0103  2.908e-84

# write.csv(BP_start, "Z:/data/protect/_devs/PROTECT_Clinical/data/BP_start_08-02-2017.csv")
# write.csv(results,"Z:/data/protect/_devs/PROTECT_Clinical/data/08-02-2017_pca_risk_results.csv" )

Next steps?

#' Subset ordered_pheno2 by study center and create 50 new .fam and 50 new covariate files, one each for each study
# Run the analysis for each study
# Meta-analyze to identify our list of me-QTLs associated with prostate cancer risk
# Then, perform similar analyses to identify me-QTLs associated with stage and grade
#' In separate datasets, perform EWAS for PCa (risk and progression) and EWAS for PCa risk factors, then perform MR with the PRACITCAL results to see if methylation variation is causual.

mQTL Analysis

Charleen D. Adams

13 February 2017

Data set-up

Creating .fam

Creating .map files

Creating the Covariates file

Dosage code in Plink2

Results

List of Results

Next steps?