Data set-up
# Create three file folders for organizing the PRACTICAL 224 me-QTL data
# /panfs/panasas01/sscm/ca16591/practical_224/data
# /panfs/panasas01/sscm/ca16591/practical_224/results
# /panfs/panasas01/sscm/ca16591/practical_224/scripts
#' Move the 224 oncoarray genetic and phenotypic data into /panfs/panasas01/sscm/ca16591/practical_224/data
# Unzipped genetic data located originally at:
# Z:\data\practical\genetic\variants\arrays\oncoarray\applications\224\released\2016-07-12\data\imputed
# Phenotype data originally locataed at:
# Z:\data\practical\phenotypic\oncoarray\applications\224\released\2016-07-12\data
#' Save the phenotype data and a sample file in Z:/data/protect/_devs/PROTECT_Clinical/data (a drive I can access with R)
# Read the phenotype and sample file for chr01 into R
setwd("Z:/data/protect/_devs/PROTECT_Clinical/data")
oncoarray <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/data.tsv", header=TRUE)
sample <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/data_chr01.sample_order.txt", header=FALSE)
sample$Onc_ID <- sample$V1
#' Merge oncoarray and sample files to find the set of people in the phenotype (oncoarray) file for which we have genetic data and then re-order the oncoarray file in the same order as the genetic data (sample) -- this is necessary because the dosage files use the order of IDs in the sample file--it's PRACTICAL's matching mechanism. Use the "merge" and "match" commands
mrg <- merge(x = oncoarray, y = sample, by = c("Onc_ID"), all.y = TRUE)
ordered_pheno <- mrg[match(sample$Onc_ID, mrg$Onc_ID),]
dim(ordered_pheno)
## [1] 79658 117
table(ordered_pheno$CaCo)
##
## 0 1
## 31661 47995
#oncoarray[which(oncoarray$Onc_ID=="21433"),]
#ordered_pheno[which(ordered_pheno$Onc_ID=="21433"),]
#' Plink uses the 1/2 convention for control/case designation <- change 0/1 to 1/2 in the CaCa (case control variable)
ordered_pheno$PCa <- ifelse(ordered_pheno$CaCo==1,2,1)
table(ordered_pheno$PCa)
##
## 1 2
## 31661 47995
ordered_pheno$CaCo <- ordered_pheno$PCa
# Save ordered_pheno
write.csv(ordered_pheno, "Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno2.txt")
Creating .fam
#' Open the ordered_pheno dataset in Excel and create the .fam file with these guidelines from Plink website:
# Family ID ('FID') <- Onc_ID
# Within-family ID ('IID'; cannot be '0') <- Onc_ID
# Within-family ID of father ('0' if father isn't in dataset) <- 0
# Within-family ID of mother ('0' if mother isn't in dataset) <- 0
# Sex code ('1' = male, '2' = female, '0' = unknown) <- 1
# Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)
ordered_pheno2 <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno2.txt")
attach(ordered_pheno2)
ordered_pheno2$FID <- ordered_pheno2$Onc_ID
ordered_pheno2$IID <- ordered_pheno2$Onc_ID
ordered_pheno2$f <- 0
ordered_pheno2$m <- 0
ordered_pheno2$sex <- 1
write.csv(ordered_pheno2, "Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno3.txt")
myvars <- c("FID","IID","f","m","sex","CaCo")
fam <- ordered_pheno2[myvars]
# head(fam)
# table(fam$CaCo)
# table(ordered_pheno2$CaCo)
# Remove the header (optional as have to open Excel to remove extra column shortly...)
# names(fam) <- NULL
# Save and open in Excel
write.csv(fam, "Z:/data/protect/_devs/PROTECT_Clinical/data/fam.csv")
#' Open in Excel and remove the extra first column generated by R and the blank first row, then save.
fam <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/fam.csv")
dim(fam)
## [1] 79658 7
#' Use WinSCP to move fam from Z:/data/protect/_devs/PROTECT_Clinical to /panfs/panasas01/sscm/ca16591/practical_224/data
Creating .map files
# Make map.sh in notepadd++ with the following code:
#!/bin/bash
#PBS -l nodes=1:ppn=1,walltime=1:00:00
#PBS -N 224_maps
#cd "/panfs/panasas01/sscm/ca16591/practical_224"
awk '{print 1,$1,0,$3}' ~/practical_224/data/data_chr01.dosages.txt > ~/practical_224/data/chr01.map
awk '{print 2,$1,0,$3}' ~/practical_224/data/data_chr02.dosages.txt > ~/practical_224/data/chr02.map
awk '{print 3,$1,0,$3}' ~/practical_224/data/data_chr03.dosages.txt > ~/practical_224/data/chr03.map
awk '{print 4,$1,0,$3}' ~/practical_224/data/data_chr04.dosages.txt > ~/practical_224/data/chr04.map
awk '{print 5,$1,0,$3}' ~/practical_224/data/data_chr05.dosages.txt > ~/practical_224/data/chr05.map
awk '{print 6,$1,0,$3}' ~/practical_224/data/data_chr06.dosages.txt > ~/practical_224/data/chr06.map
awk '{print 7,$1,0,$3}' ~/practical_224/data/data_chr07.dosages.txt > ~/practical_224/data/chr07.map
awk '{print 8,$1,0,$3}' ~/practical_224/data/data_chr08.dosages.txt > ~/practical_224/data/chr08.map
awk '{print 9,$1,0,$3}' ~/practical_224/data/data_chr09.dosages.txt > ~/practical_224/data/chr09.map
awk '{print 10,$1,0,$3}' ~/practical_224/data/data_chr10.dosages.txt > ~/practical_224/data/chr10.map
awk '{print 11,$1,0,$3}' ~/practical_224/data/data_chr11.dosages.txt > ~/practical_224/data/chr11.map
awk '{print 12,$1,0,$3}' ~/practical_224/data/data_chr12.dosages.txt > ~/practical_224/data/chr12.map
awk '{print 13,$1,0,$3}' ~/practical_224/data/data_chr13.dosages.txt > ~/practical_224/data/chr13.map
awk '{print 14,$1,0,$3}' ~/practical_224/data/data_chr14.dosages.txt > ~/practical_224/data/chr14.map
awk '{print 15,$1,0,$3}' ~/practical_224/data/data_chr15.dosages.txt > ~/practical_224/data/chr15.map
awk '{print 16,$1,0,$3}' ~/practical_224/data/data_chr16.dosages.txt > ~/practical_224/data/chr16.map
awk '{print 17,$1,0,$3}' ~/practical_224/data/data_chr17.dosages.txt > ~/practical_224/data/chr17.map
awk '{print 18,$1,0,$3}' ~/practical_224/data/data_chr18.dosages.txt > ~/practical_224/data/chr18.map
awk '{print 19,$1,0,$3}' ~/practical_224/data/data_chr19.dosages.txt > ~/practical_224/data/chr19.map
awk '{print 20,$1,0,$3}' ~/practical_224/data/data_chr20.dosages.txt > ~/practical_224/data/chr20.map
awk '{print 21,$1,0,$3}' ~/practical_224/data/data_chr21.dosages.txt > ~/practical_224/data/chr21.map
awk '{print 22,$1,0,$3}' ~/practical_224/data/data_chr22.dosages.txt > ~/practical_224/data/chr22.map
# Use the qsub command to run map.sh: qsub map.sh
Creating the Covariates file
ordered_pheno3 <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno3.txt")
attach(ordered_pheno3)
tail(ordered_pheno3)
myvars <- c("FID","IID","CaCo","pc1_euro","pc2_euro","pc3_euro","pc4_euro", "pc5_euro", "pc6_euro", "pc7_euro")
pca_covars <- ordered_pheno3[myvars]
head(pca_covars)
# Save as .txt and open in Excel to remove the additional column added by R, then resave.
write.csv(pca_covars, "Z:/data/protect/_devs/PROTECT_Clinical/data/pca_covars.csv")
# Read back in here to look at it (optional)
# pca_covars <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/pca_covars.csv", header=FALSE)
#' Use WinSCP to move pca_covars (saved in Excel without extra column) from Z:/data/protect/_devs/PROTECT_Clinical to /panfs/panasas01/sscm/ca16591/practical_224/data
Dosage code in Plink2
#' Create script to loop through the associations for all the chromosomes and output the data into "results" folder
#!/bin/bash
#PBS -N pca_GWAS_224
#PBS -o pca_GWAS_224-output
#PBS -e pca_GWAS_224-error
#PBS -t 1-22
#PBS -l walltime=12:00:00
#PBS -l nodes=1:ppn=2
#PBS -S /bin/bash
set -e
echo "Running on ${HOSTNAME}"
if [ -n "${1}" ]; then
echo "${1}"
PBS_ARRAYID=${1}
fi
i=${PBS_ARRAYID}
ii=`printf "%02d" ${i}`
module load apps/plink2
#' The dosage command doesn't permit any missing data and didn't take categorical data, so I only adjusted for PCAs in the corvariates file.
plink --fam ~/practical_224/data/fam_final.txt
--map ~/practical_224/data/chr${ii}.map
--covar ~/practical_224/data/pca_covars.txt
--covar-number 2-8
--dosage ~/practical_224/data/data_chr${ii}.dosages.txt noheader skip1=2 format=1
--out ~/practical_224/results2/pca.full${ii}.txt
#' Use WinSCP to move pca_trial_224.sh (saved in Excel without extra column) from Z:/data/protect/_devs/PROTECT_Clinical to /panfs/panasas01/sscm/ca16591/practical_224/scripts
#' Within /panfs/panasas01/sscm/ca16591/practical_224/scripts, run qsub command with pca_trial_244.sh: qsub pca_trial_224.sh
# Use qstat to check the job: qstat -u ca16591
Results
List of Results
#' Subset headers so that they print nicely and print out a sorted list of results that are matched on base position (i.e., are within both our results and the me-QTL list)
myvars <- c("CHR", "SNP", "BP", "OR", "SE", "P")
results <- BP_start[myvars]
head(results, n=50)
## CHR SNP BP OR SE P
## 1245081 10 chr10_51549496_C_T 51549496 0.7878 0.0105 8.405e-115
## 1864183 17 rs10908278:36099952:T:A 36099952 1.2456 0.0104 7.488e-99
## 1864194 17 rs11263763 36103565 1.2442 0.0104 1.828e-98
## 1864190 17 rs11651052 36102381 1.2420 0.0104 7.378e-97
## 1864182 17 rs11651755 36099840 1.2412 0.0104 8.892e-97
## 1864186 17 rs8064454 36101586 1.2419 0.0104 9.211e-97
## 1864180 17 rs4430796:36098040:G:A 36098040 1.2371 0.0104 3.103e-92
## 1864179 17 rs11263761:36097775:G:A 36097775 1.2374 0.0105 7.121e-92
## 1245055 10 rs7911198:51537475:G:A 51537475 0.8154 0.0103 2.424e-87
## 1245018 10 rs3101227:51520203:C:A 51520203 0.8150 0.0103 3.437e-87
## 1245024 10 rs7077830:51522276:C:G 51522276 1.2266 0.0103 5.172e-87
## 1244992 10 rs10825587:51509258:C:T 51509258 1.2260 0.0103 7.013e-87
## 1244994 10 rs61847060:51510203:G:A 51510203 1.2256 0.0103 1.096e-86
## 1245023 10 rs7088225:51522120:T:C 51522120 1.2256 0.0103 1.179e-86
## 1244971 10 rs7071471:51503335:C:T 51503335 1.2256 0.0103 1.180e-86
## 1244978 10 rs2337711:51504604:C:T 51504604 1.2255 0.0103 1.523e-86
## 1245026 10 rs2843554:51523861:G:T 51523861 0.8161 0.0103 1.638e-86
## 1244993 10 rs10763256:51509408:C:A 51509408 1.2253 0.0103 1.651e-86
## 1245019 10 chr10_51521247_A_C 51521247 0.8161 0.0103 1.858e-86
## 1245022 10 rs2843551:51521945:C:A 51521945 0.8161 0.0103 1.861e-86
## 1245020 10 chr10_51521452_C_T 51521452 0.8161 0.0103 1.922e-86
## 1244996 10 chr10_51510761_A_G 51510761 1.2250 0.0103 2.037e-86
## 1244997 10 chr10_51510813_A_C 51510813 1.2249 0.0103 2.468e-86
## 1245067 10 rs11006274:51540291:T:C 51540291 0.8164 0.0103 3.003e-86
## 1245003 10 chr10_51515457_A_G 51515457 0.8164 0.0103 3.264e-86
## 1245070 10 rs4512771:51540906:C:A 51540906 0.8161 0.0103 3.273e-86
## 1244963 10 rs67289834:51501304:C:T 51501304 1.2250 0.0103 3.754e-86
## 1245031 10 rs7081532:51526093:A:G 51526093 0.8165 0.0103 3.940e-86
## 1245029 10 rs3123078 51524971 0.8165 0.0103 4.139e-86
## 1245057 10 chr10_51538169_A_C 51538169 1.2246 0.0103 4.623e-86
## 1245030 10 chr10_51525699_C_G 51525699 0.8166 0.0103 5.437e-86
## 1245028 10 chr10_51524889_A_G 51524889 0.8166 0.0103 5.847e-86
## 1245016 10 rs2843547:51519358:A:G 51519358 0.8162 0.0103 6.389e-86
## 1245048 10 rs10763538:51535857:A:G 51535857 0.8167 0.0103 6.563e-86
## 1245032 10 rs7915602:51528656:A:G 51528656 0.8168 0.0103 7.848e-86
## 1245034 10 rs7896437:51529593:A:C 51529593 0.8168 0.0103 8.760e-86
## 1245040 10 rs4486572:51531805:A:G 51531805 0.8168 0.0103 1.035e-85
## 1245047 10 rs10763536:51535801:G:A 51535801 0.8170 0.0103 1.128e-85
## 1245041 10 rs4581397:51532367:A:G 51532367 0.8170 0.0103 1.404e-85
## 1245078 10 rs11006462:51545813:G:A 51545813 0.8178 0.0103 5.665e-85
## 1245009 10 rs2611508:51518047:T:A 51518047 0.8171 0.0103 7.204e-85
## 1245074 10 rs7075009:51544143:T:G 51544143 0.8181 0.0103 8.105e-85
## 1245075 10 chr10_51544475_C_T 51544475 0.8182 0.0103 9.281e-85
## 1245008 10 rs2926494:51517356:T:C 51517356 0.8175 0.0103 9.950e-85
## 1245021 10 chr10_51521684_G_T 51521684 0.8183 0.0103 1.536e-84
## 1245014 10 rs2611504:51518910:G:A 51518910 0.8184 0.0103 2.466e-84
## 1245027 10 rs2843555:51524274:C:T 51524274 0.8185 0.0103 2.611e-84
## 1245025 10 chr10_51523213_A_T 51523213 0.8185 0.0103 2.626e-84
## 1245011 10 rs2843546:51518688:A:C 51518688 0.8184 0.0103 2.661e-84
## 1245069 10 rs4630243:51540867:T:C 51540867 0.8184 0.0103 2.908e-84
# write.csv(BP_start, "Z:/data/protect/_devs/PROTECT_Clinical/data/BP_start_08-02-2017.csv")
# write.csv(results,"Z:/data/protect/_devs/PROTECT_Clinical/data/08-02-2017_pca_risk_results.csv" )
Next steps?
#' Subset ordered_pheno2 by study center and create 50 new .fam and 50 new covariate files, one each for each study
# Run the analysis for each study
# Meta-analyze to identify our list of me-QTLs associated with prostate cancer risk
# Then, perform similar analyses to identify me-QTLs associated with stage and grade
#' In separate datasets, perform EWAS for PCa (risk and progression) and EWAS for PCa risk factors, then perform MR with the PRACITCAL results to see if methylation variation is causual.