Data set-up

# Create three file folders for organizing the PRACTICAL 224 me-QTL data

# /panfs/panasas01/sscm/ca16591/practical_224/data
# /panfs/panasas01/sscm/ca16591/practical_224/results
# /panfs/panasas01/sscm/ca16591/practical_224/scripts

#' Move the 224 oncoarray genetic and phenotypic data into /panfs/panasas01/sscm/ca16591/practical_224/data
# Unzipped genetic data located originally at: 
# Z:\data\practical\genetic\variants\arrays\oncoarray\applications\224\released\2016-07-12\data\imputed
# Phenotype data originally locataed at: 
# Z:\data\practical\phenotypic\oncoarray\applications\224\released\2016-07-12\data

#' Save the phenotype data and a sample file in Z:/data/protect/_devs/PROTECT_Clinical/data (a drive I can access with R)

# Read the phenotype and sample file for chr01 into R

setwd("Z:/data/protect/_devs/PROTECT_Clinical/data")
oncoarray <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/data.tsv", header=TRUE)
sample <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/data_chr01.sample_order.txt", header=FALSE)
sample$Onc_ID <- sample$V1

#' Merge oncoarray and sample files to find the set of people in the phenotype (oncoarray) file for which we have genetic data and then re-order the oncoarray file in the same order as the genetic data (sample) -- this is necessary because the dosage files use the order of IDs in the sample file--it's PRACTICAL's matching mechanism. Use the "merge" and "match" commands

mrg <- merge(x = oncoarray, y = sample, by = c("Onc_ID"), all.y = TRUE)
ordered_pheno <- mrg[match(sample$Onc_ID, mrg$Onc_ID),]
dim(ordered_pheno)
## [1] 79658   117
table(ordered_pheno$CaCo)
## 
##     0     1 
## 31661 47995
#oncoarray[which(oncoarray$Onc_ID=="21433"),]
#ordered_pheno[which(ordered_pheno$Onc_ID=="21433"),]

#' Plink uses the 1/2 convention for control/case designation                                                           <- change 0/1 to 1/2 in the CaCa (case control variable)
ordered_pheno$PCa <- ifelse(ordered_pheno$CaCo==1,2,1)
table(ordered_pheno$PCa)
## 
##     1     2 
## 31661 47995
ordered_pheno$CaCo <- ordered_pheno$PCa

# Save ordered_pheno
write.csv(ordered_pheno, "Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno2.txt")

Creating .fam

#' Open the ordered_pheno dataset in Excel and create the .fam file with these guidelines from Plink website:

# Family ID ('FID') <- Onc_ID
# Within-family ID ('IID'; cannot be '0') <- Onc_ID
# Within-family ID of father ('0' if father isn't in dataset) <- 0
# Within-family ID of mother ('0' if mother isn't in dataset) <-  0
# Sex code ('1' = male, '2' = female, '0' = unknown) <- 1
# Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)

ordered_pheno2 <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno2.txt")
attach(ordered_pheno2)
ordered_pheno2$FID <- ordered_pheno2$Onc_ID
ordered_pheno2$IID <- ordered_pheno2$Onc_ID
ordered_pheno2$f <- 0
ordered_pheno2$m <- 0
ordered_pheno2$sex <- 1
write.csv(ordered_pheno2, "Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno3.txt")

myvars <- c("FID","IID","f","m","sex","CaCo")
fam <- ordered_pheno2[myvars]
# head(fam)
# table(fam$CaCo)
# table(ordered_pheno2$CaCo)

# Remove the header (optional as have to open Excel to remove extra column shortly...)
# names(fam) <- NULL

# Save and open in Excel 

write.csv(fam, "Z:/data/protect/_devs/PROTECT_Clinical/data/fam.csv")

#' Open in Excel and remove the extra first column generated by R and the blank first row,                             then save. 

fam <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/fam.csv")
dim(fam)
## [1] 79658     7
#' Use WinSCP to move fam from Z:/data/protect/_devs/PROTECT_Clinical to /panfs/panasas01/sscm/ca16591/practical_224/data

Creating .map files

# Make map.sh in notepadd++ with the following code:

#!/bin/bash
#PBS -l nodes=1:ppn=1,walltime=1:00:00
#PBS -N 224_maps
#cd "/panfs/panasas01/sscm/ca16591/practical_224"

awk '{print 1,$1,0,$3}' ~/practical_224/data/data_chr01.dosages.txt > ~/practical_224/data/chr01.map

awk '{print 2,$1,0,$3}' ~/practical_224/data/data_chr02.dosages.txt > ~/practical_224/data/chr02.map

awk '{print 3,$1,0,$3}' ~/practical_224/data/data_chr03.dosages.txt > ~/practical_224/data/chr03.map

awk '{print 4,$1,0,$3}' ~/practical_224/data/data_chr04.dosages.txt > ~/practical_224/data/chr04.map

awk '{print 5,$1,0,$3}' ~/practical_224/data/data_chr05.dosages.txt > ~/practical_224/data/chr05.map

awk '{print 6,$1,0,$3}' ~/practical_224/data/data_chr06.dosages.txt > ~/practical_224/data/chr06.map

awk '{print 7,$1,0,$3}' ~/practical_224/data/data_chr07.dosages.txt > ~/practical_224/data/chr07.map

awk '{print 8,$1,0,$3}' ~/practical_224/data/data_chr08.dosages.txt > ~/practical_224/data/chr08.map

awk '{print 9,$1,0,$3}' ~/practical_224/data/data_chr09.dosages.txt > ~/practical_224/data/chr09.map

awk '{print 10,$1,0,$3}' ~/practical_224/data/data_chr10.dosages.txt > ~/practical_224/data/chr10.map

awk '{print 11,$1,0,$3}' ~/practical_224/data/data_chr11.dosages.txt > ~/practical_224/data/chr11.map

awk '{print 12,$1,0,$3}' ~/practical_224/data/data_chr12.dosages.txt > ~/practical_224/data/chr12.map

awk '{print 13,$1,0,$3}' ~/practical_224/data/data_chr13.dosages.txt > ~/practical_224/data/chr13.map

awk '{print 14,$1,0,$3}' ~/practical_224/data/data_chr14.dosages.txt > ~/practical_224/data/chr14.map

awk '{print 15,$1,0,$3}' ~/practical_224/data/data_chr15.dosages.txt > ~/practical_224/data/chr15.map

awk '{print 16,$1,0,$3}' ~/practical_224/data/data_chr16.dosages.txt > ~/practical_224/data/chr16.map

awk '{print 17,$1,0,$3}' ~/practical_224/data/data_chr17.dosages.txt > ~/practical_224/data/chr17.map

awk '{print 18,$1,0,$3}' ~/practical_224/data/data_chr18.dosages.txt > ~/practical_224/data/chr18.map

awk '{print 19,$1,0,$3}' ~/practical_224/data/data_chr19.dosages.txt > ~/practical_224/data/chr19.map

awk '{print 20,$1,0,$3}' ~/practical_224/data/data_chr20.dosages.txt > ~/practical_224/data/chr20.map

awk '{print 21,$1,0,$3}' ~/practical_224/data/data_chr21.dosages.txt > ~/practical_224/data/chr21.map

awk '{print 22,$1,0,$3}' ~/practical_224/data/data_chr22.dosages.txt > ~/practical_224/data/chr22.map

# Use the qsub command to run map.sh: qsub map.sh

Creating the Covariates file

ordered_pheno3 <- read.csv("Z:/data/protect/_devs/PROTECT_Clinical/data/ordered_pheno3.txt")
attach(ordered_pheno3)
tail(ordered_pheno3)
myvars <- c("FID","IID","CaCo","pc1_euro","pc2_euro","pc3_euro","pc4_euro", "pc5_euro", "pc6_euro", "pc7_euro")
pca_covars <- ordered_pheno3[myvars]
head(pca_covars)

# Save as .txt and open in Excel to remove the additional column added by R, then resave.

write.csv(pca_covars, "Z:/data/protect/_devs/PROTECT_Clinical/data/pca_covars.csv")

# Read back in here to look at it (optional)

# pca_covars <- read.delim("Z:/data/protect/_devs/PROTECT_Clinical/data/pca_covars.csv", header=FALSE)

#' Use WinSCP to move pca_covars (saved in Excel without extra column) from Z:/data/protect/_devs/PROTECT_Clinical to /panfs/panasas01/sscm/ca16591/practical_224/data

Dosage code in Plink2

#' Create script to loop through the associations for all the chromosomes                                              and output the data into "results" folder

#!/bin/bash
 
#PBS -N pca_GWAS_224
#PBS -o pca_GWAS_224-output
#PBS -e pca_GWAS_224-error
#PBS -t 1-22
#PBS -l walltime=12:00:00
#PBS -l nodes=1:ppn=2
#PBS -S /bin/bash
 
set -e
 
echo "Running on ${HOSTNAME}"

if [ -n "${1}" ]; then
  echo "${1}"
  PBS_ARRAYID=${1}
fi
 
i=${PBS_ARRAYID}
ii=`printf "%02d" ${i}`

module load apps/plink2

#' The dosage command doesn't permit any missing data and didn't take categorical data, so I only adjusted for PCAs in the corvariates file. 

plink --fam ~/practical_224/data/fam_final.txt 
--map ~/practical_224/data/chr${ii}.map 
--covar ~/practical_224/data/pca_covars.txt 
--covar-number 2-8 
--dosage ~/practical_224/data/data_chr${ii}.dosages.txt noheader skip1=2 format=1 
--out ~/practical_224/results2/pca.full${ii}.txt

#' Use WinSCP to move pca_trial_224.sh (saved in Excel without extra column)                                            from Z:/data/protect/_devs/PROTECT_Clinical                                                                             to /panfs/panasas01/sscm/ca16591/practical_224/scripts

#' Within /panfs/panasas01/sscm/ca16591/practical_224/scripts, run qsub command with pca_trial_244.sh: qsub pca_trial_224.sh 

# Use qstat to check the job: qstat -u ca16591

Results

List of Results

#' Subset headers so that they print nicely and print out                                                               a sorted list of results that are matched on base position                                                                (i.e., are within both our results and the me-QTL list)

myvars <- c("CHR", "SNP", "BP", "OR", "SE", "P")
results <- BP_start[myvars]
head(results, n=50)
##         CHR                     SNP       BP     OR     SE          P
## 1245081  10      chr10_51549496_C_T 51549496 0.7878 0.0105 8.405e-115
## 1864183  17 rs10908278:36099952:T:A 36099952 1.2456 0.0104  7.488e-99
## 1864194  17              rs11263763 36103565 1.2442 0.0104  1.828e-98
## 1864190  17              rs11651052 36102381 1.2420 0.0104  7.378e-97
## 1864182  17              rs11651755 36099840 1.2412 0.0104  8.892e-97
## 1864186  17               rs8064454 36101586 1.2419 0.0104  9.211e-97
## 1864180  17  rs4430796:36098040:G:A 36098040 1.2371 0.0104  3.103e-92
## 1864179  17 rs11263761:36097775:G:A 36097775 1.2374 0.0105  7.121e-92
## 1245055  10  rs7911198:51537475:G:A 51537475 0.8154 0.0103  2.424e-87
## 1245018  10  rs3101227:51520203:C:A 51520203 0.8150 0.0103  3.437e-87
## 1245024  10  rs7077830:51522276:C:G 51522276 1.2266 0.0103  5.172e-87
## 1244992  10 rs10825587:51509258:C:T 51509258 1.2260 0.0103  7.013e-87
## 1244994  10 rs61847060:51510203:G:A 51510203 1.2256 0.0103  1.096e-86
## 1245023  10  rs7088225:51522120:T:C 51522120 1.2256 0.0103  1.179e-86
## 1244971  10  rs7071471:51503335:C:T 51503335 1.2256 0.0103  1.180e-86
## 1244978  10  rs2337711:51504604:C:T 51504604 1.2255 0.0103  1.523e-86
## 1245026  10  rs2843554:51523861:G:T 51523861 0.8161 0.0103  1.638e-86
## 1244993  10 rs10763256:51509408:C:A 51509408 1.2253 0.0103  1.651e-86
## 1245019  10      chr10_51521247_A_C 51521247 0.8161 0.0103  1.858e-86
## 1245022  10  rs2843551:51521945:C:A 51521945 0.8161 0.0103  1.861e-86
## 1245020  10      chr10_51521452_C_T 51521452 0.8161 0.0103  1.922e-86
## 1244996  10      chr10_51510761_A_G 51510761 1.2250 0.0103  2.037e-86
## 1244997  10      chr10_51510813_A_C 51510813 1.2249 0.0103  2.468e-86
## 1245067  10 rs11006274:51540291:T:C 51540291 0.8164 0.0103  3.003e-86
## 1245003  10      chr10_51515457_A_G 51515457 0.8164 0.0103  3.264e-86
## 1245070  10  rs4512771:51540906:C:A 51540906 0.8161 0.0103  3.273e-86
## 1244963  10 rs67289834:51501304:C:T 51501304 1.2250 0.0103  3.754e-86
## 1245031  10  rs7081532:51526093:A:G 51526093 0.8165 0.0103  3.940e-86
## 1245029  10               rs3123078 51524971 0.8165 0.0103  4.139e-86
## 1245057  10      chr10_51538169_A_C 51538169 1.2246 0.0103  4.623e-86
## 1245030  10      chr10_51525699_C_G 51525699 0.8166 0.0103  5.437e-86
## 1245028  10      chr10_51524889_A_G 51524889 0.8166 0.0103  5.847e-86
## 1245016  10  rs2843547:51519358:A:G 51519358 0.8162 0.0103  6.389e-86
## 1245048  10 rs10763538:51535857:A:G 51535857 0.8167 0.0103  6.563e-86
## 1245032  10  rs7915602:51528656:A:G 51528656 0.8168 0.0103  7.848e-86
## 1245034  10  rs7896437:51529593:A:C 51529593 0.8168 0.0103  8.760e-86
## 1245040  10  rs4486572:51531805:A:G 51531805 0.8168 0.0103  1.035e-85
## 1245047  10 rs10763536:51535801:G:A 51535801 0.8170 0.0103  1.128e-85
## 1245041  10  rs4581397:51532367:A:G 51532367 0.8170 0.0103  1.404e-85
## 1245078  10 rs11006462:51545813:G:A 51545813 0.8178 0.0103  5.665e-85
## 1245009  10  rs2611508:51518047:T:A 51518047 0.8171 0.0103  7.204e-85
## 1245074  10  rs7075009:51544143:T:G 51544143 0.8181 0.0103  8.105e-85
## 1245075  10      chr10_51544475_C_T 51544475 0.8182 0.0103  9.281e-85
## 1245008  10  rs2926494:51517356:T:C 51517356 0.8175 0.0103  9.950e-85
## 1245021  10      chr10_51521684_G_T 51521684 0.8183 0.0103  1.536e-84
## 1245014  10  rs2611504:51518910:G:A 51518910 0.8184 0.0103  2.466e-84
## 1245027  10  rs2843555:51524274:C:T 51524274 0.8185 0.0103  2.611e-84
## 1245025  10      chr10_51523213_A_T 51523213 0.8185 0.0103  2.626e-84
## 1245011  10  rs2843546:51518688:A:C 51518688 0.8184 0.0103  2.661e-84
## 1245069  10  rs4630243:51540867:T:C 51540867 0.8184 0.0103  2.908e-84
# write.csv(BP_start, "Z:/data/protect/_devs/PROTECT_Clinical/data/BP_start_08-02-2017.csv")
# write.csv(results,"Z:/data/protect/_devs/PROTECT_Clinical/data/08-02-2017_pca_risk_results.csv" )

Next steps?

#' Subset ordered_pheno2 by study center and create 50 new .fam and 50 new covariate files, one each for each study
# Run the analysis for each study
# Meta-analyze to identify our list of me-QTLs associated with prostate cancer risk
# Then, perform similar analyses to identify me-QTLs associated with stage and grade
#' In separate datasets, perform EWAS for PCa (risk and progression) and EWAS for PCa risk factors, then perform MR with the PRACITCAL results to see if methylation variation is causual.