# Matrix eQTL by Andrey A. Shabalin
# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
#
# Be sure to use an up to date version of R and Matrix eQTL.
# source("Matrix_eQTL_R/Matrix_eQTL_engine.r");
rm(list = ls())
setwd("/Volumes/Transcend/Thesis_project/2.1.liver.eqtl.analysis")
library(MatrixEQTL)
## Location of the package with the data files.
base.dir = "/Volumes/Transcend/Thesis_project/2.1.liver.eqtl.analysis";
## Settings
# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS
useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
# Genotype file name
SNP_file_name = paste(base.dir, "/2015-12-07 BXD.geno.SNP.eqtl.for.liver.txt", sep="");
snps_location_file_name = paste(base.dir, "/2015-12-07 BXD.geno.loc.eqtl.for.liver.txt", sep="");
# Gene expression file name
expression_file_name = paste(base.dir, "/2015-12-07 mouse.liver.expression.eqtl.txt", sep="");
gene_location_file_name = paste(base.dir, "/2015-12-07 liver.gene.loc.txt", sep="");
# Covariates file name
# Set to character() for no covariates
# covariates_file_name = paste(base.dir, "/Covariates.txt", sep="");

# Output file name
output_file_name_cis = tempfile();
output_file_name_tra = tempfile();

# Only associations significant at this level will be saved
pvOutputThreshold_cis = 1;
pvOutputThreshold_tra = 0.000000000000005;

# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric();
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");
# Distance for local gene-SNP pairs
cisDist = 1e6;


## Load genotype data
snps = SlicedData$new();
snps$fileDelimiter = "\t";      # the TAB character
snps$fileOmitCharacters = "NA"; # denote missing values;
snps$fileSkipRows = 1;
snps$fileSkipColumns = 1;
snps$fileSliceSize = 2000;
snps$LoadFile(SNP_file_name);
## Rows read:  2,000 
## Rows read:  3811  done.
## Load gene expression data
gene = SlicedData$new();
gene$fileDelimiter = "\t";
gene$fileOmitCharacters = "NA"; # denote missing values;
gene$fileSkipRows = 1;
gene$fileSkipColumns = 1;
gene$fileSliceSize = 2000;
gene$LoadFile(expression_file_name);
## Rows read:  2,000 
## Rows read:  4,000 
## Rows read:  6,000 
## Rows read:  8,000 
## Rows read:  10,000 
## Rows read:  12,000 
## Rows read:  14,000 
## Rows read:  16,000 
## Rows read:  18,000 
## Rows read:  20,000 
## Rows read:  20855  done.
## Load covariates
#cvrt = SlicedData$new();
#cvrt$fileDelimiter = "\t";      # the TAB character
#cvrt$fileOmitCharacters = "NA"; # denote missing values;
#cvrt$fileSkipRows = 1;          # one row of column labels
#cvrt$fileSkipColumns = 1;       # one column of row labels
#if(length(covariates_file_name)>0) {
# cvrt$LoadFile(covariates_file_name);
#}

## Run the analysis

snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE);
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);
head(genepos)
##       probe_id Chromosome start_location end_location
## 1   1415670_at          6       87887971     87890759
## 2   1415671_at          8      105524469    105566040
## 3   1415672_at          8       23241325     23257080
## 4   1415673_at          5      129765557    129787253
## 5 1415674_a_at          9       44403758     44407548
## 6   1415675_at          2       32570857     32573571
me = Matrix_eQTL_main(
  snps = snps,
  gene = gene,
  output_file_name = output_file_name_tra,
  pvOutputThreshold = pvOutputThreshold_tra,
  useModel = useModel,
  errorCovariance = numeric(),
  verbose = TRUE,
  output_file_name.cis = output_file_name_cis,
  pvOutputThreshold.cis = pvOutputThreshold_cis,
  snpspos = snpspos,
  genepos = genepos,
  cisDist = cisDist,
  pvalue.hist = TRUE,
  min.pv.by.genesnp = FALSE,
  noFDRsaveMemory = FALSE);
## Matching data files and location files 
## 20855 of 20855  genes matched
## 3811 of 3811  SNPs matched
## Task finished in  0.025  seconds
## Reordering genes
##  
## Task finished in  0.231  seconds
## Processing covariates 
## Task finished in  0.002  seconds
## Processing gene expression data (imputation, residualization, etc.) 
## Task finished in  0.04  seconds
## Creating output file(s) 
## Task finished in  0.015  seconds
## Performing eQTL analysis 
##  4.54% done, 7,118 cis-eQTLs, 67 trans-eQTLs
##  9.09% done, 13,788 cis-eQTLs, 79 trans-eQTLs
## 13.63% done, 21,752 cis-eQTLs, 137 trans-eQTLs
## 18.18% done, 27,806 cis-eQTLs, 154 trans-eQTLs
## 22.72% done, 34,278 cis-eQTLs, 272 trans-eQTLs
## 27.27% done, 35,915 cis-eQTLs, 272 trans-eQTLs
## 31.81% done, 272 trans-eQTLs
## 36.36% done, 272 trans-eQTLs
## 40.90% done, 272 trans-eQTLs
## 45.45% done, 272 trans-eQTLs
## 50.00% done, 272 trans-eQTLs
## 54.54% done, 272 trans-eQTLs
## 59.09% done, 272 trans-eQTLs
## 63.63% done, 272 trans-eQTLs
## 68.18% done, 272 trans-eQTLs
## 72.72% done, 282 trans-eQTLs
## 77.27% done, 40,035 cis-eQTLs, 283 trans-eQTLs
## 81.81% done, 47,465 cis-eQTLs, 318 trans-eQTLs
## 86.36% done, 53,735 cis-eQTLs, 348 trans-eQTLs
## 90.90% done, 60,292 cis-eQTLs, 355 trans-eQTLs
## 95.45% done, 67,748 cis-eQTLs, 395 trans-eQTLs
## 100.00% done, 69,174 cis-eQTLs, 395 trans-eQTLs
## Task finished in  10.289  seconds
## 
unlink(output_file_name_cis);
## Results:
cat('Analysis done in:', me$time.in.sec, ' seconds', '\n')
## Analysis done in: 9.876  seconds
cat('Detected local eQTLs:','\n')
## Detected local eQTLs:
cis.eqtls<-me$cis$eqtls
head(cis.eqtls)
##        snps       gene statistic       pvalue          FDR      beta
## 1 rs4163042 1452705_at -76.34025 4.891311e-34 4.229394e-30 -4.190552
## 2  116Mit88 1452705_at -76.34025 4.891311e-34 4.229394e-30 -4.190552
## 3 rs4163058 1452705_at -76.34025 4.891311e-34 4.229394e-30 -4.190552
## 4 rs4163391 1452705_at -76.34025 4.891311e-34 4.229394e-30 -4.190552
## 5 rs4151923 1452705_at -76.34025 4.891311e-34 4.229394e-30 -4.190552
## 6 rs3090019 1452705_at -76.34025 4.891311e-34 4.229394e-30 -4.190552
dim(cis.eqtls)
## [1] 69174     6
cis.eqtls$beta_se <-cis.eqtls$beta/cis.eqtls$statistic
write.table(cis.eqtls,file="2015-12-07 mouseliver.cis.1M.eqtls.txt", sep="\t", row.names=FALSE, quote=FALSE)