# Matrix eQTL by Andrey A. Shabalin
# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
#
# Be sure to use an up to date version of R and Matrix eQTL.
# source("Matrix_eQTL_R/Matrix_eQTL_engine.r");

rm(list = ls())
setwd("/Volumes/Transcend/Thesis_project/2.2.lung.eqtl.analysis")

# sorted.GE.test<-read.table("sortedGE.txt", header = TRUE, stringsAsFactors = FALSE);

# sorted.GE.test<-sorted.GE.test[1:4,]

# sorted.GE.test

# write.table(sorted.GE.test,file="sorted.GE.test.txt", sep="\t", row.names=FALSE, quote=FALSE)


library(MatrixEQTL)

## Location of the package with the data files.
base.dir = "/Volumes/Transcend/Thesis_project/2.2.lung.eqtl.analysis";
## Settings
# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS
useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
# Genotype file name
SNP_file_name = paste(base.dir, "/2015-12-07 BXD.geno.SNP.eqtl.for.lung.txt", sep="");
snps_location_file_name = paste(base.dir, "/2015-12-07 BXD.geno.loc.eqtl.for.lung.txt", sep="");
# Gene expression file name
expression_file_name = paste(base.dir, "/2015-12-07 mouse.lung.expression.eqtl.txt", sep="");
gene_location_file_name = paste(base.dir, "/2015-12-07 lung.gene.loc.txt", sep="");
# Covariates file name
# Set to character() for no covariates
# covariates_file_name = paste(base.dir, "/Covariates.txt", sep="");

# Output file name
output_file_name_cis = tempfile();
output_file_name_tra = tempfile();

# Only associations significant at this level will be saved
pvOutputThreshold_cis = 1;
pvOutputThreshold_tra = 0.000000000000005;

# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric();
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");
# Distance for local gene-SNP pairs
cisDist = 1e6;


## Load genotype data
snps = SlicedData$new();
snps$fileDelimiter = "\t";      # the TAB character
snps$fileOmitCharacters = "NA"; # denote missing values;
snps$fileSkipRows = 1;
snps$fileSkipColumns = 1;
snps$fileSliceSize = 2000;
snps$LoadFile(SNP_file_name);
## Rows read:  2,000 
## Rows read:  3811  done.
## Load gene expression data
gene = SlicedData$new();
gene$fileDelimiter = "\t";
gene$fileOmitCharacters = "NA"; # denote missing values;
gene$fileSkipRows = 1;
gene$fileSkipColumns = 1;
gene$fileSliceSize = 2000;
gene$LoadFile(expression_file_name);
## Rows read:  2,000 
## Rows read:  4,000 
## Rows read:  6,000 
## Rows read:  8,000 
## Rows read:  10,000 
## Rows read:  12,000 
## Rows read:  14,000 
## Rows read:  16,000 
## Rows read:  18,000 
## Rows read:  20,000 
## Rows read:  22,000 
## Rows read:  24,000 
## Rows read:  26,000 
## Rows read:  28,000 
## Rows read:  30,000 
## Rows read:  32,000 
## Rows read:  34,000 
## Rows read:  34470  done.
## Load covariates
#cvrt = SlicedData$new();
#cvrt$fileDelimiter = "\t";      # the TAB character
#cvrt$fileOmitCharacters = "NA"; # denote missing values;
#cvrt$fileSkipRows = 1;          # one row of column labels
#cvrt$fileSkipColumns = 1;       # one column of row labels
#if(length(covariates_file_name)>0) {
# cvrt$LoadFile(covariates_file_name);
#}

## Run the analysis

snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE);
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);
head(genepos)
##       ProbeSet Chr Blat.Mb.Start Blat.Mb.End
## 1   1415670_at   6      87859681    87859682
## 2   1415671_at   8     108048521   108048893
## 3   1415672_at   8      24351869    24352391
## 4   1415673_at   5     130271465   130272427
## 5 1415674_a_at   9      44212489    44213429
## 6   1415675_at   2      32428524    32428891
me = Matrix_eQTL_main(
  snps = snps,
  gene = gene,
  output_file_name = output_file_name_tra,
  pvOutputThreshold = pvOutputThreshold_tra,
  useModel = useModel,
  errorCovariance = numeric(),
  verbose = TRUE,
  output_file_name.cis = output_file_name_cis,
  pvOutputThreshold.cis = pvOutputThreshold_cis,
  snpspos = snpspos,
  genepos = genepos,
  cisDist = cisDist,
  pvalue.hist = TRUE,
  min.pv.by.genesnp = FALSE,
  noFDRsaveMemory = FALSE);
## Matching data files and location files 
## 34470 of 34470  genes matched
## 3811 of 3811  SNPs matched
## Task finished in  0.065  seconds
## Reordering genes
##  
## Task finished in  0.611  seconds
## Processing covariates 
## Task finished in  0.004  seconds
## Processing gene expression data (imputation, residualization, etc.) 
## Task finished in  0.228  seconds
## Creating output file(s) 
## Task finished in  0.021  seconds
## Performing eQTL analysis 
##  2.77% done, 7,963 cis-eQTLs, 169 trans-eQTLs
##  5.55% done, 14,855 cis-eQTLs, 357 trans-eQTLs
##  8.33% done, 21,896 cis-eQTLs, 548 trans-eQTLs
## 11.11% done, 27,963 cis-eQTLs, 701 trans-eQTLs
## 13.88% done, 36,530 cis-eQTLs, 762 trans-eQTLs
## 16.66% done, 42,280 cis-eQTLs, 813 trans-eQTLs
## 19.44% done, 49,292 cis-eQTLs, 851 trans-eQTLs
## 22.22% done, 54,841 cis-eQTLs, 1,008 trans-eQTLs
## 25.00% done, 58,664 cis-eQTLs, 1,083 trans-eQTLs
## 27.77% done, 1,083 trans-eQTLs
## 30.55% done, 1,083 trans-eQTLs
## 33.33% done, 1,083 trans-eQTLs
## 36.11% done, 1,083 trans-eQTLs
## 38.88% done, 1,083 trans-eQTLs
## 41.66% done, 1,083 trans-eQTLs
## 44.44% done, 1,083 trans-eQTLs
## 47.22% done, 1,083 trans-eQTLs
## 50.00% done, 1,083 trans-eQTLs
## 52.77% done, 1,083 trans-eQTLs
## 55.55% done, 1,083 trans-eQTLs
## 58.33% done, 1,092 trans-eQTLs
## 61.11% done, 1,099 trans-eQTLs
## 63.88% done, 1,099 trans-eQTLs
## 66.66% done, 1,106 trans-eQTLs
## 69.44% done, 1,106 trans-eQTLs
## 72.22% done, 1,106 trans-eQTLs
## 75.00% done, 59,989 cis-eQTLs, 1,135 trans-eQTLs
## 77.77% done, 65,774 cis-eQTLs, 1,174 trans-eQTLs
## 80.55% done, 72,273 cis-eQTLs, 1,316 trans-eQTLs
## 83.33% done, 79,875 cis-eQTLs, 1,392 trans-eQTLs
## 86.11% done, 85,785 cis-eQTLs, 1,462 trans-eQTLs
## 88.88% done, 92,077 cis-eQTLs, 1,561 trans-eQTLs
## 91.66% done, 98,470 cis-eQTLs, 1,659 trans-eQTLs
## 94.44% done, 105,124 cis-eQTLs, 1,711 trans-eQTLs
## 97.22% done, 112,224 cis-eQTLs, 1,772 trans-eQTLs
## 100.00% done, 112,835 cis-eQTLs, 1,774 trans-eQTLs
## Task finished in  20.858  seconds
## 
unlink(output_file_name_cis);

## Results:
cat('Analysis done in:', me$time.in.sec, ' seconds', '\n')
## Analysis done in: 20.507  seconds
cat('Detected local eQTLs:','\n')
## Detected local eQTLs:
cis.eqtls<-me$cis$eqtls
head(cis.eqtls)
##         snps       gene statistic       pvalue          FDR      beta
## 1  rs3681670 1421144_at -75.53535 2.361797e-47 1.332467e-42 -3.093200
## 2  rs3666583 1421144_at -75.53535 2.361797e-47 1.332467e-42 -3.093200
## 3  rs3713848 1437904_at  58.54734 1.217939e-42 3.435655e-38  3.170574
## 4  rs3669217 1437904_at  58.54734 1.217939e-42 3.435655e-38  3.170574
## 5 rs13481364 1438758_at -50.66421 5.603003e-40 6.322148e-36 -2.167143
## 6  rs3708704 1438758_at -50.66421 5.603003e-40 6.322148e-36 -2.167143
dim(cis.eqtls)
## [1] 112835      6
cis.eqtls$beta_se <-cis.eqtls$beta/cis.eqtls$statistic

write.table(cis.eqtls,file="2015-12-04 mouselung.cis.1M.eqtls.txt", sep="\t", row.names=FALSE, quote=FALSE)