dd2014_finalqtlscript

dd2014_finalqtlscript_draft1.R

Lovell — Jul 30, 2014, 5:58 PM

####################################################
### Title: Complete analysis for drydown manuscript
### Author: JTLovell
### Date Created: 29-July 2014

####################################################
###Outline:###
####################################################
# A1: set up the environment

# Part 1 (QTL analysis w/out covariates): 
#   1.1: Load permutations
#   1.2: Load Cross
#   1.3: Plot map, information and other diagnostics
#   1.4: Load list of phenotypes w/ qtls 
#   1.5: Plot correlations and nj tree of phenotypes
#   1.6: Calculate Penalties (and concatenate from different files)
#   1.7: Run stepwiseQTL, output results and save results
#   1.8: Make lod profile plots
#   1.9: Make qtl position plots
# Part 2 QTL fine mapping w/ gene expression
#   2.1: Load gene expression statistics
#     2.1a: plotting of genomic distribution of expression
#     2.1b: determination of GxE genes
#   2.2: Load other files needed for analysis
#   2.3: Calculate intervals (2q, 4p, 4q, 5q)
#   2.4: Plot intervals (and gene-cM correlations)
#   2.5: Extract genes w/ significant expression differences (from ANOVA)
#   2.6: Run covariate scans w/ significant genes
#   2.7: Plot covariate scans
#   2.8: Summarize covariate analysis
#   2.9: Extract go terms from significant genes
#   2.10: Summarize candidate genes
# Part 3: Effect of cytoplasm
#   3.1: T.tests and varcomp analysis of cytoplasm
#   3.2: Plots and summary of cytoplasm stats
#   3.3: Bioinformatics of DNA sequence polymorphism in cytoplasm
#   3.4: Determination of candidate genes in cytoplasm
#   3.5: Cytoplasm main effect in QTLs (oneways)
#   3.6: Cytoplasm main effect in multi-QTL models
#   3.7: Cytoplasm interactive effects in QTLs (oneway)- compare w/ 3.3
#   3.8: Interaction plots of QTL*cyto and cyto main effect at proline
#   3.8: Bioinformatics of potential QTL-by-cytoplasm interactions
# Part 4: Epistasis
#   4.1: Extract epistasis components from stepwiseqtl
#   4.2: Interaction plots of epistasis
#   4.3: Scantwo of epistasis
#   4.4: fitqtl of epistasis
# Part 5: QTL*environment interactions
#   5.1: work on later...



####################################################
###Libraries:###
####################################################



##############################################
#A1 Set up the environment
rm(list=ls())
options(warn=-1)
pkg <- c("biomaRt","qtl","snow","rlecuyer","plyr","ggplot2","reshape","reshape2","qvalue","lme4","nlme","car","doBy","scales")
invisible(lapply(pkg, function(x) {cat(x,"..."); library(x, character.only=T, verbose=F, warn.conflicts=F,quietly=T)} ))

biomaRt ...qtl ...snow ...rlecuyer ...plyr ...ggplot2 ...reshape ...reshape2 ...qvalue ...lme4 ...


Attaching package: 'Matrix'

The following object is masked from 'package:reshape':

    expand

nlme ...car ...doBy ...

Loading required package: splines

Attaching package: 'MASS'

The following object is masked from 'package:biomaRt':

    select

scales ...

sessionInfo()

R version 3.1.0 (2014-04-10)
Platform: x86_64-apple-darwin13.1.0 (64-bit)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] splines   stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] scales_0.2.4    doBy_4.5-10     MASS_7.3-33     survival_2.37-7
 [5] car_2.0-20      nlme_3.1-117    lme4_1.1-7      Rcpp_0.11.2    
 [9] Matrix_1.1-4    qvalue_1.38.0   reshape2_1.4    reshape_0.8.5  
[13] ggplot2_1.0.0   plyr_1.8.1      rlecuyer_0.3-3  snow_0.3-13    
[17] qtl_1.32-10     biomaRt_2.20.0  knitr_1.6      

loaded via a namespace (and not attached):
 [1] AnnotationDbi_1.26.0 Biobase_2.24.0       BiocGenerics_0.10.0 
 [4] colorspace_1.2-4     DBI_0.2-7            digest_0.6.4        
 [7] evaluate_0.5.5       formatR_0.10         GenomeInfoDb_1.0.2  
[10] grid_3.1.0           gtable_0.1.2         IRanges_1.22.10     
[13] lattice_0.20-29      minqa_1.2.3          munsell_0.4.2       
[16] nloptr_1.0.0         nnet_7.3-8           parallel_3.1.0      
[19] proto_0.3-10         RCurl_1.95-4.1       RSQLite_0.11.4      
[22] stats4_3.1.0         stringr_0.6.2        tcltk_3.1.0         
[25] tools_3.1.0          XML_3.98-1.1

setwd("~/Desktop/Drydown2014")
##############################################
##############################################
#Part 1 (QTL analysis w/out covariates): 
##############################################
##############################################


##############################################
#1.1: Load permutations
load("dd2014_allqtldata.RData")
##############################################
#1.2: Load Cross
cross<-read.cross("csvs",dir="", genotypes=c("a","b"),
                  genfile="TKrils_map55.csv", phefile= "TKrils_full_phe.csv",
                  na.strings=c("NA","-", "#VALUE!","#NUM!"))

 --Read the following data:
     341  individuals
     450  markers
     38  phenotypes
 --Cross type: bc

cross.2008<-read.cross("csvs",dir="", genotypes=c("A","H"),
                  genfile="McKay_revised_map_1_27_12.csv",phefile= "TKrils_full_phe.csv",
                  na.strings=c("NA","-", "#VALUE!","#NUM!"))

 --Read the following data:
     341  individuals
     168  markers
     38  phenotypes
 --Cross type: bc

cross.2008<-convert2riself(cross.2008)
cross<-convert2riself(cross)
cross<-calc.genoprob(cross, step=1, stepwidth="max",error.prob=0.01, map.function="kosambi")

##############################################
#1.3: Plot map, information and other diagnostics
par(mfrow=c(1,1))
plot.map(cross.2008, main="2008 map")

plot of chunk unnamed-chunk-1

plot.map(cross, main="new map")

plot of chunk unnamed-chunk-1

plotInfo(cross.2008, main="missing information, 2008 map",col="red",lty=2)

plot of chunk unnamed-chunk-1

plotInfo(cross, main="missing information, new map")

plot of chunk unnamed-chunk-1

summary(cross)

    RI strains via selfing

    No. individuals:    341 

    No. phenotypes:     38 
    Percent phenotyped: 78.7 

    No. chromosomes:    5 
        Autosomes:      1 2 3 4 5 

    Total markers:      450 
    No. markers:        103 80 82 74 111 
    Percent genotyped:  70.4 
    Genotypes (%):      AA:51.2  BB:48.8

##############################################
#1.4: Load list of phenotypes w/ qtls 
qtlphes<-c("ABAlyoph.dry", "C13.dry","C13.wet","FT.June","GR.dry","RMR.dry",
           "Rel.GR.wet","Rolled.dry","Wilted.dry","g0","proline.dry")
##############################################
#1.5: Plot correlations and nj tree of phenotypes
pheno.matrix<-as.data.frame(cross$phe)
maineffect.phes<-list()
for (i in colnames(pheno.matrix)) maineffect.phes[[i]]<-ifelse(strsplit(i,"[.]")[[1]][2]=="diff", "diff","main")
maineffect.phes<-data.frame(unlist(maineffect.phes));maineffect.phes$id<-rownames(maineffect.phes)
maineffect.phes<-maineffect.phes[complete.cases(maineffect.phes),]
maineffect.phes<-maineffect.phes$id[maineffect.phes$unlist.maineffect.phes.=="main"]
pheno.matrix.main<-pheno.matrix[,maineffect.phes]
test<-as.matrix(pheno.matrix.main[,-1])
heatmap(cor(pheno.matrix.main[,-1], use="pairwise"), na.rm=T) #can make this look better

plot of chunk unnamed-chunk-1

#print(cor(pheno.matrix.main[,-1], use="pairwise"))

##############################################
#1.6: Calculate Penalties (and concatenate from different files)
pens<-calc.penalties(perms.raw, alpha=0.05)
Wilted.dry<-calc.penalties(perms.wilted, alpha=0.05)
pens<-rbind(pens, Wilted.dry)
##############################################
#1.7: Run stepwiseQTL, output results and save results
stats_out<-data.frame(phenotype=character(),chromosome=numeric(),position=numeric(),
                      df=numeric(),type3SS=numeric(),LOD=numeric(),perc.var=numeric(),Fstat=numeric(),P.chi2=numeric(),P.F=numeric(),
                      effect.estimate=numeric(),effect.SE=numeric(),effect.t=numeric(),
                      lowCImarker=character(), hiCImarker=character(),
                      lowCIpos=character(), hiCIpos=character())
stats.all_out<-data.frame(phenotype=character(),df=numeric(),type3SS=numeric(),LOD=numeric(),perc.var=numeric(),Fstat=numeric(),P.chi2=numeric(),P.F=numeric(),
                      effect.estimate=numeric(),effect.SE=numeric(),effect.t=numeric())
statcols<-names(stats_out);statcolsall<-names(stats.all_out);par(mfrow=c(1,1))
model.out<-list()
par(mfrow=c(4,1),mar = c(2,4,2,1))
for (i in qtlphes){
  stepout<-stepwiseqtl(grid,pheno.col=i, penalties=pens[i,], max.qtl=6,method="hk",
                       refine.locations=T,additive.only=F, keeplodprofile=T,keeptrace=T,verbose=F)
  model.out[[i]]<-stepout
  plotLodProfile(stepout,main=paste(i,"formula: ", formula(stepout)))
  fit<-fitqtl(grid,pheno.col=i,qtl=stepout,formula=formula(stepout),get.ests=T,dropone=T)
  nqtls<-nqtl(stepout)
  nterms<-sum(countqtlterms(formula(stepout))[c(1,4)])
  if(nqtls>1){
    ests_out<-summary(fit)$ests[2:(nqtls+1),1:3]
    cis<-data.frame()
    for (j in 1:nqtls){
      ciout<-bayesint(stepout,qtl.index=j, expandtomarkers=T, prob=.9)
      lowmarker<-rownames(ciout)[1];  highmarker<-rownames(ciout)[3]
      lowposition<-ciout[1,2];  highposition<-ciout[3,2]
      cis<-rbind(cis,cbind(lowmarker,highmarker,lowposition,highposition))
    }
    stats<-data.frame(rep(i,nqtls),stepout$chr,stepout$pos,fit$result.drop[1:nqtls,],ests_out,cis)
    stats.all<-data.frame(rep(i,nterms),fit$result.drop,summary(fit)$ests[-1,])
    colnames(stats)<-statcols
    colnames(stats.all)<-statcolsall
  }
  else{
    cis<-data.frame()
    for (j in 1:nqtls){
      ciout<-bayesint(stepout,qtl.index=j, expandtomarkers=T, prob=.9)
      lowmarker<-rownames(ciout)[1]; highmarker<-rownames(ciout)[3]
      lowposition<-ciout[1,2]; highposition<-ciout[3,2]
      cis<-rbind(cis,cbind(lowmarker,highmarker,lowposition,highposition))
    }
    ests_out<-summary(fit)$ests[2:(nqtls+1),1:3]
    stats<-data.frame(c(i,stepout$chr[1],stepout$pos[1],as.numeric(fit$result.full[1,c(1,2,4,5,3,6,7)]),as.numeric(ests_out),cis[1,]))
    colnames(stats)<-statcols; rownames(stats)<-stepout$name
    stats.all<-stats[c(1,4:13)]
  }
  stats_out<-rbind(stats_out,stats)
  stats.all_out<-rbind(stats.all_out,stats.all)
}

plot of chunk unnamed-chunk-1

par(mfrow=c(4,1),mar = c(2,4,2,1))

plot of chunk unnamed-chunk-1

write.csv(stats_out, file="dd2014_allqtlstatistics_nocovar.csv")
write.csv(stats.all_out, file="dd2014_allqtlstatistics_nocovar_wepistasis.csv")
##############################################
#1.8: Make lod profile plots
test<-model.out[["FT.June"]]
##############################################
#1.9: Make qtl position plots
statsout<-read.csv("dd2014_allqtlstatistics_nocovar.csv",header=T) 
#function to plot pvals
reverselog_trans <- function(base = exp(1)) {
  trans <- function(y) -log(y, base); inv <- function(y) base^(-y)
  trans_new(paste0("reverselog-", format(base)), trans, inv, 
            log_breaks(base = base), domain = c(1e-100, Inf))}

ggplot(statsout,aes(x=position,y=as.numeric(effect.t)))+
  geom_point() +   scale_y_continuous("t statistic for QTL point estimate")+
  geom_errorbarh(aes(xmax = hiCIpos, xmin = lowCIpos, height = 0))+
  facet_grid(phenotype~chromosome, scales="free_y", margins=F)+
  theme_bw()+  ggtitle("distribution of QTL positions and effect sizes")

ggplot(statsout,aes(x=position,y=P.F))+
  geom_point() +
  geom_errorbarh(aes(xmax = hiCIpos, xmin = lowCIpos, height = 0))+
  facet_grid(.~chromosome, scales="free_y", margins=F)+
  scale_y_continuous(trans=reverselog_trans(10),"p.value of F statistic")+
  theme_bw()+  ggtitle("distribution of QTL positions and effect sizes")

plot of chunk unnamed-chunk-1

ggplot(statsout,aes(x=position))+
  geom_histogram(binwidth=5) + 
  facet_grid(.~chromosome, scales="free_y", margins=F)+
  theme_bw()+  ggtitle("distribution and density of QTL point estimates")

plot of chunk unnamed-chunk-1

##############################################
##############################################
#Part 2 QTL fine mapping w/ gene expression
##############################################
##############################################

##############################################
#2.1: Load gene expression statistics, generate qvalue statistics and process
out<-read.csv("dd2014_allgenes_geneexpressionstats.csv", header=T)
all.anova<-out
qvals<-data.frame(rownames(all.anova))
par(mfrow=c(5,1),mar = c(2,4,2,1))
for (i in c("p.geno","p.trt","p.gxe","dry_pval","wet_pval")){
  q.out<-as.numeric(qvalue(all.anova[,i])$qvalue)
  hist(q.out, main=paste("qvalue statistics calculated from", i))
  qvals<-cbind(qvals,as.data.frame(q.out))
}

plot of chunk unnamed-chunk-1

qvals<-qvals[,-1];colnames(qvals)<-c("sig.geno","sig.trt","sig.gxe","dry_qval","wet_qval")
all.anova<-cbind(all.anova,qvals)
summary(all.anova)

      gene.id            marker.id       chromosome       pos.bp        
 AT1G01010:    1   C3_3623963 :  311   Min.   :1.00   Min.   :    1251  
 AT1G01020:    1   C3_4648688 :  225   1st Qu.:1.00   1st Qu.: 5822841  
 AT1G01030:    1   C3_2497708 :  217   Median :3.00   Median :11920794  
 AT1G01040:    1   C3_19161491:  205   Mean   :2.94   Mean   :12591720  
 AT1G01050:    1   2-15986145 :  193   3rd Qu.:4.00   3rd Qu.:18724714  
 AT1G01060:    1   C4_9316828 :  189   Max.   :5.00   Max.   :30426839  
 (Other)  :25618   (Other)    :24284                                    
     pos.cm          f.geno          f.trt            f.gxe      
 Min.   :  0.0   Min.   :  0.0   Min.   :   0.0   Min.   : 0.00  
 1st Qu.: 22.1   1st Qu.:  0.2   1st Qu.:   0.3   1st Qu.: 0.09  
 Median : 51.5   Median :  0.8   Median :   1.9   Median : 0.42  
 Mean   : 48.7   Mean   :  3.5   Mean   :  14.6   Mean   : 1.01  
 3rd Qu.: 71.7   3rd Qu.:  2.7   3rd Qu.:  10.0   3rd Qu.: 1.26  
 Max.   :111.1   Max.   :882.7   Max.   :2003.5   Max.   :64.02  

     p.geno          p.trt            p.gxe           AA.dry     
 Min.   :0.000   Min.   :0.0000   Min.   :0.000   Min.   : 6.68  
 1st Qu.:0.104   1st Qu.:0.0018   1st Qu.:0.264   1st Qu.: 6.99  
 Median :0.366   Median :0.1713   Median :0.519   Median : 7.14  
 Mean   :0.403   Mean   :0.2989   Mean   :0.511   Mean   : 7.26  
 3rd Qu.:0.677   3rd Qu.:0.5598   3rd Qu.:0.759   3rd Qu.: 7.39  
 Max.   :1.000   Max.   :0.9999   Max.   :1.000   Max.   :12.75  

     BB.dry          AA.wet          BB.wet        dry_estAA    
 Min.   : 6.65   Min.   : 6.68   Min.   : 6.67   Min.   : 6.68  
 1st Qu.: 6.99   1st Qu.: 6.99   1st Qu.: 6.99   1st Qu.: 6.99  
 Median : 7.15   Median : 7.14   Median : 7.14   Median : 7.14  
 Mean   : 7.26   Mean   : 7.26   Mean   : 7.26   Mean   : 7.26  
 3rd Qu.: 7.39   3rd Qu.: 7.39   3rd Qu.: 7.39   3rd Qu.: 7.39  
 Max.   :12.53   Max.   :12.36   Max.   :12.36   Max.   :12.75  

   dry_estBB        dry_pval       wet_estAA       wet_estBB    
 Min.   : 6.65   Min.   :0.000   Min.   : 6.68   Min.   : 6.67  
 1st Qu.: 6.99   1st Qu.:0.158   1st Qu.: 6.99   1st Qu.: 6.99  
 Median : 7.15   Median :0.420   Median : 7.14   Median : 7.14  
 Mean   : 7.26   Mean   :0.440   Mean   : 7.26   Mean   : 7.26  
 3rd Qu.: 7.39   3rd Qu.:0.705   3rd Qu.: 7.39   3rd Qu.: 7.39  
 Max.   :12.53   Max.   :1.000   Max.   :12.36   Max.   :12.36  

    wet_pval        sig.geno        sig.trt         sig.gxe     
 Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000  
 1st Qu.:0.162   1st Qu.:0.322   1st Qu.:0.004   1st Qu.:1.000  
 Median :0.422   Median :0.565   Median :0.190   Median :1.000  
 Mean   :0.440   Mean   :0.489   Mean   :0.219   Mean   :0.991  
 3rd Qu.:0.704   3rd Qu.:0.697   3rd Qu.:0.414   3rd Qu.:1.000  
 Max.   :1.000   Max.   :0.772   Max.   :0.555   Max.   :1.000  

    dry_qval        wet_qval    
 Min.   :0.000   Min.   :0.000  
 1st Qu.:0.529   1st Qu.:0.546  
 Median :0.705   Median :0.712  
 Mean   :0.623   Mean   :0.630  
 3rd Qu.:0.788   3rd Qu.:0.791  
 Max.   :0.839   Max.   :0.843

##############################################
#2.1a: make plots of genomic distribution of pvalues
#set significance (based on qvalues) as a threshold variable
sig.geno.fac<- transform(all.anova, cut=cut(sig.geno, breaks=c(0,.05,1)))$cut
sig.trt.fac<- transform(all.anova, cut=cut(sig.trt, breaks=c(0,.05,1)))$cut
sig.gxe.fac<- transform(all.anova, cut=cut(sig.gxe, breaks=c(0,.05,1)))$cut
dry_pval.fac<- transform(all.anova, cut=cut(dry_pval, breaks=c(0,.05,1)))$cut
wet_pval.fac<- transform(all.anova, cut=cut(wet_pval, breaks=c(0,.05,1)))$cut
all.anova.toplot<-cbind(all.anova,sig.geno.fac,sig.trt.fac,sig.gxe.fac,dry_pval.fac,wet_pval.fac)
facstoplot<-list(c("sig.geno.fac","p.geno","Pvalues of genotype effect (typeIII)"),
                 c("sig.trt.fac","p.trt","Pvalues of environment effect (typeIII)"),
                 c("dry_pval.fac","dry_pval","Pvalues of genotype effect in dry (t.test)"),
                 c("wet_pval.fac","wet_pval","Pvalues of genotype effect in wet (t.test)"))
#threshold is at alpha=0.05 (via Q-values), colored in plots this way
for(i in facstoplot){
  print(ggplot(all.anova.toplot,aes(y=get(i[2]),x=pos.bp,color=get(i[1])))+
    geom_point()+theme_bw()+ggtitle(i[3])+
    scale_y_continuous(trans=reverselog_trans(10))+ facet_grid(chromosome~.))}

plot of chunk unnamed-chunk-1

##############################################
#2.1b Identify genes w/ significant G*E, output for later
head(all.anova)

    gene.id marker.id chromosome pos.bp pos.cm f.geno   f.trt   f.gxe
1 AT1G01010 C1_103654          1   3631  0.186 3.6251 6.63512 7.83503
2 AT1G01020 C1_103654          1   5928  0.186 4.2390 1.87075 0.34134
3 AT1G01030 C1_103654          1  11649  0.186 2.1099 5.92443 1.01925
4 AT1G01040 C1_103654          1  23146  0.186 0.0727 1.53420 0.01366
5 AT1G01050 C1_103654          1  31170  0.186 0.3614 0.03060 0.03141
6 AT1G01060 C1_103654          1  33666  0.186 4.9961 0.06019 0.60216
   p.geno   p.trt    p.gxe AA.dry BB.dry AA.wet BB.wet dry_estAA dry_estBB
1 0.05836 0.01073 0.005631  7.240  7.219  7.245  7.354     7.240     7.219
2 0.04081 0.17294 0.559722  7.198  7.139  7.217  7.184     7.198     7.139
3 0.14793 0.01582 0.313930  7.072  7.119  7.134  7.142     7.072     7.119
4 0.78772 0.21695 0.907088  7.624  7.619  7.606  7.604     7.624     7.619
5 0.54844 0.86131 0.859505  7.469  7.497  7.480  7.495     7.469     7.497
6 0.02652 0.80645 0.438682  7.438  7.551  7.407  7.641     7.438     7.551
  dry_pval wet_estAA wet_estBB wet_pval sig.geno sig.trt sig.gxe dry_qval
1  0.49431     7.245     7.354 0.003923   0.2334 0.01946  0.6680   0.7342
2  0.04312     7.217     7.184 0.352691   0.1902 0.19130  0.9997   0.3092
3  0.11033     7.134     7.142 0.733678   0.3789 0.02728  0.9997   0.4637
4  0.79055     7.606     7.604 0.908446   0.7274 0.22526  0.9997   0.8065
5  0.55787     7.480     7.495 0.763103   0.6489 0.51776  0.9997   0.7545
6  0.27156     7.407     7.641 0.054034   0.1481 0.50217  0.9997   0.6282
  wet_qval
1  0.08107
2  0.67949
3  0.79762
4  0.83166
5  0.80225
6  0.35175

#effect of g*e
ggplot(all.anova.toplot,aes(y=p.gxe,x=pos.bp,color=sig.gxe.fac))+
  geom_point()+theme_bw()+ggtitle("Pvalues of GxE effect (typeIII)")+
  scale_y_continuous(trans=reverselog_trans(10))+
  facet_grid(chromosome~.)

plot of chunk unnamed-chunk-1

gxe.genes<-all.anova[all.anova$sig.gxe<0.05,]
as.character(gxe.genes$gene.id)

 [1] "AT1G05200" "AT1G23940" "AT1G27030" "AT1G48760" "AT1G56510"
 [6] "AT1G62290" "AT1G62810" "AT1G63420" "AT1G67365" "AT1G72800"
[11] "AT1G73010" "AT1G76240" "AT2G15480" "AT2G16990" "AT2G23110"
[16] "AT2G35075" "AT3G47580" "AT4G01360" "AT4G15910" "AT4G19530"
[21] "AT4G21410" "AT4G22990" "AT5G09230" "AT5G09240" "AT5G12230"
[26] "AT5G25980" "AT5G41760" "AT5G42825" "AT5G44310"

##############################################
#2.2: Load other files needed for analysis
cmgenpos<-read.csv("gene_locations_new_McKay_map.csv",header=T, na.strings="NA") #physical position of all genes
geneexp<-read.csv("May_SNP_5_4_12.csv",header=T) #gene expression of all AT genes in drydown
idnames<-read.csv("dd2014_markernametoid.csv",header=T)
colnames(idnames)[2]<-"line"
geneexp<-merge(idnames,geneexp, by="line")
physpos.gene<-cmgenpos[,c(1,4:5)]
annotations<-read.table("Functional_annotation_list.txt",sep="\t",header=F)
colnames(annotations)<-c("gene","annotation")
##############################################
#2.3: Look at the distribution of QTL and effects to figure out where intervals should be
#look at the lod profiles and scanones if needed
#n <- names(model.out);lapply(model.out, function(x) plotLodProfile(x, main=n[substitute(x)[[3]]])
#par(mfrow=c(2,2));for (i in qtlphes) plot(scanone(cross, pheno.col=i,ch=5), main=i)
#make a dataframe with effects to help decide how to structure the regions
cross<-sim.geno(cross, n.draws=64, step=1, stepwidth="max",error.prob=0.01, map.function="kosambi")
effectscan.out<-data.frame()
for (i in qtlphes){
  temp<-data.frame(effectscan(cross, pheno.col=i, draw=F),i)
  temp$st.effect<-scale(temp$a,center=T,scale=T);effectscan.out<-rbind(effectscan.out,temp)
}
#general plots of qtl effects
names(effectscan.out)[3:4]<-c("effect","phenotype")
ggplot(effectscan.out, aes(x=pos, y=st.effect, color=phenotype))+ geom_line()+
  theme_bw()+ggtitle("effect scans (of TSU allele) of all phenotypes")+  facet_wrap(~chr)

plot of chunk unnamed-chunk-1

es.phys<-effectscan.out[effectscan.out$phenotype %in% c("C13.wet","C13.dry","ABAlyoph.dry","proline.dry","g0"),]
ggplot(es.phys, aes(x=pos, y=st.effect, color=phenotype))+geom_line()+theme_bw()+
  facet_wrap(~chr)+ggtitle("effect scans (of TSU allele) of physiological phenotypes")

plot of chunk unnamed-chunk-1

es.growth<-effectscan.out[effectscan.out$phenotype %in% c("GR.dry","RMR.dry","Rel.GR.wet","Rolled.dry","Wilted.dry"),]
ggplot(es.growth, aes(x=pos, y=st.effect, color=phenotype))+
  geom_line()+theme_bw()+  facet_wrap(~chr)+ggtitle("effect scans (of TSU allele) of growth phenotypes")

plot of chunk unnamed-chunk-1

##############################################
#2.4: Check intervals around four regions...
#check out the qtl effects in a region... here is a function
#region needs to be a vector of length 3 (chromosome, start pos, end pos)
#phes needs to be a vector of exactly 2 phenotypes
ploteff<-function(region,phes){
  par(mfrow=c(3,1)); plot(scanone(cross, pheno.col=phes[1], chr=region[1]), main=paste("scanones on ch",region[1], "for phenotypes:", phes[1],phes[2]))
  plot(scanone(cross, pheno.col=phes[2], chr=region[1]), add=T, lty=3, col="red")
  rect(xleft=region[2], xright=region[3],ytop=100,ybottom=-100, col=rgb(0, 1, 0, alpha=0.2), border=F)
  effectscan(cross, pheno.col=phes[1], draw=T, chr=region[1],main=paste("effect scan, ch=",region[1],"; phenotype=", phes[1]))
  rect(xleft=region[2], xright=region[3],ytop=100,ybottom=-100, col=rgb(0, 1, 0, alpha=0.2), border=F)
  effectscan(cross, pheno.col=phes[2], draw=T, chr=region[1], main=paste("effect scan, ch=",region[1],"; phenotype=", phes[2]))
  rect(xleft=region[2], xright=region[3],ytop=100,ybottom=-100, col=rgb(0, 1, 0, alpha=0.2), border=F)
}

##############################################
#2.4.1 (reg2.1) --- aba-2.1 - Initially too wide- entire shoulder of peak
#look at wilted too:
reg2.1<-statsout[statsout$chromosome==2 & statsout$position <20,c(3,17,18)]
phes2.1<-c("ABAlyoph.dry","Wilted.dry")
ploteff(region=reg2.1,phes=phes2.1)

plot of chunk unnamed-chunk-1

newint<-data.frame(bayesint(model.out["ABAlyoph.dry"][[1]], qtl.index=1, prob=.75)[1:3,1:3])
reg.test<-reg2.1; reg.test[2]<-newint[1,2];reg.test[3]<-newint[3,2]
ploteff(region=reg.test,phes=phes2.1)

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1)); plotLodProfile(model.out["ABAlyoph.dry"][[1]], main="Lod Profile of ABA, red-90% CI, purple-75% CI") 
rect(xleft=reg.test[2], xright=reg.test[3],ytop=100,ybottom=-100, col=rgb(0, 0, 1, alpha=0.2), border=F)
rect(xleft=reg2.1[2], xright=reg2.1[3],ytop=100,ybottom=-100, col=rgb(1, 0, 0, alpha=0.2), border=F)

plot of chunk unnamed-chunk-1

#the narrower region captures the peak much better (without the shoulders)
reg2.1<-reg.test

##############################################
#2.4.2 (reg2.2) ---proline-2.2 
reg2.2<-statsout[statsout$chromosome==2 & statsout$position >60 & statsout$phenotype=="proline.dry",c(3,17,18)]
phes2.2<-c("proline.dry","Rolled.dry")
ploteff(region=reg2.2,phes=phes2.2)

plot of chunk unnamed-chunk-1

newint<-data.frame(bayesint(model.out["proline.dry"][[1]], qtl.index=1, prob=.99)[1:3,1:3])
reg.test<-reg2.2; reg.test[2]<-newint[1,2];reg.test[3]<-newint[3,2]
ploteff(region=reg.test,phes=phes2.2)

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1)); plotLodProfile(model.out["proline.dry"][[1]], main="Lod Profile of proline, purple-90% CI, blue-99% CI",ch=2) 
rect(xleft=reg.test[2], xright=reg.test[3],ytop=100,ybottom=-100, col=rgb(0, 0, 1, alpha=0.2), border=F)
rect(xleft=reg2.2[2], xright=reg2.2[3],ytop=100,ybottom=-100, col=rgb(1, 0, 0, alpha=0.2), border=F)

plot of chunk unnamed-chunk-1

#the wider region captures the peak much better (without the shoulders)
reg2.2<-reg.test

##############################################
#2.4.3 (reg4.1) ---d13C-4.1 
reg4.1<-statsout[statsout$chromosome==4 & statsout$position <10 & statsout$phenotype=="C13.wet",c(3,17,18),]
phes4.1<-c("C13.wet","proline.dry")
ploteff(region=reg4.1,phes=phes4.1)

plot of chunk unnamed-chunk-1

##############################################
#2.4.4(reg5.2) ---d13C-5.2 -initially too wide (incorporates 2 peaks)
reg5.2<-statsout[statsout$chromosome==5 & statsout$position >60 & statsout$phenotype=="C13.wet",c(3,17,18),]
phes5.2<-c("C13.wet","C13.dry")
ploteff(region=reg5.2,phes=phes5.2)

plot of chunk unnamed-chunk-1

newint<-data.frame(bayesint(model.out["C13.wet"][[1]], qtl.index=6, prob=.8)[1:3,1:3])
reg.test<-reg5.2; reg.test[2]<-newint[1,2];reg.test[3]<-newint[3,2]
ploteff(region=reg.test,phes=phes5.2)

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1)); plotLodProfile(model.out["C13.wet"][[1]],ch=5, main="Lod Profile of d13C.wet, red-90% CI, purple-80% CI") 
rect(xleft=reg.test[2], xright=reg.test[3],ytop=100,ybottom=-100, col=rgb(0, 0, 1, alpha=0.2), border=F)
rect(xleft=reg5.2[2], xright=reg5.2[3],ytop=100,ybottom=-100, col=rgb(1, 0, 0, alpha=0.2), border=F)

plot of chunk unnamed-chunk-1

#this looks much better- only single peak in coverages
reg5.2<-reg.test

##############################################
#2.5 list out and plot the regions and focal phenotypes
all.regions<-list(reg2.1,reg2.2,reg4.1,reg5.2)
all.focalphes<-c("ABAlyoph.dry","proline.dry","C13.wet","C13.wet")
all.region.names<-c("reg2.1","reg2.2","reg4.1","reg5.2")
#function to subset the data
findgenes<-function(region){
  reg.in<-as.numeric(region)
  culled.bycm<-all.anova[all.anova$chromosome==reg.in[1] & all.anova$pos.cm>reg.in[2] & all.anova$pos.cm<reg.in[3],]
  temp<-all.anova[
    all.anova$chromosome==reg.in[1] & all.anova$pos.cm>reg.in[2] & all.anova$pos.cm<reg.in[3]  |
      all.anova$chromosome==reg.in[1] & all.anova$pos.bp>min(culled.bycm$pos.bp) & all.anova$pos.bp<max(culled.bycm$pos.bp),]
  return(as.character(temp$gene.id))
}
exp.stats<-all.anova
exp.stats$gene.cat<-as.character(exp.stats$gene.id)
for (i in 1:4){
  temp<-findgenes(all.regions[[i]])
  exp.stats$gene.cat[exp.stats$gene.cat %in% temp]<-all.region.names[i]
}
exp.stats$gene.cat[!exp.stats$gene.cat %in% all.region.names]<-"other"
ggplot(exp.stats, aes(y=pos.cm, x=pos.bp, color=gene.cat))+
  scale_color_manual(values=c("black",rainbow(4)))+
  geom_point()+theme_bw()+ggtitle("physical gene position to cM marker position")+
  facet_wrap(~chromosome)

plot of chunk unnamed-chunk-1

##############################################
#2.6: Extract genes w/ significant expression differences (from ANOVA)
#2.7: Make dataframe with those genes only for covariate analysis
exp.covars.list<-list()
sig.genes.out<-list()
for (i in 1:4){
  temp<-exp.stats[exp.stats$gene.cat==all.region.names[i],]
  #extract 10 most signifiant genes
  siggenes<-temp[temp$sig.geno<as.numeric(quantile(temp$sig.geno,(length(temp[,1])-(length(temp[,1])-10))/length(temp[,1]))),]
  print(all.region.names[i])
  sig.genes<-print(as.character(siggenes$gene.id))
  sig.genes.out[[i]]<-sig.genes
  geneexp.culled<-geneexp[,c("treatment","line",sig.genes)]
  geneexp1<-reshape(geneexp.culled, idvar="line", timevar="treatment",direction="wide")
  names(idnames)[2]<-"line"
  exp.covars<-merge(idnames,geneexp1, by="line", all=T)
  exp.covars.list[[i]]<-exp.covars
}

[1] "reg2.1"
 [1] "AT2G03140" "AT2G03820" "AT2G04170" "AT2G04378" "AT2G04380"
 [6] "AT2G05810" "AT2G05915" "AT2G07215" "AT2G11140" "AT2G11240"
[1] "reg2.2"
 [1] "AT2G38530" "AT2G39110" "AT2G40960" "AT2G41440" "AT2G42170"
 [6] "AT2G42490" "AT2G43210" "AT2G43530" "AT2G43680" "AT2G43980"
[1] "reg4.1"
 [1] "AT4G00355" "AT4G00430" "AT4G00440" "AT4G00525" "AT4G00620"
 [6] "AT4G00650" "AT4G00720" "AT4G00755" "AT4G00900" "AT4G00975"
[1] "reg5.2"
 [1] "AT5G53550" "AT5G53700" "AT5G53930" "AT5G53970" "AT5G54710"
 [6] "AT5G54720" "AT5G54730" "AT5G54770" "AT5G55180" "AT5G55560"

#2.8 Run scanone for w/ and w/o covars
outall<-data.frame(chr=numeric(), pos=numeric(),lod=numeric(),
                   phe=character(),covar=character(),
                   gene=character(),expression_env=character())
for (i in all.focalphes){
  temp<-cbind(as.data.frame(scanone(cross,pheno.col=i)),i,"nocovar","nocovar","nocovar")
  colnames(temp)[4:7]<-c("phe","covar","gene","expression_env")
  outall<-rbind(outall,temp)
}
for (j in 1:4){
  for (i in colnames(exp.covars.list[j][[1]])[3:length(colnames(exp.covars.list[j][[1]]))]){
    s1.out<-scanone(cross,pheno.col=all.focalphes[j],addcovar=exp.covars.list[[j]][,i])
    s1.out<-cbind(as.data.frame(s1.out),as.character(all.focalphes[j]),as.character(i),
                  unlist(strsplit(i,"[.]"))[1],unlist(strsplit(i,"[.]"))[2])
    colnames(s1.out)[4:7]<-c("phe","covar","gene","expression_env")
    outall<-rbind(outall,s1.out)
  }
}

##############################################
#2.9: Plot covariate scans
#for all phenotypes, just the focal ch
#inputs- 
# toplot: the full scanout-out object, 
#region: the region and phenotype: the phenotype
#maxcm- maximum cm to plot (mincm same way)
plotcovscan<-function(toplot, region, phenotype, maxcm, mincm, genes)
{
  region.in<-as.numeric(region)
  toplot.window<-toplot[toplot$chr==region.in[1] &
                          toplot$pos<maxcm & toplot$pos>mincm &
                          toplot$gene %in%  genes|
                          toplot$chr==region.in[1] &
                          toplot$pos<maxcm & toplot$pos>mincm &
                          toplot$gene =="nocovar" &
                          toplot$phe==phenotype,]
  palette1<-rainbow(length(genes))
  line.cols <- c("lightgrey",rep(palette1,2))
  line.width<-c(2,rep(.5,length(genes)*2))
  line.shapes<-c(rep(1,length(genes)+1),rep(3,length(genes)))
  ggplot(toplot.window,aes(x=pos,y=lod))+
    geom_line(aes(col=covar,size=covar,linetype=covar))+
    scale_color_manual(values=line.cols)+
    scale_linetype_manual(values=line.shapes)+
    scale_size_manual(values=line.width)+
    geom_vline(xintercept=c(region.in[2:3]), lty=4)+
    ggtitle(paste("Scanones for",phenotype,"(QTL region-- CH:",region.in[1], 
                  ", POS:",round(region.in[2],2), "-",round(region.in[3],2),")", "\n",
                  "LODs w/ and w/o gene expression covariates are plotted", sep=""))+
    theme_bw()
}

#plot all regions
max.cm<-c(45,100,45,100)
min.cm<-c(0,45,0,45)
for (i in 1:4){
  print(
  plotcovscan(outall,region=all.regions[[i]],phenotype=all.focalphes[i],
              genes=sig.genes.out[[i]],maxcm=max.cm[i], mincm=min.cm[i])
  )
}

plot of chunk unnamed-chunk-1

##############################################
#2.10: Summarize covariate analysis
for (i in 1:4) cat(all.focalphes[i],"-",all.region.names[i],":  ", sig.genes.out[[i]], sep=" ", "\n")

ABAlyoph.dry - reg2.1 :   AT2G03140 AT2G03820 AT2G04170 AT2G04378 AT2G04380 AT2G05810 AT2G05915 AT2G07215 AT2G11140 AT2G11240 
proline.dry - reg2.2 :   AT2G38530 AT2G39110 AT2G40960 AT2G41440 AT2G42170 AT2G42490 AT2G43210 AT2G43530 AT2G43680 AT2G43980 
C13.wet - reg4.1 :   AT4G00355 AT4G00430 AT4G00440 AT4G00525 AT4G00620 AT4G00650 AT4G00720 AT4G00755 AT4G00900 AT4G00975 
C13.wet - reg5.2 :   AT5G53550 AT5G53700 AT5G53930 AT5G53970 AT5G54710 AT5G54720 AT5G54730 AT5G54770 AT5G55180 AT5G55560

# ##############################################
# #2.9: Extract go terms from significant genes
# library(biomaRt)
# mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
# results <- getBM(attributes = c("refseq_dna", "go_molecular_function_id",
#                                 "go_molecular_function__dm_name_1006"), filters = "refseq_dna",
#                  values = c("NM_030621"), mart = mart)
# ##############################################
# #2.10: Summarize candidate genes
# 
# ##############################################
# ##############################################
# #Part 3: Effect of cytoplasm
# ##############################################
# ###############################################
# 
# ##############################################
# #3.1: T.tests and varcomp analysis of cytoplasm
# 
# ##############################################
# #3.2: Plots and summary of cytoplasm stats
# cytocovar<-read.csv("dd2014_cytocovar.csv", header=T)
# covs<-read.csv("dd2014_cytocovar.csv", header=T)
# addcov<-NULL; intcov<-NULL #model w/out covariates
# cyto<-as.data.frame(covs$cyto.num)
# par(mfrow=c(2,1))
# for (i in qtlphes){
#   nocovar<-scanone(grid,pheno.col=i, method="hk")
#   addcov<-scanone(grid,pheno.col=i, method="hk",addcovar=cyto)
#   intcov<-scanone(grid,pheno.col=i, method="hk",addcovar=cyto, intcovar=cyto)
#   ylim<-c(0,max(max(nocovar$lod),max(addcov$lod),max(intcov$lod)))
#   ylim2<-c(min(min(intcov$lod-addcov$lod),min(intcov$lod-nocovar$lod),min(addcov$lod-nocovar$lod)),
#            max(max(intcov$lod-addcov$lod),max(intcov$lod-nocovar$lod),max(addcov$lod-nocovar$lod)))
#   plot(nocovar,main=i,ylim=ylim);plot(addcov, add=T, col="red",lty=3);plot(intcov, add=T, col="blue",lty=2)
#   plot(intcov-nocovar,main=paste(i, ylim=ylim2); abline(h=2.5,lty=3)
#   plot(intcov-addcov,add=T, col="red",lty=2);plot(addcov-nocovar,add=T, col="blue",lty=3)
# }
# 
# 
# ##############################################
# #3.3: Bioinformatics of DNA sequence polymorphism in cytoplasm
# 
# ##############################################
# #3.4: Determination of candidate genes in cytoplasm
# 
# ##############################################
# #3.5: Cytoplasm main effect in QTLs (oneways)
# 
# ##############################################
# #3.6: Cytoplasm main effect in multi-QTL models
# 
# ##############################################
# #3.7: Cytoplasm interactive effects in QTLs (oneway)- compare w/ 3.3
# 
# ##############################################
# #3.8: Interaction plots of QTL*cyto and cyto main effect at proline
# 
# ##############################################
# #3.8: Bioinformatics of potential QTL-by-cytoplasm interactions
# 
# ##############################################
# ##############################################
# #Part 4: Epistasis
# ##############################################
# ##############################################
# 
# ##############################################
# #4.1: Extract epistasis components from stepwiseqtl
# 
# ##############################################
# #4.2: Interaction plots of epistasis
# 
# ##############################################
# #4.3: Scantwo of epistasis
# 
# ##############################################
# #4.4: fitqtl of epistasis
# 
# ##############################################
# ##############################################
# #Part 5: QTL*environment interactions
# ##############################################
# ##############################################
# 
# ##############################################
# #5.1: work on later...