dd2014_qtlgeneexpression

dd2014_qtlgeneexpression_draft2.R

Lovell — Jul 15, 2014, 2:18 PM

#############################
### Title: Re-analysis of Drydown QTLs
### Author: JTLovell
### Date Created: 12-July 2014
### Modifications (Date):...


#############################
###Libraries:###
library(qtl)
library(snow)
library(rlecuyer)
library(plyr)
library(ggplot2)
library(reshape)


Attaching package: 'reshape'

The following objects are masked from 'package:plyr':

    rename, round_any

The following object is masked from 'package:qtl':

    condense

library(qvalue)


Attaching package: 'qvalue'

The following object is masked from 'package:ggplot2':

    qplot

options(warn=-1)
sessionInfo()

R version 3.1.0 (2014-04-10)
Platform: x86_64-apple-darwin13.1.0 (64-bit)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] qvalue_1.38.0  reshape_0.8.5  ggplot2_1.0.0  plyr_1.8.1    
[5] rlecuyer_0.3-3 snow_0.3-13    qtl_1.32-10    knitr_1.6     

loaded via a namespace (and not attached):
 [1] colorspace_1.2-4 digest_0.6.4     evaluate_0.5.5   formatR_0.10    
 [5] grid_3.1.0       gtable_0.1.2     MASS_7.3-33      munsell_0.4.2   
 [9] parallel_3.1.0   proto_0.3-10     Rcpp_0.11.2      reshape2_1.4    
[13] scales_0.2.4     stringr_0.6.2    tcltk_3.1.0      tools_3.1.0

#######################
##2.1 set up the environment
rm(list=ls())
setwd("~/Desktop/Drydown2014")
cross<-read.cross("csvs",dir="",
                  genotypes=c("A","H"),
                  genfile="McKay_revised_map_1_27_12.csv",
                  phefile= "TKrils_full_phe.csv",
                  na.strings=c("NA","-", "#VALUE!","#NUM!"))

 --Read the following data:
     341  individuals
     168  markers
     38  phenotypes
 --Cross type: bc

class(cross)[1] <-"riself"
summary(cross)

    RI strains via selfing

    No. individuals:    341 

    No. phenotypes:     38 
    Percent phenotyped: 78.7 

    No. chromosomes:    5 
        Autosomes:      1 2 3 4 5 

    Total markers:      168 
    No. markers:        41 33 26 32 36 
    Percent genotyped:  91.1 
    Genotypes (%):      AA:50.8  BB:49.2

statsout<-read.csv("dd2014_allqtlstatistics.csv",header=T)
cmgenpos<-read.csv("gene_locations_new_McKay_map.csv",header=T,
                   na.strings="NA") #physical position of all genes
geneexp<-read.csv("May_SNP_5_4_12.csv",header=T) #gene expression of all AT genes in drydown
idnames<-read.csv("dd2014_markernametoid.csv",header=T)
annotations<-read.table("Functional_annotation_list.txt",sep="\t",header=F)
colnames(annotations)<-c("gene","annotation")
#make a super dense grid to extract pseudomarker probs
densegrid<-calc.genoprob(cross, step=.1, stepwidth="fixed")

#######################
##2.2 determine the cM positions of all genes in region
# the following steps must be conducted manually for each region of interest
#here are some potential regions of interest
qtls5.2<-statsout[statsout$chromosome == 5 & statsout$position > 60,]
qtls2.1<-statsout[statsout$chromosome == 2 & statsout$position < 50,]
qtls2.2<-statsout[statsout$chromosome == 2 & statsout$position > 50,]
ggplot(qtls5.2,aes(x=position,y=as.numeric(effect.t)))+
  geom_point() + 
  scale_y_continuous("t statistic for QTL point estimate")+
  geom_errorbarh(aes(xmax = hiCIpos, xmin = lowCIpos, height = 0))+
  facet_grid(phenotype~., scales="free_y", margins=F)+
  ggtitle("95% bayesian CI and point estimates for QTL in region 5.2 ")+
  theme_bw()

plot of chunk unnamed-chunk-1

print(qtl5.2_ci<-statsout[19,17:18])

   lowCIpos hiCIpos
19    68.48   75.98

ggplot(qtls2.1,aes(x=position,y=as.numeric(effect.t)))+
  geom_point() + 
  scale_y_continuous("t statistic for QTL point estimate")+
  geom_errorbarh(aes(xmax = hiCIpos, xmin = lowCIpos, height = 0))+
  facet_grid(phenotype~., scales="free_y", margins=F)+
  ggtitle("95% bayesian CI and point estimates for QTL in region 2.1 ")+
  theme_bw()

plot of chunk unnamed-chunk-1

print(qtl2.1_ci<-statsout[1,17:18])

  lowCIpos hiCIpos
1    7.378   40.83

ggplot(qtls2.2,aes(x=position,y=as.numeric(effect.t)))+
  geom_point() + 
  scale_y_continuous("t statistic for QTL point estimate")+
  geom_errorbarh(aes(xmax = hiCIpos, xmin = lowCIpos, height = 0))+
  facet_grid(phenotype~., scales="free_y", margins=F)+
  ggtitle("95% bayesian CI and point estimates for QTL in region 2.2 ")+
  theme_bw()

plot of chunk unnamed-chunk-1

print(qtl2.2_ci<-c(qtls2.2[2,17],qtls2.2[1,18]))

[1] 64.81 72.19

#######################
##2.3 Set parameters for each manually
#######################
#######################
#here change the region of interest
#i am opperating on the premise that the region should include all cm of the shortest confidence
#interval that overlaps all point estimates; see plots and summaries above 
region<-qtl5.2_ci #defined above, change to desired interval
info<-qtls5.2 #defined above, change to desired interval
#  cull the gene list to this region
genes<-cmgenpos[cmgenpos$Chr == info$chromosome[1] & 
                     cmgenpos$cM > region[1,1] &  cmgenpos$cM < region[1,2],]
#######################
#######################

##2.4 This is the engine that calculates differential expression for each gene
#this outputs a list of statistics, including the t test means and p-values
genexpstats<-data.frame(gene=character(),chr=numeric(),pos.bp=numeric(),pos.cm=numeric(),
                         dry_estAA=numeric(),dry_estBB=numeric(),dry_pval=numeric(),
                         wet_estAA=numeric(),wet_estBB=numeric(),wet_pval=numeric())
gen_woexp<-data.frame(gene=character)
count<-1
#pb <- txtProgressBar(min = 0, max = length(as.character(genes$Gene)), style = 3)
for (i in as.character(genes$Gene)){
  #setTxtProgressBar(pb, count) #sets a progress bar (not run)
  genein<-genes[genes$Gene==i,] #subsets data by genes in region
  chpos<-paste(as.numeric(genein[4]),"@",round(as.numeric(genein[6]),1),sep="")
  outgeno<-as.numeric(formMarkerCovar(densegrid,chpos,method="prob"))
  #get pseduomarker genotype probabilities for the nearest pseduomarker
  outgeno[outgeno>.05 & outgeno <.95]<-NA
  #this is important--- set genotype probabilities of >95% to the alleles, set the rest to missing
  outgeno<-cbind(idnames,as.data.frame(ifelse(outgeno>=.95,"BB","AA")))
  colnames(outgeno)[2:3]<-c("line","geno")
  #below, calculate the statistics as long as we have gene expression data
  if (i %in% colnames(geneexp)){
    outexp<-geneexp[,c("cyto","treatment","line",i)]
    allin<-merge(outgeno,outexp,by="line")
    colnames(allin)[6]<-"exp"
    wet<-allin[allin$treatment=="wet",]
    dry<-allin[allin$treatment=="dry",]
    tdry<-t.test(dry$exp~dry$geno)
    twet<-t.test(wet$exp~wet$geno)
    ttest_out<-data.frame(i,as.numeric(genein[4]),as.numeric(genein[5]),as.numeric(genein[6]),
                        tdry$estimate[1],tdry$estimate[2],tdry$p.value,
                        twet$estimate[1],twet$estimate[2],twet$p.value)
    colnames(ttest_out)<-c("gene","chr","pos.bp","pos.cm","dry_estAA","dry_estBB","dry_pval"
                         ,"wet_estAA","wet_estBB","wet_pval")
    rownames(ttest_out)<-count
    genexpstats<-rbind(genexpstats,ttest_out)
    count<-count+1
  }
  else{gen_woexp<-rbind(gen_woexp,i)}
}

#######################
##2.5 Correct pvalue statistics for number and distribution of comparisons using "qvalues"
genexpstats$qval_dry<-qvalue(genexpstats$dry_pval)$qvalues
genexpstats$qval_wet<-qvalue(genexpstats$wet_pval)$qvalues
sig_exp<-genexpstats[genexpstats$qval_dry<0.05 | genexpstats$qval_wet<0.05,]
cat("total number of significant genes = ", length(sig_exp[,1]), "out of", count)

total number of significant genes =  30 out of 657

#######################
##2.6 plot summaries of data
toplot_pvals<-melt(genexpstats[,c(1:4,7,10:12)], id.vars=colnames(genexpstats)[1:4])
ggplot(toplot_pvals,aes(x=value))+
  geom_histogram(binwidth=.1)+
  scale_x_log10()+
  ggtitle("Distn of P (top 2 panels) and Q (bottom 2) values")+
  facet_grid(variable~.,scales="free_x")

plot of chunk unnamed-chunk-1

#######################
##2.7 look at QTL effect in region
par(mfrow=c(length(info[,1]),1))
sim.cross<-sim.geno(cross, step=1,n.draws=256)
for (i in as.character(info[,2])){
  effectscan(sim.cross,pheno.col=i,info$chromosome[1], ylab=i,add.legend=F,
             main=paste("effect of the BB allele across chr",info$chromosome[1]))
  rect(region[1],-2,region[2],+2 ,col= '#FF003322')
}

plot of chunk unnamed-chunk-1

#######################
##2.8 output list of genes that have expression following pattern
# change effect here depending on the QTL effects
# for 5.2, there is an effect in both wet and dry environments on d13C 
# so, cull data by an FDR of 0.05 for both environments
sigexp_matcheffect<-genexpstats[genexpstats$qval_dry<0.05 & genexpstats$qval_wet<0.05,]
as.character(sigexp_matcheffect[,1])

[1] "AT5G55180" "AT5G57830" "AT5G58575" "AT5G61410"

#these are the significantly expressed genes that follow the pattern expected from the QTL analysis

#######################
## 2.9 plot the location of these genes
ggplot(info,aes(x=position,y=as.numeric(effect.t)))+
  geom_point() + 
  scale_y_continuous("t statistic for QTL point estimate")+
  geom_errorbarh(aes(xmax = hiCIpos, xmin = lowCIpos, height = 0))+
  annotate("text",x=as.numeric(info$position),y=as.numeric(info$effect.t)+.1,
           label=info$phenotype, hjust=0)+
  geom_vline(xintercept=as.numeric(sigexp_matcheffect$pos.cm), lty=3)+
  annotate("text",x=as.numeric(sigexp_matcheffect$pos.cm)-2,y=c(10,9.5,9,8.5),
           label=paste(as.character(sigexp_matcheffect[,1]),
                       "Wet=", round(as.numeric(sigexp_matcheffect$qval_wet),6),
                       "Dry=", round(as.numeric(sigexp_matcheffect$qval_dry),6)),hjust=0)+
  ggtitle("Names (locations are dotted lines) w/ q-values of sign. genes in region")+
  theme_bw()

plot of chunk unnamed-chunk-1

#annotation of these significant genes
#annotations[annotations$gene %in% sigexp_matcheffect$gene,]
#org.At.tair[["AT5G55180"]]
### not complete do online for now.