dd2014_completegeneexpression.R

Lovell — Jul 25, 2014, 7:59 AM

#gene expression analysis
#############################
###Libraries:###
#############################
pkgs <- c("qtl","snow","rlecuyer","plyr","ggplot2","reshape","reshape2","qvalue","lme4","nlme","car","doBy","scales")
lapply(pkgs, require, character.only=T)
Loading required package: qtl
Loading required package: snow
Loading required package: rlecuyer
Loading required package: plyr
Loading required package: ggplot2
Loading required package: reshape

Attaching package: 'reshape'

The following objects are masked from 'package:plyr':

    rename, round_any

The following object is masked from 'package:qtl':

    condense

Loading required package: reshape2

Attaching package: 'reshape2'

The following objects are masked from 'package:reshape':

    colsplit, melt, recast

Loading required package: qvalue

Attaching package: 'qvalue'

The following object is masked from 'package:ggplot2':

    qplot

Loading required package: lme4
Warning: package 'lme4' was built under R version 3.1.1
Loading required package: Matrix

Attaching package: 'Matrix'

The following object is masked from 'package:reshape':

    expand

Loading required package: Rcpp
Loading required package: nlme

Attaching package: 'nlme'

The following object is masked from 'package:lme4':

    lmList

Loading required package: car
Loading required package: doBy
Loading required package: survival
Loading required package: splines
Loading required package: MASS
Loading required package: scales
[[1]]
[1] TRUE

[[2]]
[1] TRUE

[[3]]
[1] TRUE

[[4]]
[1] TRUE

[[5]]
[1] TRUE

[[6]]
[1] TRUE

[[7]]
[1] TRUE

[[8]]
[1] TRUE

[[9]]
[1] TRUE

[[10]]
[1] TRUE

[[11]]
[1] TRUE

[[12]]
[1] TRUE

[[13]]
[1] TRUE
options(warn=-1)
sessionInfo()
R version 3.1.0 (2014-04-10)
Platform: x86_64-apple-darwin13.1.0 (64-bit)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] splines   stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] scales_0.2.4    doBy_4.5-10     MASS_7.3-33     survival_2.37-7
 [5] car_2.0-20      nlme_3.1-117    lme4_1.1-7      Rcpp_0.11.2    
 [9] Matrix_1.1-4    qvalue_1.38.0   reshape2_1.4    reshape_0.8.5  
[13] ggplot2_1.0.0   plyr_1.8.1      rlecuyer_0.3-3  snow_0.3-13    
[17] qtl_1.32-10     knitr_1.6      

loaded via a namespace (and not attached):
 [1] colorspace_1.2-4 digest_0.6.4     evaluate_0.5.5   formatR_0.10    
 [5] grid_3.1.0       gtable_0.1.2     lattice_0.20-29  minqa_1.2.3     
 [9] munsell_0.4.2    nloptr_1.0.0     nnet_7.3-8       parallel_3.1.0  
[13] proto_0.3-10     stringr_0.6.2    tcltk_3.1.0      tools_3.1.0     
#############################
###Part I: Get all the data in and organized
#############################

#######################
#1.1 The qtl cross object
rm(list=ls())
setwd("~/Desktop/Drydown2014")
cross<-read.cross("csvs",dir="",
                  genotypes=c("a","b"),
                  genfile="TKrils_map55.csv",
                  phefile= "TKrils_full_phe.csv",
                  na.strings=c("NA","-", "#VALUE!","#NUM!"))
 --Read the following data:
     341  individuals
     450  markers
     38  phenotypes
 --Cross type: bc 
cross<-convert2riself(cross)
cross<-calc.genoprob(cross, step=1, 
                     stepwidth="max",error.prob=0.01, map.function="kosambi")

##1.2 read in the other files
statsout<-read.csv("dd2014_allqtlstatistics.csv",header=T) #qtl stats from QTL mapping
cmgenpos<-read.csv("gene_locations_new_McKay_map.csv",header=T, na.strings="NA") #physical position of all genes
geneexp<-read.csv("May_SNP_5_4_12.csv",header=T) #gene expression of all AT genes in drydown
idnames<-read.csv("dd2014_markernametoid.csv",header=T)
colnames(idnames)[2]<-"line"
geneexp<-merge(idnames,geneexp, by="line")
physpos.gene<-cmgenpos[,c(1,4:5)]

#1.3 Process the physical positions of markers- make sure this looks good
physpos.marker<-read.csv("TKrils_Marker_PhysPos.csv",header=T)
physpos.marker<-physpos.marker[,1:3]
physpos.marker<-physpos.marker[physpos.marker$Marker %in% markernames(cross),]
markernames(cross)[!markernames(cross) %in% physpos.marker$Marker] #all markers have positions!
character(0)
#make a dataframe with physical position, name, chr, and cMpos
cross.markers<-as.data.frame(pull.map(cross,as.table=T))
cross.markers$Marker<-rownames(cross.markers)
marker.info<-merge(physpos.marker,cross.markers, by="Marker")[,-4]

#1.4 Do gene expression stats for every gene-
# make a dataframe with one row for each gene
#for each gene- calculate effect of genotype, environment and g*e
out<-data.frame()
for (i in 1:length(colnames(geneexp))){
  exp.in<-geneexp[,c(1:5,(i+5))]
  gene.name<-colnames(geneexp[(i+5)])
  gene.bp<-physpos.gene[physpos.gene$Gene==gene.name,]
  if(is.finite(gene.bp$Loc)){
    marker.chr<-physpos.marker[physpos.marker$Chr==gene.bp$Chr,]
    marker.name<-data.frame(marker.chr[which(abs(marker.chr$Col.Phys.Pos-gene.bp$Loc)==
                                               min(abs(marker.chr$Col.Phys.Pos-gene.bp$Loc))),])
    #add allele data of nearest marker to gene expression
    geno<-pull.genoprob(cross, marker.chr$Chr[1])
    geno<-data.frame(geno[,paste(as.character(marker.name[1,1]),"AA",sep=":")])
    colnames(geno)<-as.character(marker.name[1,1])
    geno[geno>.1 & geno <.90]<-NA
    geno<-cbind(idnames,as.data.frame(ifelse(geno>=.90,"AA","BB")))
    colnames(geno)[2:3]<-c("line","geno")
    exp.geno<-merge(exp.in,geno,by="line"); colnames(exp.geno)[6]<-"exp"
    #fit the model
    test1<-aov(exp ~ geno + treatment + geno * treatment, data=exp.geno)
    test2<-model.tables(test1, type="means")
    fvals<-summary(test1)[[1]][["F value"]][1:3]
    pvals<-summary(test1)[[1]][["Pr(>F)"]][1:3]
    means<-test2$tables$"geno:treatment"[1:4]
    #fit t tests w/in each environment
    wet<-exp.geno[exp.geno$treatment=="wet",]
    dry<-exp.geno[exp.geno$treatment=="dry",]
    tdry<-t.test(exp ~ geno, data=dry)
    twet<-t.test(exp ~ geno, data=wet)    
    out.aov<-data.frame(gene.name,as.character(marker.name[1,1]),
                        as.numeric(gene.bp[1,2]),as.numeric(gene.bp[1,3]),
                        find.markerpos(cross,as.character(marker.name[1,1]))$pos,
                        fvals[1],fvals[2],fvals[3],pvals[1],pvals[2],pvals[3],means[1],means[2],means[3],means[4],
                        tdry$estimate[1],tdry$estimate[2],tdry$p.value,
                        twet$estimate[1],twet$estimate[2],twet$p.value)
    names(out.aov)<-c("gene.id","marker.id","chromosome","pos.bp","pos.cm","f.geno","f.trt","f.gxe",
                      "p.geno","p.trt","p.gxe","AA.dry","BB.dry","AA.wet","BB.wet",
                      "dry_estAA","dry_estBB","dry_pval","wet_estAA","wet_estBB","wet_pval")
    rownames(out.aov)<-i-5
    out<-rbind(out,out.aov)
    #cat(i,".")
  }
}
Error: undefined columns selected
#1.5 calculate qvalues- determine significance of effect and add to main stats dataframe
all.anova<-out
qvals<-data.frame(rownames(all.anova))
for (i in c("p.geno","p.trt","p.gxe","dry_pval","wet_pval")){
  q.out<-as.numeric(qvalue(all.anova[,i])$qvalue)
  hist(q.out, main=i)
  qvals<-cbind(qvals,as.data.frame(q.out))
}

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

qvals<-qvals[,-1];colnames(qvals)<-c("sig.geno","sig.trt","sig.gxe","dry_pval","wet_pval")
all.anova<-cbind(all.anova,qvals)
write.csv(out, file="dd2014_allgenes_geneexpressionstats.csv",row.names=F)

#1.6 Plot the genomic distribution of the gene expressions stats
hist(all.anova$p.geno)

plot of chunk unnamed-chunk-1

# a function to do reverse log scale- needed for pvalue plotting
reverselog_trans <- function(base = exp(1)) {
  trans <- function(y) -log(y, base)
  inv <- function(y) base^(-y)
  trans_new(paste0("reverselog-", format(base)), trans, inv, 
            log_breaks(base = base), 
            domain = c(1e-100, Inf))
}

sig.geno.fac<- transform(all.anova, cut=cut(sig.geno, breaks=c(0,.05,1)))$cut
sig.trt.fac<- transform(all.anova, cut=cut(sig.trt, breaks=c(0,.05,1)))$cut
sig.gxe.fac<- transform(all.anova, cut=cut(sig.gxe, breaks=c(0,.05,1)))$cut
dry_pval.fac<- transform(all.anova, cut=cut(dry_pval, breaks=c(0,.05,1)))$cut
wet_pval.fac<- transform(all.anova, cut=cut(wet_pval, breaks=c(0,.05,1)))$cut

all.anova<-cbind(all.anova,sig.geno.fac,sig.trt.fac,sig.gxe.fac,dry_pval.fac,wet_pval.fac)
#effect of allele
#threshold is at alpha=0.05 (via Q-values)
ggplot(all.anova,aes(y=p.geno,x=pos.bp,color=sig.geno.fac))+
  geom_point()+theme_bw()+ggtitle("Pvalues of genotype effect (typeIII)")+
  scale_y_continuous(trans=reverselog_trans(10))+
  facet_grid(chromosome~., scales="free")

plot of chunk unnamed-chunk-1

#effect of env
ggplot(all.anova,aes(y=p.trt,x=pos.bp,color=sig.trt.fac))+
  geom_point()+theme_bw()+ggtitle("Pvalues of environment effect (typeIII)")+
  scale_y_continuous(trans=reverselog_trans(10))+
  facet_grid(chromosome~., scales="free")

plot of chunk unnamed-chunk-1

#effect of g*e
ggplot(all.anova,aes(y=p.gxe,x=pos.bp,color=sig.gxe.fac))+
  geom_point()+theme_bw()+ggtitle("Pvalues of GxE effect (typeIII)")+
  scale_y_continuous(trans=reverselog_trans(10))+
  facet_grid(chromosome~., scales="free")

plot of chunk unnamed-chunk-1

ggplot(all.anova,aes(y=dry_pval,x=pos.bp,color=dry_pval.fac))+
  geom_point()+theme_bw()+ggtitle("Pvalues of genotype effect in dry (t.test)")+
  scale_y_continuous(trans=reverselog_trans(10))+
  facet_grid(chromosome~., scales="free")

plot of chunk unnamed-chunk-1

ggplot(all.anova,aes(y=wet_pval,x=pos.bp,color=wet_pval.fac))+
  geom_point()+theme_bw()+ggtitle("Pvalues of genotype effect in wet (t.test)")+
  scale_y_continuous(trans=reverselog_trans(10))+
  facet_grid(chromosome~., scales="free")

plot of chunk unnamed-chunk-1

head(all.anova)
     gene.id marker.id chromosome pos.bp pos.cm f.geno   f.trt   f.gxe
-4 AT1G01010 C1_103654          1   3631  0.186 3.6251 6.63512 7.83503
-3 AT1G01020 C1_103654          1   5928  0.186 4.2390 1.87075 0.34134
-2 AT1G01030 C1_103654          1  11649  0.186 2.1099 5.92443 1.01925
-1 AT1G01040 C1_103654          1  23146  0.186 0.0727 1.53420 0.01366
0  AT1G01050 C1_103654          1  31170  0.186 0.3614 0.03060 0.03141
1  AT1G01060 C1_103654          1  33666  0.186 4.9961 0.06019 0.60216
    p.geno   p.trt    p.gxe AA.dry BB.dry AA.wet BB.wet dry_estAA
-4 0.05836 0.01073 0.005631  7.240  7.219  7.245  7.354     7.240
-3 0.04081 0.17294 0.559722  7.198  7.139  7.217  7.184     7.198
-2 0.14793 0.01582 0.313930  7.072  7.119  7.134  7.142     7.072
-1 0.78772 0.21695 0.907088  7.624  7.619  7.606  7.604     7.624
0  0.54844 0.86131 0.859505  7.469  7.497  7.480  7.495     7.469
1  0.02652 0.80645 0.438682  7.438  7.551  7.407  7.641     7.438
   dry_estBB dry_pval wet_estAA wet_estBB wet_pval sig.geno sig.trt
-4     7.219  0.49431     7.245     7.354 0.003923   0.2334 0.01946
-3     7.139  0.04312     7.217     7.184 0.352691   0.1902 0.19130
-2     7.119  0.11033     7.134     7.142 0.733678   0.3789 0.02728
-1     7.619  0.79055     7.606     7.604 0.908446   0.7274 0.22526
0      7.497  0.55787     7.480     7.495 0.763103   0.6489 0.51776
1      7.551  0.27156     7.407     7.641 0.054034   0.1481 0.50217
   sig.gxe dry_pval wet_pval sig.geno.fac sig.trt.fac sig.gxe.fac
-4  0.6680   0.7342  0.08107     (0.05,1]    (0,0.05]    (0.05,1]
-3  0.9997   0.3092  0.67949     (0.05,1]    (0.05,1]    (0.05,1]
-2  0.9997   0.4637  0.79762     (0.05,1]    (0,0.05]    (0.05,1]
-1  0.9997   0.8065  0.83166     (0.05,1]    (0.05,1]    (0.05,1]
0   0.9997   0.7545  0.80225     (0.05,1]    (0.05,1]    (0.05,1]
1   0.9997   0.6282  0.35175     (0.05,1]    (0.05,1]    (0.05,1]
   dry_pval.fac wet_pval.fac
-4     (0.05,1]     (0,0.05]
-3     (0,0.05]     (0.05,1]
-2     (0.05,1]     (0.05,1]
-1     (0.05,1]     (0.05,1]
0      (0.05,1]     (0.05,1]
1      (0.05,1]     (0.05,1]
#this plots the relationship between the physical position of each gene
# with the marker that has been selected to use for local allele imputation.
ggplot(all.anova, aes(y=pos.bp, x=pos.cm))+
  geom_point()+theme_bw()+ggtitle("physical gene position to cM marker position")+
  facet_grid(chromosome~.)

plot of chunk unnamed-chunk-1

ggplot(cmgenpos, aes(y=Loc, x=cM))+
  geom_point()+theme_bw()+
  facet_grid(Chr~.)

plot of chunk unnamed-chunk-1