Lovell — Jul 25, 2014, 7:59 AM
#gene expression analysis
#############################
###Libraries:###
#############################
pkgs <- c("qtl","snow","rlecuyer","plyr","ggplot2","reshape","reshape2","qvalue","lme4","nlme","car","doBy","scales")
lapply(pkgs, require, character.only=T)
Loading required package: qtl
Loading required package: snow
Loading required package: rlecuyer
Loading required package: plyr
Loading required package: ggplot2
Loading required package: reshape
Attaching package: 'reshape'
The following objects are masked from 'package:plyr':
rename, round_any
The following object is masked from 'package:qtl':
condense
Loading required package: reshape2
Attaching package: 'reshape2'
The following objects are masked from 'package:reshape':
colsplit, melt, recast
Loading required package: qvalue
Attaching package: 'qvalue'
The following object is masked from 'package:ggplot2':
qplot
Loading required package: lme4
Warning: package 'lme4' was built under R version 3.1.1
Loading required package: Matrix
Attaching package: 'Matrix'
The following object is masked from 'package:reshape':
expand
Loading required package: Rcpp
Loading required package: nlme
Attaching package: 'nlme'
The following object is masked from 'package:lme4':
lmList
Loading required package: car
Loading required package: doBy
Loading required package: survival
Loading required package: splines
Loading required package: MASS
Loading required package: scales
[[1]]
[1] TRUE
[[2]]
[1] TRUE
[[3]]
[1] TRUE
[[4]]
[1] TRUE
[[5]]
[1] TRUE
[[6]]
[1] TRUE
[[7]]
[1] TRUE
[[8]]
[1] TRUE
[[9]]
[1] TRUE
[[10]]
[1] TRUE
[[11]]
[1] TRUE
[[12]]
[1] TRUE
[[13]]
[1] TRUE
options(warn=-1)
sessionInfo()
R version 3.1.0 (2014-04-10)
Platform: x86_64-apple-darwin13.1.0 (64-bit)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] splines stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] scales_0.2.4 doBy_4.5-10 MASS_7.3-33 survival_2.37-7
[5] car_2.0-20 nlme_3.1-117 lme4_1.1-7 Rcpp_0.11.2
[9] Matrix_1.1-4 qvalue_1.38.0 reshape2_1.4 reshape_0.8.5
[13] ggplot2_1.0.0 plyr_1.8.1 rlecuyer_0.3-3 snow_0.3-13
[17] qtl_1.32-10 knitr_1.6
loaded via a namespace (and not attached):
[1] colorspace_1.2-4 digest_0.6.4 evaluate_0.5.5 formatR_0.10
[5] grid_3.1.0 gtable_0.1.2 lattice_0.20-29 minqa_1.2.3
[9] munsell_0.4.2 nloptr_1.0.0 nnet_7.3-8 parallel_3.1.0
[13] proto_0.3-10 stringr_0.6.2 tcltk_3.1.0 tools_3.1.0
#############################
###Part I: Get all the data in and organized
#############################
#######################
#1.1 The qtl cross object
rm(list=ls())
setwd("~/Desktop/Drydown2014")
cross<-read.cross("csvs",dir="",
genotypes=c("a","b"),
genfile="TKrils_map55.csv",
phefile= "TKrils_full_phe.csv",
na.strings=c("NA","-", "#VALUE!","#NUM!"))
--Read the following data:
341 individuals
450 markers
38 phenotypes
--Cross type: bc
cross<-convert2riself(cross)
cross<-calc.genoprob(cross, step=1,
stepwidth="max",error.prob=0.01, map.function="kosambi")
##1.2 read in the other files
statsout<-read.csv("dd2014_allqtlstatistics.csv",header=T) #qtl stats from QTL mapping
cmgenpos<-read.csv("gene_locations_new_McKay_map.csv",header=T, na.strings="NA") #physical position of all genes
geneexp<-read.csv("May_SNP_5_4_12.csv",header=T) #gene expression of all AT genes in drydown
idnames<-read.csv("dd2014_markernametoid.csv",header=T)
colnames(idnames)[2]<-"line"
geneexp<-merge(idnames,geneexp, by="line")
physpos.gene<-cmgenpos[,c(1,4:5)]
#1.3 Process the physical positions of markers- make sure this looks good
physpos.marker<-read.csv("TKrils_Marker_PhysPos.csv",header=T)
physpos.marker<-physpos.marker[,1:3]
physpos.marker<-physpos.marker[physpos.marker$Marker %in% markernames(cross),]
markernames(cross)[!markernames(cross) %in% physpos.marker$Marker] #all markers have positions!
character(0)
#make a dataframe with physical position, name, chr, and cMpos
cross.markers<-as.data.frame(pull.map(cross,as.table=T))
cross.markers$Marker<-rownames(cross.markers)
marker.info<-merge(physpos.marker,cross.markers, by="Marker")[,-4]
#1.4 Do gene expression stats for every gene-
# make a dataframe with one row for each gene
#for each gene- calculate effect of genotype, environment and g*e
out<-data.frame()
for (i in 1:length(colnames(geneexp))){
exp.in<-geneexp[,c(1:5,(i+5))]
gene.name<-colnames(geneexp[(i+5)])
gene.bp<-physpos.gene[physpos.gene$Gene==gene.name,]
if(is.finite(gene.bp$Loc)){
marker.chr<-physpos.marker[physpos.marker$Chr==gene.bp$Chr,]
marker.name<-data.frame(marker.chr[which(abs(marker.chr$Col.Phys.Pos-gene.bp$Loc)==
min(abs(marker.chr$Col.Phys.Pos-gene.bp$Loc))),])
#add allele data of nearest marker to gene expression
geno<-pull.genoprob(cross, marker.chr$Chr[1])
geno<-data.frame(geno[,paste(as.character(marker.name[1,1]),"AA",sep=":")])
colnames(geno)<-as.character(marker.name[1,1])
geno[geno>.1 & geno <.90]<-NA
geno<-cbind(idnames,as.data.frame(ifelse(geno>=.90,"AA","BB")))
colnames(geno)[2:3]<-c("line","geno")
exp.geno<-merge(exp.in,geno,by="line"); colnames(exp.geno)[6]<-"exp"
#fit the model
test1<-aov(exp ~ geno + treatment + geno * treatment, data=exp.geno)
test2<-model.tables(test1, type="means")
fvals<-summary(test1)[[1]][["F value"]][1:3]
pvals<-summary(test1)[[1]][["Pr(>F)"]][1:3]
means<-test2$tables$"geno:treatment"[1:4]
#fit t tests w/in each environment
wet<-exp.geno[exp.geno$treatment=="wet",]
dry<-exp.geno[exp.geno$treatment=="dry",]
tdry<-t.test(exp ~ geno, data=dry)
twet<-t.test(exp ~ geno, data=wet)
out.aov<-data.frame(gene.name,as.character(marker.name[1,1]),
as.numeric(gene.bp[1,2]),as.numeric(gene.bp[1,3]),
find.markerpos(cross,as.character(marker.name[1,1]))$pos,
fvals[1],fvals[2],fvals[3],pvals[1],pvals[2],pvals[3],means[1],means[2],means[3],means[4],
tdry$estimate[1],tdry$estimate[2],tdry$p.value,
twet$estimate[1],twet$estimate[2],twet$p.value)
names(out.aov)<-c("gene.id","marker.id","chromosome","pos.bp","pos.cm","f.geno","f.trt","f.gxe",
"p.geno","p.trt","p.gxe","AA.dry","BB.dry","AA.wet","BB.wet",
"dry_estAA","dry_estBB","dry_pval","wet_estAA","wet_estBB","wet_pval")
rownames(out.aov)<-i-5
out<-rbind(out,out.aov)
#cat(i,".")
}
}
Error: undefined columns selected
#1.5 calculate qvalues- determine significance of effect and add to main stats dataframe
all.anova<-out
qvals<-data.frame(rownames(all.anova))
for (i in c("p.geno","p.trt","p.gxe","dry_pval","wet_pval")){
q.out<-as.numeric(qvalue(all.anova[,i])$qvalue)
hist(q.out, main=i)
qvals<-cbind(qvals,as.data.frame(q.out))
}
qvals<-qvals[,-1];colnames(qvals)<-c("sig.geno","sig.trt","sig.gxe","dry_pval","wet_pval")
all.anova<-cbind(all.anova,qvals)
write.csv(out, file="dd2014_allgenes_geneexpressionstats.csv",row.names=F)
#1.6 Plot the genomic distribution of the gene expressions stats
hist(all.anova$p.geno)
# a function to do reverse log scale- needed for pvalue plotting
reverselog_trans <- function(base = exp(1)) {
trans <- function(y) -log(y, base)
inv <- function(y) base^(-y)
trans_new(paste0("reverselog-", format(base)), trans, inv,
log_breaks(base = base),
domain = c(1e-100, Inf))
}
sig.geno.fac<- transform(all.anova, cut=cut(sig.geno, breaks=c(0,.05,1)))$cut
sig.trt.fac<- transform(all.anova, cut=cut(sig.trt, breaks=c(0,.05,1)))$cut
sig.gxe.fac<- transform(all.anova, cut=cut(sig.gxe, breaks=c(0,.05,1)))$cut
dry_pval.fac<- transform(all.anova, cut=cut(dry_pval, breaks=c(0,.05,1)))$cut
wet_pval.fac<- transform(all.anova, cut=cut(wet_pval, breaks=c(0,.05,1)))$cut
all.anova<-cbind(all.anova,sig.geno.fac,sig.trt.fac,sig.gxe.fac,dry_pval.fac,wet_pval.fac)
#effect of allele
#threshold is at alpha=0.05 (via Q-values)
ggplot(all.anova,aes(y=p.geno,x=pos.bp,color=sig.geno.fac))+
geom_point()+theme_bw()+ggtitle("Pvalues of genotype effect (typeIII)")+
scale_y_continuous(trans=reverselog_trans(10))+
facet_grid(chromosome~., scales="free")
#effect of env
ggplot(all.anova,aes(y=p.trt,x=pos.bp,color=sig.trt.fac))+
geom_point()+theme_bw()+ggtitle("Pvalues of environment effect (typeIII)")+
scale_y_continuous(trans=reverselog_trans(10))+
facet_grid(chromosome~., scales="free")
#effect of g*e
ggplot(all.anova,aes(y=p.gxe,x=pos.bp,color=sig.gxe.fac))+
geom_point()+theme_bw()+ggtitle("Pvalues of GxE effect (typeIII)")+
scale_y_continuous(trans=reverselog_trans(10))+
facet_grid(chromosome~., scales="free")
ggplot(all.anova,aes(y=dry_pval,x=pos.bp,color=dry_pval.fac))+
geom_point()+theme_bw()+ggtitle("Pvalues of genotype effect in dry (t.test)")+
scale_y_continuous(trans=reverselog_trans(10))+
facet_grid(chromosome~., scales="free")
ggplot(all.anova,aes(y=wet_pval,x=pos.bp,color=wet_pval.fac))+
geom_point()+theme_bw()+ggtitle("Pvalues of genotype effect in wet (t.test)")+
scale_y_continuous(trans=reverselog_trans(10))+
facet_grid(chromosome~., scales="free")
head(all.anova)
gene.id marker.id chromosome pos.bp pos.cm f.geno f.trt f.gxe
-4 AT1G01010 C1_103654 1 3631 0.186 3.6251 6.63512 7.83503
-3 AT1G01020 C1_103654 1 5928 0.186 4.2390 1.87075 0.34134
-2 AT1G01030 C1_103654 1 11649 0.186 2.1099 5.92443 1.01925
-1 AT1G01040 C1_103654 1 23146 0.186 0.0727 1.53420 0.01366
0 AT1G01050 C1_103654 1 31170 0.186 0.3614 0.03060 0.03141
1 AT1G01060 C1_103654 1 33666 0.186 4.9961 0.06019 0.60216
p.geno p.trt p.gxe AA.dry BB.dry AA.wet BB.wet dry_estAA
-4 0.05836 0.01073 0.005631 7.240 7.219 7.245 7.354 7.240
-3 0.04081 0.17294 0.559722 7.198 7.139 7.217 7.184 7.198
-2 0.14793 0.01582 0.313930 7.072 7.119 7.134 7.142 7.072
-1 0.78772 0.21695 0.907088 7.624 7.619 7.606 7.604 7.624
0 0.54844 0.86131 0.859505 7.469 7.497 7.480 7.495 7.469
1 0.02652 0.80645 0.438682 7.438 7.551 7.407 7.641 7.438
dry_estBB dry_pval wet_estAA wet_estBB wet_pval sig.geno sig.trt
-4 7.219 0.49431 7.245 7.354 0.003923 0.2334 0.01946
-3 7.139 0.04312 7.217 7.184 0.352691 0.1902 0.19130
-2 7.119 0.11033 7.134 7.142 0.733678 0.3789 0.02728
-1 7.619 0.79055 7.606 7.604 0.908446 0.7274 0.22526
0 7.497 0.55787 7.480 7.495 0.763103 0.6489 0.51776
1 7.551 0.27156 7.407 7.641 0.054034 0.1481 0.50217
sig.gxe dry_pval wet_pval sig.geno.fac sig.trt.fac sig.gxe.fac
-4 0.6680 0.7342 0.08107 (0.05,1] (0,0.05] (0.05,1]
-3 0.9997 0.3092 0.67949 (0.05,1] (0.05,1] (0.05,1]
-2 0.9997 0.4637 0.79762 (0.05,1] (0,0.05] (0.05,1]
-1 0.9997 0.8065 0.83166 (0.05,1] (0.05,1] (0.05,1]
0 0.9997 0.7545 0.80225 (0.05,1] (0.05,1] (0.05,1]
1 0.9997 0.6282 0.35175 (0.05,1] (0.05,1] (0.05,1]
dry_pval.fac wet_pval.fac
-4 (0.05,1] (0,0.05]
-3 (0,0.05] (0.05,1]
-2 (0.05,1] (0.05,1]
-1 (0.05,1] (0.05,1]
0 (0.05,1] (0.05,1]
1 (0.05,1] (0.05,1]
#this plots the relationship between the physical position of each gene
# with the marker that has been selected to use for local allele imputation.
ggplot(all.anova, aes(y=pos.bp, x=pos.cm))+
geom_point()+theme_bw()+ggtitle("physical gene position to cM marker position")+
facet_grid(chromosome~.)
ggplot(cmgenpos, aes(y=Loc, x=cM))+
geom_point()+theme_bw()+
facet_grid(Chr~.)