Lovell — Jul 30, 2014, 5:58 PM
####################################################
### Title: Complete analysis for drydown manuscript
### Author: JTLovell
### Date Created: 29-July 2014
####################################################
###Outline:###
####################################################
# A1: set up the environment
# Part 1 (QTL analysis w/out covariates):
# 1.1: Load permutations
# 1.2: Load Cross
# 1.3: Plot map, information and other diagnostics
# 1.4: Load list of phenotypes w/ qtls
# 1.5: Plot correlations and nj tree of phenotypes
# 1.6: Calculate Penalties (and concatenate from different files)
# 1.7: Run stepwiseQTL, output results and save results
# 1.8: Make lod profile plots
# 1.9: Make qtl position plots
# Part 2 QTL fine mapping w/ gene expression
# 2.1: Load gene expression statistics
# 2.1a: plotting of genomic distribution of expression
# 2.1b: determination of GxE genes
# 2.2: Load other files needed for analysis
# 2.3: Calculate intervals (2q, 4p, 4q, 5q)
# 2.4: Plot intervals (and gene-cM correlations)
# 2.5: Extract genes w/ significant expression differences (from ANOVA)
# 2.6: Run covariate scans w/ significant genes
# 2.7: Plot covariate scans
# 2.8: Summarize covariate analysis
# 2.9: Extract go terms from significant genes
# 2.10: Summarize candidate genes
# Part 3: Effect of cytoplasm
# 3.1: T.tests and varcomp analysis of cytoplasm
# 3.2: Plots and summary of cytoplasm stats
# 3.3: Bioinformatics of DNA sequence polymorphism in cytoplasm
# 3.4: Determination of candidate genes in cytoplasm
# 3.5: Cytoplasm main effect in QTLs (oneways)
# 3.6: Cytoplasm main effect in multi-QTL models
# 3.7: Cytoplasm interactive effects in QTLs (oneway)- compare w/ 3.3
# 3.8: Interaction plots of QTL*cyto and cyto main effect at proline
# 3.8: Bioinformatics of potential QTL-by-cytoplasm interactions
# Part 4: Epistasis
# 4.1: Extract epistasis components from stepwiseqtl
# 4.2: Interaction plots of epistasis
# 4.3: Scantwo of epistasis
# 4.4: fitqtl of epistasis
# Part 5: QTL*environment interactions
# 5.1: work on later...
####################################################
###Libraries:###
####################################################
##############################################
#A1 Set up the environment
rm(list=ls())
options(warn=-1)
pkg <- c("biomaRt","qtl","snow","rlecuyer","plyr","ggplot2","reshape","reshape2","qvalue","lme4","nlme","car","doBy","scales")
invisible(lapply(pkg, function(x) {cat(x,"..."); library(x, character.only=T, verbose=F, warn.conflicts=F,quietly=T)} ))
biomaRt ...qtl ...snow ...rlecuyer ...plyr ...ggplot2 ...reshape ...reshape2 ...qvalue ...lme4 ...
Attaching package: 'Matrix'
The following object is masked from 'package:reshape':
expand
nlme ...car ...doBy ...
Loading required package: splines
Attaching package: 'MASS'
The following object is masked from 'package:biomaRt':
select
scales ...
sessionInfo()
R version 3.1.0 (2014-04-10)
Platform: x86_64-apple-darwin13.1.0 (64-bit)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] splines stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] scales_0.2.4 doBy_4.5-10 MASS_7.3-33 survival_2.37-7
[5] car_2.0-20 nlme_3.1-117 lme4_1.1-7 Rcpp_0.11.2
[9] Matrix_1.1-4 qvalue_1.38.0 reshape2_1.4 reshape_0.8.5
[13] ggplot2_1.0.0 plyr_1.8.1 rlecuyer_0.3-3 snow_0.3-13
[17] qtl_1.32-10 biomaRt_2.20.0 knitr_1.6
loaded via a namespace (and not attached):
[1] AnnotationDbi_1.26.0 Biobase_2.24.0 BiocGenerics_0.10.0
[4] colorspace_1.2-4 DBI_0.2-7 digest_0.6.4
[7] evaluate_0.5.5 formatR_0.10 GenomeInfoDb_1.0.2
[10] grid_3.1.0 gtable_0.1.2 IRanges_1.22.10
[13] lattice_0.20-29 minqa_1.2.3 munsell_0.4.2
[16] nloptr_1.0.0 nnet_7.3-8 parallel_3.1.0
[19] proto_0.3-10 RCurl_1.95-4.1 RSQLite_0.11.4
[22] stats4_3.1.0 stringr_0.6.2 tcltk_3.1.0
[25] tools_3.1.0 XML_3.98-1.1
setwd("~/Desktop/Drydown2014")
##############################################
##############################################
#Part 1 (QTL analysis w/out covariates):
##############################################
##############################################
##############################################
#1.1: Load permutations
load("dd2014_allqtldata.RData")
##############################################
#1.2: Load Cross
cross<-read.cross("csvs",dir="", genotypes=c("a","b"),
genfile="TKrils_map55.csv", phefile= "TKrils_full_phe.csv",
na.strings=c("NA","-", "#VALUE!","#NUM!"))
--Read the following data:
341 individuals
450 markers
38 phenotypes
--Cross type: bc
cross.2008<-read.cross("csvs",dir="", genotypes=c("A","H"),
genfile="McKay_revised_map_1_27_12.csv",phefile= "TKrils_full_phe.csv",
na.strings=c("NA","-", "#VALUE!","#NUM!"))
--Read the following data:
341 individuals
168 markers
38 phenotypes
--Cross type: bc
cross.2008<-convert2riself(cross.2008)
cross<-convert2riself(cross)
cross<-calc.genoprob(cross, step=1, stepwidth="max",error.prob=0.01, map.function="kosambi")
##############################################
#1.3: Plot map, information and other diagnostics
par(mfrow=c(1,1))
plot.map(cross.2008, main="2008 map")
plot.map(cross, main="new map")
plotInfo(cross.2008, main="missing information, 2008 map",col="red",lty=2)
plotInfo(cross, main="missing information, new map")
summary(cross)
RI strains via selfing
No. individuals: 341
No. phenotypes: 38
Percent phenotyped: 78.7
No. chromosomes: 5
Autosomes: 1 2 3 4 5
Total markers: 450
No. markers: 103 80 82 74 111
Percent genotyped: 70.4
Genotypes (%): AA:51.2 BB:48.8
##############################################
#1.4: Load list of phenotypes w/ qtls
qtlphes<-c("ABAlyoph.dry", "C13.dry","C13.wet","FT.June","GR.dry","RMR.dry",
"Rel.GR.wet","Rolled.dry","Wilted.dry","g0","proline.dry")
##############################################
#1.5: Plot correlations and nj tree of phenotypes
pheno.matrix<-as.data.frame(cross$phe)
maineffect.phes<-list()
for (i in colnames(pheno.matrix)) maineffect.phes[[i]]<-ifelse(strsplit(i,"[.]")[[1]][2]=="diff", "diff","main")
maineffect.phes<-data.frame(unlist(maineffect.phes));maineffect.phes$id<-rownames(maineffect.phes)
maineffect.phes<-maineffect.phes[complete.cases(maineffect.phes),]
maineffect.phes<-maineffect.phes$id[maineffect.phes$unlist.maineffect.phes.=="main"]
pheno.matrix.main<-pheno.matrix[,maineffect.phes]
test<-as.matrix(pheno.matrix.main[,-1])
heatmap(cor(pheno.matrix.main[,-1], use="pairwise"), na.rm=T) #can make this look better
#print(cor(pheno.matrix.main[,-1], use="pairwise"))
##############################################
#1.6: Calculate Penalties (and concatenate from different files)
pens<-calc.penalties(perms.raw, alpha=0.05)
Wilted.dry<-calc.penalties(perms.wilted, alpha=0.05)
pens<-rbind(pens, Wilted.dry)
##############################################
#1.7: Run stepwiseQTL, output results and save results
stats_out<-data.frame(phenotype=character(),chromosome=numeric(),position=numeric(),
df=numeric(),type3SS=numeric(),LOD=numeric(),perc.var=numeric(),Fstat=numeric(),P.chi2=numeric(),P.F=numeric(),
effect.estimate=numeric(),effect.SE=numeric(),effect.t=numeric(),
lowCImarker=character(), hiCImarker=character(),
lowCIpos=character(), hiCIpos=character())
stats.all_out<-data.frame(phenotype=character(),df=numeric(),type3SS=numeric(),LOD=numeric(),perc.var=numeric(),Fstat=numeric(),P.chi2=numeric(),P.F=numeric(),
effect.estimate=numeric(),effect.SE=numeric(),effect.t=numeric())
statcols<-names(stats_out);statcolsall<-names(stats.all_out);par(mfrow=c(1,1))
model.out<-list()
par(mfrow=c(4,1),mar = c(2,4,2,1))
for (i in qtlphes){
stepout<-stepwiseqtl(grid,pheno.col=i, penalties=pens[i,], max.qtl=6,method="hk",
refine.locations=T,additive.only=F, keeplodprofile=T,keeptrace=T,verbose=F)
model.out[[i]]<-stepout
plotLodProfile(stepout,main=paste(i,"formula: ", formula(stepout)))
fit<-fitqtl(grid,pheno.col=i,qtl=stepout,formula=formula(stepout),get.ests=T,dropone=T)
nqtls<-nqtl(stepout)
nterms<-sum(countqtlterms(formula(stepout))[c(1,4)])
if(nqtls>1){
ests_out<-summary(fit)$ests[2:(nqtls+1),1:3]
cis<-data.frame()
for (j in 1:nqtls){
ciout<-bayesint(stepout,qtl.index=j, expandtomarkers=T, prob=.9)
lowmarker<-rownames(ciout)[1]; highmarker<-rownames(ciout)[3]
lowposition<-ciout[1,2]; highposition<-ciout[3,2]
cis<-rbind(cis,cbind(lowmarker,highmarker,lowposition,highposition))
}
stats<-data.frame(rep(i,nqtls),stepout$chr,stepout$pos,fit$result.drop[1:nqtls,],ests_out,cis)
stats.all<-data.frame(rep(i,nterms),fit$result.drop,summary(fit)$ests[-1,])
colnames(stats)<-statcols
colnames(stats.all)<-statcolsall
}
else{
cis<-data.frame()
for (j in 1:nqtls){
ciout<-bayesint(stepout,qtl.index=j, expandtomarkers=T, prob=.9)
lowmarker<-rownames(ciout)[1]; highmarker<-rownames(ciout)[3]
lowposition<-ciout[1,2]; highposition<-ciout[3,2]
cis<-rbind(cis,cbind(lowmarker,highmarker,lowposition,highposition))
}
ests_out<-summary(fit)$ests[2:(nqtls+1),1:3]
stats<-data.frame(c(i,stepout$chr[1],stepout$pos[1],as.numeric(fit$result.full[1,c(1,2,4,5,3,6,7)]),as.numeric(ests_out),cis[1,]))
colnames(stats)<-statcols; rownames(stats)<-stepout$name
stats.all<-stats[c(1,4:13)]
}
stats_out<-rbind(stats_out,stats)
stats.all_out<-rbind(stats.all_out,stats.all)
}
par(mfrow=c(4,1),mar = c(2,4,2,1))
write.csv(stats_out, file="dd2014_allqtlstatistics_nocovar.csv")
write.csv(stats.all_out, file="dd2014_allqtlstatistics_nocovar_wepistasis.csv")
##############################################
#1.8: Make lod profile plots
test<-model.out[["FT.June"]]
##############################################
#1.9: Make qtl position plots
statsout<-read.csv("dd2014_allqtlstatistics_nocovar.csv",header=T)
#function to plot pvals
reverselog_trans <- function(base = exp(1)) {
trans <- function(y) -log(y, base); inv <- function(y) base^(-y)
trans_new(paste0("reverselog-", format(base)), trans, inv,
log_breaks(base = base), domain = c(1e-100, Inf))}
ggplot(statsout,aes(x=position,y=as.numeric(effect.t)))+
geom_point() + scale_y_continuous("t statistic for QTL point estimate")+
geom_errorbarh(aes(xmax = hiCIpos, xmin = lowCIpos, height = 0))+
facet_grid(phenotype~chromosome, scales="free_y", margins=F)+
theme_bw()+ ggtitle("distribution of QTL positions and effect sizes")
ggplot(statsout,aes(x=position,y=P.F))+
geom_point() +
geom_errorbarh(aes(xmax = hiCIpos, xmin = lowCIpos, height = 0))+
facet_grid(.~chromosome, scales="free_y", margins=F)+
scale_y_continuous(trans=reverselog_trans(10),"p.value of F statistic")+
theme_bw()+ ggtitle("distribution of QTL positions and effect sizes")
ggplot(statsout,aes(x=position))+
geom_histogram(binwidth=5) +
facet_grid(.~chromosome, scales="free_y", margins=F)+
theme_bw()+ ggtitle("distribution and density of QTL point estimates")
##############################################
##############################################
#Part 2 QTL fine mapping w/ gene expression
##############################################
##############################################
##############################################
#2.1: Load gene expression statistics, generate qvalue statistics and process
out<-read.csv("dd2014_allgenes_geneexpressionstats.csv", header=T)
all.anova<-out
qvals<-data.frame(rownames(all.anova))
par(mfrow=c(5,1),mar = c(2,4,2,1))
for (i in c("p.geno","p.trt","p.gxe","dry_pval","wet_pval")){
q.out<-as.numeric(qvalue(all.anova[,i])$qvalue)
hist(q.out, main=paste("qvalue statistics calculated from", i))
qvals<-cbind(qvals,as.data.frame(q.out))
}
qvals<-qvals[,-1];colnames(qvals)<-c("sig.geno","sig.trt","sig.gxe","dry_qval","wet_qval")
all.anova<-cbind(all.anova,qvals)
summary(all.anova)
gene.id marker.id chromosome pos.bp
AT1G01010: 1 C3_3623963 : 311 Min. :1.00 Min. : 1251
AT1G01020: 1 C3_4648688 : 225 1st Qu.:1.00 1st Qu.: 5822841
AT1G01030: 1 C3_2497708 : 217 Median :3.00 Median :11920794
AT1G01040: 1 C3_19161491: 205 Mean :2.94 Mean :12591720
AT1G01050: 1 2-15986145 : 193 3rd Qu.:4.00 3rd Qu.:18724714
AT1G01060: 1 C4_9316828 : 189 Max. :5.00 Max. :30426839
(Other) :25618 (Other) :24284
pos.cm f.geno f.trt f.gxe
Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.00
1st Qu.: 22.1 1st Qu.: 0.2 1st Qu.: 0.3 1st Qu.: 0.09
Median : 51.5 Median : 0.8 Median : 1.9 Median : 0.42
Mean : 48.7 Mean : 3.5 Mean : 14.6 Mean : 1.01
3rd Qu.: 71.7 3rd Qu.: 2.7 3rd Qu.: 10.0 3rd Qu.: 1.26
Max. :111.1 Max. :882.7 Max. :2003.5 Max. :64.02
p.geno p.trt p.gxe AA.dry
Min. :0.000 Min. :0.0000 Min. :0.000 Min. : 6.68
1st Qu.:0.104 1st Qu.:0.0018 1st Qu.:0.264 1st Qu.: 6.99
Median :0.366 Median :0.1713 Median :0.519 Median : 7.14
Mean :0.403 Mean :0.2989 Mean :0.511 Mean : 7.26
3rd Qu.:0.677 3rd Qu.:0.5598 3rd Qu.:0.759 3rd Qu.: 7.39
Max. :1.000 Max. :0.9999 Max. :1.000 Max. :12.75
BB.dry AA.wet BB.wet dry_estAA
Min. : 6.65 Min. : 6.68 Min. : 6.67 Min. : 6.68
1st Qu.: 6.99 1st Qu.: 6.99 1st Qu.: 6.99 1st Qu.: 6.99
Median : 7.15 Median : 7.14 Median : 7.14 Median : 7.14
Mean : 7.26 Mean : 7.26 Mean : 7.26 Mean : 7.26
3rd Qu.: 7.39 3rd Qu.: 7.39 3rd Qu.: 7.39 3rd Qu.: 7.39
Max. :12.53 Max. :12.36 Max. :12.36 Max. :12.75
dry_estBB dry_pval wet_estAA wet_estBB
Min. : 6.65 Min. :0.000 Min. : 6.68 Min. : 6.67
1st Qu.: 6.99 1st Qu.:0.158 1st Qu.: 6.99 1st Qu.: 6.99
Median : 7.15 Median :0.420 Median : 7.14 Median : 7.14
Mean : 7.26 Mean :0.440 Mean : 7.26 Mean : 7.26
3rd Qu.: 7.39 3rd Qu.:0.705 3rd Qu.: 7.39 3rd Qu.: 7.39
Max. :12.53 Max. :1.000 Max. :12.36 Max. :12.36
wet_pval sig.geno sig.trt sig.gxe
Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
1st Qu.:0.162 1st Qu.:0.322 1st Qu.:0.004 1st Qu.:1.000
Median :0.422 Median :0.565 Median :0.190 Median :1.000
Mean :0.440 Mean :0.489 Mean :0.219 Mean :0.991
3rd Qu.:0.704 3rd Qu.:0.697 3rd Qu.:0.414 3rd Qu.:1.000
Max. :1.000 Max. :0.772 Max. :0.555 Max. :1.000
dry_qval wet_qval
Min. :0.000 Min. :0.000
1st Qu.:0.529 1st Qu.:0.546
Median :0.705 Median :0.712
Mean :0.623 Mean :0.630
3rd Qu.:0.788 3rd Qu.:0.791
Max. :0.839 Max. :0.843
##############################################
#2.1a: make plots of genomic distribution of pvalues
#set significance (based on qvalues) as a threshold variable
sig.geno.fac<- transform(all.anova, cut=cut(sig.geno, breaks=c(0,.05,1)))$cut
sig.trt.fac<- transform(all.anova, cut=cut(sig.trt, breaks=c(0,.05,1)))$cut
sig.gxe.fac<- transform(all.anova, cut=cut(sig.gxe, breaks=c(0,.05,1)))$cut
dry_pval.fac<- transform(all.anova, cut=cut(dry_pval, breaks=c(0,.05,1)))$cut
wet_pval.fac<- transform(all.anova, cut=cut(wet_pval, breaks=c(0,.05,1)))$cut
all.anova.toplot<-cbind(all.anova,sig.geno.fac,sig.trt.fac,sig.gxe.fac,dry_pval.fac,wet_pval.fac)
facstoplot<-list(c("sig.geno.fac","p.geno","Pvalues of genotype effect (typeIII)"),
c("sig.trt.fac","p.trt","Pvalues of environment effect (typeIII)"),
c("dry_pval.fac","dry_pval","Pvalues of genotype effect in dry (t.test)"),
c("wet_pval.fac","wet_pval","Pvalues of genotype effect in wet (t.test)"))
#threshold is at alpha=0.05 (via Q-values), colored in plots this way
for(i in facstoplot){
print(ggplot(all.anova.toplot,aes(y=get(i[2]),x=pos.bp,color=get(i[1])))+
geom_point()+theme_bw()+ggtitle(i[3])+
scale_y_continuous(trans=reverselog_trans(10))+ facet_grid(chromosome~.))}
##############################################
#2.1b Identify genes w/ significant G*E, output for later
head(all.anova)
gene.id marker.id chromosome pos.bp pos.cm f.geno f.trt f.gxe
1 AT1G01010 C1_103654 1 3631 0.186 3.6251 6.63512 7.83503
2 AT1G01020 C1_103654 1 5928 0.186 4.2390 1.87075 0.34134
3 AT1G01030 C1_103654 1 11649 0.186 2.1099 5.92443 1.01925
4 AT1G01040 C1_103654 1 23146 0.186 0.0727 1.53420 0.01366
5 AT1G01050 C1_103654 1 31170 0.186 0.3614 0.03060 0.03141
6 AT1G01060 C1_103654 1 33666 0.186 4.9961 0.06019 0.60216
p.geno p.trt p.gxe AA.dry BB.dry AA.wet BB.wet dry_estAA dry_estBB
1 0.05836 0.01073 0.005631 7.240 7.219 7.245 7.354 7.240 7.219
2 0.04081 0.17294 0.559722 7.198 7.139 7.217 7.184 7.198 7.139
3 0.14793 0.01582 0.313930 7.072 7.119 7.134 7.142 7.072 7.119
4 0.78772 0.21695 0.907088 7.624 7.619 7.606 7.604 7.624 7.619
5 0.54844 0.86131 0.859505 7.469 7.497 7.480 7.495 7.469 7.497
6 0.02652 0.80645 0.438682 7.438 7.551 7.407 7.641 7.438 7.551
dry_pval wet_estAA wet_estBB wet_pval sig.geno sig.trt sig.gxe dry_qval
1 0.49431 7.245 7.354 0.003923 0.2334 0.01946 0.6680 0.7342
2 0.04312 7.217 7.184 0.352691 0.1902 0.19130 0.9997 0.3092
3 0.11033 7.134 7.142 0.733678 0.3789 0.02728 0.9997 0.4637
4 0.79055 7.606 7.604 0.908446 0.7274 0.22526 0.9997 0.8065
5 0.55787 7.480 7.495 0.763103 0.6489 0.51776 0.9997 0.7545
6 0.27156 7.407 7.641 0.054034 0.1481 0.50217 0.9997 0.6282
wet_qval
1 0.08107
2 0.67949
3 0.79762
4 0.83166
5 0.80225
6 0.35175
#effect of g*e
ggplot(all.anova.toplot,aes(y=p.gxe,x=pos.bp,color=sig.gxe.fac))+
geom_point()+theme_bw()+ggtitle("Pvalues of GxE effect (typeIII)")+
scale_y_continuous(trans=reverselog_trans(10))+
facet_grid(chromosome~.)
gxe.genes<-all.anova[all.anova$sig.gxe<0.05,]
as.character(gxe.genes$gene.id)
[1] "AT1G05200" "AT1G23940" "AT1G27030" "AT1G48760" "AT1G56510"
[6] "AT1G62290" "AT1G62810" "AT1G63420" "AT1G67365" "AT1G72800"
[11] "AT1G73010" "AT1G76240" "AT2G15480" "AT2G16990" "AT2G23110"
[16] "AT2G35075" "AT3G47580" "AT4G01360" "AT4G15910" "AT4G19530"
[21] "AT4G21410" "AT4G22990" "AT5G09230" "AT5G09240" "AT5G12230"
[26] "AT5G25980" "AT5G41760" "AT5G42825" "AT5G44310"
##############################################
#2.2: Load other files needed for analysis
cmgenpos<-read.csv("gene_locations_new_McKay_map.csv",header=T, na.strings="NA") #physical position of all genes
geneexp<-read.csv("May_SNP_5_4_12.csv",header=T) #gene expression of all AT genes in drydown
idnames<-read.csv("dd2014_markernametoid.csv",header=T)
colnames(idnames)[2]<-"line"
geneexp<-merge(idnames,geneexp, by="line")
physpos.gene<-cmgenpos[,c(1,4:5)]
annotations<-read.table("Functional_annotation_list.txt",sep="\t",header=F)
colnames(annotations)<-c("gene","annotation")
##############################################
#2.3: Look at the distribution of QTL and effects to figure out where intervals should be
#look at the lod profiles and scanones if needed
#n <- names(model.out);lapply(model.out, function(x) plotLodProfile(x, main=n[substitute(x)[[3]]])
#par(mfrow=c(2,2));for (i in qtlphes) plot(scanone(cross, pheno.col=i,ch=5), main=i)
#make a dataframe with effects to help decide how to structure the regions
cross<-sim.geno(cross, n.draws=64, step=1, stepwidth="max",error.prob=0.01, map.function="kosambi")
effectscan.out<-data.frame()
for (i in qtlphes){
temp<-data.frame(effectscan(cross, pheno.col=i, draw=F),i)
temp$st.effect<-scale(temp$a,center=T,scale=T);effectscan.out<-rbind(effectscan.out,temp)
}
#general plots of qtl effects
names(effectscan.out)[3:4]<-c("effect","phenotype")
ggplot(effectscan.out, aes(x=pos, y=st.effect, color=phenotype))+ geom_line()+
theme_bw()+ggtitle("effect scans (of TSU allele) of all phenotypes")+ facet_wrap(~chr)
es.phys<-effectscan.out[effectscan.out$phenotype %in% c("C13.wet","C13.dry","ABAlyoph.dry","proline.dry","g0"),]
ggplot(es.phys, aes(x=pos, y=st.effect, color=phenotype))+geom_line()+theme_bw()+
facet_wrap(~chr)+ggtitle("effect scans (of TSU allele) of physiological phenotypes")
es.growth<-effectscan.out[effectscan.out$phenotype %in% c("GR.dry","RMR.dry","Rel.GR.wet","Rolled.dry","Wilted.dry"),]
ggplot(es.growth, aes(x=pos, y=st.effect, color=phenotype))+
geom_line()+theme_bw()+ facet_wrap(~chr)+ggtitle("effect scans (of TSU allele) of growth phenotypes")
##############################################
#2.4: Check intervals around four regions...
#check out the qtl effects in a region... here is a function
#region needs to be a vector of length 3 (chromosome, start pos, end pos)
#phes needs to be a vector of exactly 2 phenotypes
ploteff<-function(region,phes){
par(mfrow=c(3,1)); plot(scanone(cross, pheno.col=phes[1], chr=region[1]), main=paste("scanones on ch",region[1], "for phenotypes:", phes[1],phes[2]))
plot(scanone(cross, pheno.col=phes[2], chr=region[1]), add=T, lty=3, col="red")
rect(xleft=region[2], xright=region[3],ytop=100,ybottom=-100, col=rgb(0, 1, 0, alpha=0.2), border=F)
effectscan(cross, pheno.col=phes[1], draw=T, chr=region[1],main=paste("effect scan, ch=",region[1],"; phenotype=", phes[1]))
rect(xleft=region[2], xright=region[3],ytop=100,ybottom=-100, col=rgb(0, 1, 0, alpha=0.2), border=F)
effectscan(cross, pheno.col=phes[2], draw=T, chr=region[1], main=paste("effect scan, ch=",region[1],"; phenotype=", phes[2]))
rect(xleft=region[2], xright=region[3],ytop=100,ybottom=-100, col=rgb(0, 1, 0, alpha=0.2), border=F)
}
##############################################
#2.4.1 (reg2.1) --- aba-2.1 - Initially too wide- entire shoulder of peak
#look at wilted too:
reg2.1<-statsout[statsout$chromosome==2 & statsout$position <20,c(3,17,18)]
phes2.1<-c("ABAlyoph.dry","Wilted.dry")
ploteff(region=reg2.1,phes=phes2.1)
newint<-data.frame(bayesint(model.out["ABAlyoph.dry"][[1]], qtl.index=1, prob=.75)[1:3,1:3])
reg.test<-reg2.1; reg.test[2]<-newint[1,2];reg.test[3]<-newint[3,2]
ploteff(region=reg.test,phes=phes2.1)
par(mfrow=c(1,1)); plotLodProfile(model.out["ABAlyoph.dry"][[1]], main="Lod Profile of ABA, red-90% CI, purple-75% CI")
rect(xleft=reg.test[2], xright=reg.test[3],ytop=100,ybottom=-100, col=rgb(0, 0, 1, alpha=0.2), border=F)
rect(xleft=reg2.1[2], xright=reg2.1[3],ytop=100,ybottom=-100, col=rgb(1, 0, 0, alpha=0.2), border=F)
#the narrower region captures the peak much better (without the shoulders)
reg2.1<-reg.test
##############################################
#2.4.2 (reg2.2) ---proline-2.2
reg2.2<-statsout[statsout$chromosome==2 & statsout$position >60 & statsout$phenotype=="proline.dry",c(3,17,18)]
phes2.2<-c("proline.dry","Rolled.dry")
ploteff(region=reg2.2,phes=phes2.2)
newint<-data.frame(bayesint(model.out["proline.dry"][[1]], qtl.index=1, prob=.99)[1:3,1:3])
reg.test<-reg2.2; reg.test[2]<-newint[1,2];reg.test[3]<-newint[3,2]
ploteff(region=reg.test,phes=phes2.2)
par(mfrow=c(1,1)); plotLodProfile(model.out["proline.dry"][[1]], main="Lod Profile of proline, purple-90% CI, blue-99% CI",ch=2)
rect(xleft=reg.test[2], xright=reg.test[3],ytop=100,ybottom=-100, col=rgb(0, 0, 1, alpha=0.2), border=F)
rect(xleft=reg2.2[2], xright=reg2.2[3],ytop=100,ybottom=-100, col=rgb(1, 0, 0, alpha=0.2), border=F)
#the wider region captures the peak much better (without the shoulders)
reg2.2<-reg.test
##############################################
#2.4.3 (reg4.1) ---d13C-4.1
reg4.1<-statsout[statsout$chromosome==4 & statsout$position <10 & statsout$phenotype=="C13.wet",c(3,17,18),]
phes4.1<-c("C13.wet","proline.dry")
ploteff(region=reg4.1,phes=phes4.1)
##############################################
#2.4.4(reg5.2) ---d13C-5.2 -initially too wide (incorporates 2 peaks)
reg5.2<-statsout[statsout$chromosome==5 & statsout$position >60 & statsout$phenotype=="C13.wet",c(3,17,18),]
phes5.2<-c("C13.wet","C13.dry")
ploteff(region=reg5.2,phes=phes5.2)
newint<-data.frame(bayesint(model.out["C13.wet"][[1]], qtl.index=6, prob=.8)[1:3,1:3])
reg.test<-reg5.2; reg.test[2]<-newint[1,2];reg.test[3]<-newint[3,2]
ploteff(region=reg.test,phes=phes5.2)
par(mfrow=c(1,1)); plotLodProfile(model.out["C13.wet"][[1]],ch=5, main="Lod Profile of d13C.wet, red-90% CI, purple-80% CI")
rect(xleft=reg.test[2], xright=reg.test[3],ytop=100,ybottom=-100, col=rgb(0, 0, 1, alpha=0.2), border=F)
rect(xleft=reg5.2[2], xright=reg5.2[3],ytop=100,ybottom=-100, col=rgb(1, 0, 0, alpha=0.2), border=F)
#this looks much better- only single peak in coverages
reg5.2<-reg.test
##############################################
#2.5 list out and plot the regions and focal phenotypes
all.regions<-list(reg2.1,reg2.2,reg4.1,reg5.2)
all.focalphes<-c("ABAlyoph.dry","proline.dry","C13.wet","C13.wet")
all.region.names<-c("reg2.1","reg2.2","reg4.1","reg5.2")
#function to subset the data
findgenes<-function(region){
reg.in<-as.numeric(region)
culled.bycm<-all.anova[all.anova$chromosome==reg.in[1] & all.anova$pos.cm>reg.in[2] & all.anova$pos.cm<reg.in[3],]
temp<-all.anova[
all.anova$chromosome==reg.in[1] & all.anova$pos.cm>reg.in[2] & all.anova$pos.cm<reg.in[3] |
all.anova$chromosome==reg.in[1] & all.anova$pos.bp>min(culled.bycm$pos.bp) & all.anova$pos.bp<max(culled.bycm$pos.bp),]
return(as.character(temp$gene.id))
}
exp.stats<-all.anova
exp.stats$gene.cat<-as.character(exp.stats$gene.id)
for (i in 1:4){
temp<-findgenes(all.regions[[i]])
exp.stats$gene.cat[exp.stats$gene.cat %in% temp]<-all.region.names[i]
}
exp.stats$gene.cat[!exp.stats$gene.cat %in% all.region.names]<-"other"
ggplot(exp.stats, aes(y=pos.cm, x=pos.bp, color=gene.cat))+
scale_color_manual(values=c("black",rainbow(4)))+
geom_point()+theme_bw()+ggtitle("physical gene position to cM marker position")+
facet_wrap(~chromosome)
##############################################
#2.6: Extract genes w/ significant expression differences (from ANOVA)
#2.7: Make dataframe with those genes only for covariate analysis
exp.covars.list<-list()
sig.genes.out<-list()
for (i in 1:4){
temp<-exp.stats[exp.stats$gene.cat==all.region.names[i],]
#extract 10 most signifiant genes
siggenes<-temp[temp$sig.geno<as.numeric(quantile(temp$sig.geno,(length(temp[,1])-(length(temp[,1])-10))/length(temp[,1]))),]
print(all.region.names[i])
sig.genes<-print(as.character(siggenes$gene.id))
sig.genes.out[[i]]<-sig.genes
geneexp.culled<-geneexp[,c("treatment","line",sig.genes)]
geneexp1<-reshape(geneexp.culled, idvar="line", timevar="treatment",direction="wide")
names(idnames)[2]<-"line"
exp.covars<-merge(idnames,geneexp1, by="line", all=T)
exp.covars.list[[i]]<-exp.covars
}
[1] "reg2.1"
[1] "AT2G03140" "AT2G03820" "AT2G04170" "AT2G04378" "AT2G04380"
[6] "AT2G05810" "AT2G05915" "AT2G07215" "AT2G11140" "AT2G11240"
[1] "reg2.2"
[1] "AT2G38530" "AT2G39110" "AT2G40960" "AT2G41440" "AT2G42170"
[6] "AT2G42490" "AT2G43210" "AT2G43530" "AT2G43680" "AT2G43980"
[1] "reg4.1"
[1] "AT4G00355" "AT4G00430" "AT4G00440" "AT4G00525" "AT4G00620"
[6] "AT4G00650" "AT4G00720" "AT4G00755" "AT4G00900" "AT4G00975"
[1] "reg5.2"
[1] "AT5G53550" "AT5G53700" "AT5G53930" "AT5G53970" "AT5G54710"
[6] "AT5G54720" "AT5G54730" "AT5G54770" "AT5G55180" "AT5G55560"
#2.8 Run scanone for w/ and w/o covars
outall<-data.frame(chr=numeric(), pos=numeric(),lod=numeric(),
phe=character(),covar=character(),
gene=character(),expression_env=character())
for (i in all.focalphes){
temp<-cbind(as.data.frame(scanone(cross,pheno.col=i)),i,"nocovar","nocovar","nocovar")
colnames(temp)[4:7]<-c("phe","covar","gene","expression_env")
outall<-rbind(outall,temp)
}
for (j in 1:4){
for (i in colnames(exp.covars.list[j][[1]])[3:length(colnames(exp.covars.list[j][[1]]))]){
s1.out<-scanone(cross,pheno.col=all.focalphes[j],addcovar=exp.covars.list[[j]][,i])
s1.out<-cbind(as.data.frame(s1.out),as.character(all.focalphes[j]),as.character(i),
unlist(strsplit(i,"[.]"))[1],unlist(strsplit(i,"[.]"))[2])
colnames(s1.out)[4:7]<-c("phe","covar","gene","expression_env")
outall<-rbind(outall,s1.out)
}
}
##############################################
#2.9: Plot covariate scans
#for all phenotypes, just the focal ch
#inputs-
# toplot: the full scanout-out object,
#region: the region and phenotype: the phenotype
#maxcm- maximum cm to plot (mincm same way)
plotcovscan<-function(toplot, region, phenotype, maxcm, mincm, genes)
{
region.in<-as.numeric(region)
toplot.window<-toplot[toplot$chr==region.in[1] &
toplot$pos<maxcm & toplot$pos>mincm &
toplot$gene %in% genes|
toplot$chr==region.in[1] &
toplot$pos<maxcm & toplot$pos>mincm &
toplot$gene =="nocovar" &
toplot$phe==phenotype,]
palette1<-rainbow(length(genes))
line.cols <- c("lightgrey",rep(palette1,2))
line.width<-c(2,rep(.5,length(genes)*2))
line.shapes<-c(rep(1,length(genes)+1),rep(3,length(genes)))
ggplot(toplot.window,aes(x=pos,y=lod))+
geom_line(aes(col=covar,size=covar,linetype=covar))+
scale_color_manual(values=line.cols)+
scale_linetype_manual(values=line.shapes)+
scale_size_manual(values=line.width)+
geom_vline(xintercept=c(region.in[2:3]), lty=4)+
ggtitle(paste("Scanones for",phenotype,"(QTL region-- CH:",region.in[1],
", POS:",round(region.in[2],2), "-",round(region.in[3],2),")", "\n",
"LODs w/ and w/o gene expression covariates are plotted", sep=""))+
theme_bw()
}
#plot all regions
max.cm<-c(45,100,45,100)
min.cm<-c(0,45,0,45)
for (i in 1:4){
print(
plotcovscan(outall,region=all.regions[[i]],phenotype=all.focalphes[i],
genes=sig.genes.out[[i]],maxcm=max.cm[i], mincm=min.cm[i])
)
}
##############################################
#2.10: Summarize covariate analysis
for (i in 1:4) cat(all.focalphes[i],"-",all.region.names[i],": ", sig.genes.out[[i]], sep=" ", "\n")
ABAlyoph.dry - reg2.1 : AT2G03140 AT2G03820 AT2G04170 AT2G04378 AT2G04380 AT2G05810 AT2G05915 AT2G07215 AT2G11140 AT2G11240
proline.dry - reg2.2 : AT2G38530 AT2G39110 AT2G40960 AT2G41440 AT2G42170 AT2G42490 AT2G43210 AT2G43530 AT2G43680 AT2G43980
C13.wet - reg4.1 : AT4G00355 AT4G00430 AT4G00440 AT4G00525 AT4G00620 AT4G00650 AT4G00720 AT4G00755 AT4G00900 AT4G00975
C13.wet - reg5.2 : AT5G53550 AT5G53700 AT5G53930 AT5G53970 AT5G54710 AT5G54720 AT5G54730 AT5G54770 AT5G55180 AT5G55560
# ##############################################
# #2.9: Extract go terms from significant genes
# library(biomaRt)
# mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
# results <- getBM(attributes = c("refseq_dna", "go_molecular_function_id",
# "go_molecular_function__dm_name_1006"), filters = "refseq_dna",
# values = c("NM_030621"), mart = mart)
# ##############################################
# #2.10: Summarize candidate genes
#
# ##############################################
# ##############################################
# #Part 3: Effect of cytoplasm
# ##############################################
# ###############################################
#
# ##############################################
# #3.1: T.tests and varcomp analysis of cytoplasm
#
# ##############################################
# #3.2: Plots and summary of cytoplasm stats
# cytocovar<-read.csv("dd2014_cytocovar.csv", header=T)
# covs<-read.csv("dd2014_cytocovar.csv", header=T)
# addcov<-NULL; intcov<-NULL #model w/out covariates
# cyto<-as.data.frame(covs$cyto.num)
# par(mfrow=c(2,1))
# for (i in qtlphes){
# nocovar<-scanone(grid,pheno.col=i, method="hk")
# addcov<-scanone(grid,pheno.col=i, method="hk",addcovar=cyto)
# intcov<-scanone(grid,pheno.col=i, method="hk",addcovar=cyto, intcovar=cyto)
# ylim<-c(0,max(max(nocovar$lod),max(addcov$lod),max(intcov$lod)))
# ylim2<-c(min(min(intcov$lod-addcov$lod),min(intcov$lod-nocovar$lod),min(addcov$lod-nocovar$lod)),
# max(max(intcov$lod-addcov$lod),max(intcov$lod-nocovar$lod),max(addcov$lod-nocovar$lod)))
# plot(nocovar,main=i,ylim=ylim);plot(addcov, add=T, col="red",lty=3);plot(intcov, add=T, col="blue",lty=2)
# plot(intcov-nocovar,main=paste(i, ylim=ylim2); abline(h=2.5,lty=3)
# plot(intcov-addcov,add=T, col="red",lty=2);plot(addcov-nocovar,add=T, col="blue",lty=3)
# }
#
#
# ##############################################
# #3.3: Bioinformatics of DNA sequence polymorphism in cytoplasm
#
# ##############################################
# #3.4: Determination of candidate genes in cytoplasm
#
# ##############################################
# #3.5: Cytoplasm main effect in QTLs (oneways)
#
# ##############################################
# #3.6: Cytoplasm main effect in multi-QTL models
#
# ##############################################
# #3.7: Cytoplasm interactive effects in QTLs (oneway)- compare w/ 3.3
#
# ##############################################
# #3.8: Interaction plots of QTL*cyto and cyto main effect at proline
#
# ##############################################
# #3.8: Bioinformatics of potential QTL-by-cytoplasm interactions
#
# ##############################################
# ##############################################
# #Part 4: Epistasis
# ##############################################
# ##############################################
#
# ##############################################
# #4.1: Extract epistasis components from stepwiseqtl
#
# ##############################################
# #4.2: Interaction plots of epistasis
#
# ##############################################
# #4.3: Scantwo of epistasis
#
# ##############################################
# #4.4: fitqtl of epistasis
#
# ##############################################
# ##############################################
# #Part 5: QTL*environment interactions
# ##############################################
# ##############################################
#
# ##############################################
# #5.1: work on later...