# Differential expression analysis of Hal Fil F1 population
# Script #2
# Author- JT Lovell
# Data - 25-March 2015
# Cis -trans test post-processing, plotting and generation of figures for MS
# Table of contents:
#1. Get data in shape, QC
#2.
#3.
#4.
setwd("~/Library/Mobile Documents/com~apple~CloudDocs/ph2015_eqtl")
library(qvalue)
# Part 1: Set up data
# 1.1: Read in the data
ct.stats<-read.csv("ct2015_lmstats.csv")
# resid<-read.csv("ct2015_allexpresiduals.csv")
# norm.counts<-read.csv("ct2015_deseqf012_normcounts.csv")
# time.stats<-read.csv("ct2015_timetrtstats.csv")
# 1.2: Generate adjusted p-values
for(i in colnames(ct.stats)[which(grepl("p_", colnames(ct.stats)))]){
dat<-ct.stats[,i]
qs<-data.frame(qvalue(as.numeric(dat))$qvalue)
colnames(qs)<-gsub("p_","q_",i)
ct.stats<-cbind(ct.stats,qs)
}
# 1.3: Generate q-value ranks of genes for plotting
for(i in colnames(ct.stats)[which(grepl("p_", colnames(ct.stats)))]){
dat<-ct.stats[,i]
rs<-data.frame(rank(dat))
colnames(rs)<-gsub("p_","rank_",i)
ct.stats<-cbind(ct.stats,rs)
}
#1.4: Generate "best" trans column
ct.stats$p_best.trans<-with(ct.stats,ifelse(p_trans.hal<p_trans.fil, p_trans.hal,p_trans.fil))
ct.stats$q_best.trans<-with(ct.stats,ifelse(p_trans.hal<p_trans.fil, q_trans.hal,q_trans.fil))
ct.stats$rank_best.trans<-rank(ct.stats$p_best.trans)
ct.stats$p_best.tt<-with(ct.stats,ifelse(p_th.trt<p_tf.trt, p_th.trt,p_tf.trt))
ct.stats$q_best.tt<-with(ct.stats,ifelse(p_th.trt<p_tf.trt, q_th.trt,q_tf.trt))
ct.stats$rank_best.tt<-rank(ct.stats$p_best.tt)
# 1.5: basic QC... make sure p-value/qvalue distributions look ok
par(mfrow=c(2,1))
for(i in colnames(ct.stats)[grep("p_", colnames(ct.stats))]){
j<-gsub("p_","q_",i)
hist(ct.stats[,i],main=i,xlim=c(0,1),breaks=((max(ct.stats[,i])-min(ct.stats[,i]))*1000))
hist(ct.stats[,j],main=j,xlim=c(0,1),breaks=((max(ct.stats[,j])-min(ct.stats[,j]))*1000))
cat(gsub("p_","",i),": n. p<0.05 = ", length(ct.stats[,i][ct.stats[,i]<0.05]),
" .. n. q<0.05 = ", length(ct.stats[,j][ct.stats[,j]<0.05]),
" .. n. q<0.1 = ", length(ct.stats[,j][ct.stats[,j]<0.1]),
"\n", sep="")
}

## intercept: n. p<0.05 = 7253 .. n. q<0.05 = 8791 .. n. q<0.1 = 10993

## cis: n. p<0.05 = 5753 .. n. q<0.05 = 5215 .. n. q<0.1 = 6385

## trans.fil: n. p<0.05 = 1412 .. n. q<0.05 = 96 .. n. q<0.1 = 186

## trans.hal: n. p<0.05 = 1199 .. n. q<0.05 = 45 .. n. q<0.1 = 88

## trt: n. p<0.05 = 831 .. n. q<0.05 = 4 .. n. q<0.1 = 15

## cis.trt: n. p<0.05 = 1124 .. n. q<0.05 = 164 .. n. q<0.1 = 260

## tf.trt: n. p<0.05 = 705 .. n. q<0.05 = 0 .. n. q<0.1 = 1

## th.trt: n. p<0.05 = 535 .. n. q<0.05 = 0 .. n. q<0.1 = 0

## best.trans: n. p<0.05 = 2416 .. n. q<0.05 = 137 .. n. q<0.1 = 266

## best.tt: n. p<0.05 = 1198 .. n. q<0.05 = 0 .. n. q<0.1 = 1
# length(ct.stats$q_th.trt[ct.stats$q_th.trt<0.05])
# length(ct.stats$p_best.trans[ct.stats$q_best.trans<0.05])
#
#
# with(ct.stats, plot(rank_cis,p_cis, pch="."))
#
# # Part 2: Summary plots:
# # 2.1: Histograms and volcano plots
# # 2.2: Venn diagrams
# # 2.3: Ternary plots
#
# # Part 3: Clustering
# # 3.1: All genes - categorize by significance
# # 3.2: Cis * treatment directional effects
# # 3.1:
#
# # Part 2:
# # Part 2:
#
# lmstats<-read.csv("ct2015_lmstats.csv")
# pdf("ct2015_statsfromresidmodel.pdf")
# par(mfrow=c(2,1))
# for( i in 1:8){
# dat<-stats2.out[,c(i,i+8,i+16)]
# hist(dat[,3], main=sn[i], xlab="pvalues", breaks=100)
# plot(dat[,1], -log10(dat[,3]), ylab="-log10 Pvalue", xlab="effect estimate", main=sn[i])
# }
# dev.off()
# r2<-summary(lm.out)[[8]]
# stats<-data.frame(ptime,ptt,r2)
# stats.out<-rbind(stats.out,stats)
# print(length(stats.out[,1]))
# }
#
# }
#
# stats2.out[stats2.out$p_th.trt<0.00001,]
# stats2.out[stats2.out$p_tf.trt<0.00001,]
# dds$Treatment <- relevel(dds$Treatment, "Dry")
# dds$generation <- relevel(dds$generation, "F0")
# dds$allele <- relevel(dds$allele, "fil")
#
# dge <- DGEList(counts=counts.t)
# dge <- calcNormFactors(dge)
# v <- voom(dge,plot=TRUE)
# i="Pahalv11b035147m.g"
# dat<-all.raw[,c("Treatment","generation","allele","cis","trans.fil","trans.hal",
# "time.sampled","VWC_July",i)]
# dat$Pahalv11b035147m.g[which(dat$Pahalv11b035147m.g<100)]<-NA
# dat$qn<-quantnorm(dat[,i])
#
# ggplot(dat, aes(x=time.sampled, y=qn,color=Treatment, shape=allele))+
# geom_point()+theme_classic()+
# geom_smooth(aes(group=Treatment), method="lm", fullrange=TRUE)
# colnames(dat)[9]<-"counts.knmm"
# dat$counts.knmm<-as.numeric(as.character(dat$counts.knmm))
# dat$qn<-quantnorm(dat$counts.knmm)
# nb1<-glm.nb(counts.knmm ~ cis + trans.fil + trans.hal +
# Treatment + time.sampled + Treatment*time.sampled +
# cis*Treatment + trans.fil*Treatment+ trans.hal*Treatment,link="log",
# data=dat)
#
# library(MASS)
# library(ggplot2)
# lm.out<-data.frame()
# for(i in all.genes){
# dat<-kd[,c("Treatment","generation","Allele","Cis","Trans_fil","Trans_hal",
# "time.sampled","MD_LWP_July",i)]
# colnames(dat)[9]<-"counts.knmm"
# dat$counts.knmm<-as.numeric(as.character(dat$counts.knmm))
# dat$qn<-quantnorm(dat$counts.knmm)
# nb1<-glm.nb(counts.knmm~generation+Allele+Treatment+
# #MD_LWP_July+
# time.sampled+
# #generation*MD_LWP_July+Allele*MD_LWP_July+
# #generation*time.sampled+Allele*time.sampled+
# generation*Treatment+Allele*Treatment,link="log",
# data=dat)
#
# hcc<-data.frame(t(hal.counts.cull))
# fcc<-data.frame(t(fil.counts.cull))
#
#
#
# all.raw2<-all.raw[,grep("Pahalv", colnames(all.raw))]
# raw.to.jmpgenomics<-data.frame(t(all.raw2))
# raw.to.jmpgenomics[1:5,1:5]
# write.csv(raw.to.jmpgenomics, file="ct2015_rawcounts_tojmpg.csv", row.names=T)
# write.csv(design, file="ct2015_design_tojmpg.csv", row.names=F)
#
# #look at some particular genes
# colnames(all.counts)
#
# xu<-data.frame(t(all.counts[c("Pahalv11b016018m", "Pahalv11b035147m","Pahalv11b011369m", "Pahalv11b055185m"),]))
# xu$jgi.id<-rownames(xu)
# # xu<-merge(info,xu,by="jgi.id")
# test<-melt(xu, id.vars="jgi.id")
# colnames(test)[2:3]<-c("gene","counts")
# xu<-merge(info,test,by="jgi.id")
# xu<-xu[-grep("FH",xu$id),]
# xu$id<-as.character(xu$id)
#
# time.sampled <- as.character(xu$Tissue_Time_July)
# time.sampled<-sapply(strsplit(time.sampled,":"),
# function(x) {
# x <- as.numeric(x)
# (x[1]-10)*60+(x[2]-58)
# }
# )
#
# xu<-xu[,c("id","Treatment","counts","id","gene")]
# xu$time<-time.sampled
# pdf("xiaoyu.plots.pdf")
# ggplot(xu, aes(x=id.1,y=counts,color=Treatment))+
# geom_boxplot(position="dodge")+
# facet_wrap(~gene, scales="free")+
# theme_bw()
# ggplot(xu, aes(x=time,y=counts,color=Treatment, shape=id.1))+
# geom_point()+
# facet_wrap(~gene, scales="free")+
# theme_bw()
# dev.off()
#