Lovell — Feb 11, 2015, 4:12 PM
#
# Script to make map from allele-specific counts for genes
# End goal - make file to import into JoinMap
# Author- JT Lovell
# Data - 10-Feb 2015
####################
####################
#Part 1:Get data in
rm(list=ls())
setwd("~/Library/Mobile Documents/com~apple~CloudDocs/ph2015_eqtl")
pkg <- c("RCurl","plyr","ggplot2","mclust","qtl")
invisible(lapply(pkg, function(x) {cat(x,"..."); library(x, character.only=T, verbose=F, warn.conflicts=F,quietly=T)} ))
RCurl ...
Warning: package 'RCurl' was built under R version 3.1.2
plyr ...ggplot2 ...mclust ...
Warning: package 'mclust' was built under R version 3.1.2
Package 'mclust' version 4.4
Type 'citation("mclust")' for citing this R package in publications.
qtl ...
Warning: package 'qtl' was built under R version 3.1.2
sessionInfo()
R version 3.1.0 (2014-04-10)
Platform: x86_64-apple-darwin13.1.0 (64-bit)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] qtl_1.35-3 mclust_4.4 ggplot2_1.0.0 plyr_1.8.1
[5] RCurl_1.95-4.5 bitops_1.0-6 knitr_1.9
loaded via a namespace (and not attached):
[1] colorspace_1.2-4 digest_0.6.8 evaluate_0.5.5 formatR_1.0
[5] grid_3.1.0 gtable_0.1.2 MASS_7.3-37 munsell_0.4.2
[9] parallel_3.1.0 proto_0.3-10 Rcpp_0.11.4 reshape2_1.4.1
[13] scales_0.2.4 stringr_0.6.2 tools_3.1.0
options(warn=-1)
# import necessary functions from github
function.names<-c("genobyMclust.R","multiplot.R")
us<-paste("https://raw.githubusercontent.com/jtlovell/eqtlanalysis/master/",function.names, sep="")
for(i in 1:length(function.names)){
script <- getURL(us[i], ssl.verifypeer = FALSE)
eval(parse(text = script))
}
counts.fil<-read.delim("Phallii-FIL.counts")
counts.hal<-read.delim("Phallii-HAL.counts")
#part 1 figure out the genes to work with
hf1.fil<-counts.fil[,c(grep("HAL",colnames(counts.fil)),grep("FIL",colnames(counts.fil)),grep("F1",colnames(counts.fil)))]
hf1.hal<-counts.hal[,c(grep("HAL",colnames(counts.hal)),grep("FIL",colnames(counts.hal)),grep("F1",colnames(counts.hal)))]
#look at the data-
hf1.fil[1:10,1:5]
HAL2_G997_338 HAL2_H101_237 HAL2_H126_821 HAL2_G950_542
Pahalv11b000001m.g 0 0 0 0
Pahalv11b000002m.g 0 0 0 0
Pahalv11b000003m.g 0 0 0 0
Pahalv11b000006m.g 0 0 0 0
Pahalv11b000007m.g 0 0 0 0
Pahalv11b000009m.g 0 0 0 0
Pahalv11b000011m.g 0 0 0 0
Pahalv11b000012m.g 0 0 0 0
Pahalv11b000013m.g 0 0 0 0
Pahalv11b000014m.g 0 0 0 0
HAL2_G988_482
Pahalv11b000001m.g 0
Pahalv11b000002m.g 0
Pahalv11b000003m.g 0
Pahalv11b000006m.g 0
Pahalv11b000007m.g 0
Pahalv11b000009m.g 0
Pahalv11b000011m.g 0
Pahalv11b000012m.g 0
Pahalv11b000013m.g 0
Pahalv11b000014m.g 0
hf1.hal[1:10,1:5]
HAL2_G997_338 HAL2_H101_237 HAL2_H126_821 HAL2_G950_542
Pahalv11b000001m.g 0 0 0 0
Pahalv11b000002m.g 0 0 0 0
Pahalv11b000003m.g 0 0 0 0
Pahalv11b000006m.g 0 0 0 0
Pahalv11b000007m.g 0 0 0 0
Pahalv11b000009m.g 135 195 182 149
Pahalv11b000011m.g 52 31 48 57
Pahalv11b000012m.g 162 136 180 180
Pahalv11b000013m.g 0 1 3 0
Pahalv11b000014m.g 113 205 220 123
HAL2_G988_482
Pahalv11b000001m.g 0
Pahalv11b000002m.g 0
Pahalv11b000003m.g 0
Pahalv11b000006m.g 0
Pahalv11b000007m.g 0
Pahalv11b000009m.g 266
Pahalv11b000011m.g 50
Pahalv11b000012m.g 208
Pahalv11b000013m.g 0
Pahalv11b000014m.g 269
####################
####################
#Part 2: find genes that are appropriate for ASE - high hal in hal, low hal in fil etc.
hal.counts.hal<-counts.hal[,grep("HAL",colnames(counts.hal))]
fil.counts.fil<-counts.fil[,grep("FIL",colnames(counts.fil))]
hal.counts.fil<-counts.hal[,grep("FIL",colnames(counts.hal))]
fil.counts.hal<-counts.fil[,grep("HAL",colnames(counts.fil))]
med.hal.hal<-as.numeric(apply(hal.counts.hal,1,median))
med.fil.fil<-as.numeric(apply(fil.counts.fil,1,median))
med.hal.fil<-as.numeric(apply(hal.counts.fil,1,median))
med.fil.hal<-as.numeric(apply(fil.counts.hal,1,median))
genes<-as.character(rownames(hal.counts.hal))
genes.parents<-genes[med.hal.hal>50 &
med.fil.fil>50 &
med.hal.fil<1 &
med.fil.hal<1]
length(genes.parents)
[1] 8479
#less stringent cuttoffs - 40% increase in markers
genes.parents.lax<-genes[med.hal.hal>10 &
med.fil.fil>10 &
med.hal.fil<2 &
med.fil.hal<2]
length(genes.parents.lax)
[1] 13712
#for remaining analyses, use 1/50 cutoffs - these will most likely give the strongest clustering for later
####################
####################
#Part 3: of the genes that look good in parents, which have both alleles in F1s?
fil.counts.f1<-counts.fil[,grep("F1",colnames(counts.fil))]
hal.counts.f1<-counts.hal[,grep("F1",colnames(counts.hal))]
med.fil.F1<-as.numeric(apply(fil.counts.f1,1,median))
med.hal.F1<-as.numeric(apply(hal.counts.f1,1,median))
genes<-as.character(rownames(fil.counts.f1))
genes.f1<-genes[med.fil.F1>10 &
med.hal.F1>10]
length(genes.f1)
[1] 13802
genes.parents.f1<-intersect(genes.parents,genes.f1)
length(genes.parents.f1) # number of genes that fit both parent and F1 criteria
[1] 8477
#we only lose 2 genes because of strong ASE in F1
####################
####################
#Part 4: parse the file to extract F2s
fil.counts.f2<-counts.fil[genes.parents.f1,grep("FH",colnames(counts.fil))]
fil.counts.f2$gene<-rownames(fil.counts.f2)
hal.counts.f2<-counts.hal[genes.parents.f1,grep("FH",colnames(counts.hal))]
hal.counts.f2$gene<-rownames(hal.counts.f2)
hal.counts.all<-data.frame(t(counts.hal[genes.parents.f1,]))
fil.counts.all<-data.frame(t(counts.fil[genes.parents.f1,]))
hal.counts.all$id<-rownames(hal.counts.all)
hal.counts.all$type<-substr(hal.counts.all$id,1,2)
fil.counts.all$id<-rownames(fil.counts.all)
fil.counts.all$type<-substr(fil.counts.all$id,1,2)
####################
####################
#Part 5 - call genotypes for all lines
#demostration of the function with plotting
for(i in 1:4){
test<-genobyMclust(gene.name=genes.parents.f1[i],
ids=c("id","type"),
fil.file=fil.counts.all,
hal.file=hal.counts.all,
n.clust=3,
p.tol=1e-10,
logtrans=TRUE,plot=T)
}
Pahalv11b000009m.g : 1 samples miscategorized for F1
Loading required package: grid
NULL
Pahalv11b000011m.g : 1 samples miscategorized for F1
NULL
Pahalv11b000012m.g : 1 samples miscategorized for F1
NULL
Pahalv11b000014m.g : 1 samples miscategorized for F1
NULL
test$group<-sapply(test[,1],function(x) strsplit(x,"_")[[1]][1])
#there may be a problem with F1_H294_525
#not run --- takes a while
# fil.counts.edit<-fil.counts.all[-which(rownames(fil.counts.all)=="F1_H294_525"),]
# hal.counts.edit<-hal.counts.all[-which(rownames(hal.counts.all)=="F1_H294_525"),]
# line.info<-hal.counts.edit[,c("id","type")]
# for(i in 1:length(genes.parents.f1)){
# geno<-genobyMclust(gene.name=genes.parents.f1[i],
# ids=c("id","type"),
# fil.file=fil.counts.edit,
# hal.file=hal.counts.edit,
# n.clust=3,
# p.tol=1e-10,
# logtrans=TRUE,plot=F)
# line.info<-merge(line.info,geno, by="id")
# cat(i," .")
# }
# write.csv(line.info,file="ph2015_rawgenocalls2.csv", row.names=F)
####################
####################
#Part 6 merge genotype calls with annotation
#from genobyMclust, we get a df with one column for each gene
#make files in R/qtl format
line.info<-read.csv("ph2015_rawgenocalls2.csv")
annot<-read.delim("Phalliiv1.1b.gene.gff3")
annot<-read.csv("ph2015_v1.1annot.edited.csv")
annot.cull<-annot[annot$id %in% genes.parents.f1,c("chr","start","id")]
annot.cull$chr<-tolower(annot.cull$chr)
annot.cull$chr <- gsub("_", "", annot.cull$chr)
annot.cull$chr <- gsub("super", "chr", annot.cull$chr)
colnames(annot.cull)[3]<-"gene"
line.info.cull<-line.info
line.info.cull<-line.info.cull[line.info.cull$type=="FH",]
rownames(line.info.cull)<-line.info.cull$id
####################
####################
#Part 7: count the number of genotypes, nas and likelihood of segregation distortion for each gene
counts<-apply(line.info.cull[,-c(1:2)],2,function(x){
nas<-length(x[is.na(x)])
fils<-length(x[x=="fil"])
hets<-length(x[x=="het"])
hals<-length(x[x=="hal"])
counts<-sum(nas,fils,hets,hals)
chi2p<-chisq.test(c(fils,hets,hals), p=c(.25,.5,.25))$p.value
kbstat<-sum((.25*log(.25/(fils/counts))),
(.5*log(.5/(hets/counts))),
(.25*log(.25/(hals/counts))))
prop.na<-nas/sum(fils,hets,hals)
out<-data.frame(fils,hets,hals,nas,chi2p,prop.na,kbstat)
return(out)
})
counts.df <- ldply(counts, data.frame)
colnames(counts.df)[1]<-"gene"
hist(counts.df$prop.na)
counts.95<-counts.df[counts.df$prop.na<=0.05,]
hist(counts.95$chi2p,breaks=1000, xlim=c(0,.1))
plot(log(counts.df$chi2p),counts.df$prop.na, main="n.NAs by seg. dist. for each gene")
####################
####################
#Part 8: count the number of genotypes, nas and likelihood of segregation distortion for each sample
counts.ind<-apply(line.info.cull[,-c(1:2)],1,function(x){
nas<-length(x[is.na(x)])
fils<-length(x[x=="fil"])
hets<-length(x[x=="het"])
hals<-length(x[x=="hal"])
counts<-sum(nas,fils,hets,hals)
chi2p<-chisq.test(c(fils,hets,hals), p=c(.25,.5,.25))$p.value
kbstat<-sum((.25*log(.25/(fils/counts))),
(.5*log(.5/(hets/counts))),
(.25*log(.25/(hals/counts))))
prop.na<-nas/sum(fils,hets,hals)
out<-data.frame(fils,hets,hals,nas,chi2p,prop.na,kbstat)
return(out)
})
counts.ind.df <- ldply(counts.ind, data.frame)
colnames(counts.ind.df)[1]<-"ind"
counts.ind.df[,1]<-line.info.cull[,1]
hist(counts.ind.df$prop.na)
hist(log(counts.ind.df$chi2p))
pairs(counts.ind.df[,c("fils","hets","hals")])
counts.ind.df$tot.counts<-counts.ind.df$fils + counts.ind.df$hets + counts.ind.df$hals
counts.ind.df$prop.het<-counts.ind.df$hets/counts.ind.df$tot.counts
counts.ind.df$prop.hal<-counts.ind.df$hals/counts.ind.df$tot.counts
counts.ind.df$prop.fil<-counts.ind.df$fils/counts.ind.df$tot.counts
pairs(counts.ind.df[,c("prop.het","prop.hal","prop.fil")], ylim=c(0,1), xlim=c(0,1))
mean(counts.ind.df$prop.het);mean(counts.ind.df$prop.hal);mean(counts.ind.df$prop.fil)
[1] 0.3450659
[1] 0.3610172
[1] 0.2939169
#cull by bad lines and genes - there are definitely a few contaminants here
#(probably 3 fils and 1 hal called f2s)
###############
###############
#part 9 - prepare data for joinmap
badinds<-apply(counts.ind.df[,c("prop.het","prop.hal","prop.fil")], 1, function (x) any(x<0.1))
good.lines<-counts.ind.df$ind[!badinds]
good.genes<-counts.95$gene
final.culled.genmatrix<-line.info.cull[good.lines,good.genes]
final.culled.genmatrix<-data.frame(t(final.culled.genmatrix))
final.culled.genmatrix<-as.matrix(final.culled.genmatrix)
final.culled.genmatrix[final.culled.genmatrix=="fil"]<-"a"
final.culled.genmatrix[final.culled.genmatrix=="het"]<-"h"
final.culled.genmatrix[final.culled.genmatrix=="hal"]<-"b"
write.csv(final.culled.genmatrix, file="ph2015_final.culled.genmatrix.csv", row.names=T)
####################
####################
# #Part 10 - make the R/qtl map files
# counts.95.nosd<-counts.95[counts.95$chi2p>1e-10,]
# genotypes.final<-line.info.cull[,c("id",counts.95.nosd$gene)]
#
# out.all<-data.frame(c(NA,NA,genotypes.final$id))
# colnames(out.all)<-"id"
# chr.nums<-substring(unique(annot.cull$chr),4,8)
# for(i in chr.nums){
# chr.name<-paste("chr",i,sep="")
#
# genes.chr<-as.character(annot.cull$gene[annot.cull$chr==chr.name])[
# as.character(annot.cull$gene[annot.cull$chr==chr.name]) %in% colnames(genotypes.final)]
# if(length(genes.chr)==0){
# next
# }else{
# chr.all<-genotypes.final[,c("id",genes.chr)]
# annot.chr<-annot.cull[annot.cull$gene %in% genes.chr,]
#
# chr.unique<-chr.all[,colnames(unique(as.matrix(chr.all),MARGIN=2))] #check for duplicates
# chr.unique[chr.unique=="het"]<-"H"
# chr.unique[chr.unique=="fil"]<-"A"
# chr.unique[chr.unique=="hal"]<-"B"
# chr.all<-rbind(c("id",rep(i,length(annot.chr$start)))
# ,c("id",annot.chr$start*100/max(annot.chr$start)),
# chr.unique) #bind the information with the data
# out.all<-cbind(out.all, chr.all)
# }
# }
#
# genfile.initial<-out.all[,-grep("id",colnames(out.all))[-1]]
# genfile.initial$id<-as.numeric(substring(genfile.initial$id,3,5))
# genfile.initial<-genfile.initial[,-2]
# genfile.initial[1:2,"id"]<-""
# genfile.initial2<- genfile.initial[!(duplicated(genfile.initial$id, incomparables="")),]
# write.csv(genfile.initial2,file="ph2015_newgen_initial.csv", row.names=F, quote=F)
#
# phefile.initial<-genfile.initial2[-c(1:2),1:2]
# phefile.initial$phe1<-rnorm(length(phefile.initial[,1]))
# write.csv(phefile.initial,file="ph2015_newphe.csv", row.names=F, quote=F)
#
# # next script - making the map in R/qtl ("ph2015_rqtlmap_fromASE.R)
#