ph2015_snpsfromase_tojoinmap.R

Lovell — Feb 11, 2015, 4:12 PM

#
# Script to make map from allele-specific counts for genes
# End goal - make file to import into JoinMap
# Author- JT Lovell
# Data - 10-Feb 2015

####################
####################
#Part 1:Get data in
rm(list=ls())
setwd("~/Library/Mobile Documents/com~apple~CloudDocs/ph2015_eqtl")
pkg <- c("RCurl","plyr","ggplot2","mclust","qtl")
invisible(lapply(pkg, function(x) {cat(x,"..."); library(x, character.only=T, verbose=F, warn.conflicts=F,quietly=T)} ))
RCurl ...
Warning: package 'RCurl' was built under R version 3.1.2
plyr ...ggplot2 ...mclust ...
Warning: package 'mclust' was built under R version 3.1.2
Package 'mclust' version 4.4
Type 'citation("mclust")' for citing this R package in publications.
qtl ...
Warning: package 'qtl' was built under R version 3.1.2
sessionInfo()
R version 3.1.0 (2014-04-10)
Platform: x86_64-apple-darwin13.1.0 (64-bit)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] qtl_1.35-3     mclust_4.4     ggplot2_1.0.0  plyr_1.8.1    
[5] RCurl_1.95-4.5 bitops_1.0-6   knitr_1.9     

loaded via a namespace (and not attached):
 [1] colorspace_1.2-4 digest_0.6.8     evaluate_0.5.5   formatR_1.0     
 [5] grid_3.1.0       gtable_0.1.2     MASS_7.3-37      munsell_0.4.2   
 [9] parallel_3.1.0   proto_0.3-10     Rcpp_0.11.4      reshape2_1.4.1  
[13] scales_0.2.4     stringr_0.6.2    tools_3.1.0     
options(warn=-1)
# import necessary functions from github
function.names<-c("genobyMclust.R","multiplot.R")
us<-paste("https://raw.githubusercontent.com/jtlovell/eqtlanalysis/master/",function.names, sep="")
for(i in 1:length(function.names)){
  script <- getURL(us[i], ssl.verifypeer = FALSE)
  eval(parse(text = script))
}

counts.fil<-read.delim("Phallii-FIL.counts")
counts.hal<-read.delim("Phallii-HAL.counts")
#part 1 figure out the genes to work with
hf1.fil<-counts.fil[,c(grep("HAL",colnames(counts.fil)),grep("FIL",colnames(counts.fil)),grep("F1",colnames(counts.fil)))]
hf1.hal<-counts.hal[,c(grep("HAL",colnames(counts.hal)),grep("FIL",colnames(counts.hal)),grep("F1",colnames(counts.hal)))]

#look at the data-
hf1.fil[1:10,1:5]
                   HAL2_G997_338 HAL2_H101_237 HAL2_H126_821 HAL2_G950_542
Pahalv11b000001m.g             0             0             0             0
Pahalv11b000002m.g             0             0             0             0
Pahalv11b000003m.g             0             0             0             0
Pahalv11b000006m.g             0             0             0             0
Pahalv11b000007m.g             0             0             0             0
Pahalv11b000009m.g             0             0             0             0
Pahalv11b000011m.g             0             0             0             0
Pahalv11b000012m.g             0             0             0             0
Pahalv11b000013m.g             0             0             0             0
Pahalv11b000014m.g             0             0             0             0
                   HAL2_G988_482
Pahalv11b000001m.g             0
Pahalv11b000002m.g             0
Pahalv11b000003m.g             0
Pahalv11b000006m.g             0
Pahalv11b000007m.g             0
Pahalv11b000009m.g             0
Pahalv11b000011m.g             0
Pahalv11b000012m.g             0
Pahalv11b000013m.g             0
Pahalv11b000014m.g             0
hf1.hal[1:10,1:5]
                   HAL2_G997_338 HAL2_H101_237 HAL2_H126_821 HAL2_G950_542
Pahalv11b000001m.g             0             0             0             0
Pahalv11b000002m.g             0             0             0             0
Pahalv11b000003m.g             0             0             0             0
Pahalv11b000006m.g             0             0             0             0
Pahalv11b000007m.g             0             0             0             0
Pahalv11b000009m.g           135           195           182           149
Pahalv11b000011m.g            52            31            48            57
Pahalv11b000012m.g           162           136           180           180
Pahalv11b000013m.g             0             1             3             0
Pahalv11b000014m.g           113           205           220           123
                   HAL2_G988_482
Pahalv11b000001m.g             0
Pahalv11b000002m.g             0
Pahalv11b000003m.g             0
Pahalv11b000006m.g             0
Pahalv11b000007m.g             0
Pahalv11b000009m.g           266
Pahalv11b000011m.g            50
Pahalv11b000012m.g           208
Pahalv11b000013m.g             0
Pahalv11b000014m.g           269
####################
####################
#Part 2: find genes that are appropriate for ASE - high hal in hal, low hal in fil etc. 
hal.counts.hal<-counts.hal[,grep("HAL",colnames(counts.hal))]
fil.counts.fil<-counts.fil[,grep("FIL",colnames(counts.fil))]
hal.counts.fil<-counts.hal[,grep("FIL",colnames(counts.hal))]
fil.counts.hal<-counts.fil[,grep("HAL",colnames(counts.fil))]
med.hal.hal<-as.numeric(apply(hal.counts.hal,1,median))
med.fil.fil<-as.numeric(apply(fil.counts.fil,1,median))
med.hal.fil<-as.numeric(apply(hal.counts.fil,1,median))
med.fil.hal<-as.numeric(apply(fil.counts.hal,1,median))
genes<-as.character(rownames(hal.counts.hal))
genes.parents<-genes[med.hal.hal>50 &
                med.fil.fil>50 &
                med.hal.fil<1 &
                med.fil.hal<1]
length(genes.parents)
[1] 8479
#less stringent cuttoffs - 40% increase in markers
genes.parents.lax<-genes[med.hal.hal>10 &
                     med.fil.fil>10 &
                     med.hal.fil<2 &
                     med.fil.hal<2]
length(genes.parents.lax)
[1] 13712
#for remaining analyses, use 1/50 cutoffs - these will most likely give the strongest clustering for later

####################
#################### 
#Part 3: of the genes that look good in parents, which have both alleles in F1s?
fil.counts.f1<-counts.fil[,grep("F1",colnames(counts.fil))]
hal.counts.f1<-counts.hal[,grep("F1",colnames(counts.hal))]
med.fil.F1<-as.numeric(apply(fil.counts.f1,1,median))
med.hal.F1<-as.numeric(apply(hal.counts.f1,1,median))
genes<-as.character(rownames(fil.counts.f1))
genes.f1<-genes[med.fil.F1>10 &
                  med.hal.F1>10]
length(genes.f1)
[1] 13802
genes.parents.f1<-intersect(genes.parents,genes.f1)
length(genes.parents.f1) # number of genes that fit both parent and F1 criteria
[1] 8477
#we only lose 2 genes because of strong ASE in F1

####################
####################
#Part 4: parse the file to extract F2s
fil.counts.f2<-counts.fil[genes.parents.f1,grep("FH",colnames(counts.fil))]
fil.counts.f2$gene<-rownames(fil.counts.f2)
hal.counts.f2<-counts.hal[genes.parents.f1,grep("FH",colnames(counts.hal))]
hal.counts.f2$gene<-rownames(hal.counts.f2)

hal.counts.all<-data.frame(t(counts.hal[genes.parents.f1,]))
fil.counts.all<-data.frame(t(counts.fil[genes.parents.f1,]))
hal.counts.all$id<-rownames(hal.counts.all)
hal.counts.all$type<-substr(hal.counts.all$id,1,2)
fil.counts.all$id<-rownames(fil.counts.all)
fil.counts.all$type<-substr(fil.counts.all$id,1,2)


####################
####################
#Part 5 - call genotypes for all lines
#demostration of the function with plotting
for(i in 1:4){
  test<-genobyMclust(gene.name=genes.parents.f1[i],
                     ids=c("id","type"),
                     fil.file=fil.counts.all,
                     hal.file=hal.counts.all,
                     n.clust=3,
                     p.tol=1e-10,
                     logtrans=TRUE,plot=T)
}
Pahalv11b000009m.g : 1 samples miscategorized for F1 
Loading required package: grid

plot of chunk unnamed-chunk-1

NULL
Pahalv11b000011m.g : 1 samples miscategorized for F1 

plot of chunk unnamed-chunk-1

NULL
Pahalv11b000012m.g : 1 samples miscategorized for F1 

plot of chunk unnamed-chunk-1

NULL
Pahalv11b000014m.g : 1 samples miscategorized for F1 

plot of chunk unnamed-chunk-1

NULL
test$group<-sapply(test[,1],function(x) strsplit(x,"_")[[1]][1])
#there may be a problem with F1_H294_525 
#not run --- takes a while
# fil.counts.edit<-fil.counts.all[-which(rownames(fil.counts.all)=="F1_H294_525"),]
# hal.counts.edit<-hal.counts.all[-which(rownames(hal.counts.all)=="F1_H294_525"),]
# line.info<-hal.counts.edit[,c("id","type")]
# for(i in 1:length(genes.parents.f1)){
#   geno<-genobyMclust(gene.name=genes.parents.f1[i],
#                      ids=c("id","type"),
#                      fil.file=fil.counts.edit,
#                      hal.file=hal.counts.edit,
#                      n.clust=3,
#                      p.tol=1e-10,
#                      logtrans=TRUE,plot=F)
#   line.info<-merge(line.info,geno, by="id")
#   cat(i," .")
# }
# write.csv(line.info,file="ph2015_rawgenocalls2.csv", row.names=F)


####################
####################
#Part 6 merge genotype calls with annotation
#from genobyMclust, we get a df with one column for each gene
#make files in R/qtl format
line.info<-read.csv("ph2015_rawgenocalls2.csv")
annot<-read.delim("Phalliiv1.1b.gene.gff3")
annot<-read.csv("ph2015_v1.1annot.edited.csv")
annot.cull<-annot[annot$id %in% genes.parents.f1,c("chr","start","id")]
annot.cull$chr<-tolower(annot.cull$chr)
annot.cull$chr <- gsub("_", "", annot.cull$chr)
annot.cull$chr <- gsub("super", "chr", annot.cull$chr)
colnames(annot.cull)[3]<-"gene"

line.info.cull<-line.info
line.info.cull<-line.info.cull[line.info.cull$type=="FH",]
rownames(line.info.cull)<-line.info.cull$id
####################
####################
#Part 7: count the number of genotypes, nas and likelihood of segregation distortion for each gene
counts<-apply(line.info.cull[,-c(1:2)],2,function(x){
  nas<-length(x[is.na(x)])
  fils<-length(x[x=="fil"])
  hets<-length(x[x=="het"])
  hals<-length(x[x=="hal"])
  counts<-sum(nas,fils,hets,hals)
  chi2p<-chisq.test(c(fils,hets,hals), p=c(.25,.5,.25))$p.value
  kbstat<-sum((.25*log(.25/(fils/counts))),
              (.5*log(.5/(hets/counts))),
              (.25*log(.25/(hals/counts))))
  prop.na<-nas/sum(fils,hets,hals)
  out<-data.frame(fils,hets,hals,nas,chi2p,prop.na,kbstat)
  return(out)
})
counts.df <- ldply(counts, data.frame)
colnames(counts.df)[1]<-"gene"
hist(counts.df$prop.na)

plot of chunk unnamed-chunk-1

counts.95<-counts.df[counts.df$prop.na<=0.05,]
hist(counts.95$chi2p,breaks=1000, xlim=c(0,.1))

plot of chunk unnamed-chunk-1

plot(log(counts.df$chi2p),counts.df$prop.na, main="n.NAs by seg. dist. for each gene")

plot of chunk unnamed-chunk-1

####################
####################
#Part 8: count the number of genotypes, nas and likelihood of segregation distortion for each sample
counts.ind<-apply(line.info.cull[,-c(1:2)],1,function(x){
  nas<-length(x[is.na(x)])
  fils<-length(x[x=="fil"])
  hets<-length(x[x=="het"])
  hals<-length(x[x=="hal"])
  counts<-sum(nas,fils,hets,hals)
  chi2p<-chisq.test(c(fils,hets,hals), p=c(.25,.5,.25))$p.value
  kbstat<-sum((.25*log(.25/(fils/counts))),
              (.5*log(.5/(hets/counts))),
              (.25*log(.25/(hals/counts))))
  prop.na<-nas/sum(fils,hets,hals)
  out<-data.frame(fils,hets,hals,nas,chi2p,prop.na,kbstat)
  return(out)
})
counts.ind.df <- ldply(counts.ind, data.frame)
colnames(counts.ind.df)[1]<-"ind"
counts.ind.df[,1]<-line.info.cull[,1]
hist(counts.ind.df$prop.na)

plot of chunk unnamed-chunk-1

hist(log(counts.ind.df$chi2p))

plot of chunk unnamed-chunk-1

pairs(counts.ind.df[,c("fils","hets","hals")])

plot of chunk unnamed-chunk-1

counts.ind.df$tot.counts<-counts.ind.df$fils + counts.ind.df$hets + counts.ind.df$hals
counts.ind.df$prop.het<-counts.ind.df$hets/counts.ind.df$tot.counts
counts.ind.df$prop.hal<-counts.ind.df$hals/counts.ind.df$tot.counts
counts.ind.df$prop.fil<-counts.ind.df$fils/counts.ind.df$tot.counts
pairs(counts.ind.df[,c("prop.het","prop.hal","prop.fil")], ylim=c(0,1), xlim=c(0,1))

plot of chunk unnamed-chunk-1

mean(counts.ind.df$prop.het);mean(counts.ind.df$prop.hal);mean(counts.ind.df$prop.fil)
[1] 0.3450659
[1] 0.3610172
[1] 0.2939169
#cull by bad lines and genes - there are definitely a few contaminants here 
#(probably 3 fils and 1 hal called f2s)

###############
###############
#part 9 - prepare data for joinmap
badinds<-apply(counts.ind.df[,c("prop.het","prop.hal","prop.fil")], 1, function (x) any(x<0.1))
good.lines<-counts.ind.df$ind[!badinds]
good.genes<-counts.95$gene
final.culled.genmatrix<-line.info.cull[good.lines,good.genes]
final.culled.genmatrix<-data.frame(t(final.culled.genmatrix))
final.culled.genmatrix<-as.matrix(final.culled.genmatrix)
final.culled.genmatrix[final.culled.genmatrix=="fil"]<-"a"
final.culled.genmatrix[final.culled.genmatrix=="het"]<-"h"
final.culled.genmatrix[final.culled.genmatrix=="hal"]<-"b"
write.csv(final.culled.genmatrix, file="ph2015_final.culled.genmatrix.csv", row.names=T)
####################
####################
# #Part 10 - make the R/qtl map files
# counts.95.nosd<-counts.95[counts.95$chi2p>1e-10,]
# genotypes.final<-line.info.cull[,c("id",counts.95.nosd$gene)]
# 
# out.all<-data.frame(c(NA,NA,genotypes.final$id))
# colnames(out.all)<-"id"
# chr.nums<-substring(unique(annot.cull$chr),4,8)
# for(i in chr.nums){
#   chr.name<-paste("chr",i,sep="")
#   
#   genes.chr<-as.character(annot.cull$gene[annot.cull$chr==chr.name])[
#     as.character(annot.cull$gene[annot.cull$chr==chr.name]) %in% colnames(genotypes.final)]
#   if(length(genes.chr)==0){
#     next
#   }else{
#     chr.all<-genotypes.final[,c("id",genes.chr)]
#     annot.chr<-annot.cull[annot.cull$gene %in% genes.chr,]
#     
#     chr.unique<-chr.all[,colnames(unique(as.matrix(chr.all),MARGIN=2))] #check for duplicates
#     chr.unique[chr.unique=="het"]<-"H"
#     chr.unique[chr.unique=="fil"]<-"A"
#     chr.unique[chr.unique=="hal"]<-"B"
#     chr.all<-rbind(c("id",rep(i,length(annot.chr$start)))
#                    ,c("id",annot.chr$start*100/max(annot.chr$start)),
#                    chr.unique) #bind the information with the data
#     out.all<-cbind(out.all, chr.all)
#   }
# }
# 
# genfile.initial<-out.all[,-grep("id",colnames(out.all))[-1]]
# genfile.initial$id<-as.numeric(substring(genfile.initial$id,3,5))
# genfile.initial<-genfile.initial[,-2]
# genfile.initial[1:2,"id"]<-""
# genfile.initial2<- genfile.initial[!(duplicated(genfile.initial$id, incomparables="")),]
# write.csv(genfile.initial2,file="ph2015_newgen_initial.csv", row.names=F, quote=F)
# 
# phefile.initial<-genfile.initial2[-c(1:2),1:2]
# phefile.initial$phe1<-rnorm(length(phefile.initial[,1]))
# write.csv(phefile.initial,file="ph2015_newphe.csv", row.names=F, quote=F)
# 
# # next script - making the map in R/qtl ("ph2015_rqtlmap_fromASE.R)
#