library(ggplot2)
setwd("~/projects/R_reports")
Get some data on relative genome size from HapMap2 and those reads mapped to Ensembl v3 version 22 of the cDNA with only “known” cDNAs
#Genome size from Chia
g=pipe(paste("curl -o - https://gist.githubusercontent.com/rossibarra/7638160/raw/62c4dab3f8a96a7d86cfe01a524dc1df2d03784f/genome_size"))
gsize<-read.table(g,header=T,colClasses=c("character","numeric","numeric"))
#Mapping to v3 cDNA ("known" cdna only)
m=pipe(paste("ssh farm 'cat ~/projects/genomesize/results/percent_hm2.txt'"))
mapped<-read.table(m,header=T,colClasses=c("character",rep("numeric",3)))
mapped$Line=gsub("_merged.fq.bam","",mapped$Line)
#Munge away: get lines with both genome size and percent mapping
data<-merge(gsize,mapped,by="Line")
Test correlation
ggplot(data=data,aes(x=GenomeSize,y=percent))+geom_point()+geom_smooth(method="lm")+xlab("1C genome size")+ylab("Fraction Reads Aligning to cDNA")
cor.test(data$GenomeSize,data$percent)
##
## Pearson's product-moment correlation
##
## data: data$GenomeSize and data$percent
## t = 4.981, df = 33, p-value = 1.95e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4118 0.8113
## sample estimates:
## cor
## 0.6551
Significant is good, but positive correlation is not what we predict. Let’s remove TILs and see what happens
x=data[c(1:24,34:35),]
ggplot(data=x,aes(x=GenomeSize,y=percent))+geom_point()+geom_smooth(method="lm")+xlab("1C genome size")+ylab("Fraction Reads Aligning to cDNA")
cor.test(x$GenomeSize,x$percent)
##
## Pearson's product-moment correlation
##
## data: x$GenomeSize and x$percent
## t = 1.398, df = 24, p-value = 0.175
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1264 0.5981
## sample estimates:
## cor
## 0.2744