gsize trials

library(ggplot2)
setwd("~/projects/R_reports")

Get some data on relative genome size from HapMap2 and those reads mapped to Ensembl v3 version 22 of the cDNA with only “known” cDNAs

#Genome size from Chia
g=pipe(paste("curl -o - https://gist.githubusercontent.com/rossibarra/7638160/raw/62c4dab3f8a96a7d86cfe01a524dc1df2d03784f/genome_size"))
gsize<-read.table(g,header=T,colClasses=c("character","numeric","numeric"))

#Mapping to v3 cDNA ("known" cdna only)
m=pipe(paste("ssh farm 'cat ~/projects/genomesize/results/percent_hm2.txt'"))
mapped<-read.table(m,header=T,colClasses=c("character",rep("numeric",3)))
mapped$Line=gsub("_merged.fq.bam","",mapped$Line)

#Munge away: get lines with both genome size and percent mapping
data<-merge(gsize,mapped,by="Line")

Test correlation

ggplot(data=data,aes(x=GenomeSize,y=percent))+geom_point()+geom_smooth(method="lm")+xlab("1C genome size")+ylab("Fraction Reads Aligning to cDNA")

plot of chunk unnamed-chunk-3

cor.test(data$GenomeSize,data$percent)

## 
##  Pearson's product-moment correlation
## 
## data:  data$GenomeSize and data$percent
## t = 4.981, df = 33, p-value = 1.95e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4118 0.8113
## sample estimates:
##    cor 
## 0.6551

Significant is good, but positive correlation is not what we predict. Let’s remove TILs and see what happens

x=data[c(1:24,34:35),]
ggplot(data=x,aes(x=GenomeSize,y=percent))+geom_point()+geom_smooth(method="lm")+xlab("1C genome size")+ylab("Fraction Reads Aligning to cDNA")

plot of chunk unnamed-chunk-4

cor.test(x$GenomeSize,x$percent)

## 
##  Pearson's product-moment correlation
## 
## data:  x$GenomeSize and x$percent
## t = 1.398, df = 24, p-value = 0.175
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1264  0.5981
## sample estimates:
##    cor 
## 0.2744