First 96 taxa from Centeromere Diversity Panel (CDP). Reads aligned with bwa-mem and depth calculated from bedtools. See repo for details on methods.
# set stuff up
library(ggplot2)
setwd("~/projects/R_reports")
# depth data system('curl -o -
# https://raw.github.com/rossibarra/gsize/master/results/depths.txt >
# data/depths.txt')
# redid with depths >1 cut
system("curl -o - https://raw.github.com/rossibarra/gsize/master/results/cutdepths.txt > data/depths.txt")
depth <- read.table("data/depths.txt")
colnames(depth) = c("Line", "totreads", "totbp", "fpkm", "w_fpkm")
depth$Line = as.factor(gsub(".fastq.bam", "", depth$Line))
# CDP data
system("curl -o - https://raw.github.com/rossibarra/gsize/master/data/CDP.csv > data/cdp.csv")
cdp <- read.csv("data/cdp.csv", header = T)
colnames(cdp)[2] = "gs1c"
# corrected for total number of reads in each file
depth$totbp = depth$totbp/depth$totreads/100 #divide by 100 and now fraction of all 100bp reads mapping
depth$fpkm = 10000 * depth$fpkm/depth$totreads
depth$w_fpkm = 1000 * depth$w_fpkm/depth$totreads
# make taxa column, merge
taxa = substr(depth$Line, 1, 5)
gsize = cbind(merge(depth, cdp, by = "Line"), taxa)
# play with subset with reasonable numbers: only maize, as teosinte gsize
# estimates are averages of 2 individuals not sequenced!
gs = subset(gsize, gsize$taxa == "RIMMA")
gs = subset(gs, gs$Percent_TE > 0.2) #drop strange outlier RIMMA0385.1 -> Tao Pueblo weirdness!
# example depth data
system("curl -o - https://raw.github.com/rossibarra/gsize/master/results/example_gene_depths.txt > data/ex_depths.txt")
eg <- read.table("data/ex_depths.txt")
ggplot(data = eg, aes(log(V2))) + geom_histogram() + xlab("log mean depth per gene")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(data = gsize, aes(x = Percent_180, y = Percent_TR.1, color = taxa)) +
geom_point() + geom_smooth(method = "lm")
ggplot(data = gsize, aes(x = gs1c, y = totbp, color = taxa)) + geom_point() +
geom_smooth(method = "lm") + xlab("1C genome size") + ylab("Fraction Reads Aligning")
## Warning: Removed 8 rows containing missing values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
ggplot(data = gsize, aes(x = gs1c, y = Percent_TE, color = taxa)) + geom_point() +
geom_smooth(method = "lm") + xlab("1C genome size") + ylab("Fraction Reads Aligning")
## Warning: Removed 8 rows containing missing values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
summary(lm(gs$gs1c ~ gs$Percent_TE))
##
## Call:
## lm(formula = gs$gs1c ~ gs$Percent_TE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3490 -0.1381 0.0182 0.0923 0.4279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.857 0.979 7.00 1.9e-08 ***
## gs$Percent_TE -6.215 1.573 -3.95 0.00031 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.183 on 40 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.281, Adjusted R-squared: 0.263
## F-statistic: 15.6 on 1 and 40 DF, p-value: 0.000307
summary(lm(gs$gs1c ~ gs$fpkm))
##
## Call:
## lm(formula = gs$gs1c ~ gs$fpkm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3490 -0.1286 -0.0077 0.1271 0.6933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.911 0.115 25.3 <2e-16 ***
## gs$fpkm 36.302 51.863 0.7 0.49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.215 on 40 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.0121, Adjusted R-squared: -0.0126
## F-statistic: 0.49 on 1 and 40 DF, p-value: 0.488