setwd ("/home/boussau/Data/Dengomix/TestMeasureNumberOfReadsWithRepeats")
d <- read.table ("for_back_overlap.tsv", h=F, na.strings ="None")
dcomp <- d[complete.cases(d),]
summary(dcomp)
V1 V2 V3 V4 V5 V6
SN7001146:348:HV5GLBCXY:1:1101:10000:45855: 1 Min. : 0 Min. : 29 Min. : 0 Min. : 29 Min. : 0.0
SN7001146:348:HV5GLBCXY:1:1101:10000:48346: 1 1st Qu.: 3050 1st Qu.: 3170 1st Qu.: 3050 1st Qu.: 3171 1st Qu.: 85.0
SN7001146:348:HV5GLBCXY:1:1101:10001:78299: 1 Median : 5716 Median : 5850 Median : 5716 Median : 5850 Median :113.0
SN7001146:348:HV5GLBCXY:1:1101:10001:83580: 1 Mean : 5531 Mean : 5655 Mean : 5531 Mean : 5655 Mean :116.6
SN7001146:348:HV5GLBCXY:1:1101:10002:53840: 1 3rd Qu.: 8063 3rd Qu.: 8224 3rd Qu.: 8063 3rd Qu.: 8224 3rd Qu.:147.0
SN7001146:348:HV5GLBCXY:1:1101:10003:22070: 1 Max. :10786 Max. :10807 Max. :10783 Max. :10807 Max. :205.0
(Other) :1546073
hist (d$V6, xlab="Size of overlap between forward and reverse reads", main ="", ylab="Count")
dim(dcomp)
[1] 1546079 6
dim(d)
[1] 1753013 6
length(which(d$V6>50))
[1] 1477057
So, in frequency:
1477057/1753013
[1] 0.8425819
84% of fragments have forward and reverse reads overlapping on more than 50 bases.
20 000 forward reads and 20 000 reverse reads have been sampled.
fo <- read.table ("forward_repeat.tsv", h=T, sep="\t")
reve <- read.table ("reverse_repeat.tsv", h=T, sep="\t")
summary(fo)
name longest_repeat
SN7001146:348:HV5GLBCXY:1:1107:10005:11405: 1 Min. :33.00
SN7001146:348:HV5GLBCXY:1:1107:10019:2375 : 1 1st Qu.:40.00
SN7001146:348:HV5GLBCXY:1:1107:10025:9493 : 1 Median :49.00
SN7001146:348:HV5GLBCXY:1:1107:10031:11309: 1 Mean :51.46
SN7001146:348:HV5GLBCXY:1:1107:10047:5911 : 1 3rd Qu.:68.00
SN7001146:348:HV5GLBCXY:1:1107:10053:9755 : 1 Max. :92.00
(Other) :3180
summary(reve)
name longest_repeat
SN7001146:348:HV5GLBCXY:1:1107:10005:11405: 1 Min. :33.00
SN7001146:348:HV5GLBCXY:1:1107:10019:2375 : 1 1st Qu.:41.00
SN7001146:348:HV5GLBCXY:1:1107:10025:9493 : 1 Median :53.00
SN7001146:348:HV5GLBCXY:1:1107:10031:11309: 1 Mean :53.91
SN7001146:348:HV5GLBCXY:1:1107:10047:5911 : 1 3rd Qu.:68.00
SN7001146:348:HV5GLBCXY:1:1107:10053:9755 : 1 Max. :68.00
(Other) :3495
hist(fo$longest_repeat, col=rgb(0,0,0,0.3), ylim=c(0,1500), main="", xlab="Length of within-read repeat", breaks=2*(15:46))
hist(reve$longest_repeat, col=rgb(1,0,0,0.3), add=T, breaks=2*(15:46))
legend("topleft", col=c(rgb(0,0,0,0.3), rgb(1,0,0,0.3)), legend=c("Forward", "Reverse"), pch=20)
There are repeats within the reads, of median or mean length around 50. So cirseq seems to have worked, for small repeats.
dim(fo)[1] / 5000
[1] 0.6372
64% of the forward reads, and
dim(reve)[1] / 5000
[1] 0.7002
70% of the reverse reads.
d <- read.table ("TD24Dfor_back_overlap.tsv", h=F, na.strings ="None")
dcomp <- d[complete.cases(d),]
summary(dcomp)
V1 V2 V3 V4 V5 V6
SN7001146:360:HJJ7JBCX2:1:1101:10000:22124: 1 Min. : 0 Min. : 166 Min. : 0 Min. : 166 Min. :162.0
SN7001146:360:HJJ7JBCX2:1:1101:10001:57897: 1 1st Qu.: 3225 1st Qu.: 3418 1st Qu.: 3225 1st Qu.: 3418 1st Qu.:189.0
SN7001146:360:HJJ7JBCX2:1:1101:10001:78603: 1 Median : 5666 Median : 5861 Median : 5666 Median : 5861 Median :199.0
SN7001146:360:HJJ7JBCX2:1:1101:10002:47548: 1 Mean : 5497 Mean : 5691 Mean : 5497 Mean : 5691 Mean :193.9
SN7001146:360:HJJ7JBCX2:1:1101:10005:80013: 1 3rd Qu.: 7824 3rd Qu.: 8015 3rd Qu.: 7824 3rd Qu.: 8015 3rd Qu.:200.0
SN7001146:360:HJJ7JBCX2:1:1101:10005:86589: 1 Max. :10636 Max. :10807 Max. :10636 Max. :10807 Max. :200.0
(Other) :1013091
hist (dcomp$V6, xlab="Size of overlap between forward and reverse reads", main ="", ylab="Count")
dim(dcomp)
[1] 1013097 6
dim(d)
[1] 1013097 6
length(which(d$V6>50))
[1] 1013097
So, in frequency:
1013097/1013097
[1] 1
100% of fragments have forward and reverse reads overlapping on more than 50 bases.
20 000 forward reads and 20 000 reverse reads have been sampled.
fo <- read.table ("forward24.tsv", h=T, sep="\t")
reve <- read.table ("reverse24.tsv", h=T, sep="\t")
summary(fo)
name longest_repeat
SN7001146:360:HJJ7JBCX2:1:1101:10001:3343: 1 Min. : 66.00
SN7001146:360:HJJ7JBCX2:1:1101:10025:2580: 1 1st Qu.: 66.00
SN7001146:360:HJJ7JBCX2:1:1101:10026:3008: 1 Median : 66.00
SN7001146:360:HJJ7JBCX2:1:1101:10031:4291: 1 Mean : 72.15
SN7001146:360:HJJ7JBCX2:1:1101:10031:4360: 1 3rd Qu.: 76.00
SN7001146:360:HJJ7JBCX2:1:1101:10033:4578: 1 Max. :100.00
(Other) :4994
summary(reve)
name longest_repeat
SN7001146:360:HJJ7JBCX2:1:1101:10001:3343: 1 Min. : 66.00
SN7001146:360:HJJ7JBCX2:1:1101:10025:2580: 1 1st Qu.: 66.00
SN7001146:360:HJJ7JBCX2:1:1101:10026:3008: 1 Median : 66.00
SN7001146:360:HJJ7JBCX2:1:1101:10031:4291: 1 Mean : 68.32
SN7001146:360:HJJ7JBCX2:1:1101:10031:4360: 1 3rd Qu.: 66.00
SN7001146:360:HJJ7JBCX2:1:1101:10033:4578: 1 Max. :100.00
(Other) :4994
breaks = 2*(32:50)
hist(fo$longest_repeat, col=rgb(0,0,0,0.3), ylim=c(0,4500), main="", xlab="Length of within-read repeat", breaks=breaks)
hist(reve$longest_repeat, col=rgb(1,0,0,0.3), add=T, breaks=breaks)
legend("topright", col=c(rgb(0,0,0,0.3), rgb(1,0,0,0.3)), legend=c("Forward", "Reverse"), pch=20)
dim(fo)[1] / 5000
[1] 1
100% of the forward reads, and
dim(reve)[1] / 5000
[1] 1
100% of the reverse reads.