Results for library HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12A

setwd ("/home/boussau/Data/Dengomix/TestMeasureNumberOfReadsWithRepeats")
d <- read.table ("for_back_overlap.tsv", h=F, na.strings ="None")
dcomp <- d[complete.cases(d),]
summary(dcomp)
                                          V1                V2              V3              V4              V5              V6       
 SN7001146:348:HV5GLBCXY:1:1101:10000:45855:      1   Min.   :    0   Min.   :   29   Min.   :    0   Min.   :   29   Min.   :  0.0  
 SN7001146:348:HV5GLBCXY:1:1101:10000:48346:      1   1st Qu.: 3050   1st Qu.: 3170   1st Qu.: 3050   1st Qu.: 3171   1st Qu.: 85.0  
 SN7001146:348:HV5GLBCXY:1:1101:10001:78299:      1   Median : 5716   Median : 5850   Median : 5716   Median : 5850   Median :113.0  
 SN7001146:348:HV5GLBCXY:1:1101:10001:83580:      1   Mean   : 5531   Mean   : 5655   Mean   : 5531   Mean   : 5655   Mean   :116.6  
 SN7001146:348:HV5GLBCXY:1:1101:10002:53840:      1   3rd Qu.: 8063   3rd Qu.: 8224   3rd Qu.: 8063   3rd Qu.: 8224   3rd Qu.:147.0  
 SN7001146:348:HV5GLBCXY:1:1101:10003:22070:      1   Max.   :10786   Max.   :10807   Max.   :10783   Max.   :10807   Max.   :205.0  
 (Other)                                   :1546073                                                                                  
hist (d$V6, xlab="Size of overlap between forward and reverse reads", main ="", ylab="Count")

dim(dcomp)
[1] 1546079       6
dim(d)
[1] 1753013       6

How many have repeats of more than 50?

length(which(d$V6>50))
[1] 1477057

So, in frequency:

1477057/1753013
[1] 0.8425819

84% of fragments have forward and reverse reads overlapping on more than 50 bases.

Analysis of repeats within the forward read, or within the reverse read.

20 000 forward reads and 20 000 reverse reads have been sampled.

fo <- read.table ("forward_repeat.tsv", h=T, sep="\t")
reve <- read.table ("reverse_repeat.tsv", h=T, sep="\t")
summary(fo)
                                         name      longest_repeat 
 SN7001146:348:HV5GLBCXY:1:1107:10005:11405:   1   Min.   :33.00  
 SN7001146:348:HV5GLBCXY:1:1107:10019:2375 :   1   1st Qu.:40.00  
 SN7001146:348:HV5GLBCXY:1:1107:10025:9493 :   1   Median :49.00  
 SN7001146:348:HV5GLBCXY:1:1107:10031:11309:   1   Mean   :51.46  
 SN7001146:348:HV5GLBCXY:1:1107:10047:5911 :   1   3rd Qu.:68.00  
 SN7001146:348:HV5GLBCXY:1:1107:10053:9755 :   1   Max.   :92.00  
 (Other)                                   :3180                  
summary(reve)
                                         name      longest_repeat 
 SN7001146:348:HV5GLBCXY:1:1107:10005:11405:   1   Min.   :33.00  
 SN7001146:348:HV5GLBCXY:1:1107:10019:2375 :   1   1st Qu.:41.00  
 SN7001146:348:HV5GLBCXY:1:1107:10025:9493 :   1   Median :53.00  
 SN7001146:348:HV5GLBCXY:1:1107:10031:11309:   1   Mean   :53.91  
 SN7001146:348:HV5GLBCXY:1:1107:10047:5911 :   1   3rd Qu.:68.00  
 SN7001146:348:HV5GLBCXY:1:1107:10053:9755 :   1   Max.   :68.00  
 (Other)                                   :3495                  
hist(fo$longest_repeat, col=rgb(0,0,0,0.3), ylim=c(0,1500), main="", xlab="Length of within-read repeat", breaks=2*(15:46))
hist(reve$longest_repeat, col=rgb(1,0,0,0.3), add=T, breaks=2*(15:46))
legend("topleft", col=c(rgb(0,0,0,0.3), rgb(1,0,0,0.3)), legend=c("Forward", "Reverse"), pch=20)

There are repeats within the reads, of median or mean length around 50. So cirseq seems to have worked, for small repeats.

How many reads have repeats?

dim(fo)[1] / 5000
[1] 0.6372

64% of the forward reads, and

dim(reve)[1] / 5000
[1] 0.7002

70% of the reverse reads.

Results for library HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24D

d <- read.table ("TD24Dfor_back_overlap.tsv", h=F, na.strings ="None")
dcomp <- d[complete.cases(d),]
summary(dcomp)
                                          V1                V2              V3              V4              V5              V6       
 SN7001146:360:HJJ7JBCX2:1:1101:10000:22124:      1   Min.   :    0   Min.   :  166   Min.   :    0   Min.   :  166   Min.   :162.0  
 SN7001146:360:HJJ7JBCX2:1:1101:10001:57897:      1   1st Qu.: 3225   1st Qu.: 3418   1st Qu.: 3225   1st Qu.: 3418   1st Qu.:189.0  
 SN7001146:360:HJJ7JBCX2:1:1101:10001:78603:      1   Median : 5666   Median : 5861   Median : 5666   Median : 5861   Median :199.0  
 SN7001146:360:HJJ7JBCX2:1:1101:10002:47548:      1   Mean   : 5497   Mean   : 5691   Mean   : 5497   Mean   : 5691   Mean   :193.9  
 SN7001146:360:HJJ7JBCX2:1:1101:10005:80013:      1   3rd Qu.: 7824   3rd Qu.: 8015   3rd Qu.: 7824   3rd Qu.: 8015   3rd Qu.:200.0  
 SN7001146:360:HJJ7JBCX2:1:1101:10005:86589:      1   Max.   :10636   Max.   :10807   Max.   :10636   Max.   :10807   Max.   :200.0  
 (Other)                                   :1013091                                                                                  
hist (dcomp$V6, xlab="Size of overlap between forward and reverse reads", main ="", ylab="Count")

dim(dcomp)
[1] 1013097       6
dim(d)
[1] 1013097       6

How many have repeats of more than 50?

length(which(d$V6>50))
[1] 1013097

So, in frequency:

1013097/1013097
[1] 1

100% of fragments have forward and reverse reads overlapping on more than 50 bases.

Analysis of repeats within the forward read, or within the reverse read.

20 000 forward reads and 20 000 reverse reads have been sampled.

fo <- read.table ("forward24.tsv", h=T, sep="\t")
reve <- read.table ("reverse24.tsv", h=T, sep="\t")
summary(fo)
                                        name      longest_repeat  
 SN7001146:360:HJJ7JBCX2:1:1101:10001:3343:   1   Min.   : 66.00  
 SN7001146:360:HJJ7JBCX2:1:1101:10025:2580:   1   1st Qu.: 66.00  
 SN7001146:360:HJJ7JBCX2:1:1101:10026:3008:   1   Median : 66.00  
 SN7001146:360:HJJ7JBCX2:1:1101:10031:4291:   1   Mean   : 72.15  
 SN7001146:360:HJJ7JBCX2:1:1101:10031:4360:   1   3rd Qu.: 76.00  
 SN7001146:360:HJJ7JBCX2:1:1101:10033:4578:   1   Max.   :100.00  
 (Other)                                  :4994                   
summary(reve)
                                        name      longest_repeat  
 SN7001146:360:HJJ7JBCX2:1:1101:10001:3343:   1   Min.   : 66.00  
 SN7001146:360:HJJ7JBCX2:1:1101:10025:2580:   1   1st Qu.: 66.00  
 SN7001146:360:HJJ7JBCX2:1:1101:10026:3008:   1   Median : 66.00  
 SN7001146:360:HJJ7JBCX2:1:1101:10031:4291:   1   Mean   : 68.32  
 SN7001146:360:HJJ7JBCX2:1:1101:10031:4360:   1   3rd Qu.: 66.00  
 SN7001146:360:HJJ7JBCX2:1:1101:10033:4578:   1   Max.   :100.00  
 (Other)                                  :4994                   
breaks = 2*(32:50)
hist(fo$longest_repeat, col=rgb(0,0,0,0.3), ylim=c(0,4500), main="", xlab="Length of within-read repeat", breaks=breaks)
hist(reve$longest_repeat, col=rgb(1,0,0,0.3), add=T, breaks=breaks)
legend("topright", col=c(rgb(0,0,0,0.3), rgb(1,0,0,0.3)), legend=c("Forward", "Reverse"), pch=20)

How many reads have repeats?

dim(fo)[1] / 5000
[1] 1

100% of the forward reads, and

dim(reve)[1] / 5000
[1] 1

100% of the reverse reads.

LS0tCnRpdGxlOiAiT3ZlcmxhcCBhbmFseXNpcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyBSZXN1bHRzIGZvciBsaWJyYXJ5IEhWNUdMQkNYWV9aSUtWXzE3czAwNjEzOS0xLTFfRFJFVVhfbGFuZTFERDEyQQoKYGBge3J9CnNldHdkICgiL2hvbWUvYm91c3NhdS9EYXRhL0RlbmdvbWl4L1Rlc3RNZWFzdXJlTnVtYmVyT2ZSZWFkc1dpdGhSZXBlYXRzIikKZCA8LSByZWFkLnRhYmxlICgiZm9yX2JhY2tfb3ZlcmxhcC50c3YiLCBoPUYsIG5hLnN0cmluZ3MgPSJOb25lIikKYGBgCgpgYGB7cn0KZGNvbXAgPC0gZFtjb21wbGV0ZS5jYXNlcyhkKSxdCmBgYAoKCmBgYHtyfQoKc3VtbWFyeShkY29tcCkKYGBgCgoKYGBge3J9Cmhpc3QgKGQkVjYsIHhsYWI9IlNpemUgb2Ygb3ZlcmxhcCBiZXR3ZWVuIGZvcndhcmQgYW5kIHJldmVyc2UgcmVhZHMiLCBtYWluID0iIiwgeWxhYj0iQ291bnQiKQpgYGAKCgoKYGBge3J9CmRpbShkY29tcCkKYGBgCgpgYGB7cn0KZGltKGQpCmBgYAoKIyBIb3cgbWFueSBoYXZlIHJlcGVhdHMgb2YgbW9yZSB0aGFuIDUwPwpgYGB7cn0KbGVuZ3RoKHdoaWNoKGQkVjY+NTApKQpgYGAKClNvLCBpbiBmcmVxdWVuY3k6CmBgYHtyfQoxNDc3MDU3LzE3NTMwMTMKYGBgCgo4NCUgb2YgZnJhZ21lbnRzIGhhdmUgZm9yd2FyZCBhbmQgcmV2ZXJzZSByZWFkcyBvdmVybGFwcGluZyBvbiBtb3JlIHRoYW4gNTAgYmFzZXMuCgoKIyMgQW5hbHlzaXMgb2YgcmVwZWF0cyB3aXRoaW4gdGhlIGZvcndhcmQgcmVhZCwgb3Igd2l0aGluIHRoZSByZXZlcnNlIHJlYWQuCjIwIDAwMCBmb3J3YXJkIHJlYWRzIGFuZCAyMCAwMDAgcmV2ZXJzZSByZWFkcyBoYXZlIGJlZW4gc2FtcGxlZC4KYGBge3J9CmZvIDwtIHJlYWQudGFibGUgKCJmb3J3YXJkX3JlcGVhdC50c3YiLCBoPVQsIHNlcD0iXHQiKQpyZXZlIDwtIHJlYWQudGFibGUgKCJyZXZlcnNlX3JlcGVhdC50c3YiLCBoPVQsIHNlcD0iXHQiKQpgYGAKCgoKYGBge3J9CnN1bW1hcnkoZm8pCmBgYAoKYGBge3J9CnN1bW1hcnkocmV2ZSkKYGBgCgoKYGBge3J9Cmhpc3QoZm8kbG9uZ2VzdF9yZXBlYXQsIGNvbD1yZ2IoMCwwLDAsMC4zKSwgeWxpbT1jKDAsMTUwMCksIG1haW49IiIsIHhsYWI9Ikxlbmd0aCBvZiB3aXRoaW4tcmVhZCByZXBlYXQiLCBicmVha3M9MiooMTU6NDYpKQpoaXN0KHJldmUkbG9uZ2VzdF9yZXBlYXQsIGNvbD1yZ2IoMSwwLDAsMC4zKSwgYWRkPVQsIGJyZWFrcz0yKigxNTo0NikpCmxlZ2VuZCgidG9wbGVmdCIsIGNvbD1jKHJnYigwLDAsMCwwLjMpLCByZ2IoMSwwLDAsMC4zKSksIGxlZ2VuZD1jKCJGb3J3YXJkIiwgIlJldmVyc2UiKSwgcGNoPTIwKQpgYGAKClRoZXJlIGFyZSByZXBlYXRzIHdpdGhpbiB0aGUgcmVhZHMsIG9mIG1lZGlhbiBvciBtZWFuIGxlbmd0aCBhcm91bmQgNTAuIFNvIGNpcnNlcSBzZWVtcyB0byBoYXZlIHdvcmtlZCwgZm9yIHNtYWxsIHJlcGVhdHMuCgojIyBIb3cgbWFueSByZWFkcyBoYXZlIHJlcGVhdHM/CmBgYHtyfQpkaW0oZm8pWzFdIC8gNTAwMApgYGAKCjY0JSBvZiB0aGUgZm9yd2FyZCByZWFkcywgYW5kIAoKYGBge3J9CmRpbShyZXZlKVsxXSAvIDUwMDAKCmBgYAo3MCUgb2YgdGhlIHJldmVyc2UgcmVhZHMuCgojIFJlc3VsdHMgZm9yIGxpYnJhcnkgSEpKN0pCQ1gyX1pJS1Ytcy1hbmQtY18xOHMwMDQyNTgtMS0xX0RSRVVYX2xhbmUxVEQyNEQKCmBgYHtyfQpkIDwtIHJlYWQudGFibGUgKCJURDI0RGZvcl9iYWNrX292ZXJsYXAudHN2IiwgaD1GLCBuYS5zdHJpbmdzID0iTm9uZSIpCmBgYAoKYGBge3J9CmRjb21wIDwtIGRbY29tcGxldGUuY2FzZXMoZCksXQpgYGAKCgpgYGB7cn0KCnN1bW1hcnkoZGNvbXApCmBgYAoKYGBge3J9Cmhpc3QgKGRjb21wJFY2LCB4bGFiPSJTaXplIG9mIG92ZXJsYXAgYmV0d2VlbiBmb3J3YXJkIGFuZCByZXZlcnNlIHJlYWRzIiwgbWFpbiA9IiIsIHlsYWI9IkNvdW50IikKYGBgCgoKCgpgYGB7cn0KZGltKGRjb21wKQpgYGAKCmBgYHtyfQpkaW0oZCkKYGBgCgojIEhvdyBtYW55IGhhdmUgcmVwZWF0cyBvZiBtb3JlIHRoYW4gNTA/CmBgYHtyfQpsZW5ndGgod2hpY2goZCRWNj41MCkpCmBgYAoKU28sIGluIGZyZXF1ZW5jeToKYGBge3J9CjEwMTMwOTcvMTAxMzA5NwpgYGAKCjEwMCUgb2YgZnJhZ21lbnRzIGhhdmUgZm9yd2FyZCBhbmQgcmV2ZXJzZSByZWFkcyBvdmVybGFwcGluZyBvbiBtb3JlIHRoYW4gNTAgYmFzZXMuCgoKIyMgQW5hbHlzaXMgb2YgcmVwZWF0cyB3aXRoaW4gdGhlIGZvcndhcmQgcmVhZCwgb3Igd2l0aGluIHRoZSByZXZlcnNlIHJlYWQuCjIwIDAwMCBmb3J3YXJkIHJlYWRzIGFuZCAyMCAwMDAgcmV2ZXJzZSByZWFkcyBoYXZlIGJlZW4gc2FtcGxlZC4KYGBge3J9CmZvIDwtIHJlYWQudGFibGUgKCJmb3J3YXJkMjQudHN2IiwgaD1ULCBzZXA9Ilx0IikKcmV2ZSA8LSByZWFkLnRhYmxlICgicmV2ZXJzZTI0LnRzdiIsIGg9VCwgc2VwPSJcdCIpCmBgYAoKCgpgYGB7cn0Kc3VtbWFyeShmbykKYGBgCgpgYGB7cn0Kc3VtbWFyeShyZXZlKQpgYGAKCgpgYGB7cn0KYnJlYWtzID0gMiooMzI6NTApCmhpc3QoZm8kbG9uZ2VzdF9yZXBlYXQsIGNvbD1yZ2IoMCwwLDAsMC4zKSwgeWxpbT1jKDAsNDUwMCksIG1haW49IiIsIHhsYWI9Ikxlbmd0aCBvZiB3aXRoaW4tcmVhZCByZXBlYXQiLCBicmVha3M9YnJlYWtzKQpoaXN0KHJldmUkbG9uZ2VzdF9yZXBlYXQsIGNvbD1yZ2IoMSwwLDAsMC4zKSwgYWRkPVQsIGJyZWFrcz1icmVha3MpCmxlZ2VuZCgidG9wcmlnaHQiLCBjb2w9YyhyZ2IoMCwwLDAsMC4zKSwgcmdiKDEsMCwwLDAuMykpLCBsZWdlbmQ9YygiRm9yd2FyZCIsICJSZXZlcnNlIiksIHBjaD0yMCkKYGBgCgojIyBIb3cgbWFueSByZWFkcyBoYXZlIHJlcGVhdHM/CmBgYHtyfQpkaW0oZm8pWzFdIC8gNTAwMApgYGAKCjEwMCUgb2YgdGhlIGZvcndhcmQgcmVhZHMsIGFuZCAKCmBgYHtyfQpkaW0ocmV2ZSlbMV0gLyA1MDAwCgpgYGAKMTAwJSBvZiB0aGUgcmV2ZXJzZSByZWFkcy4K