From My blogger fun-with-genome-1
It’s final exam’s week. Every one is buried in their study. I’m now taking a break and making some fun with the human genome package BSgenome.Hsapiens.UCSC.hg19.
This package contains the entire genome sequences for human provided by University of California, Santa Cruz (UCSC).
library(BSgenome.Hsapiens.UCSC.hg19)
There is a set of generic functions for us to get or modify the sequence information like seqinfo, seqnames, seqlengths…
genome <- BSgenome.Hsapiens.UCSC.hg19
seqinfo(genome)
## Seqinfo object with 93 sequences (1 circular) from hg19 genome:
## seqnames seqlengths isCircular genome
## chr1 249250621 FALSE hg19
## chr2 243199373 FALSE hg19
## chr3 198022430 FALSE hg19
## chr4 191154276 FALSE hg19
## chr5 180915260 FALSE hg19
## ... ... ... ...
## chrUn_gl000245 36651 FALSE hg19
## chrUn_gl000246 38154 FALSE hg19
## chrUn_gl000247 36422 FALSE hg19
## chrUn_gl000248 39786 FALSE hg19
## chrUn_gl000249 38502 FALSE hg19
seqnames(genome)
## [1] "chr1" "chr2"
## [3] "chr3" "chr4"
## [5] "chr5" "chr6"
## [7] "chr7" "chr8"
## [9] "chr9" "chr10"
## [11] "chr11" "chr12"
## [13] "chr13" "chr14"
## [15] "chr15" "chr16"
## [17] "chr17" "chr18"
## [19] "chr19" "chr20"
## [21] "chr21" "chr22"
## [23] "chrX" "chrY"
## [25] "chrM" "chr1_gl000191_random"
## [27] "chr1_gl000192_random" "chr4_ctg9_hap1"
## [29] "chr4_gl000193_random" "chr4_gl000194_random"
## [31] "chr6_apd_hap1" "chr6_cox_hap2"
## [33] "chr6_dbb_hap3" "chr6_mann_hap4"
## [35] "chr6_mcf_hap5" "chr6_qbl_hap6"
## [37] "chr6_ssto_hap7" "chr7_gl000195_random"
## [39] "chr8_gl000196_random" "chr8_gl000197_random"
## [41] "chr9_gl000198_random" "chr9_gl000199_random"
## [43] "chr9_gl000200_random" "chr9_gl000201_random"
## [45] "chr11_gl000202_random" "chr17_ctg5_hap1"
## [47] "chr17_gl000203_random" "chr17_gl000204_random"
## [49] "chr17_gl000205_random" "chr17_gl000206_random"
## [51] "chr18_gl000207_random" "chr19_gl000208_random"
## [53] "chr19_gl000209_random" "chr21_gl000210_random"
## [55] "chrUn_gl000211" "chrUn_gl000212"
## [57] "chrUn_gl000213" "chrUn_gl000214"
## [59] "chrUn_gl000215" "chrUn_gl000216"
## [61] "chrUn_gl000217" "chrUn_gl000218"
## [63] "chrUn_gl000219" "chrUn_gl000220"
## [65] "chrUn_gl000221" "chrUn_gl000222"
## [67] "chrUn_gl000223" "chrUn_gl000224"
## [69] "chrUn_gl000225" "chrUn_gl000226"
## [71] "chrUn_gl000227" "chrUn_gl000228"
## [73] "chrUn_gl000229" "chrUn_gl000230"
## [75] "chrUn_gl000231" "chrUn_gl000232"
## [77] "chrUn_gl000233" "chrUn_gl000234"
## [79] "chrUn_gl000235" "chrUn_gl000236"
## [81] "chrUn_gl000237" "chrUn_gl000238"
## [83] "chrUn_gl000239" "chrUn_gl000240"
## [85] "chrUn_gl000241" "chrUn_gl000242"
## [87] "chrUn_gl000243" "chrUn_gl000244"
## [89] "chrUn_gl000245" "chrUn_gl000246"
## [91] "chrUn_gl000247" "chrUn_gl000248"
## [93] "chrUn_gl000249"
seqlengths(genome)
## chr1 chr2 chr3
## 249250621 243199373 198022430
## chr4 chr5 chr6
## 191154276 180915260 171115067
## chr7 chr8 chr9
## 159138663 146364022 141213431
## chr10 chr11 chr12
## 135534747 135006516 133851895
## chr13 chr14 chr15
## 115169878 107349540 102531392
## chr16 chr17 chr18
## 90354753 81195210 78077248
## chr19 chr20 chr21
## 59128983 63025520 48129895
## chr22 chrX chrY
## 51304566 155270560 59373566
## chrM chr1_gl000191_random chr1_gl000192_random
## 16571 106433 547496
## chr4_ctg9_hap1 chr4_gl000193_random chr4_gl000194_random
## 590426 189789 191469
## chr6_apd_hap1 chr6_cox_hap2 chr6_dbb_hap3
## 4622290 4795371 4610396
## chr6_mann_hap4 chr6_mcf_hap5 chr6_qbl_hap6
## 4683263 4833398 4611984
## chr6_ssto_hap7 chr7_gl000195_random chr8_gl000196_random
## 4928567 182896 38914
## chr8_gl000197_random chr9_gl000198_random chr9_gl000199_random
## 37175 90085 169874
## chr9_gl000200_random chr9_gl000201_random chr11_gl000202_random
## 187035 36148 40103
## chr17_ctg5_hap1 chr17_gl000203_random chr17_gl000204_random
## 1680828 37498 81310
## chr17_gl000205_random chr17_gl000206_random chr18_gl000207_random
## 174588 41001 4262
## chr19_gl000208_random chr19_gl000209_random chr21_gl000210_random
## 92689 159169 27682
## chrUn_gl000211 chrUn_gl000212 chrUn_gl000213
## 166566 186858 164239
## chrUn_gl000214 chrUn_gl000215 chrUn_gl000216
## 137718 172545 172294
## chrUn_gl000217 chrUn_gl000218 chrUn_gl000219
## 172149 161147 179198
## chrUn_gl000220 chrUn_gl000221 chrUn_gl000222
## 161802 155397 186861
## chrUn_gl000223 chrUn_gl000224 chrUn_gl000225
## 180455 179693 211173
## chrUn_gl000226 chrUn_gl000227 chrUn_gl000228
## 15008 128374 129120
## chrUn_gl000229 chrUn_gl000230 chrUn_gl000231
## 19913 43691 27386
## chrUn_gl000232 chrUn_gl000233 chrUn_gl000234
## 40652 45941 40531
## chrUn_gl000235 chrUn_gl000236 chrUn_gl000237
## 34474 41934 45867
## chrUn_gl000238 chrUn_gl000239 chrUn_gl000240
## 39939 33824 41933
## chrUn_gl000241 chrUn_gl000242 chrUn_gl000243
## 42152 43523 43341
## chrUn_gl000244 chrUn_gl000245 chrUn_gl000246
## 39929 36651 38154
## chrUn_gl000247 chrUn_gl000248 chrUn_gl000249
## 36422 39786 38502
It is in April now, so I will take chromosome 4 for example.
chr4 <- genome$chr4
The DNA string is still too large. Let me cut it down.
s <- subseq(chr4, start=20160425, width=999)
I’m curious about What it looks like.
print(s)
## 999-letter "DNAString" instance
## seq: GTGTCTCTCTCTCTCTCTCTCTTTCTCCTGCTCT...CAATCTTCTCTTGCTGATGGTGCACCTGGTTCT
print(as.character(s))
## [1] "GTGTCTCTCTCTCTCTCTCTCTTTCTCCTGCTCTGGTGTGTGAAGACATGTTTACTTCCCTTTCGCCTTCCTCCATGATTGTAAGTTTCCTGAGGCCTCTCAGCCATGCCTTCTATACATCCAGCAGAATTGTTGAGTCTAAACCTCTTTTTTCATAAATTATCCCATCTCAGGTAGTTCTTTATAGCAGTGCAAGAATGGACTAATATATTTACCTAATAAGAAACCCACACAGATACAATATGAGGAATAACTAATAATACACCTAAAAGAATTTAAGACCATCATTTTATGTGGAATTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCCAGGCTGGAGTGCAGTGGCGCAATCTCGGCTCACTGCAAGCTCCGCCTCCCGGGTTCATGCCATTCTCCTGCCTCAGCCTCCCAGAGTAGCTGGGAGTACATGAGCCCGCCACCACGCCCGGCTAATTTTCTTTTCTGTGCTATCCACTCTTTCATATTTTATCTGTACATTTGGTCCAACTTTCTTCTTCTATCATCAGTCTACTGAACACAATCACCAGAACATGTTGTTTACTTGAAATCAGTACATCAAATATGAGCTAATAATAAAAAAGTGATTTCTCAATGGAGTTAAAATATTCTTGAATACTTCAATCACATGAAAAGTTTCTTGATTTACACAACATGACTTATAATAAAGGAATACAAGATAACTTTTATGCAAAAATTATATAAAATTCCATATAAGGCCAGGTGCGGTGGCTAAGTGTGTGTGTATGTGTGTGTGTATGAGAGAGTGAGAGTGTGTTTCTGGAAGAGATTAGCCCTTGAATTGACAGACAGAATAAAGCAGATGGCTGTTCCCATTGTGGCCGGGTATTATCCAATCCACTGAGGATCTGAGTAGAAAAAAAAGGCAGAAGAAGGGTGAATTACCTTTGCCAGACTGTTGAGCTGGCCCATCAATCTTCTCTTGCTGATGGTGCACCTGGTTCT"
Emm.. I can see nothing but four letters. I should ask more specifically. What is the GC-content of my new sequence?
sum(alphabetFrequency(s)[c("C","G")])
## [1] 404
How about a proportion?
letterFrequency(s,"CG",as.prob=TRUE)
## C|G
## 0.4044044
Here is a function I find useful. countPattern
countPattern("CG",s)
## [1] 12
It answers the question: how many times a given pattern occurs in a reference sequence.
I can also locate them
matchPattern("CG",s)
## Views on a 999-letter DNAString subject
## subject: GTGTCTCTCTCTCTCTCTCTCTTTCTCCTGCT...ATCTTCTCTTGCTGATGGTGCACCTGGTTCT
## views:
## start end width
## [1] 64 65 2 [CG]
## [2] 323 324 2 [CG]
## [3] 331 332 2 [CG]
## [4] 361 362 2 [CG]
## [5] 369 370 2 [CG]
## ... ... ... ... ...
## [8] 450 451 2 [CG]
## [9] 458 459 2 [CG]
## [10] 462 463 2 [CG]
## [11] 760 761 2 [CG]
## [12] 877 878 2 [CG]