title: "Annotate and analyze nucleotide and amino acid sequences using python"
author: "Bhagirathi Dash"
date: "December 1, 2018"
output: html_document
import sys
sys.stdout.write("Hello from Python %s\n" % (sys.version,))
Hello from Python 3.5.6 |Anaconda, Inc.| (default, Aug 26 2018, 16:05:27) [MSC v.1900 64 bit (AMD64)]
DNA = "atgcnatgcccnggttaatngcagNNNNNgaTCCCAGCCGGCTGAGGCGGGCAGGGCCGGGCGGGGCCGCGCCACGGAGCCCACAGCCCGGCGCTCCCTGCnnnnNNNNtga"
DNA[:3]
'atg'
DNA.lower()
'atgcnatgcccnggttaatngcagnnnnngatcccagccggctgaggcgggcagggccgggcggggccgcgccacggagcccacagcccggcgctccctgcnnnnnnnntga'
import random
def create_dna(n, alphabet='acgt'):
return ''.join([random.choice(alphabet) for i in range(n)])
dna = create_dna(1000)
dna
'gggctcctccagcgtctaaaggtgctagtcgcccattactctgggtatggacttagagagcataggttacgagccggagagatcaaatatacgacgcagcgagaaatacgatcgtacggtgggagtcacgacgtacatcctaatatgaaccagtttgttcaccgcaatatatcctttcagcggcgcgagtctccccgctcatttggggatagacttagatgggggtgaaggggcgcaacatgtagactaagaagtgaatgcgtgcttcaaatctgtacggtttcctaactcacactttaaaccgcagaataaaactgtataacctatcgtcctgtaaacgagaattcgccacgaacctagctctaaattgtcttgtggtttgtgatcgttgataggagagacacaaatagattaagatcaatcgaatgtgaccataggaccacctatcactttatccccagcagtaaatctactaagcctgaaaacgttgcaacatattatgtctcccagaacgcttagtaacataaggagaaatggagttaccaacgctgtccgccagatcgggcatcgggttgccccgtagtattgagcgtaagttaaagccagacaagtaaggatctgaactgtaaccgaggccgatagcacagcatcagtagcctctatgccgtgcgcgccgaatgtggcaccgcgacagtggaatttcgctgaagggtcttgcaccaattcgcagcttaaaatcctagaattccattatgcggataaatttaattaggtatcacggagtagagacgttagcaggaacagattatggtcgaatcttacacagcaccgaatgaatgttacttcactcattgcctaaattgacggggccagcaatccgggggcccctagccattgcgggtataccattacaagtgtagagtcaagtaggtcatctcttacggtggcggcgtggcatcccgtgcgactccatcctaacgtcgaacgatgctacctatac'
dna.find('tcc')
4
dna.count('n')
0
dna.count('a')
285
dna.count('t')
239
dna.count('g')
242
dna.count('c')
234
dna.count('gg')
43
# count total number of nucleotides
count_nts = 0
for i in dna:
count_nts = count_nts + 1
print(count_nts)
1000
#count unknown number of nucleotides
for unknown_nts in dna:
unknown_nts = dna.count('n')
print('The number of unknown nucleotides are %d.' % unknown_nts)
The number of unknown nucleotides are 0.
#splice donor site position
gt_position = dna.find('gt', 0)
for nts in dna:
nts = dna.find('gt', gt_position + 1)
print('Splice donor site position is at %d.' % gt_position)
Splice donor site position is at 13.
# make a dictionary of nucleotides and manipulate
dna_counts={'t':dna.count('t'),'c':dna.count('c'),
'g':dna.count('g'),'a':dna.count('a')}
dna_counts.items()
dict_items([('c', 234), ('a', 285), ('g', 242), ('t', 239)])
dna_counts
{'a': 285, 'c': 234, 'g': 242, 't': 239}
dna_counts.keys()
dict_keys(['c', 'a', 'g', 't'])
sorted(dna_counts.keys())
['a', 'c', 'g', 't']
del dna_counts['a']
dna_counts
{'c': 234, 'g': 242, 't': 239}
sorted(dna_counts.values())
[234, 239, 242]
# another approach to build nts dictionary: agnostic about unknown nucleotides
nts_dict = dict()
for nts in dna:
if nts not in nts_dict:
nts_dict[nts] = 1
else :
nts_dict[nts] = nts_dict[nts] + 1
print(nts_dict)
{'c': 234, 'a': 285, 't': 239, 'g': 242}
# get method for dictionaries
nts_counts = dict()
for nts in dna :
nts_counts[nts] = nts_counts.get(nts, 0) + 1
print(nts_counts)
{'c': 234, 'a': 285, 't': 239, 'g': 242}
#tabulate a dictionary
for key in nts_counts:
print(key, nts_counts[key])
c 234
a 285
t 239
g 242
#another approach to tabulate dictionary
for k, v in nts_counts.items() :
print(k, v)
c 234
a 285
t 239
g 242
# find whether there is a stop codon in your sequence
def has_stop_codon(dna) :
stop_codon_found = False
stop_codons = ['tga', 'tag', 'taa']
for i in range(0,len(dna),3) :
codon = dna[i:i+3].lower()
if codon in stop_codons :
stop_codon_found = True
break
return stop_codon_found
has_stop_codon('dna')
False
#if there is a stop codon find in which frame
def has_stop_codon(dna, frame=0) :
stop_codon_found = False
stop_codons = ['tga', 'tag', 'taa']
for i in range(frame,len(dna),3) :
codon = dna[i:i+3].lower()
if codon in stop_codons :
stop_codon_found = True
break
return stop_codon_found
#seq="atgcATGCatgctaa"
y=has_stop_codon('dna', 2)
print(y)
print('Done')
False
Done
# are there undefined bases in your sequence
if 'n' in dna or 'N' in dna:
nbases=dna.count('n') + dna.count('N')
print("dna sequence has %d undefined bases" % nbases)
else:
print("Dna sequence has no undefined bases")
Dna sequence has no undefined bases
# calculate the gc percent in sequence
def gc(dna) :
nbases = dna.count('n') + dna.count('N')
bases = len(dna) - nbases
gccount = dna.count('c') + dna.count('C') + dna.count('g') + dna.count('G')
gcpercent = (gccount/bases)*100
return gcpercent
print(gc(dna))
print("done")
47.599999999999994
done
# reverse the DNA sequence
x=dna[::-1]
print(x)
catatccatcgtagcaagctgcaatcctacctcagcgtgccctacggtgcggcggtggcattctctactggatgaactgagatgtgaacattaccatatgggcgttaccgatccccgggggcctaacgaccggggcagttaaatccgttactcacttcattgtaagtaagccacgacacattctaagctggtattagacaaggacgattgcagagatgaggcactatggattaatttaaataggcgtattaccttaagatcctaaaattcgacgcttaaccacgttctgggaagtcgctttaaggtgacagcgccacggtgtaagccgcgcgtgccgtatctccgatgactacgacacgatagccggagccaatgtcaagtctaggaatgaacagaccgaaattgaatgcgagttatgatgccccgttgggctacgggctagaccgcctgtcgcaaccattgaggtaaagaggaatacaatgattcgcaagaccctctgtattatacaacgttgcaaaagtccgaatcatctaaatgacgacccctatttcactatccaccaggataccagtgtaagctaactagaattagataaacacagagaggatagttgctagtgtttggtgttctgttaaatctcgatccaagcaccgcttaagagcaaatgtcctgctatccaatatgtcaaaataagacgccaaatttcacactcaatcctttggcatgtctaaacttcgtgcgtaagtgaagaatcagatgtacaacgcggggaagtgggggtagattcagataggggtttactcgcccctctgagcgcggcgactttcctatataacgccacttgtttgaccaagtataatcctacatgcagcactgagggtggcatgctagcataaagagcgacgcagcatataaactagagaggccgagcattggatacgagagattcaggtatgggtctcattacccgctgatcgtggaaatctgcgacctcctcggg
#Make a complement of the DNA sequence
basecomplement = {'A':'T','T':'A',
'G':'C','C':'G',
'a':'t','t':'a',
'g':'c','c':'g',
'N':'N','n':'n'}
letters=list(dna)
letters = [basecomplement[base] for base in letters]
joinletters = ''.join(letters)
print(joinletters)
cccgaggaggtcgcagatttccacgatcagcgggtaatgagacccatacctgaatctctcgtatccaatgctcggcctctctagtttatatgctgcgtcgctctttatgctagcatgccaccctcagtgctgcatgtaggattatacttggtcaaacaagtggcgttatataggaaagtcgccgcgctcagaggggcgagtaaacccctatctgaatctacccccacttccccgcgttgtacatctgattcttcacttacgcacgaagtttagacatgccaaaggattgagtgtgaaatttggcgtcttattttgacatattggatagcaggacatttgctcttaagcggtgcttggatcgagatttaacagaacaccaaacactagcaactatcctctctgtgtttatctaattctagttagcttacactggtatcctggtggatagtgaaataggggtcgtcatttagatgattcggacttttgcaacgttgtataatacagagggtcttgcgaatcattgtattcctctttacctcaatggttgcgacaggcggtctagcccgtagcccaacggggcatcataactcgcattcaatttcggtctgttcattcctagacttgacattggctccggctatcgtgtcgtagtcatcggagatacggcacgcgcggcttacaccgtggcgctgtcaccttaaagcgacttcccagaacgtggttaagcgtcgaattttaggatcttaaggtaatacgcctatttaaattaatccatagtgcctcatctctgcaatcgtccttgtctaataccagcttagaatgtgtcgtggcttacttacaatgaagtgagtaacggatttaactgccccggtcgttaggcccccggggatcggtaacgcccatatggtaatgttcacatctcagttcatccagtagagaatgccaccgccgcaccgtagggcacgctgaggtaggattgcagcttgctacgatggatatg
# reverse the DNA seq using a function
def rev_string(seq):
return seq[::-1]
rev_string(dna)
'catatccatcgtagcaagctgcaatcctacctcagcgtgccctacggtgcggcggtggcattctctactggatgaactgagatgtgaacattaccatatgggcgttaccgatccccgggggcctaacgaccggggcagttaaatccgttactcacttcattgtaagtaagccacgacacattctaagctggtattagacaaggacgattgcagagatgaggcactatggattaatttaaataggcgtattaccttaagatcctaaaattcgacgcttaaccacgttctgggaagtcgctttaaggtgacagcgccacggtgtaagccgcgcgtgccgtatctccgatgactacgacacgatagccggagccaatgtcaagtctaggaatgaacagaccgaaattgaatgcgagttatgatgccccgttgggctacgggctagaccgcctgtcgcaaccattgaggtaaagaggaatacaatgattcgcaagaccctctgtattatacaacgttgcaaaagtccgaatcatctaaatgacgacccctatttcactatccaccaggataccagtgtaagctaactagaattagataaacacagagaggatagttgctagtgtttggtgttctgttaaatctcgatccaagcaccgcttaagagcaaatgtcctgctatccaatatgtcaaaataagacgccaaatttcacactcaatcctttggcatgtctaaacttcgtgcgtaagtgaagaatcagatgtacaacgcggggaagtgggggtagattcagataggggtttactcgcccctctgagcgcggcgactttcctatataacgccacttgtttgaccaagtataatcctacatgcagcactgagggtggcatgctagcataaagagcgacgcagcatataaactagagaggccgagcattggatacgagagattcaggtatgggtctcattacccgctgatcgtggaaatctgcgacctcctcggg'
# make a complement of the DNA sequence using a function
def rev_complement(dna_seq) :
base_complement_dict = {'A' : 'T', 'T' : 'A',
'G' : 'C', 'C' : 'G',
'a' : 't', 't' : 'a',
'g' : 'c', 'c' : 'g',
'N' : 'N', 'n' : 'n'}
nucleotides = list(dna_seq)
nucleotide_complements = [base_complement_dict[nt] for nt in nucleotides]
join_compl_letters = ''.join(nucleotide_complements)
return join_compl_letters[::-1]
rev_complement(dna)
'gtataggtagcatcgttcgacgttaggatggagtcgcacgggatgccacgccgccaccgtaagagatgacctacttgactctacacttgtaatggtatacccgcaatggctaggggcccccggattgctggccccgtcaatttaggcaatgagtgaagtaacattcattcggtgctgtgtaagattcgaccataatctgttcctgctaacgtctctactccgtgatacctaattaaatttatccgcataatggaattctaggattttaagctgcgaattggtgcaagacccttcagcgaaattccactgtcgcggtgccacattcggcgcgcacggcatagaggctactgatgctgtgctatcggcctcggttacagttcagatccttacttgtctggctttaacttacgctcaatactacggggcaacccgatgcccgatctggcggacagcgttggtaactccatttctccttatgttactaagcgttctgggagacataatatgttgcaacgttttcaggcttagtagatttactgctggggataaagtgataggtggtcctatggtcacattcgattgatcttaatctatttgtgtctctcctatcaacgatcacaaaccacaagacaatttagagctaggttcgtggcgaattctcgtttacaggacgataggttatacagttttattctgcggtttaaagtgtgagttaggaaaccgtacagatttgaagcacgcattcacttcttagtctacatgttgcgccccttcacccccatctaagtctatccccaaatgagcggggagactcgcgccgctgaaaggatatattgcggtgaacaaactggttcatattaggatgtacgtcgtgactcccaccgtacgatcgtatttctcgctgcgtcgtatatttgatctctccggctcgtaacctatgctctctaagtccatacccagagtaatgggcgactagcacctttagacgctggaggagccc'
# Print all the steps in making a reverse complement
def myfunction(first, second, third, *therest) :
basecomplement = {'A':'T','T':'A',
'G':'C','C':'G',
'a':'t','t':'a',
'g':'c','c':'g',
'N':'N','n':'n'}
letters=list(dna)
compl_letters = [basecomplement[base] for base in letters]
join_compl_letters = ''.join(compl_letters)
rev_join_compl_letters=join_compl_letters[::-1]
print("first %s" % letters)
print("second %s" % compl_letters)
print("third %s" % join_compl_letters)
print("And all the rest %s" % rev_join_compl_letters)
return
print (myfunction(x,y,z,a))
first ['g', 'g', 'g', 'c', 't', 'c', 'c', 't', 'c', 'c', 'a', 'g', 'c', 'g', 't', 'c', 't', 'a', 'a', 'a', 'g', 'g', 't', 'g', 'c', 't', 'a', 'g', 't', 'c', 'g', 'c', 'c', 'c', 'a', 't', 't', 'a', 'c', 't', 'c', 't', 'g', 'g', 'g', 't', 'a', 't', 'g', 'g', 'a', 'c', 't', 't', 'a', 'g', 'a', 'g', 'a', 'g', 'c', 'a', 't', 'a', 'g', 'g', 't', 't', 'a', 'c', 'g', 'a', 'g', 'c', 'c', 'g', 'g', 'a', 'g', 'a', 'g', 'a', 't', 'c', 'a', 'a', 'a', 't', 'a', 't', 'a', 'c', 'g', 'a', 'c', 'g', 'c', 'a', 'g', 'c', 'g', 'a', 'g', 'a', 'a', 'a', 't', 'a', 'c', 'g', 'a', 't', 'c', 'g', 't', 'a', 'c', 'g', 'g', 't', 'g', 'g', 'g', 'a', 'g', 't', 'c', 'a', 'c', 'g', 'a', 'c', 'g', 't', 'a', 'c', 'a', 't', 'c', 'c', 't', 'a', 'a', 't', 'a', 't', 'g', 'a', 'a', 'c', 'c', 'a', 'g', 't', 't', 't', 'g', 't', 't', 'c', 'a', 'c', 'c', 'g', 'c', 'a', 'a', 't', 'a', 't', 'a', 't', 'c', 'c', 't', 't', 't', 'c', 'a', 'g', 'c', 'g', 'g', 'c', 'g', 'c', 'g', 'a', 'g', 't', 'c', 't', 'c', 'c', 'c', 'c', 'g', 'c', 't', 'c', 'a', 't', 't', 't', 'g', 'g', 'g', 'g', 'a', 't', 'a', 'g', 'a', 'c', 't', 't', 'a', 'g', 'a', 't', 'g', 'g', 'g', 'g', 'g', 't', 'g', 'a', 'a', 'g', 'g', 'g', 'g', 'c', 'g', 'c', 'a', 'a', 'c', 'a', 't', 'g', 't', 'a', 'g', 'a', 'c', 't', 'a', 'a', 'g', 'a', 'a', 'g', 't', 'g', 'a', 'a', 't', 'g', 'c', 'g', 't', 'g', 'c', 't', 't', 'c', 'a', 'a', 'a', 't', 'c', 't', 'g', 't', 'a', 'c', 'g', 'g', 't', 't', 't', 'c', 'c', 't', 'a', 'a', 'c', 't', 'c', 'a', 'c', 'a', 'c', 't', 't', 't', 'a', 'a', 'a', 'c', 'c', 'g', 'c', 'a', 'g', 'a', 'a', 't', 'a', 'a', 'a', 'a', 'c', 't', 'g', 't', 'a', 't', 'a', 'a', 'c', 'c', 't', 'a', 't', 'c', 'g', 't', 'c', 'c', 't', 'g', 't', 'a', 'a', 'a', 'c', 'g', 'a', 'g', 'a', 'a', 't', 't', 'c', 'g', 'c', 'c', 'a', 'c', 'g', 'a', 'a', 'c', 'c', 't', 'a', 'g', 'c', 't', 'c', 't', 'a', 'a', 'a', 't', 't', 'g', 't', 'c', 't', 't', 'g', 't', 'g', 'g', 't', 't', 't', 'g', 't', 'g', 'a', 't', 'c', 'g', 't', 't', 'g', 'a', 't', 'a', 'g', 'g', 'a', 'g', 'a', 'g', 'a', 'c', 'a', 'c', 'a', 'a', 'a', 't', 'a', 'g', 'a', 't', 't', 'a', 'a', 'g', 'a', 't', 'c', 'a', 'a', 't', 'c', 'g', 'a', 'a', 't', 'g', 't', 'g', 'a', 'c', 'c', 'a', 't', 'a', 'g', 'g', 'a', 'c', 'c', 'a', 'c', 'c', 't', 'a', 't', 'c', 'a', 'c', 't', 't', 't', 'a', 't', 'c', 'c', 'c', 'c', 'a', 'g', 'c', 'a', 'g', 't', 'a', 'a', 'a', 't', 'c', 't', 'a', 'c', 't', 'a', 'a', 'g', 'c', 'c', 't', 'g', 'a', 'a', 'a', 'a', 'c', 'g', 't', 't', 'g', 'c', 'a', 'a', 'c', 'a', 't', 'a', 't', 't', 'a', 't', 'g', 't', 'c', 't', 'c', 'c', 'c', 'a', 'g', 'a', 'a', 'c', 'g', 'c', 't', 't', 'a', 'g', 't', 'a', 'a', 'c', 'a', 't', 'a', 'a', 'g', 'g', 'a', 'g', 'a', 'a', 'a', 't', 'g', 'g', 'a', 'g', 't', 't', 'a', 'c', 'c', 'a', 'a', 'c', 'g', 'c', 't', 'g', 't', 'c', 'c', 'g', 'c', 'c', 'a', 'g', 'a', 't', 'c', 'g', 'g', 'g', 'c', 'a', 't', 'c', 'g', 'g', 'g', 't', 't', 'g', 'c', 'c', 'c', 'c', 'g', 't', 'a', 'g', 't', 'a', 't', 't', 'g', 'a', 'g', 'c', 'g', 't', 'a', 'a', 'g', 't', 't', 'a', 'a', 'a', 'g', 'c', 'c', 'a', 'g', 'a', 'c', 'a', 'a', 'g', 't', 'a', 'a', 'g', 'g', 'a', 't', 'c', 't', 'g', 'a', 'a', 'c', 't', 'g', 't', 'a', 'a', 'c', 'c', 'g', 'a', 'g', 'g', 'c', 'c', 'g', 'a', 't', 'a', 'g', 'c', 'a', 'c', 'a', 'g', 'c', 'a', 't', 'c', 'a', 'g', 't', 'a', 'g', 'c', 'c', 't', 'c', 't', 'a', 't', 'g', 'c', 'c', 'g', 't', 'g', 'c', 'g', 'c', 'g', 'c', 'c', 'g', 'a', 'a', 't', 'g', 't', 'g', 'g', 'c', 'a', 'c', 'c', 'g', 'c', 'g', 'a', 'c', 'a', 'g', 't', 'g', 'g', 'a', 'a', 't', 't', 't', 'c', 'g', 'c', 't', 'g', 'a', 'a', 'g', 'g', 'g', 't', 'c', 't', 't', 'g', 'c', 'a', 'c', 'c', 'a', 'a', 't', 't', 'c', 'g', 'c', 'a', 'g', 'c', 't', 't', 'a', 'a', 'a', 'a', 't', 'c', 'c', 't', 'a', 'g', 'a', 'a', 't', 't', 'c', 'c', 'a', 't', 't', 'a', 't', 'g', 'c', 'g', 'g', 'a', 't', 'a', 'a', 'a', 't', 't', 't', 'a', 'a', 't', 't', 'a', 'g', 'g', 't', 'a', 't', 'c', 'a', 'c', 'g', 'g', 'a', 'g', 't', 'a', 'g', 'a', 'g', 'a', 'c', 'g', 't', 't', 'a', 'g', 'c', 'a', 'g', 'g', 'a', 'a', 'c', 'a', 'g', 'a', 't', 't', 'a', 't', 'g', 'g', 't', 'c', 'g', 'a', 'a', 't', 'c', 't', 't', 'a', 'c', 'a', 'c', 'a', 'g', 'c', 'a', 'c', 'c', 'g', 'a', 'a', 't', 'g', 'a', 'a', 't', 'g', 't', 't', 'a', 'c', 't', 't', 'c', 'a', 'c', 't', 'c', 'a', 't', 't', 'g', 'c', 'c', 't', 'a', 'a', 'a', 't', 't', 'g', 'a', 'c', 'g', 'g', 'g', 'g', 'c', 'c', 'a', 'g', 'c', 'a', 'a', 't', 'c', 'c', 'g', 'g', 'g', 'g', 'g', 'c', 'c', 'c', 'c', 't', 'a', 'g', 'c', 'c', 'a', 't', 't', 'g', 'c', 'g', 'g', 'g', 't', 'a', 't', 'a', 'c', 'c', 'a', 't', 't', 'a', 'c', 'a', 'a', 'g', 't', 'g', 't', 'a', 'g', 'a', 'g', 't', 'c', 'a', 'a', 'g', 't', 'a', 'g', 'g', 't', 'c', 'a', 't', 'c', 't', 'c', 't', 't', 'a', 'c', 'g', 'g', 't', 'g', 'g', 'c', 'g', 'g', 'c', 'g', 't', 'g', 'g', 'c', 'a', 't', 'c', 'c', 'c', 'g', 't', 'g', 'c', 'g', 'a', 'c', 't', 'c', 'c', 'a', 't', 'c', 'c', 't', 'a', 'a', 'c', 'g', 't', 'c', 'g', 'a', 'a', 'c', 'g', 'a', 't', 'g', 'c', 't', 'a', 'c', 'c', 't', 'a', 't', 'a', 'c']
second ['c', 'c', 'c', 'g', 'a', 'g', 'g', 'a', 'g', 'g', 't', 'c', 'g', 'c', 'a', 'g', 'a', 't', 't', 't', 'c', 'c', 'a', 'c', 'g', 'a', 't', 'c', 'a', 'g', 'c', 'g', 'g', 'g', 't', 'a', 'a', 't', 'g', 'a', 'g', 'a', 'c', 'c', 'c', 'a', 't', 'a', 'c', 'c', 't', 'g', 'a', 'a', 't', 'c', 't', 'c', 't', 'c', 'g', 't', 'a', 't', 'c', 'c', 'a', 'a', 't', 'g', 'c', 't', 'c', 'g', 'g', 'c', 'c', 't', 'c', 't', 'c', 't', 'a', 'g', 't', 't', 't', 'a', 't', 'a', 't', 'g', 'c', 't', 'g', 'c', 'g', 't', 'c', 'g', 'c', 't', 'c', 't', 't', 't', 'a', 't', 'g', 'c', 't', 'a', 'g', 'c', 'a', 't', 'g', 'c', 'c', 'a', 'c', 'c', 'c', 't', 'c', 'a', 'g', 't', 'g', 'c', 't', 'g', 'c', 'a', 't', 'g', 't', 'a', 'g', 'g', 'a', 't', 't', 'a', 't', 'a', 'c', 't', 't', 'g', 'g', 't', 'c', 'a', 'a', 'a', 'c', 'a', 'a', 'g', 't', 'g', 'g', 'c', 'g', 't', 't', 'a', 't', 'a', 't', 'a', 'g', 'g', 'a', 'a', 'a', 'g', 't', 'c', 'g', 'c', 'c', 'g', 'c', 'g', 'c', 't', 'c', 'a', 'g', 'a', 'g', 'g', 'g', 'g', 'c', 'g', 'a', 'g', 't', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 't', 'a', 't', 'c', 't', 'g', 'a', 'a', 't', 'c', 't', 'a', 'c', 'c', 'c', 'c', 'c', 'a', 'c', 't', 't', 'c', 'c', 'c', 'c', 'g', 'c', 'g', 't', 't', 'g', 't', 'a', 'c', 'a', 't', 'c', 't', 'g', 'a', 't', 't', 'c', 't', 't', 'c', 'a', 'c', 't', 't', 'a', 'c', 'g', 'c', 'a', 'c', 'g', 'a', 'a', 'g', 't', 't', 't', 'a', 'g', 'a', 'c', 'a', 't', 'g', 'c', 'c', 'a', 'a', 'a', 'g', 'g', 'a', 't', 't', 'g', 'a', 'g', 't', 'g', 't', 'g', 'a', 'a', 'a', 't', 't', 't', 'g', 'g', 'c', 'g', 't', 'c', 't', 't', 'a', 't', 't', 't', 't', 'g', 'a', 'c', 'a', 't', 'a', 't', 't', 'g', 'g', 'a', 't', 'a', 'g', 'c', 'a', 'g', 'g', 'a', 'c', 'a', 't', 't', 't', 'g', 'c', 't', 'c', 't', 't', 'a', 'a', 'g', 'c', 'g', 'g', 't', 'g', 'c', 't', 't', 'g', 'g', 'a', 't', 'c', 'g', 'a', 'g', 'a', 't', 't', 't', 'a', 'a', 'c', 'a', 'g', 'a', 'a', 'c', 'a', 'c', 'c', 'a', 'a', 'a', 'c', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 'c', 't', 'a', 't', 'c', 'c', 't', 'c', 't', 'c', 't', 'g', 't', 'g', 't', 't', 't', 'a', 't', 'c', 't', 'a', 'a', 't', 't', 'c', 't', 'a', 'g', 't', 't', 'a', 'g', 'c', 't', 't', 'a', 'c', 'a', 'c', 't', 'g', 'g', 't', 'a', 't', 'c', 'c', 't', 'g', 'g', 't', 'g', 'g', 'a', 't', 'a', 'g', 't', 'g', 'a', 'a', 'a', 't', 'a', 'g', 'g', 'g', 'g', 't', 'c', 'g', 't', 'c', 'a', 't', 't', 't', 'a', 'g', 'a', 't', 'g', 'a', 't', 't', 'c', 'g', 'g', 'a', 'c', 't', 't', 't', 't', 'g', 'c', 'a', 'a', 'c', 'g', 't', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'a', 'c', 'a', 'g', 'a', 'g', 'g', 'g', 't', 'c', 't', 't', 'g', 'c', 'g', 'a', 'a', 't', 'c', 'a', 't', 't', 'g', 't', 'a', 't', 't', 'c', 'c', 't', 'c', 't', 't', 't', 'a', 'c', 'c', 't', 'c', 'a', 'a', 't', 'g', 'g', 't', 't', 'g', 'c', 'g', 'a', 'c', 'a', 'g', 'g', 'c', 'g', 'g', 't', 'c', 't', 'a', 'g', 'c', 'c', 'c', 'g', 't', 'a', 'g', 'c', 'c', 'c', 'a', 'a', 'c', 'g', 'g', 'g', 'g', 'c', 'a', 't', 'c', 'a', 't', 'a', 'a', 'c', 't', 'c', 'g', 'c', 'a', 't', 't', 'c', 'a', 'a', 't', 't', 't', 'c', 'g', 'g', 't', 'c', 't', 'g', 't', 't', 'c', 'a', 't', 't', 'c', 'c', 't', 'a', 'g', 'a', 'c', 't', 't', 'g', 'a', 'c', 'a', 't', 't', 'g', 'g', 'c', 't', 'c', 'c', 'g', 'g', 'c', 't', 'a', 't', 'c', 'g', 't', 'g', 't', 'c', 'g', 't', 'a', 'g', 't', 'c', 'a', 't', 'c', 'g', 'g', 'a', 'g', 'a', 't', 'a', 'c', 'g', 'g', 'c', 'a', 'c', 'g', 'c', 'g', 'c', 'g', 'g', 'c', 't', 't', 'a', 'c', 'a', 'c', 'c', 'g', 't', 'g', 'g', 'c', 'g', 'c', 't', 'g', 't', 'c', 'a', 'c', 'c', 't', 't', 'a', 'a', 'a', 'g', 'c', 'g', 'a', 'c', 't', 't', 'c', 'c', 'c', 'a', 'g', 'a', 'a', 'c', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 'c', 'g', 't', 'c', 'g', 'a', 'a', 't', 't', 't', 't', 'a', 'g', 'g', 'a', 't', 'c', 't', 't', 'a', 'a', 'g', 'g', 't', 'a', 'a', 't', 'a', 'c', 'g', 'c', 'c', 't', 'a', 't', 't', 't', 'a', 'a', 'a', 't', 't', 'a', 'a', 't', 'c', 'c', 'a', 't', 'a', 'g', 't', 'g', 'c', 'c', 't', 'c', 'a', 't', 'c', 't', 'c', 't', 'g', 'c', 'a', 'a', 't', 'c', 'g', 't', 'c', 'c', 't', 't', 'g', 't', 'c', 't', 'a', 'a', 't', 'a', 'c', 'c', 'a', 'g', 'c', 't', 't', 'a', 'g', 'a', 'a', 't', 'g', 't', 'g', 't', 'c', 'g', 't', 'g', 'g', 'c', 't', 't', 'a', 'c', 't', 't', 'a', 'c', 'a', 'a', 't', 'g', 'a', 'a', 'g', 't', 'g', 'a', 'g', 't', 'a', 'a', 'c', 'g', 'g', 'a', 't', 't', 't', 'a', 'a', 'c', 't', 'g', 'c', 'c', 'c', 'c', 'g', 'g', 't', 'c', 'g', 't', 't', 'a', 'g', 'g', 'c', 'c', 'c', 'c', 'c', 'g', 'g', 'g', 'g', 'a', 't', 'c', 'g', 'g', 't', 'a', 'a', 'c', 'g', 'c', 'c', 'c', 'a', 't', 'a', 't', 'g', 'g', 't', 'a', 'a', 't', 'g', 't', 't', 'c', 'a', 'c', 'a', 't', 'c', 't', 'c', 'a', 'g', 't', 't', 'c', 'a', 't', 'c', 'c', 'a', 'g', 't', 'a', 'g', 'a', 'g', 'a', 'a', 't', 'g', 'c', 'c', 'a', 'c', 'c', 'g', 'c', 'c', 'g', 'c', 'a', 'c', 'c', 'g', 't', 'a', 'g', 'g', 'g', 'c', 'a', 'c', 'g', 'c', 't', 'g', 'a', 'g', 'g', 't', 'a', 'g', 'g', 'a', 't', 't', 'g', 'c', 'a', 'g', 'c', 't', 't', 'g', 'c', 't', 'a', 'c', 'g', 'a', 't', 'g', 'g', 'a', 't', 'a', 't', 'g']
third cccgaggaggtcgcagatttccacgatcagcgggtaatgagacccatacctgaatctctcgtatccaatgctcggcctctctagtttatatgctgcgtcgctctttatgctagcatgccaccctcagtgctgcatgtaggattatacttggtcaaacaagtggcgttatataggaaagtcgccgcgctcagaggggcgagtaaacccctatctgaatctacccccacttccccgcgttgtacatctgattcttcacttacgcacgaagtttagacatgccaaaggattgagtgtgaaatttggcgtcttattttgacatattggatagcaggacatttgctcttaagcggtgcttggatcgagatttaacagaacaccaaacactagcaactatcctctctgtgtttatctaattctagttagcttacactggtatcctggtggatagtgaaataggggtcgtcatttagatgattcggacttttgcaacgttgtataatacagagggtcttgcgaatcattgtattcctctttacctcaatggttgcgacaggcggtctagcccgtagcccaacggggcatcataactcgcattcaatttcggtctgttcattcctagacttgacattggctccggctatcgtgtcgtagtcatcggagatacggcacgcgcggcttacaccgtggcgctgtcaccttaaagcgacttcccagaacgtggttaagcgtcgaattttaggatcttaaggtaatacgcctatttaaattaatccatagtgcctcatctctgcaatcgtccttgtctaataccagcttagaatgtgtcgtggcttacttacaatgaagtgagtaacggatttaactgccccggtcgttaggcccccggggatcggtaacgcccatatggtaatgttcacatctcagttcatccagtagagaatgccaccgccgcaccgtagggcacgctgaggtaggattgcagcttgctacgatggatatg
And all the rest gtataggtagcatcgttcgacgttaggatggagtcgcacgggatgccacgccgccaccgtaagagatgacctacttgactctacacttgtaatggtatacccgcaatggctaggggcccccggattgctggccccgtcaatttaggcaatgagtgaagtaacattcattcggtgctgtgtaagattcgaccataatctgttcctgctaacgtctctactccgtgatacctaattaaatttatccgcataatggaattctaggattttaagctgcgaattggtgcaagacccttcagcgaaattccactgtcgcggtgccacattcggcgcgcacggcatagaggctactgatgctgtgctatcggcctcggttacagttcagatccttacttgtctggctttaacttacgctcaatactacggggcaacccgatgcccgatctggcggacagcgttggtaactccatttctccttatgttactaagcgttctgggagacataatatgttgcaacgttttcaggcttagtagatttactgctggggataaagtgataggtggtcctatggtcacattcgattgatcttaatctatttgtgtctctcctatcaacgatcacaaaccacaagacaatttagagctaggttcgtggcgaattctcgtttacaggacgataggttatacagttttattctgcggtttaaagtgtgagttaggaaaccgtacagatttgaagcacgcattcacttcttagtctacatgttgcgccccttcacccccatctaagtctatccccaaatgagcggggagactcgcgccgctgaaaggatatattgcggtgaacaaactggttcatattaggatgtacgtcgtgactcccaccgtacgatcgtatttctcgctgcgtcgtatatttgatctctccggctcgtaacctatgctctctaagtccatacccagagtaatgggcgactagcacctttagacgctggaggagccc
None
def count(dna, base):
return dna.count(base)
count(dna, "atgc")
4
def count(dna, base):
return dna.count(base)
count(dna, "atg")
15
def count(dna, base):
return dna.count(base)
count(dna, "at")
75
hv_fasta = open('hv_nt_seq.fasta')
hv_fasta.read()
'>DQ286061.1 Hydra vulgaris mitochondrial succinate dehydrogenase flavoprotein subunit mRNA, partial cds; nuclear gene for mitochondrial product\nTGTTGTGCTGTTGCTGATCGAACTGGTCATTCACTACTTCATACTCTTTATGGACAGTCATTGCGATACG\nATTGTAACTACTTTATAGAATACTTTGCATTGGATTTGTTGATGGATAAAGGAAAATGTGTTGGGATAAT\nTGCATTAAATCTTGAAGATGGATCTTTGCATAGGATTAAAGCAAAAAATACCGTCCTTGCAACCGGTGGT\nTCTGGAAGAACGTATTTCTCATGTACTTCAGCCCATACATGCACAGGAGATGGCACTGCTATGGTTACAA\nGAGCTGGTCTTGCAAATGAAGATTTAGAGTTCATTCAGTTTCATCCTACTGGTATTTATGGAGCTGGTTG\nTCTCATCACAGAAGGTTGTAGAGGAGAAGGAGGCTACTTGATTAATAGCGAGGGTGAACGCTTTATGGAA\nAGATATGCTCCTACTGCAAAGGATCTTGCCTCAAGAGATGTTGTTTCTCGATCGATGACAATTGAGATGA\nGAGAAGGGCGTGGATGTGGACCTGAAAAAGATCATGTATATTTACAATTGTCTCATCTTCCCCAAGAGAT\nACTTAAATCTCGTCTTCCTGGAATTTCTGAGACAGCCATGATATTTGCTGGTGTTGATGTAACTCGTGAT\nCCTATACCTGTTCTTCCAACTTGCCATTACAATATGGGTGGAATACCAACCAACTTTAACGGACAGGTAA\nTACAACATCATAATGGTAAAGATGTTATTGTGGAAGGTTTGTATGCTGCAGGTGAAGCTGCTTGTGCTTC\nAGTTCATGGTGCTAACCGTCTGGGAGCTAATTCTTTGCTCGATTTAGTCATATTTGGTCGTGCTTGTGCC\nTTAGATATTGCTGCCAAAAATAAGCCTGGAGACAGCATTCCTGATTTACCCAGTGATATTGGTGAAGTAT\nCTGTGGCCAATCTTGATAAGGTTCGGTTTGCCAATGGACACACACCAACTGCAAATTTGAGATTAAAGAT\nGCAAAAGATTATGCAAGGACATGCAGCTGTATTCAGGACTGGTGCTGTCTTGGCAGAAGGAGTGTCAAAG\nATTTATCAGGCTTATGATGAGTTGAAGGATCTTAAGCTTTATGACCGTGGTATGATATTGAATACAGATC\nTTGTTGAAGCTCTGGAACTTCAAAATTTAATGTTAAACTCCTGTCTAGCAATGGTGTCTGCAGAAGCCAG\nAAAAGAAAGTCGTGGTGCGCTT\n\n>DQ286057.1 Hydra vulgaris myosin heavy chain mRNA, partial cds\nTGACAACTTTCTGCTGTTGGGCGCCGGAAAGGTAGTGCTTTTCAAACTGTATCATTTCGTCACAAGGAAC\nAACTAAAAAATTTGCTTACTACTCTTGGAATGACTAGTCCTCATTTCGTAAGATGTATCATTCCTAATGA\nAAAGAAGGAACCAGGAGTTGTTGAGGGCCAACTTGTTCTTCATCAGTTGAGGTGTAATGGTGTCTTGGAA\nGGTATTCGCATATGTAGAAAAGGTTTTCCATCCAGAATGAATTTTCAAGATTTTAAGTTAAGGTACCAAA\nTACTAGCATCTAATGCGATCCCACCTGGTTTTATTGATGGCAAAGTAGCAGCTGAAAAATTAATTGAGGC\nACTTCAACTAGATCAAAGTGAATACAGAGTAGGAAAGACAAAAATATTCTTTAGAGCTGGTATTGTGGGA\nGAGTTAGAAGAAATGCGCGATGAGCGATTATCTAAAATTATTTCACAGTTCCAAGCATACTGTAAGGGCA\nGTATTATGCGCAGTGAATATAAAAAGATGGTGGCACAGCGTATTGGTCTAGCTGTTATTCAAAGAAATGT\nCAGAAAGTATTTATTCTTGCGCCATTGGTCTTGGTGGAAGTTGTACACTAAGGTTCAACCTTTATTGAGT\nGTTGCACGAGCAGAGGATGAAATGAGAGCAAAAGAAGAAGAGTTAGAAGCTGCTAAAGAACAATTAAAAA\nAAGATGCAGAAGCTAAGAAAAAAATGGAAGAAGAACTGACTGAGGCTATGGCTCAAAAAGAAAAACTTTA\nTGCAAGTTTACAAGCTGAGACTGACAGATTAATTACAATTGAAGACAAGCTTCTCAATCTGCAAACAGTT\nAAGGATAAACTTGAAAGTAGTCTAAATGAAGCATTAGAAAAGCTGGATGGAGAAGAACATAGTGTTTTAG\nTTCTTGAAGAAAAGATTCAAGAAGCAGAAGAAAAAATTGACGAACTTACTGAAAAGACTGAGGAACTCCA\nATCAAACATTAGTCGACTTGAAACTGAAAAACAAAATCGTGATAAACAAATTGATACCTTGAATGAAGAT\nATTCGCAAGCAAGATGAAACTATCTCTAAAATGAATGCAGAAAAGAAGCATGTAGATGAGGAGTTGAAAG\nATCGCACTGAACAACTACAGGCTGCTGAGGATAAATGCAACAACCTCAATAAAACAAAGAATAAATTAGA\nATCTTCTATTAGAGAGATTGAACAAGATTTAAAGAAAGAAAAAGACAGTAAAATGAAGTTAGAAAAAGAA\nAAAAAGAAAGTTGAGTCAGATCTTAAAGACAATCGAGATAAACTTTCAGAAACAGAAACTCGTCTAAAAG\nAAACTCAGGATCTTGTAACTAAACGAGAAAAGTCAATATCCGATTTAGAAAATGCAAAAGAAGGTCTTGA\nATCACAGATTAGTCAACTCCAAAGAAAAATACAAGAACTTCTTGCTAAAATTGAAGAATTAGAAGAAGAG\nCTTGAAAACGAAAGAAAGTTGAGGCAGAAATCAGAGCTACAAAGAAAAGAGTTAGAGTCAAGAATTGAGG\nAATTGCAAGACCAACTTGAAACAGCAGGCGGTGCTACATCAGCTCAAGTTGAAGTTGGTAAAAAACGTGA\nAGCTGAATGTAATCGCCTTAGAAAAGAGATTGAAGCCCTTAACATAGCAAATGATGCTGCCATCTCAGCT\nATTAAAGCAAAAACAAATGCTACAATAGCAGAAATTCAAGAGGAAAATGAAGCAATGAAAAAAGCAAAAG\nCAAAACTTGAGAAAGAAAAAAGTGCACTTAATAATGAATTAAATGAAACTAAAAACTCGCTTGATCAAAT\nTAAGAAGCAAAAAACTAATAGTGACAAGAACTCCCGTATGCTTGAAGAACAAATCAATGAACTAAACAGC\nAAGTTGGCTCAAGTTGATGAATTACATTCTCAAAGTGAGTCAAAGAATTCTAAAGTTAACAGTGAGTTGT\nTGGCTCTTAACAGTCAATTGAGCGAATCAGAACATAATTTGGGAATAGCTACTAAAAATATAAAAACTTT\nAGAAAGTCAACTTGCAGAAAGTAAAAATTTTAATGAAGCTGAATCAAAGGCTAAACTTGAGAATTACAAC\nAGCTCGAATGCTTTT\n\n>DQ286058.1 Hydra vulgaris chitinase mRNA, partial cds\nATTGGTCGTTATTGCGGTCAAGGACGTTACCCATTAATGTCTTCAGTGGGTAAACTTCTTGGTGGATACG\nTTCCTCCCGTTGAGCCTACGTTCTCCCCAACCACAAAAGGACCATCAACGCCTAGCAAAAGTAGTACTGC\nCACTGATCGTCCTGCAACAAACCCCCCAACTGGAGCATGTAAGGCAATCGATGCAAGAGTAAAAGATCAA\nTGGTGTAATGATAACTGTCCCAAAGGATATTGCCCTACTGAGTTTTGTAAATGTTAAATAAATAAAAAAA\nGTGTTAACTTATTGCTCTTAAAAAAAAAAAAAAAAAAA\n\n>DQ286055.1 Hydra vulgaris putative solute carrier family 30 mRNA, partial cds\nGAAGTTTAGAATTCTTCCAGACTACTTATTAACAGATTATACTGACAGTAGATGAAAGAAGTTAGTCGGT\nTTACAAATAGTCAAGGAATTAAGATGAAATGTAAGTTTGGCCGAAATGCTACCTTTATTTTAATGTTGGT\nATAACAATGTCATTTTTTATTGTGGAGCTTGTTGTTGGTTATATGACTAAATCAATGGCATTGGTTGCTG\nACTCCTTTCAGATGTTATCGGATACAGTCTCTATTATTGTTGGCTTTGTTGCTTTTCACTGTTCGAAGCG\nTAGTGAAACCTCCAGCCGATTTACATATGGCTGGGTTCGTGCTGAAATACTTGGAGCTTTAGTTAATTCA\nGTATTTCTTGCTGCTCTTTGTTTTACAATTCTCATAGAATCATTTAAGCGGTCTGCTATTCCAGAAAGAG\nTTGAAAATCCTAAACTTGTTCTTATAGTCGGAGCAGTTGGTTTGCTTGTTAATATAATTGGGTTGTTTCT\nTTTTAATCACCACAGTAATGGCCATTCAAATAATAGTGAATCTGTTGAAAAAGGACATAATAATGAAGTT\nGTAGACAATATTGTTGCTGAATTCCCATTAGTTGATAGTAGTGAAGTGGTTATTTATGATAGTGATAAAA\nGCAATTCCCAAGTACCTCAAGTTGTAAGTAATAATGAGAATAGTAAAAAAAAATTAGGAGCATCTCGTCT\nAAATATTCGTGGAGTTTATTTGAATATTCTTGGAGATGCTTTAGGGTCAG\n\n>DQ286054.1 Hydra vulgaris putative solute carrier family 39 member 1 mRNA, partial cds\nTTCTTCAATATTTTAATCAACACAGAAAAAAAATATTTTGACTGTAAAAAAGAAGAATCCTTTTTTAGCT\nATGGGTTTTTATGAATTACCAGAATGGTCAGTTAAACTAATTATAATTATTATTTTATTTTTATTGGGCA\nTGATATTTGGTGTTGTACCATTAAATCTTTCACGAAGCTCCTCATTTGAAGGAAGAGTTTCTCCTACTCG\nCAATCTGCTTATTAGTTTGTCAAATTGCTTTGCTGGAGGGGTGTTTTTTAGCACTGTTATTCTTGATTTA\nTTTCCATTGGTAAAGTTAACAGTAAATAATGCACTAATATCTGTTTATATTGATACTGATTTTCCGCTAG\nGGGATTTTATTATTGGTATTGGATTTATCTTTATGTTAATTTTAGAGCATATAGTTCATTCTTGTTGCCA\nTCCTAATCAGTTATCTTATGAAGCTCCTAAAAATGTTAATAGTAACCAGGATGAATTATCATGTAATGAA\nAATAATCATCTTTTATCTCATGACAACAACTTTGATGTAGTTACTGATATTGAGATAAACACTTCAGAAA\nGACAGCTGCAACAA\n\n>DQ286051.1 Hydra vulgaris ribosomal protein S19 mRNA, partial cds\nCAGCTTTAATGGCCCTCAAGAATTTGTAAAAGCTTTTGCTGGACACCTAAAAAAAGGTAATAAATTTAAA\nGTGCCAGAATTTGTTGAGATAGTAAAAACTTCGAAAGCTTGTGAACTTGGACCTTCTGATCCTGATTGGT\nTTTATATCCGTGCTGCAGCTGTTGCTAGACATATTTATTTAAGACCAAACCTTGGTGTTGGTGCTATTCG\nTAAAATTTATGGTCGTGCTCAAAGGAATGGAACAAGGCCATCACATTCATGCTTAGGATCAGCCTCAATT\nGCTCGAAAGGTCTTACAATCTCTAGAAGCAATGAAACTTGTTACTAAAGATGCAGCAGGTGGACGTAGCT\nTAACTCCTGCAGGTCGAAGGGATATGGATCGAATAGCTGGACAGGTTGTGAACAAAGTTTAAAAATATAT\nTACAGAGTTAATATTAAAAAAAAAAAAAAAAAA\n\n>DQ286050.1 Hydra vulgaris ribosomal protein S9 mRNA, partial cds\nCATGCAATGGTCTTGTTCGTATTGGAGTACTTGATGAAGGAAGAATGAAGCTAGATTATGTTTTAGGTTT\nAAAAGTAGAAGATTTTTTGGAAAGACGTCTACAAACTCAAGTGCTTAAGTTGGGTCTCGCTAAGTCTATT\nCACCATGCTCGTGTTCTTATCCGACAAAAGCATATTAGAGTGCGAAAGCAGTTAGTCAACATCCCATCAT\nTTATCGTGAGACTTGACTCTCAAAAGCACATAGATTTCAGTACTAATTCACCATTCGGTGGTGGTCGACC\nAGGACGTGTTTCACGAAAGAACATGAAGAAAGGTGGCAGTGGAGGAAACGATGAAGAAGACGAAGATGAA\nTAGATTATATTGAAATCTGGCATGTGATTGTTTTGTTAGGCGGTTAATAAAGATCATTTGTCAAATAAAT\nCTAAATACTGTACAATAAAAAAAAAAAAAAAAAAA\n\n>DQ286049.1 Hydra vulgaris ribosomal protein S7 mRNA, partial cds\nACATATGGCTGGGTTCGTGAGCTAGAGAAAAAGTTTTCTGGAAAGCATGTCATTGTTGTTGGACAGAGAA\nGAATCTTGCCTAAACCCAGTCGTAAGACAAGAAATCAAAAGCAAATGAGACCAAGAAGTCGTACTCTAAC\nTGCTGTGCACGATGCCATTCTTGAAGATCTTTGTTTCCCATCGGAAATCGTTGGTAAAAGCATTCGAGTT\nAAATTAGATGGTTCAAGATTGATAAAAATAGTTTTAGAAAAAGCTCAGCAAACAAATGTTGAACATAAAC\nTTGACACGTTTGCAAATGTTTACAAGAAACTAACTGGTAAAGACACTCATTTTACTTTCGAAATATAAGT\nCATAACAGAGAAAAAAAAAAAAAAAAAA\n\n>NM_001309772.1 Hydra vulgaris phospholipid hydroperoxide glutathione peroxidase, mitochondrial-like (LOC100215761), mRNA\nATGGCTGCATCAGACCCTACAAAAGCTTCTTCTATATTTGAATTTCAAGCAAAAAGTATAGATGGTGAAG\nATATCAGTCTTTCGAAATATAAAGGTTTTGTTACACTTATTGTTAACGTGGCTAGCAAGGGTTTAACTGA\nACTCAACTATGCTCAGCTTGCTGATCTGCACACCAAGTATGCTGAGAAAGGTCTTCGAATTCTTGCTTTT\nCCTTGTAATCAGTTTGGTAACCAAGAGCCTGGTACAGATTTAGAAATAAAAGCGTTTGCATTAGCGCGAG\nGCGCCCACTATGACTTATTCAGTAAAATTGATGTTAATGGAGATAAGGCAGATCCTCTGTATAAATATTT\nGAAATCAAAGCAGAAAGGTATTTTGGGTAATAAAATCAAATGGAATTTTTCAAAGTTTATTTGTGATAAA\nAACGGTATCCCTGTTAAAAGATATGCTCCTACAACAGAACCTTTGTCATTAGTTCCAGATATCGAAAAGT\nATTTATGCCAATAA\n\n>NM_001309736.1 Hydra vulgaris superoxide dismutase [Mn], mitochondrial-like (LOC100209764), mRNA\nATGTTTTCTTTTGGAATCCACCGCCTTTCAGTTTTTCGAAAAATATCGAGAATAGCATTTGCTAATAAGC\nACACTCTTCCAGAATTGGGGTATGAATATAATGCATTGGAACCAACAATCAGCAGTCAAATTATGGAGAT\nACATCATCGCAAACACCACCAAGCTTATGTAAATAACTTAAATACAGCAGAAGAACAGTTAGCTGAAGCT\nCAGCATAAAGGAGATACGTCAAAGATTATTTCTTTAGCTCCTGCGTTAAAATTCAATGGAGGTGGGCACA\nTCAATCATTCCATTTTTTGGACTAATCTTTCGCCAAACGGTGGAGGAAAACCAACAGGTGAACTATTAGA\nAGCCATATTAAAAGACTTTGGGTCTTTTGAGGCAATGAAAACACGGTTATCGTCTCCAGCTGTTGCAGTG\nCAAGGTTCGGGTTGGGGTTGGTTGGGATACGATTCTGTCACTAAAAGACTTGCAATTACAGCTTTACCTA\nATCAAGATCCTTTGCAAGCTACTACTGGGTTAATACCGTTACTCGGTATTGATGTTTGGGAGCATGCGTA\nCTACTTGCAGTATAAGAATGTTCGTCTTGATTATGTCAACGCAATATTTAACATCATTGATTGGAAAAAT\nGTATCCGCAAGGTTTGTCGCAGCTAAATAA\n\n'
hv_fasta = open('hv_nt_seq.fasta')
hv_seqs = {}
for lines in hv_fasta:
lines = lines.rstrip()
if lines.startswith('>') :
words = lines.split()
name = words[0][1: ]
hv_seqs[name] = ''
else :
hv_seqs[name] = hv_seqs[name] + lines
hv_fasta.close()
for name, hv_seqs in hv_seqs.items() :
print(name, hv_seqs)
DQ286061.1 TGTTGTGCTGTTGCTGATCGAACTGGTCATTCACTACTTCATACTCTTTATGGACAGTCATTGCGATACGATTGTAACTACTTTATAGAATACTTTGCATTGGATTTGTTGATGGATAAAGGAAAATGTGTTGGGATAATTGCATTAAATCTTGAAGATGGATCTTTGCATAGGATTAAAGCAAAAAATACCGTCCTTGCAACCGGTGGTTCTGGAAGAACGTATTTCTCATGTACTTCAGCCCATACATGCACAGGAGATGGCACTGCTATGGTTACAAGAGCTGGTCTTGCAAATGAAGATTTAGAGTTCATTCAGTTTCATCCTACTGGTATTTATGGAGCTGGTTGTCTCATCACAGAAGGTTGTAGAGGAGAAGGAGGCTACTTGATTAATAGCGAGGGTGAACGCTTTATGGAAAGATATGCTCCTACTGCAAAGGATCTTGCCTCAAGAGATGTTGTTTCTCGATCGATGACAATTGAGATGAGAGAAGGGCGTGGATGTGGACCTGAAAAAGATCATGTATATTTACAATTGTCTCATCTTCCCCAAGAGATACTTAAATCTCGTCTTCCTGGAATTTCTGAGACAGCCATGATATTTGCTGGTGTTGATGTAACTCGTGATCCTATACCTGTTCTTCCAACTTGCCATTACAATATGGGTGGAATACCAACCAACTTTAACGGACAGGTAATACAACATCATAATGGTAAAGATGTTATTGTGGAAGGTTTGTATGCTGCAGGTGAAGCTGCTTGTGCTTCAGTTCATGGTGCTAACCGTCTGGGAGCTAATTCTTTGCTCGATTTAGTCATATTTGGTCGTGCTTGTGCCTTAGATATTGCTGCCAAAAATAAGCCTGGAGACAGCATTCCTGATTTACCCAGTGATATTGGTGAAGTATCTGTGGCCAATCTTGATAAGGTTCGGTTTGCCAATGGACACACACCAACTGCAAATTTGAGATTAAAGATGCAAAAGATTATGCAAGGACATGCAGCTGTATTCAGGACTGGTGCTGTCTTGGCAGAAGGAGTGTCAAAGATTTATCAGGCTTATGATGAGTTGAAGGATCTTAAGCTTTATGACCGTGGTATGATATTGAATACAGATCTTGTTGAAGCTCTGGAACTTCAAAATTTAATGTTAAACTCCTGTCTAGCAATGGTGTCTGCAGAAGCCAGAAAAGAAAGTCGTGGTGCGCTT
DQ286058.1 ATTGGTCGTTATTGCGGTCAAGGACGTTACCCATTAATGTCTTCAGTGGGTAAACTTCTTGGTGGATACGTTCCTCCCGTTGAGCCTACGTTCTCCCCAACCACAAAAGGACCATCAACGCCTAGCAAAAGTAGTACTGCCACTGATCGTCCTGCAACAAACCCCCCAACTGGAGCATGTAAGGCAATCGATGCAAGAGTAAAAGATCAATGGTGTAATGATAACTGTCCCAAAGGATATTGCCCTACTGAGTTTTGTAAATGTTAAATAAATAAAAAAAGTGTTAACTTATTGCTCTTAAAAAAAAAAAAAAAAAAA
DQ286054.1 TTCTTCAATATTTTAATCAACACAGAAAAAAAATATTTTGACTGTAAAAAAGAAGAATCCTTTTTTAGCTATGGGTTTTTATGAATTACCAGAATGGTCAGTTAAACTAATTATAATTATTATTTTATTTTTATTGGGCATGATATTTGGTGTTGTACCATTAAATCTTTCACGAAGCTCCTCATTTGAAGGAAGAGTTTCTCCTACTCGCAATCTGCTTATTAGTTTGTCAAATTGCTTTGCTGGAGGGGTGTTTTTTAGCACTGTTATTCTTGATTTATTTCCATTGGTAAAGTTAACAGTAAATAATGCACTAATATCTGTTTATATTGATACTGATTTTCCGCTAGGGGATTTTATTATTGGTATTGGATTTATCTTTATGTTAATTTTAGAGCATATAGTTCATTCTTGTTGCCATCCTAATCAGTTATCTTATGAAGCTCCTAAAAATGTTAATAGTAACCAGGATGAATTATCATGTAATGAAAATAATCATCTTTTATCTCATGACAACAACTTTGATGTAGTTACTGATATTGAGATAAACACTTCAGAAAGACAGCTGCAACAA
NM_001309736.1 ATGTTTTCTTTTGGAATCCACCGCCTTTCAGTTTTTCGAAAAATATCGAGAATAGCATTTGCTAATAAGCACACTCTTCCAGAATTGGGGTATGAATATAATGCATTGGAACCAACAATCAGCAGTCAAATTATGGAGATACATCATCGCAAACACCACCAAGCTTATGTAAATAACTTAAATACAGCAGAAGAACAGTTAGCTGAAGCTCAGCATAAAGGAGATACGTCAAAGATTATTTCTTTAGCTCCTGCGTTAAAATTCAATGGAGGTGGGCACATCAATCATTCCATTTTTTGGACTAATCTTTCGCCAAACGGTGGAGGAAAACCAACAGGTGAACTATTAGAAGCCATATTAAAAGACTTTGGGTCTTTTGAGGCAATGAAAACACGGTTATCGTCTCCAGCTGTTGCAGTGCAAGGTTCGGGTTGGGGTTGGTTGGGATACGATTCTGTCACTAAAAGACTTGCAATTACAGCTTTACCTAATCAAGATCCTTTGCAAGCTACTACTGGGTTAATACCGTTACTCGGTATTGATGTTTGGGAGCATGCGTACTACTTGCAGTATAAGAATGTTCGTCTTGATTATGTCAACGCAATATTTAACATCATTGATTGGAAAAATGTATCCGCAAGGTTTGTCGCAGCTAAATAA
NM_001309772.1 ATGGCTGCATCAGACCCTACAAAAGCTTCTTCTATATTTGAATTTCAAGCAAAAAGTATAGATGGTGAAGATATCAGTCTTTCGAAATATAAAGGTTTTGTTACACTTATTGTTAACGTGGCTAGCAAGGGTTTAACTGAACTCAACTATGCTCAGCTTGCTGATCTGCACACCAAGTATGCTGAGAAAGGTCTTCGAATTCTTGCTTTTCCTTGTAATCAGTTTGGTAACCAAGAGCCTGGTACAGATTTAGAAATAAAAGCGTTTGCATTAGCGCGAGGCGCCCACTATGACTTATTCAGTAAAATTGATGTTAATGGAGATAAGGCAGATCCTCTGTATAAATATTTGAAATCAAAGCAGAAAGGTATTTTGGGTAATAAAATCAAATGGAATTTTTCAAAGTTTATTTGTGATAAAAACGGTATCCCTGTTAAAAGATATGCTCCTACAACAGAACCTTTGTCATTAGTTCCAGATATCGAAAAGTATTTATGCCAATAA
DQ286050.1 CATGCAATGGTCTTGTTCGTATTGGAGTACTTGATGAAGGAAGAATGAAGCTAGATTATGTTTTAGGTTTAAAAGTAGAAGATTTTTTGGAAAGACGTCTACAAACTCAAGTGCTTAAGTTGGGTCTCGCTAAGTCTATTCACCATGCTCGTGTTCTTATCCGACAAAAGCATATTAGAGTGCGAAAGCAGTTAGTCAACATCCCATCATTTATCGTGAGACTTGACTCTCAAAAGCACATAGATTTCAGTACTAATTCACCATTCGGTGGTGGTCGACCAGGACGTGTTTCACGAAAGAACATGAAGAAAGGTGGCAGTGGAGGAAACGATGAAGAAGACGAAGATGAATAGATTATATTGAAATCTGGCATGTGATTGTTTTGTTAGGCGGTTAATAAAGATCATTTGTCAAATAAATCTAAATACTGTACAATAAAAAAAAAAAAAAAAAAA
DQ286049.1 ACATATGGCTGGGTTCGTGAGCTAGAGAAAAAGTTTTCTGGAAAGCATGTCATTGTTGTTGGACAGAGAAGAATCTTGCCTAAACCCAGTCGTAAGACAAGAAATCAAAAGCAAATGAGACCAAGAAGTCGTACTCTAACTGCTGTGCACGATGCCATTCTTGAAGATCTTTGTTTCCCATCGGAAATCGTTGGTAAAAGCATTCGAGTTAAATTAGATGGTTCAAGATTGATAAAAATAGTTTTAGAAAAAGCTCAGCAAACAAATGTTGAACATAAACTTGACACGTTTGCAAATGTTTACAAGAAACTAACTGGTAAAGACACTCATTTTACTTTCGAAATATAAGTCATAACAGAGAAAAAAAAAAAAAAAAAA
DQ286057.1 TGACAACTTTCTGCTGTTGGGCGCCGGAAAGGTAGTGCTTTTCAAACTGTATCATTTCGTCACAAGGAACAACTAAAAAATTTGCTTACTACTCTTGGAATGACTAGTCCTCATTTCGTAAGATGTATCATTCCTAATGAAAAGAAGGAACCAGGAGTTGTTGAGGGCCAACTTGTTCTTCATCAGTTGAGGTGTAATGGTGTCTTGGAAGGTATTCGCATATGTAGAAAAGGTTTTCCATCCAGAATGAATTTTCAAGATTTTAAGTTAAGGTACCAAATACTAGCATCTAATGCGATCCCACCTGGTTTTATTGATGGCAAAGTAGCAGCTGAAAAATTAATTGAGGCACTTCAACTAGATCAAAGTGAATACAGAGTAGGAAAGACAAAAATATTCTTTAGAGCTGGTATTGTGGGAGAGTTAGAAGAAATGCGCGATGAGCGATTATCTAAAATTATTTCACAGTTCCAAGCATACTGTAAGGGCAGTATTATGCGCAGTGAATATAAAAAGATGGTGGCACAGCGTATTGGTCTAGCTGTTATTCAAAGAAATGTCAGAAAGTATTTATTCTTGCGCCATTGGTCTTGGTGGAAGTTGTACACTAAGGTTCAACCTTTATTGAGTGTTGCACGAGCAGAGGATGAAATGAGAGCAAAAGAAGAAGAGTTAGAAGCTGCTAAAGAACAATTAAAAAAAGATGCAGAAGCTAAGAAAAAAATGGAAGAAGAACTGACTGAGGCTATGGCTCAAAAAGAAAAACTTTATGCAAGTTTACAAGCTGAGACTGACAGATTAATTACAATTGAAGACAAGCTTCTCAATCTGCAAACAGTTAAGGATAAACTTGAAAGTAGTCTAAATGAAGCATTAGAAAAGCTGGATGGAGAAGAACATAGTGTTTTAGTTCTTGAAGAAAAGATTCAAGAAGCAGAAGAAAAAATTGACGAACTTACTGAAAAGACTGAGGAACTCCAATCAAACATTAGTCGACTTGAAACTGAAAAACAAAATCGTGATAAACAAATTGATACCTTGAATGAAGATATTCGCAAGCAAGATGAAACTATCTCTAAAATGAATGCAGAAAAGAAGCATGTAGATGAGGAGTTGAAAGATCGCACTGAACAACTACAGGCTGCTGAGGATAAATGCAACAACCTCAATAAAACAAAGAATAAATTAGAATCTTCTATTAGAGAGATTGAACAAGATTTAAAGAAAGAAAAAGACAGTAAAATGAAGTTAGAAAAAGAAAAAAAGAAAGTTGAGTCAGATCTTAAAGACAATCGAGATAAACTTTCAGAAACAGAAACTCGTCTAAAAGAAACTCAGGATCTTGTAACTAAACGAGAAAAGTCAATATCCGATTTAGAAAATGCAAAAGAAGGTCTTGAATCACAGATTAGTCAACTCCAAAGAAAAATACAAGAACTTCTTGCTAAAATTGAAGAATTAGAAGAAGAGCTTGAAAACGAAAGAAAGTTGAGGCAGAAATCAGAGCTACAAAGAAAAGAGTTAGAGTCAAGAATTGAGGAATTGCAAGACCAACTTGAAACAGCAGGCGGTGCTACATCAGCTCAAGTTGAAGTTGGTAAAAAACGTGAAGCTGAATGTAATCGCCTTAGAAAAGAGATTGAAGCCCTTAACATAGCAAATGATGCTGCCATCTCAGCTATTAAAGCAAAAACAAATGCTACAATAGCAGAAATTCAAGAGGAAAATGAAGCAATGAAAAAAGCAAAAGCAAAACTTGAGAAAGAAAAAAGTGCACTTAATAATGAATTAAATGAAACTAAAAACTCGCTTGATCAAATTAAGAAGCAAAAAACTAATAGTGACAAGAACTCCCGTATGCTTGAAGAACAAATCAATGAACTAAACAGCAAGTTGGCTCAAGTTGATGAATTACATTCTCAAAGTGAGTCAAAGAATTCTAAAGTTAACAGTGAGTTGTTGGCTCTTAACAGTCAATTGAGCGAATCAGAACATAATTTGGGAATAGCTACTAAAAATATAAAAACTTTAGAAAGTCAACTTGCAGAAAGTAAAAATTTTAATGAAGCTGAATCAAAGGCTAAACTTGAGAATTACAACAGCTCGAATGCTTTT
DQ286055.1 GAAGTTTAGAATTCTTCCAGACTACTTATTAACAGATTATACTGACAGTAGATGAAAGAAGTTAGTCGGTTTACAAATAGTCAAGGAATTAAGATGAAATGTAAGTTTGGCCGAAATGCTACCTTTATTTTAATGTTGGTATAACAATGTCATTTTTTATTGTGGAGCTTGTTGTTGGTTATATGACTAAATCAATGGCATTGGTTGCTGACTCCTTTCAGATGTTATCGGATACAGTCTCTATTATTGTTGGCTTTGTTGCTTTTCACTGTTCGAAGCGTAGTGAAACCTCCAGCCGATTTACATATGGCTGGGTTCGTGCTGAAATACTTGGAGCTTTAGTTAATTCAGTATTTCTTGCTGCTCTTTGTTTTACAATTCTCATAGAATCATTTAAGCGGTCTGCTATTCCAGAAAGAGTTGAAAATCCTAAACTTGTTCTTATAGTCGGAGCAGTTGGTTTGCTTGTTAATATAATTGGGTTGTTTCTTTTTAATCACCACAGTAATGGCCATTCAAATAATAGTGAATCTGTTGAAAAAGGACATAATAATGAAGTTGTAGACAATATTGTTGCTGAATTCCCATTAGTTGATAGTAGTGAAGTGGTTATTTATGATAGTGATAAAAGCAATTCCCAAGTACCTCAAGTTGTAAGTAATAATGAGAATAGTAAAAAAAAATTAGGAGCATCTCGTCTAAATATTCGTGGAGTTTATTTGAATATTCTTGGAGATGCTTTAGGGTCAG
DQ286051.1 CAGCTTTAATGGCCCTCAAGAATTTGTAAAAGCTTTTGCTGGACACCTAAAAAAAGGTAATAAATTTAAAGTGCCAGAATTTGTTGAGATAGTAAAAACTTCGAAAGCTTGTGAACTTGGACCTTCTGATCCTGATTGGTTTTATATCCGTGCTGCAGCTGTTGCTAGACATATTTATTTAAGACCAAACCTTGGTGTTGGTGCTATTCGTAAAATTTATGGTCGTGCTCAAAGGAATGGAACAAGGCCATCACATTCATGCTTAGGATCAGCCTCAATTGCTCGAAAGGTCTTACAATCTCTAGAAGCAATGAAACTTGTTACTAAAGATGCAGCAGGTGGACGTAGCTTAACTCCTGCAGGTCGAAGGGATATGGATCGAATAGCTGGACAGGTTGTGAACAAAGTTTAAAAATATATTACAGAGTTAATATTAAAAAAAAAAAAAAAAAA
hv_seqs[1:]
'AGCTTTAATGGCCCTCAAGAATTTGTAAAAGCTTTTGCTGGACACCTAAAAAAAGGTAATAAATTTAAAGTGCCAGAATTTGTTGAGATAGTAAAAACTTCGAAAGCTTGTGAACTTGGACCTTCTGATCCTGATTGGTTTTATATCCGTGCTGCAGCTGTTGCTAGACATATTTATTTAAGACCAAACCTTGGTGTTGGTGCTATTCGTAAAATTTATGGTCGTGCTCAAAGGAATGGAACAAGGCCATCACATTCATGCTTAGGATCAGCCTCAATTGCTCGAAAGGTCTTACAATCTCTAGAAGCAATGAAACTTGTTACTAAAGATGCAGCAGGTGGACGTAGCTTAACTCCTGCAGGTCGAAGGGATATGGATCGAATAGCTGGACAGGTTGTGAACAAAGTTTAAAAATATATTACAGAGTTAATATTAAAAAAAAAAAAAAAAAA'
hv_seq_1 = hv_seqs[1:]
hv_seq_1.count("AGC")
9
def revs_complement(dna):
pairs = {"A": "T", "C": "G", "G": "C", "T": "A"} # complementary code
c_dna = [pairs[s] for s in dna] # complementary replace
return "".join(c_dna)[::-1] # reverse
revs_complement(hv_seq_1)
'TTTTTTTTTTTTTTTTTTAATATTAACTCTGTAATATATTTTTAAACTTTGTTCACAACCTGTCCAGCTATTCGATCCATATCCCTTCGACCTGCAGGAGTTAAGCTACGTCCACCTGCTGCATCTTTAGTAACAAGTTTCATTGCTTCTAGAGATTGTAAGACCTTTCGAGCAATTGAGGCTGATCCTAAGCATGAATGTGATGGCCTTGTTCCATTCCTTTGAGCACGACCATAAATTTTACGAATAGCACCAACACCAAGGTTTGGTCTTAAATAAATATGTCTAGCAACAGCTGCAGCACGGATATAAAACCAATCAGGATCAGAAGGTCCAAGTTCACAAGCTTTCGAAGTTTTTACTATCTCAACAAATTCTGGCACTTTAAATTTATTACCTTTTTTTAGGTGTCCAGCAAAAGCTTTTACAAATTCTTGAGGGCCATTAAAGCT'
class dna_tool_sets ():
def __init__(self, file_name):
self.file_name = file_name
self.dict = {}
f_reader = open (self.file_name)
for line in f_reader:
line = line.strip("\n")
if ">" in line: # if this line is a header
header = line
self.dict[header] = "" # an initial empty string for header
else:
self.dict[header] += line # add dna sequence
f_reader.close()
def count_records (self):
number_of_records = len(self.dict)
print(" The number of records are in the multi-FASTA file: %d \n"\
%number_of_records)
def check_length(self):
length_dict = {} # a dictornary for record length
for key, value in self.dict.items():
length_dict[key] = len(value)
lengths = length_dict.values() # length count per sequence
max_length = max(lengths)
min_length = min(lengths)
record_max_length = [item for item in length_dict if length_dict[item] == max_length]
record_min_length = [item for item in length_dict if length_dict[item] == min_length]
print(" The length of the longest sequence: %d \n"%max_length, \
"The number of longest sequence: %d \n"%len(record_max_length))
print(" The length of the shortest sequence: %d \n"%min_length, \
"The number of shortest sequence: %d \n"%len(record_min_length))
def revs_complement(dna):
pairs = {"A": "T", "C": "G", "G": "C", "T": "A"}
c_dna = [pairs[s] for s in dna]
return "".join(c_dna)[::-1]
def find_repeats(self, dna, n):
repeats = {}
for i in range(0, len(dna)):
repeat = dna[i:i+n] # all possible repeats
if len(repeat) == n:
if repeat not in repeats:
repeats [repeat] = 1 # create 1st record
else:
repeats[repeat] = repeats.get(repeat) + 1
return repeats
if __name__ == "__main__":
file_name = "hv_nt_seq.fasta"
dna_tools = dna_tool_sets (file_name)
dna_tools.count_records()
dna_tools.check_length()
The number of records are in the multi-FASTA file: 10
The length of the longest sequence: 2115
The number of longest sequence: 1
The length of the shortest sequence: 318
The number of shortest sequence: 1
class dna_tool_sets ():
def __init__(self, file_name):
self.file_name = file_name
self.dict = {}
f_reader = open (self.file_name)
for line in f_reader:
line = line.strip("\n")
if ">" in line:
header = line
self.dict[header] = ""
else:
self.dict[header] += line
f_reader.close()
def find_pos(self, dna):
start_code = "ATG"
stop_codes = ["TAA", "TAG", "TGA"]
pos_dict = {}
for i in range(3):
pos = []
if i == 0:
frame = [dna[j:j+3] for j in range(i, len(dna), 3)]
else:
frame = [dna[:i]] + [dna[j:j+3] for j in range(i, len(dna), 3)]
start_pos = []
stop_pos = []
try:
index_start_pos = [m for m, y in enumerate(frame) if \
y == start_code]
start_pos += index_start_pos
except ValueError:
pos.append((-1, 0))
continue
for stop_code in stop_codes:
try:
# possible positions of stop codes
index_stop_code = [n for n, x in enumerate(frame) if \
x == stop_code and n > min(start_pos)]
stop_pos += index_stop_code
except ValueError:
continue
if len(stop_pos) == 0: # add -1 as start position when no stop found
pos.append((-1, 0))
else:
#find the closest paired code
while len(start_pos) != 0:
start = min(start_pos)
try:
end = min([stop for stop in stop_pos if stop > start])
except ValueError:
break
# add start position and length
s_pos = len("".join(frame[:start])) + 1
pos.append((s_pos, (end - start + 1)*3))
start_pos.remove(start)
pos_dict["frame%d"%(i+1)] = pos
return pos_dict
def revs_complement(dna):
pairs = {"A": "T", "C": "G", "G": "C", "T": "A"} # complementary code
c_dna = [pairs[s] for s in dna] # complementary replace
return "".join(c_dna)[::-1] # reverse
def orf_identifier (self):
orf = {}
for header, dna_seq in self.dict.items(): # generate orf for the whole file
pos = self.find_pos(dna_seq)
orf[header] = pos
# find header for an ID
id_key = [key for key in orf if "DQ286061.1" in key]
idx = id_key[0]
# generate list of frames for questions 4 to 7
frame1, frame2, all_frames, id_frames = [], [], [], [],
for key, dict_value in orf.items():
frame1 += dict_value["frame1"]
frame2 += dict_value["frame2"]
frames = dict_value["frame1"] + dict_value["frame2"] + dict_value["frame3"]
all_frames += frames
if key == idx:
id_frames = dict_value["frame1"] + dict_value["frame2"] + dict_value["frame3"]
frame2_max_length = max(frame2, key = lambda x: x[1])
print("The length of longest ORF in frame2: %d\n"%frame2_max_length[1])
frame1_max_length_pos = max(frame1, key = lambda x: x[1])
print("The start position of longest ORF in frame1: %d\n"%frame1_max_length_pos[0])
max_length = max(all_frames, key = lambda x: x[1])
print("The longest ORF of all frames and sequences: %d\n"%max_length[1])
max_length_id = max(id_frames, key = lambda x: x[1])
print("The length of longest ORF for ", idx, "is: %d \n" %max_length_id[1])
def find_repeats(self, dna, n):
repeats = {}
for i in range(0, len(dna)):
repeat = dna[i:i+n] # all possible repeats
if len(repeat) == n:
if repeat not in repeats:
repeats [repeat] = 1 # initiate record
else:
# count repeated repeats
repeats[repeat] = repeats.get(repeat) + 1
return repeats
def repeats_identifier(self, n):
repeats_set = {}
for header, dna_seq in self.dict.items():
repeats = self.find_repeats(dna_seq, n)
repeats_set[header] = repeats
# record the repeats with counts for the whole file
combined_repeats = {}
for dict_value in repeats_set.values():
for key in dict_value:
if key not in combined_repeats:
combined_repeats[key] = dict_value[key]
else:
combined_repeats[key] = combined_repeats.get(key) \
+ dict_value[key]
if n == 6:
most_freq_7 = max (combined_repeats.values())
print("The most frequently repeats occur: %d times \n"%most_freq_7)
most_freq_7_seq = [key for key in combined_repeats if \
combined_repeats[key] == max(combined_repeats.values())]
print("The following repeats occured most frequently: \n", most_freq_7_seq)
if n == 12:
count_most_freq_10 = len([value for value in combined_repeats.values()\
if value == max(combined_repeats.values())])
print("The number of different 12-base sequences occur max times: %d \n"\
%count_most_freq_10)
if __name__ == "__main__":
file_name = "hv_nt_seq.fasta"
dna_tools = dna_tool_sets (file_name)
dna_tools.orf_identifier()
dna_tools.repeats_identifier(6)
dna_tools.repeats_identifier(12)
The length of longest ORF in frame2: 150
The start position of longest ORF in frame1: 1
The longest ORF of all frames and sequences: 660
The length of longest ORF for >DQ286061.1 Hydra vulgaris mitochondrial succinate dehydrogenase flavoprotein subunit mRNA, partial cds; nuclear gene for mitochondrial product is: 150
The most frequently repeats occur: 79 times
The following repeats occured most frequently:
['AAAAAA']
The number of different 12-base sequences occur max times: 1
protein = 'UUUUUMFSFGIHRLSVFRKISRIAFANKHTLPELGYEYNALEPTISSQIMEIHHRKHHQAYVNNLNTAEEQLAEAQHKGDTSKIIS'
for i in range(len(protein)):
if protein[i] not in 'ACDEFGHIKLMNPQRSTVWY':
print(protein[i], i)
U 0
U 1
U 2
U 3
U 4
for i in range(len(protein)):
if protein[i] not in 'ACDEFGHIKLMNPQRSTVWY':
print("this is not a valid protein sequence!")
break
this is not a valid protein sequence!
for i in range(len(protein)):
if protein[i] not in 'ACDEFGHIKLMNPQRSTVWY':
print("protein contains invalid amino acid %s at position %d" %(protein[i],i))
protein contains invalid amino acid U at position 0
protein contains invalid amino acid U at position 1
protein contains invalid amino acid U at position 2
protein contains invalid amino acid U at position 3
protein contains invalid amino acid U at position 4
corrected_protein=''
for i in range(len(protein)):
if protein[i] not in 'ACDEFGHIKLMNPQRSTVWY':
continue
corrected_protein=corrected_protein+protein[i]
print("Corrected protein sequence is: %s" %corrected_protein)
Corrected protein sequence is: MFSFGIHRLSVFRKISRIAFANKHTLPELGYEYNALEPTISSQIMEIHHRKHHQAYVNNLNTAEEQLAEAQHKGDTSKIIS