title: "Annotate and analyze nucleotide and amino acid sequences using python"
author: "Bhagirathi Dash"
date: "December 1, 2018"
output: html_document

Nucleotide and Amino Acid Sequence Manipulation

import sys
sys.stdout.write("Hello from Python %s\n" % (sys.version,))
Hello from Python 3.5.6 |Anaconda, Inc.| (default, Aug 26 2018, 16:05:27) [MSC v.1900 64 bit (AMD64)]

Manipulate a nucleotide sequence

DNA = "atgcnatgcccnggttaatngcagNNNNNgaTCCCAGCCGGCTGAGGCGGGCAGGGCCGGGCGGGGCCGCGCCACGGAGCCCACAGCCCGGCGCTCCCTGCnnnnNNNNtga"
DNA[:3]
'atg'
DNA.lower()
'atgcnatgcccnggttaatngcagnnnnngatcccagccggctgaggcgggcagggccgggcggggccgcgccacggagcccacagcccggcgctccctgcnnnnnnnntga'

Crerate and manipulate random DNA sequence

import random
def create_dna(n, alphabet='acgt'):
    return ''.join([random.choice(alphabet) for i in range(n)])
dna = create_dna(1000)
dna
'gggctcctccagcgtctaaaggtgctagtcgcccattactctgggtatggacttagagagcataggttacgagccggagagatcaaatatacgacgcagcgagaaatacgatcgtacggtgggagtcacgacgtacatcctaatatgaaccagtttgttcaccgcaatatatcctttcagcggcgcgagtctccccgctcatttggggatagacttagatgggggtgaaggggcgcaacatgtagactaagaagtgaatgcgtgcttcaaatctgtacggtttcctaactcacactttaaaccgcagaataaaactgtataacctatcgtcctgtaaacgagaattcgccacgaacctagctctaaattgtcttgtggtttgtgatcgttgataggagagacacaaatagattaagatcaatcgaatgtgaccataggaccacctatcactttatccccagcagtaaatctactaagcctgaaaacgttgcaacatattatgtctcccagaacgcttagtaacataaggagaaatggagttaccaacgctgtccgccagatcgggcatcgggttgccccgtagtattgagcgtaagttaaagccagacaagtaaggatctgaactgtaaccgaggccgatagcacagcatcagtagcctctatgccgtgcgcgccgaatgtggcaccgcgacagtggaatttcgctgaagggtcttgcaccaattcgcagcttaaaatcctagaattccattatgcggataaatttaattaggtatcacggagtagagacgttagcaggaacagattatggtcgaatcttacacagcaccgaatgaatgttacttcactcattgcctaaattgacggggccagcaatccgggggcccctagccattgcgggtataccattacaagtgtagagtcaagtaggtcatctcttacggtggcggcgtggcatcccgtgcgactccatcctaacgtcgaacgatgctacctatac'
dna.find('tcc')
4
dna.count('n')
0
dna.count('a')
285
dna.count('t')
239
dna.count('g')
242
dna.count('c')
234
dna.count('gg')
43
# count total number of nucleotides

count_nts = 0

for i in dna:
    count_nts = count_nts + 1
    
print(count_nts)
1000
#count unknown number of nucleotides

for unknown_nts in dna:
    unknown_nts = dna.count('n')
    
print('The number of unknown nucleotides are %d.' % unknown_nts)
The number of unknown nucleotides are 0.
#splice donor site position

gt_position = dna.find('gt', 0)

for nts in dna:
    nts = dna.find('gt', gt_position + 1)
    
print('Splice donor site position is at %d.' % gt_position)
Splice donor site position is at 13.
# make a dictionary of nucleotides and manipulate

dna_counts={'t':dna.count('t'),'c':dna.count('c'),
            'g':dna.count('g'),'a':dna.count('a')}
dna_counts.items()
dict_items([('c', 234), ('a', 285), ('g', 242), ('t', 239)])
dna_counts
{'a': 285, 'c': 234, 'g': 242, 't': 239}
dna_counts.keys()
dict_keys(['c', 'a', 'g', 't'])
sorted(dna_counts.keys())
['a', 'c', 'g', 't']
del dna_counts['a']
dna_counts
{'c': 234, 'g': 242, 't': 239}
sorted(dna_counts.values())
[234, 239, 242]
# another approach to build nts dictionary: agnostic about unknown nucleotides

nts_dict = dict()

for nts in dna:
    if nts not in nts_dict:
        nts_dict[nts] = 1
    else :
        nts_dict[nts] = nts_dict[nts] + 1
print(nts_dict)
{'c': 234, 'a': 285, 't': 239, 'g': 242}
# get method for dictionaries

nts_counts = dict()

for nts in dna  :
    nts_counts[nts] = nts_counts.get(nts, 0) + 1

print(nts_counts)
{'c': 234, 'a': 285, 't': 239, 'g': 242}
#tabulate a dictionary
for key in nts_counts:
    print(key, nts_counts[key])
c 234
a 285
t 239
g 242
#another approach to tabulate dictionary

for k, v in nts_counts.items() :
    print(k, v)
c 234
a 285
t 239
g 242
# find whether there is a stop codon in your sequence

def has_stop_codon(dna) :
    stop_codon_found = False
    stop_codons = ['tga', 'tag', 'taa']
    for i in range(0,len(dna),3) :
        codon = dna[i:i+3].lower()
        if codon in stop_codons :
            stop_codon_found = True
            break
    return stop_codon_found

has_stop_codon('dna')
False
#if there is a stop codon find in which frame

def has_stop_codon(dna, frame=0) :
    stop_codon_found = False
    stop_codons = ['tga', 'tag', 'taa']
    for i in range(frame,len(dna),3) :
        codon = dna[i:i+3].lower()
        if codon in stop_codons :
            stop_codon_found = True
            break
    return stop_codon_found

#seq="atgcATGCatgctaa"

y=has_stop_codon('dna', 2)

print(y)

print('Done')
False
Done
# are there undefined bases in your sequence

if 'n' in dna or 'N' in dna:
    nbases=dna.count('n') + dna.count('N')
    print("dna sequence has %d undefined bases" % nbases)
    
else:
    print("Dna sequence has no undefined bases")
Dna sequence has no undefined bases
# calculate the gc percent in sequence

def gc(dna) :
    nbases = dna.count('n') + dna.count('N')
    bases = len(dna) - nbases
    gccount = dna.count('c') + dna.count('C') + dna.count('g') + dna.count('G')
    gcpercent = (gccount/bases)*100
    return gcpercent

print(gc(dna))

print("done")
47.599999999999994
done
# reverse the DNA sequence

x=dna[::-1]
print(x)
catatccatcgtagcaagctgcaatcctacctcagcgtgccctacggtgcggcggtggcattctctactggatgaactgagatgtgaacattaccatatgggcgttaccgatccccgggggcctaacgaccggggcagttaaatccgttactcacttcattgtaagtaagccacgacacattctaagctggtattagacaaggacgattgcagagatgaggcactatggattaatttaaataggcgtattaccttaagatcctaaaattcgacgcttaaccacgttctgggaagtcgctttaaggtgacagcgccacggtgtaagccgcgcgtgccgtatctccgatgactacgacacgatagccggagccaatgtcaagtctaggaatgaacagaccgaaattgaatgcgagttatgatgccccgttgggctacgggctagaccgcctgtcgcaaccattgaggtaaagaggaatacaatgattcgcaagaccctctgtattatacaacgttgcaaaagtccgaatcatctaaatgacgacccctatttcactatccaccaggataccagtgtaagctaactagaattagataaacacagagaggatagttgctagtgtttggtgttctgttaaatctcgatccaagcaccgcttaagagcaaatgtcctgctatccaatatgtcaaaataagacgccaaatttcacactcaatcctttggcatgtctaaacttcgtgcgtaagtgaagaatcagatgtacaacgcggggaagtgggggtagattcagataggggtttactcgcccctctgagcgcggcgactttcctatataacgccacttgtttgaccaagtataatcctacatgcagcactgagggtggcatgctagcataaagagcgacgcagcatataaactagagaggccgagcattggatacgagagattcaggtatgggtctcattacccgctgatcgtggaaatctgcgacctcctcggg
#Make a complement of the DNA sequence

basecomplement = {'A':'T','T':'A',
'G':'C','C':'G',
'a':'t','t':'a',
'g':'c','c':'g',
'N':'N','n':'n'}

letters=list(dna)

letters = [basecomplement[base] for base in letters]

joinletters = ''.join(letters)

print(joinletters)
cccgaggaggtcgcagatttccacgatcagcgggtaatgagacccatacctgaatctctcgtatccaatgctcggcctctctagtttatatgctgcgtcgctctttatgctagcatgccaccctcagtgctgcatgtaggattatacttggtcaaacaagtggcgttatataggaaagtcgccgcgctcagaggggcgagtaaacccctatctgaatctacccccacttccccgcgttgtacatctgattcttcacttacgcacgaagtttagacatgccaaaggattgagtgtgaaatttggcgtcttattttgacatattggatagcaggacatttgctcttaagcggtgcttggatcgagatttaacagaacaccaaacactagcaactatcctctctgtgtttatctaattctagttagcttacactggtatcctggtggatagtgaaataggggtcgtcatttagatgattcggacttttgcaacgttgtataatacagagggtcttgcgaatcattgtattcctctttacctcaatggttgcgacaggcggtctagcccgtagcccaacggggcatcataactcgcattcaatttcggtctgttcattcctagacttgacattggctccggctatcgtgtcgtagtcatcggagatacggcacgcgcggcttacaccgtggcgctgtcaccttaaagcgacttcccagaacgtggttaagcgtcgaattttaggatcttaaggtaatacgcctatttaaattaatccatagtgcctcatctctgcaatcgtccttgtctaataccagcttagaatgtgtcgtggcttacttacaatgaagtgagtaacggatttaactgccccggtcgttaggcccccggggatcggtaacgcccatatggtaatgttcacatctcagttcatccagtagagaatgccaccgccgcaccgtagggcacgctgaggtaggattgcagcttgctacgatggatatg
# reverse the DNA seq using a function

def rev_string(seq):
    return seq[::-1]

rev_string(dna)
'catatccatcgtagcaagctgcaatcctacctcagcgtgccctacggtgcggcggtggcattctctactggatgaactgagatgtgaacattaccatatgggcgttaccgatccccgggggcctaacgaccggggcagttaaatccgttactcacttcattgtaagtaagccacgacacattctaagctggtattagacaaggacgattgcagagatgaggcactatggattaatttaaataggcgtattaccttaagatcctaaaattcgacgcttaaccacgttctgggaagtcgctttaaggtgacagcgccacggtgtaagccgcgcgtgccgtatctccgatgactacgacacgatagccggagccaatgtcaagtctaggaatgaacagaccgaaattgaatgcgagttatgatgccccgttgggctacgggctagaccgcctgtcgcaaccattgaggtaaagaggaatacaatgattcgcaagaccctctgtattatacaacgttgcaaaagtccgaatcatctaaatgacgacccctatttcactatccaccaggataccagtgtaagctaactagaattagataaacacagagaggatagttgctagtgtttggtgttctgttaaatctcgatccaagcaccgcttaagagcaaatgtcctgctatccaatatgtcaaaataagacgccaaatttcacactcaatcctttggcatgtctaaacttcgtgcgtaagtgaagaatcagatgtacaacgcggggaagtgggggtagattcagataggggtttactcgcccctctgagcgcggcgactttcctatataacgccacttgtttgaccaagtataatcctacatgcagcactgagggtggcatgctagcataaagagcgacgcagcatataaactagagaggccgagcattggatacgagagattcaggtatgggtctcattacccgctgatcgtggaaatctgcgacctcctcggg'
# make a complement of the DNA sequence using a function

def rev_complement(dna_seq) :
    base_complement_dict = {'A' : 'T', 'T' : 'A',
                            'G' : 'C', 'C' : 'G',
                            'a' : 't', 't' : 'a',
                            'g' : 'c', 'c' : 'g',
                            'N' : 'N', 'n' : 'n'}
    nucleotides = list(dna_seq)
    nucleotide_complements = [base_complement_dict[nt] for nt in nucleotides]       
    join_compl_letters = ''.join(nucleotide_complements)
    return join_compl_letters[::-1]

rev_complement(dna)
'gtataggtagcatcgttcgacgttaggatggagtcgcacgggatgccacgccgccaccgtaagagatgacctacttgactctacacttgtaatggtatacccgcaatggctaggggcccccggattgctggccccgtcaatttaggcaatgagtgaagtaacattcattcggtgctgtgtaagattcgaccataatctgttcctgctaacgtctctactccgtgatacctaattaaatttatccgcataatggaattctaggattttaagctgcgaattggtgcaagacccttcagcgaaattccactgtcgcggtgccacattcggcgcgcacggcatagaggctactgatgctgtgctatcggcctcggttacagttcagatccttacttgtctggctttaacttacgctcaatactacggggcaacccgatgcccgatctggcggacagcgttggtaactccatttctccttatgttactaagcgttctgggagacataatatgttgcaacgttttcaggcttagtagatttactgctggggataaagtgataggtggtcctatggtcacattcgattgatcttaatctatttgtgtctctcctatcaacgatcacaaaccacaagacaatttagagctaggttcgtggcgaattctcgtttacaggacgataggttatacagttttattctgcggtttaaagtgtgagttaggaaaccgtacagatttgaagcacgcattcacttcttagtctacatgttgcgccccttcacccccatctaagtctatccccaaatgagcggggagactcgcgccgctgaaaggatatattgcggtgaacaaactggttcatattaggatgtacgtcgtgactcccaccgtacgatcgtatttctcgctgcgtcgtatatttgatctctccggctcgtaacctatgctctctaagtccatacccagagtaatgggcgactagcacctttagacgctggaggagccc'
# Print all the steps in making a reverse complement

def myfunction(first, second, third, *therest) :
    basecomplement = {'A':'T','T':'A',
    'G':'C','C':'G',
    'a':'t','t':'a',
    'g':'c','c':'g',
    'N':'N','n':'n'}
    letters=list(dna)
    compl_letters = [basecomplement[base] for base in letters]
    join_compl_letters = ''.join(compl_letters)
    rev_join_compl_letters=join_compl_letters[::-1]
    
    print("first %s" % letters)
    
    print("second %s" % compl_letters)
    
    print("third %s" % join_compl_letters)
    
    print("And all the rest %s" % rev_join_compl_letters)
    
    return

print (myfunction(x,y,z,a))
first ['g', 'g', 'g', 'c', 't', 'c', 'c', 't', 'c', 'c', 'a', 'g', 'c', 'g', 't', 'c', 't', 'a', 'a', 'a', 'g', 'g', 't', 'g', 'c', 't', 'a', 'g', 't', 'c', 'g', 'c', 'c', 'c', 'a', 't', 't', 'a', 'c', 't', 'c', 't', 'g', 'g', 'g', 't', 'a', 't', 'g', 'g', 'a', 'c', 't', 't', 'a', 'g', 'a', 'g', 'a', 'g', 'c', 'a', 't', 'a', 'g', 'g', 't', 't', 'a', 'c', 'g', 'a', 'g', 'c', 'c', 'g', 'g', 'a', 'g', 'a', 'g', 'a', 't', 'c', 'a', 'a', 'a', 't', 'a', 't', 'a', 'c', 'g', 'a', 'c', 'g', 'c', 'a', 'g', 'c', 'g', 'a', 'g', 'a', 'a', 'a', 't', 'a', 'c', 'g', 'a', 't', 'c', 'g', 't', 'a', 'c', 'g', 'g', 't', 'g', 'g', 'g', 'a', 'g', 't', 'c', 'a', 'c', 'g', 'a', 'c', 'g', 't', 'a', 'c', 'a', 't', 'c', 'c', 't', 'a', 'a', 't', 'a', 't', 'g', 'a', 'a', 'c', 'c', 'a', 'g', 't', 't', 't', 'g', 't', 't', 'c', 'a', 'c', 'c', 'g', 'c', 'a', 'a', 't', 'a', 't', 'a', 't', 'c', 'c', 't', 't', 't', 'c', 'a', 'g', 'c', 'g', 'g', 'c', 'g', 'c', 'g', 'a', 'g', 't', 'c', 't', 'c', 'c', 'c', 'c', 'g', 'c', 't', 'c', 'a', 't', 't', 't', 'g', 'g', 'g', 'g', 'a', 't', 'a', 'g', 'a', 'c', 't', 't', 'a', 'g', 'a', 't', 'g', 'g', 'g', 'g', 'g', 't', 'g', 'a', 'a', 'g', 'g', 'g', 'g', 'c', 'g', 'c', 'a', 'a', 'c', 'a', 't', 'g', 't', 'a', 'g', 'a', 'c', 't', 'a', 'a', 'g', 'a', 'a', 'g', 't', 'g', 'a', 'a', 't', 'g', 'c', 'g', 't', 'g', 'c', 't', 't', 'c', 'a', 'a', 'a', 't', 'c', 't', 'g', 't', 'a', 'c', 'g', 'g', 't', 't', 't', 'c', 'c', 't', 'a', 'a', 'c', 't', 'c', 'a', 'c', 'a', 'c', 't', 't', 't', 'a', 'a', 'a', 'c', 'c', 'g', 'c', 'a', 'g', 'a', 'a', 't', 'a', 'a', 'a', 'a', 'c', 't', 'g', 't', 'a', 't', 'a', 'a', 'c', 'c', 't', 'a', 't', 'c', 'g', 't', 'c', 'c', 't', 'g', 't', 'a', 'a', 'a', 'c', 'g', 'a', 'g', 'a', 'a', 't', 't', 'c', 'g', 'c', 'c', 'a', 'c', 'g', 'a', 'a', 'c', 'c', 't', 'a', 'g', 'c', 't', 'c', 't', 'a', 'a', 'a', 't', 't', 'g', 't', 'c', 't', 't', 'g', 't', 'g', 'g', 't', 't', 't', 'g', 't', 'g', 'a', 't', 'c', 'g', 't', 't', 'g', 'a', 't', 'a', 'g', 'g', 'a', 'g', 'a', 'g', 'a', 'c', 'a', 'c', 'a', 'a', 'a', 't', 'a', 'g', 'a', 't', 't', 'a', 'a', 'g', 'a', 't', 'c', 'a', 'a', 't', 'c', 'g', 'a', 'a', 't', 'g', 't', 'g', 'a', 'c', 'c', 'a', 't', 'a', 'g', 'g', 'a', 'c', 'c', 'a', 'c', 'c', 't', 'a', 't', 'c', 'a', 'c', 't', 't', 't', 'a', 't', 'c', 'c', 'c', 'c', 'a', 'g', 'c', 'a', 'g', 't', 'a', 'a', 'a', 't', 'c', 't', 'a', 'c', 't', 'a', 'a', 'g', 'c', 'c', 't', 'g', 'a', 'a', 'a', 'a', 'c', 'g', 't', 't', 'g', 'c', 'a', 'a', 'c', 'a', 't', 'a', 't', 't', 'a', 't', 'g', 't', 'c', 't', 'c', 'c', 'c', 'a', 'g', 'a', 'a', 'c', 'g', 'c', 't', 't', 'a', 'g', 't', 'a', 'a', 'c', 'a', 't', 'a', 'a', 'g', 'g', 'a', 'g', 'a', 'a', 'a', 't', 'g', 'g', 'a', 'g', 't', 't', 'a', 'c', 'c', 'a', 'a', 'c', 'g', 'c', 't', 'g', 't', 'c', 'c', 'g', 'c', 'c', 'a', 'g', 'a', 't', 'c', 'g', 'g', 'g', 'c', 'a', 't', 'c', 'g', 'g', 'g', 't', 't', 'g', 'c', 'c', 'c', 'c', 'g', 't', 'a', 'g', 't', 'a', 't', 't', 'g', 'a', 'g', 'c', 'g', 't', 'a', 'a', 'g', 't', 't', 'a', 'a', 'a', 'g', 'c', 'c', 'a', 'g', 'a', 'c', 'a', 'a', 'g', 't', 'a', 'a', 'g', 'g', 'a', 't', 'c', 't', 'g', 'a', 'a', 'c', 't', 'g', 't', 'a', 'a', 'c', 'c', 'g', 'a', 'g', 'g', 'c', 'c', 'g', 'a', 't', 'a', 'g', 'c', 'a', 'c', 'a', 'g', 'c', 'a', 't', 'c', 'a', 'g', 't', 'a', 'g', 'c', 'c', 't', 'c', 't', 'a', 't', 'g', 'c', 'c', 'g', 't', 'g', 'c', 'g', 'c', 'g', 'c', 'c', 'g', 'a', 'a', 't', 'g', 't', 'g', 'g', 'c', 'a', 'c', 'c', 'g', 'c', 'g', 'a', 'c', 'a', 'g', 't', 'g', 'g', 'a', 'a', 't', 't', 't', 'c', 'g', 'c', 't', 'g', 'a', 'a', 'g', 'g', 'g', 't', 'c', 't', 't', 'g', 'c', 'a', 'c', 'c', 'a', 'a', 't', 't', 'c', 'g', 'c', 'a', 'g', 'c', 't', 't', 'a', 'a', 'a', 'a', 't', 'c', 'c', 't', 'a', 'g', 'a', 'a', 't', 't', 'c', 'c', 'a', 't', 't', 'a', 't', 'g', 'c', 'g', 'g', 'a', 't', 'a', 'a', 'a', 't', 't', 't', 'a', 'a', 't', 't', 'a', 'g', 'g', 't', 'a', 't', 'c', 'a', 'c', 'g', 'g', 'a', 'g', 't', 'a', 'g', 'a', 'g', 'a', 'c', 'g', 't', 't', 'a', 'g', 'c', 'a', 'g', 'g', 'a', 'a', 'c', 'a', 'g', 'a', 't', 't', 'a', 't', 'g', 'g', 't', 'c', 'g', 'a', 'a', 't', 'c', 't', 't', 'a', 'c', 'a', 'c', 'a', 'g', 'c', 'a', 'c', 'c', 'g', 'a', 'a', 't', 'g', 'a', 'a', 't', 'g', 't', 't', 'a', 'c', 't', 't', 'c', 'a', 'c', 't', 'c', 'a', 't', 't', 'g', 'c', 'c', 't', 'a', 'a', 'a', 't', 't', 'g', 'a', 'c', 'g', 'g', 'g', 'g', 'c', 'c', 'a', 'g', 'c', 'a', 'a', 't', 'c', 'c', 'g', 'g', 'g', 'g', 'g', 'c', 'c', 'c', 'c', 't', 'a', 'g', 'c', 'c', 'a', 't', 't', 'g', 'c', 'g', 'g', 'g', 't', 'a', 't', 'a', 'c', 'c', 'a', 't', 't', 'a', 'c', 'a', 'a', 'g', 't', 'g', 't', 'a', 'g', 'a', 'g', 't', 'c', 'a', 'a', 'g', 't', 'a', 'g', 'g', 't', 'c', 'a', 't', 'c', 't', 'c', 't', 't', 'a', 'c', 'g', 'g', 't', 'g', 'g', 'c', 'g', 'g', 'c', 'g', 't', 'g', 'g', 'c', 'a', 't', 'c', 'c', 'c', 'g', 't', 'g', 'c', 'g', 'a', 'c', 't', 'c', 'c', 'a', 't', 'c', 'c', 't', 'a', 'a', 'c', 'g', 't', 'c', 'g', 'a', 'a', 'c', 'g', 'a', 't', 'g', 'c', 't', 'a', 'c', 'c', 't', 'a', 't', 'a', 'c']
second ['c', 'c', 'c', 'g', 'a', 'g', 'g', 'a', 'g', 'g', 't', 'c', 'g', 'c', 'a', 'g', 'a', 't', 't', 't', 'c', 'c', 'a', 'c', 'g', 'a', 't', 'c', 'a', 'g', 'c', 'g', 'g', 'g', 't', 'a', 'a', 't', 'g', 'a', 'g', 'a', 'c', 'c', 'c', 'a', 't', 'a', 'c', 'c', 't', 'g', 'a', 'a', 't', 'c', 't', 'c', 't', 'c', 'g', 't', 'a', 't', 'c', 'c', 'a', 'a', 't', 'g', 'c', 't', 'c', 'g', 'g', 'c', 'c', 't', 'c', 't', 'c', 't', 'a', 'g', 't', 't', 't', 'a', 't', 'a', 't', 'g', 'c', 't', 'g', 'c', 'g', 't', 'c', 'g', 'c', 't', 'c', 't', 't', 't', 'a', 't', 'g', 'c', 't', 'a', 'g', 'c', 'a', 't', 'g', 'c', 'c', 'a', 'c', 'c', 'c', 't', 'c', 'a', 'g', 't', 'g', 'c', 't', 'g', 'c', 'a', 't', 'g', 't', 'a', 'g', 'g', 'a', 't', 't', 'a', 't', 'a', 'c', 't', 't', 'g', 'g', 't', 'c', 'a', 'a', 'a', 'c', 'a', 'a', 'g', 't', 'g', 'g', 'c', 'g', 't', 't', 'a', 't', 'a', 't', 'a', 'g', 'g', 'a', 'a', 'a', 'g', 't', 'c', 'g', 'c', 'c', 'g', 'c', 'g', 'c', 't', 'c', 'a', 'g', 'a', 'g', 'g', 'g', 'g', 'c', 'g', 'a', 'g', 't', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 't', 'a', 't', 'c', 't', 'g', 'a', 'a', 't', 'c', 't', 'a', 'c', 'c', 'c', 'c', 'c', 'a', 'c', 't', 't', 'c', 'c', 'c', 'c', 'g', 'c', 'g', 't', 't', 'g', 't', 'a', 'c', 'a', 't', 'c', 't', 'g', 'a', 't', 't', 'c', 't', 't', 'c', 'a', 'c', 't', 't', 'a', 'c', 'g', 'c', 'a', 'c', 'g', 'a', 'a', 'g', 't', 't', 't', 'a', 'g', 'a', 'c', 'a', 't', 'g', 'c', 'c', 'a', 'a', 'a', 'g', 'g', 'a', 't', 't', 'g', 'a', 'g', 't', 'g', 't', 'g', 'a', 'a', 'a', 't', 't', 't', 'g', 'g', 'c', 'g', 't', 'c', 't', 't', 'a', 't', 't', 't', 't', 'g', 'a', 'c', 'a', 't', 'a', 't', 't', 'g', 'g', 'a', 't', 'a', 'g', 'c', 'a', 'g', 'g', 'a', 'c', 'a', 't', 't', 't', 'g', 'c', 't', 'c', 't', 't', 'a', 'a', 'g', 'c', 'g', 'g', 't', 'g', 'c', 't', 't', 'g', 'g', 'a', 't', 'c', 'g', 'a', 'g', 'a', 't', 't', 't', 'a', 'a', 'c', 'a', 'g', 'a', 'a', 'c', 'a', 'c', 'c', 'a', 'a', 'a', 'c', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 'c', 't', 'a', 't', 'c', 'c', 't', 'c', 't', 'c', 't', 'g', 't', 'g', 't', 't', 't', 'a', 't', 'c', 't', 'a', 'a', 't', 't', 'c', 't', 'a', 'g', 't', 't', 'a', 'g', 'c', 't', 't', 'a', 'c', 'a', 'c', 't', 'g', 'g', 't', 'a', 't', 'c', 'c', 't', 'g', 'g', 't', 'g', 'g', 'a', 't', 'a', 'g', 't', 'g', 'a', 'a', 'a', 't', 'a', 'g', 'g', 'g', 'g', 't', 'c', 'g', 't', 'c', 'a', 't', 't', 't', 'a', 'g', 'a', 't', 'g', 'a', 't', 't', 'c', 'g', 'g', 'a', 'c', 't', 't', 't', 't', 'g', 'c', 'a', 'a', 'c', 'g', 't', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'a', 'c', 'a', 'g', 'a', 'g', 'g', 'g', 't', 'c', 't', 't', 'g', 'c', 'g', 'a', 'a', 't', 'c', 'a', 't', 't', 'g', 't', 'a', 't', 't', 'c', 'c', 't', 'c', 't', 't', 't', 'a', 'c', 'c', 't', 'c', 'a', 'a', 't', 'g', 'g', 't', 't', 'g', 'c', 'g', 'a', 'c', 'a', 'g', 'g', 'c', 'g', 'g', 't', 'c', 't', 'a', 'g', 'c', 'c', 'c', 'g', 't', 'a', 'g', 'c', 'c', 'c', 'a', 'a', 'c', 'g', 'g', 'g', 'g', 'c', 'a', 't', 'c', 'a', 't', 'a', 'a', 'c', 't', 'c', 'g', 'c', 'a', 't', 't', 'c', 'a', 'a', 't', 't', 't', 'c', 'g', 'g', 't', 'c', 't', 'g', 't', 't', 'c', 'a', 't', 't', 'c', 'c', 't', 'a', 'g', 'a', 'c', 't', 't', 'g', 'a', 'c', 'a', 't', 't', 'g', 'g', 'c', 't', 'c', 'c', 'g', 'g', 'c', 't', 'a', 't', 'c', 'g', 't', 'g', 't', 'c', 'g', 't', 'a', 'g', 't', 'c', 'a', 't', 'c', 'g', 'g', 'a', 'g', 'a', 't', 'a', 'c', 'g', 'g', 'c', 'a', 'c', 'g', 'c', 'g', 'c', 'g', 'g', 'c', 't', 't', 'a', 'c', 'a', 'c', 'c', 'g', 't', 'g', 'g', 'c', 'g', 'c', 't', 'g', 't', 'c', 'a', 'c', 'c', 't', 't', 'a', 'a', 'a', 'g', 'c', 'g', 'a', 'c', 't', 't', 'c', 'c', 'c', 'a', 'g', 'a', 'a', 'c', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 'c', 'g', 't', 'c', 'g', 'a', 'a', 't', 't', 't', 't', 'a', 'g', 'g', 'a', 't', 'c', 't', 't', 'a', 'a', 'g', 'g', 't', 'a', 'a', 't', 'a', 'c', 'g', 'c', 'c', 't', 'a', 't', 't', 't', 'a', 'a', 'a', 't', 't', 'a', 'a', 't', 'c', 'c', 'a', 't', 'a', 'g', 't', 'g', 'c', 'c', 't', 'c', 'a', 't', 'c', 't', 'c', 't', 'g', 'c', 'a', 'a', 't', 'c', 'g', 't', 'c', 'c', 't', 't', 'g', 't', 'c', 't', 'a', 'a', 't', 'a', 'c', 'c', 'a', 'g', 'c', 't', 't', 'a', 'g', 'a', 'a', 't', 'g', 't', 'g', 't', 'c', 'g', 't', 'g', 'g', 'c', 't', 't', 'a', 'c', 't', 't', 'a', 'c', 'a', 'a', 't', 'g', 'a', 'a', 'g', 't', 'g', 'a', 'g', 't', 'a', 'a', 'c', 'g', 'g', 'a', 't', 't', 't', 'a', 'a', 'c', 't', 'g', 'c', 'c', 'c', 'c', 'g', 'g', 't', 'c', 'g', 't', 't', 'a', 'g', 'g', 'c', 'c', 'c', 'c', 'c', 'g', 'g', 'g', 'g', 'a', 't', 'c', 'g', 'g', 't', 'a', 'a', 'c', 'g', 'c', 'c', 'c', 'a', 't', 'a', 't', 'g', 'g', 't', 'a', 'a', 't', 'g', 't', 't', 'c', 'a', 'c', 'a', 't', 'c', 't', 'c', 'a', 'g', 't', 't', 'c', 'a', 't', 'c', 'c', 'a', 'g', 't', 'a', 'g', 'a', 'g', 'a', 'a', 't', 'g', 'c', 'c', 'a', 'c', 'c', 'g', 'c', 'c', 'g', 'c', 'a', 'c', 'c', 'g', 't', 'a', 'g', 'g', 'g', 'c', 'a', 'c', 'g', 'c', 't', 'g', 'a', 'g', 'g', 't', 'a', 'g', 'g', 'a', 't', 't', 'g', 'c', 'a', 'g', 'c', 't', 't', 'g', 'c', 't', 'a', 'c', 'g', 'a', 't', 'g', 'g', 'a', 't', 'a', 't', 'g']
third cccgaggaggtcgcagatttccacgatcagcgggtaatgagacccatacctgaatctctcgtatccaatgctcggcctctctagtttatatgctgcgtcgctctttatgctagcatgccaccctcagtgctgcatgtaggattatacttggtcaaacaagtggcgttatataggaaagtcgccgcgctcagaggggcgagtaaacccctatctgaatctacccccacttccccgcgttgtacatctgattcttcacttacgcacgaagtttagacatgccaaaggattgagtgtgaaatttggcgtcttattttgacatattggatagcaggacatttgctcttaagcggtgcttggatcgagatttaacagaacaccaaacactagcaactatcctctctgtgtttatctaattctagttagcttacactggtatcctggtggatagtgaaataggggtcgtcatttagatgattcggacttttgcaacgttgtataatacagagggtcttgcgaatcattgtattcctctttacctcaatggttgcgacaggcggtctagcccgtagcccaacggggcatcataactcgcattcaatttcggtctgttcattcctagacttgacattggctccggctatcgtgtcgtagtcatcggagatacggcacgcgcggcttacaccgtggcgctgtcaccttaaagcgacttcccagaacgtggttaagcgtcgaattttaggatcttaaggtaatacgcctatttaaattaatccatagtgcctcatctctgcaatcgtccttgtctaataccagcttagaatgtgtcgtggcttacttacaatgaagtgagtaacggatttaactgccccggtcgttaggcccccggggatcggtaacgcccatatggtaatgttcacatctcagttcatccagtagagaatgccaccgccgcaccgtagggcacgctgaggtaggattgcagcttgctacgatggatatg
And all the rest gtataggtagcatcgttcgacgttaggatggagtcgcacgggatgccacgccgccaccgtaagagatgacctacttgactctacacttgtaatggtatacccgcaatggctaggggcccccggattgctggccccgtcaatttaggcaatgagtgaagtaacattcattcggtgctgtgtaagattcgaccataatctgttcctgctaacgtctctactccgtgatacctaattaaatttatccgcataatggaattctaggattttaagctgcgaattggtgcaagacccttcagcgaaattccactgtcgcggtgccacattcggcgcgcacggcatagaggctactgatgctgtgctatcggcctcggttacagttcagatccttacttgtctggctttaacttacgctcaatactacggggcaacccgatgcccgatctggcggacagcgttggtaactccatttctccttatgttactaagcgttctgggagacataatatgttgcaacgttttcaggcttagtagatttactgctggggataaagtgataggtggtcctatggtcacattcgattgatcttaatctatttgtgtctctcctatcaacgatcacaaaccacaagacaatttagagctaggttcgtggcgaattctcgtttacaggacgataggttatacagttttattctgcggtttaaagtgtgagttaggaaaccgtacagatttgaagcacgcattcacttcttagtctacatgttgcgccccttcacccccatctaagtctatccccaaatgagcggggagactcgcgccgctgaaaggatatattgcggtgaacaaactggttcatattaggatgtacgtcgtgactcccaccgtacgatcgtatttctcgctgcgtcgtatatttgatctctccggctcgtaacctatgctctctaagtccatacccagagtaatgggcgactagcacctttagacgctggaggagccc
None
def count(dna, base):
    return dna.count(base)

count(dna, "atgc")
4
def count(dna, base):
    return dna.count(base)

count(dna, "atg")
15
def count(dna, base):
    return dna.count(base)

count(dna, "at")
75

Read and manipulate fasta sequences from a local file

hv_fasta = open('hv_nt_seq.fasta')
hv_fasta.read()
'>DQ286061.1 Hydra vulgaris mitochondrial succinate dehydrogenase flavoprotein subunit mRNA, partial cds; nuclear gene for mitochondrial product\nTGTTGTGCTGTTGCTGATCGAACTGGTCATTCACTACTTCATACTCTTTATGGACAGTCATTGCGATACG\nATTGTAACTACTTTATAGAATACTTTGCATTGGATTTGTTGATGGATAAAGGAAAATGTGTTGGGATAAT\nTGCATTAAATCTTGAAGATGGATCTTTGCATAGGATTAAAGCAAAAAATACCGTCCTTGCAACCGGTGGT\nTCTGGAAGAACGTATTTCTCATGTACTTCAGCCCATACATGCACAGGAGATGGCACTGCTATGGTTACAA\nGAGCTGGTCTTGCAAATGAAGATTTAGAGTTCATTCAGTTTCATCCTACTGGTATTTATGGAGCTGGTTG\nTCTCATCACAGAAGGTTGTAGAGGAGAAGGAGGCTACTTGATTAATAGCGAGGGTGAACGCTTTATGGAA\nAGATATGCTCCTACTGCAAAGGATCTTGCCTCAAGAGATGTTGTTTCTCGATCGATGACAATTGAGATGA\nGAGAAGGGCGTGGATGTGGACCTGAAAAAGATCATGTATATTTACAATTGTCTCATCTTCCCCAAGAGAT\nACTTAAATCTCGTCTTCCTGGAATTTCTGAGACAGCCATGATATTTGCTGGTGTTGATGTAACTCGTGAT\nCCTATACCTGTTCTTCCAACTTGCCATTACAATATGGGTGGAATACCAACCAACTTTAACGGACAGGTAA\nTACAACATCATAATGGTAAAGATGTTATTGTGGAAGGTTTGTATGCTGCAGGTGAAGCTGCTTGTGCTTC\nAGTTCATGGTGCTAACCGTCTGGGAGCTAATTCTTTGCTCGATTTAGTCATATTTGGTCGTGCTTGTGCC\nTTAGATATTGCTGCCAAAAATAAGCCTGGAGACAGCATTCCTGATTTACCCAGTGATATTGGTGAAGTAT\nCTGTGGCCAATCTTGATAAGGTTCGGTTTGCCAATGGACACACACCAACTGCAAATTTGAGATTAAAGAT\nGCAAAAGATTATGCAAGGACATGCAGCTGTATTCAGGACTGGTGCTGTCTTGGCAGAAGGAGTGTCAAAG\nATTTATCAGGCTTATGATGAGTTGAAGGATCTTAAGCTTTATGACCGTGGTATGATATTGAATACAGATC\nTTGTTGAAGCTCTGGAACTTCAAAATTTAATGTTAAACTCCTGTCTAGCAATGGTGTCTGCAGAAGCCAG\nAAAAGAAAGTCGTGGTGCGCTT\n\n>DQ286057.1 Hydra vulgaris myosin heavy chain mRNA, partial cds\nTGACAACTTTCTGCTGTTGGGCGCCGGAAAGGTAGTGCTTTTCAAACTGTATCATTTCGTCACAAGGAAC\nAACTAAAAAATTTGCTTACTACTCTTGGAATGACTAGTCCTCATTTCGTAAGATGTATCATTCCTAATGA\nAAAGAAGGAACCAGGAGTTGTTGAGGGCCAACTTGTTCTTCATCAGTTGAGGTGTAATGGTGTCTTGGAA\nGGTATTCGCATATGTAGAAAAGGTTTTCCATCCAGAATGAATTTTCAAGATTTTAAGTTAAGGTACCAAA\nTACTAGCATCTAATGCGATCCCACCTGGTTTTATTGATGGCAAAGTAGCAGCTGAAAAATTAATTGAGGC\nACTTCAACTAGATCAAAGTGAATACAGAGTAGGAAAGACAAAAATATTCTTTAGAGCTGGTATTGTGGGA\nGAGTTAGAAGAAATGCGCGATGAGCGATTATCTAAAATTATTTCACAGTTCCAAGCATACTGTAAGGGCA\nGTATTATGCGCAGTGAATATAAAAAGATGGTGGCACAGCGTATTGGTCTAGCTGTTATTCAAAGAAATGT\nCAGAAAGTATTTATTCTTGCGCCATTGGTCTTGGTGGAAGTTGTACACTAAGGTTCAACCTTTATTGAGT\nGTTGCACGAGCAGAGGATGAAATGAGAGCAAAAGAAGAAGAGTTAGAAGCTGCTAAAGAACAATTAAAAA\nAAGATGCAGAAGCTAAGAAAAAAATGGAAGAAGAACTGACTGAGGCTATGGCTCAAAAAGAAAAACTTTA\nTGCAAGTTTACAAGCTGAGACTGACAGATTAATTACAATTGAAGACAAGCTTCTCAATCTGCAAACAGTT\nAAGGATAAACTTGAAAGTAGTCTAAATGAAGCATTAGAAAAGCTGGATGGAGAAGAACATAGTGTTTTAG\nTTCTTGAAGAAAAGATTCAAGAAGCAGAAGAAAAAATTGACGAACTTACTGAAAAGACTGAGGAACTCCA\nATCAAACATTAGTCGACTTGAAACTGAAAAACAAAATCGTGATAAACAAATTGATACCTTGAATGAAGAT\nATTCGCAAGCAAGATGAAACTATCTCTAAAATGAATGCAGAAAAGAAGCATGTAGATGAGGAGTTGAAAG\nATCGCACTGAACAACTACAGGCTGCTGAGGATAAATGCAACAACCTCAATAAAACAAAGAATAAATTAGA\nATCTTCTATTAGAGAGATTGAACAAGATTTAAAGAAAGAAAAAGACAGTAAAATGAAGTTAGAAAAAGAA\nAAAAAGAAAGTTGAGTCAGATCTTAAAGACAATCGAGATAAACTTTCAGAAACAGAAACTCGTCTAAAAG\nAAACTCAGGATCTTGTAACTAAACGAGAAAAGTCAATATCCGATTTAGAAAATGCAAAAGAAGGTCTTGA\nATCACAGATTAGTCAACTCCAAAGAAAAATACAAGAACTTCTTGCTAAAATTGAAGAATTAGAAGAAGAG\nCTTGAAAACGAAAGAAAGTTGAGGCAGAAATCAGAGCTACAAAGAAAAGAGTTAGAGTCAAGAATTGAGG\nAATTGCAAGACCAACTTGAAACAGCAGGCGGTGCTACATCAGCTCAAGTTGAAGTTGGTAAAAAACGTGA\nAGCTGAATGTAATCGCCTTAGAAAAGAGATTGAAGCCCTTAACATAGCAAATGATGCTGCCATCTCAGCT\nATTAAAGCAAAAACAAATGCTACAATAGCAGAAATTCAAGAGGAAAATGAAGCAATGAAAAAAGCAAAAG\nCAAAACTTGAGAAAGAAAAAAGTGCACTTAATAATGAATTAAATGAAACTAAAAACTCGCTTGATCAAAT\nTAAGAAGCAAAAAACTAATAGTGACAAGAACTCCCGTATGCTTGAAGAACAAATCAATGAACTAAACAGC\nAAGTTGGCTCAAGTTGATGAATTACATTCTCAAAGTGAGTCAAAGAATTCTAAAGTTAACAGTGAGTTGT\nTGGCTCTTAACAGTCAATTGAGCGAATCAGAACATAATTTGGGAATAGCTACTAAAAATATAAAAACTTT\nAGAAAGTCAACTTGCAGAAAGTAAAAATTTTAATGAAGCTGAATCAAAGGCTAAACTTGAGAATTACAAC\nAGCTCGAATGCTTTT\n\n>DQ286058.1 Hydra vulgaris chitinase mRNA, partial cds\nATTGGTCGTTATTGCGGTCAAGGACGTTACCCATTAATGTCTTCAGTGGGTAAACTTCTTGGTGGATACG\nTTCCTCCCGTTGAGCCTACGTTCTCCCCAACCACAAAAGGACCATCAACGCCTAGCAAAAGTAGTACTGC\nCACTGATCGTCCTGCAACAAACCCCCCAACTGGAGCATGTAAGGCAATCGATGCAAGAGTAAAAGATCAA\nTGGTGTAATGATAACTGTCCCAAAGGATATTGCCCTACTGAGTTTTGTAAATGTTAAATAAATAAAAAAA\nGTGTTAACTTATTGCTCTTAAAAAAAAAAAAAAAAAAA\n\n>DQ286055.1 Hydra vulgaris putative solute carrier family 30 mRNA, partial cds\nGAAGTTTAGAATTCTTCCAGACTACTTATTAACAGATTATACTGACAGTAGATGAAAGAAGTTAGTCGGT\nTTACAAATAGTCAAGGAATTAAGATGAAATGTAAGTTTGGCCGAAATGCTACCTTTATTTTAATGTTGGT\nATAACAATGTCATTTTTTATTGTGGAGCTTGTTGTTGGTTATATGACTAAATCAATGGCATTGGTTGCTG\nACTCCTTTCAGATGTTATCGGATACAGTCTCTATTATTGTTGGCTTTGTTGCTTTTCACTGTTCGAAGCG\nTAGTGAAACCTCCAGCCGATTTACATATGGCTGGGTTCGTGCTGAAATACTTGGAGCTTTAGTTAATTCA\nGTATTTCTTGCTGCTCTTTGTTTTACAATTCTCATAGAATCATTTAAGCGGTCTGCTATTCCAGAAAGAG\nTTGAAAATCCTAAACTTGTTCTTATAGTCGGAGCAGTTGGTTTGCTTGTTAATATAATTGGGTTGTTTCT\nTTTTAATCACCACAGTAATGGCCATTCAAATAATAGTGAATCTGTTGAAAAAGGACATAATAATGAAGTT\nGTAGACAATATTGTTGCTGAATTCCCATTAGTTGATAGTAGTGAAGTGGTTATTTATGATAGTGATAAAA\nGCAATTCCCAAGTACCTCAAGTTGTAAGTAATAATGAGAATAGTAAAAAAAAATTAGGAGCATCTCGTCT\nAAATATTCGTGGAGTTTATTTGAATATTCTTGGAGATGCTTTAGGGTCAG\n\n>DQ286054.1 Hydra vulgaris putative solute carrier family 39 member 1 mRNA, partial cds\nTTCTTCAATATTTTAATCAACACAGAAAAAAAATATTTTGACTGTAAAAAAGAAGAATCCTTTTTTAGCT\nATGGGTTTTTATGAATTACCAGAATGGTCAGTTAAACTAATTATAATTATTATTTTATTTTTATTGGGCA\nTGATATTTGGTGTTGTACCATTAAATCTTTCACGAAGCTCCTCATTTGAAGGAAGAGTTTCTCCTACTCG\nCAATCTGCTTATTAGTTTGTCAAATTGCTTTGCTGGAGGGGTGTTTTTTAGCACTGTTATTCTTGATTTA\nTTTCCATTGGTAAAGTTAACAGTAAATAATGCACTAATATCTGTTTATATTGATACTGATTTTCCGCTAG\nGGGATTTTATTATTGGTATTGGATTTATCTTTATGTTAATTTTAGAGCATATAGTTCATTCTTGTTGCCA\nTCCTAATCAGTTATCTTATGAAGCTCCTAAAAATGTTAATAGTAACCAGGATGAATTATCATGTAATGAA\nAATAATCATCTTTTATCTCATGACAACAACTTTGATGTAGTTACTGATATTGAGATAAACACTTCAGAAA\nGACAGCTGCAACAA\n\n>DQ286051.1 Hydra vulgaris ribosomal protein S19 mRNA, partial cds\nCAGCTTTAATGGCCCTCAAGAATTTGTAAAAGCTTTTGCTGGACACCTAAAAAAAGGTAATAAATTTAAA\nGTGCCAGAATTTGTTGAGATAGTAAAAACTTCGAAAGCTTGTGAACTTGGACCTTCTGATCCTGATTGGT\nTTTATATCCGTGCTGCAGCTGTTGCTAGACATATTTATTTAAGACCAAACCTTGGTGTTGGTGCTATTCG\nTAAAATTTATGGTCGTGCTCAAAGGAATGGAACAAGGCCATCACATTCATGCTTAGGATCAGCCTCAATT\nGCTCGAAAGGTCTTACAATCTCTAGAAGCAATGAAACTTGTTACTAAAGATGCAGCAGGTGGACGTAGCT\nTAACTCCTGCAGGTCGAAGGGATATGGATCGAATAGCTGGACAGGTTGTGAACAAAGTTTAAAAATATAT\nTACAGAGTTAATATTAAAAAAAAAAAAAAAAAA\n\n>DQ286050.1 Hydra vulgaris ribosomal protein S9 mRNA, partial cds\nCATGCAATGGTCTTGTTCGTATTGGAGTACTTGATGAAGGAAGAATGAAGCTAGATTATGTTTTAGGTTT\nAAAAGTAGAAGATTTTTTGGAAAGACGTCTACAAACTCAAGTGCTTAAGTTGGGTCTCGCTAAGTCTATT\nCACCATGCTCGTGTTCTTATCCGACAAAAGCATATTAGAGTGCGAAAGCAGTTAGTCAACATCCCATCAT\nTTATCGTGAGACTTGACTCTCAAAAGCACATAGATTTCAGTACTAATTCACCATTCGGTGGTGGTCGACC\nAGGACGTGTTTCACGAAAGAACATGAAGAAAGGTGGCAGTGGAGGAAACGATGAAGAAGACGAAGATGAA\nTAGATTATATTGAAATCTGGCATGTGATTGTTTTGTTAGGCGGTTAATAAAGATCATTTGTCAAATAAAT\nCTAAATACTGTACAATAAAAAAAAAAAAAAAAAAA\n\n>DQ286049.1 Hydra vulgaris ribosomal protein S7 mRNA, partial cds\nACATATGGCTGGGTTCGTGAGCTAGAGAAAAAGTTTTCTGGAAAGCATGTCATTGTTGTTGGACAGAGAA\nGAATCTTGCCTAAACCCAGTCGTAAGACAAGAAATCAAAAGCAAATGAGACCAAGAAGTCGTACTCTAAC\nTGCTGTGCACGATGCCATTCTTGAAGATCTTTGTTTCCCATCGGAAATCGTTGGTAAAAGCATTCGAGTT\nAAATTAGATGGTTCAAGATTGATAAAAATAGTTTTAGAAAAAGCTCAGCAAACAAATGTTGAACATAAAC\nTTGACACGTTTGCAAATGTTTACAAGAAACTAACTGGTAAAGACACTCATTTTACTTTCGAAATATAAGT\nCATAACAGAGAAAAAAAAAAAAAAAAAA\n\n>NM_001309772.1 Hydra vulgaris phospholipid hydroperoxide glutathione peroxidase, mitochondrial-like (LOC100215761), mRNA\nATGGCTGCATCAGACCCTACAAAAGCTTCTTCTATATTTGAATTTCAAGCAAAAAGTATAGATGGTGAAG\nATATCAGTCTTTCGAAATATAAAGGTTTTGTTACACTTATTGTTAACGTGGCTAGCAAGGGTTTAACTGA\nACTCAACTATGCTCAGCTTGCTGATCTGCACACCAAGTATGCTGAGAAAGGTCTTCGAATTCTTGCTTTT\nCCTTGTAATCAGTTTGGTAACCAAGAGCCTGGTACAGATTTAGAAATAAAAGCGTTTGCATTAGCGCGAG\nGCGCCCACTATGACTTATTCAGTAAAATTGATGTTAATGGAGATAAGGCAGATCCTCTGTATAAATATTT\nGAAATCAAAGCAGAAAGGTATTTTGGGTAATAAAATCAAATGGAATTTTTCAAAGTTTATTTGTGATAAA\nAACGGTATCCCTGTTAAAAGATATGCTCCTACAACAGAACCTTTGTCATTAGTTCCAGATATCGAAAAGT\nATTTATGCCAATAA\n\n>NM_001309736.1 Hydra vulgaris superoxide dismutase [Mn], mitochondrial-like (LOC100209764), mRNA\nATGTTTTCTTTTGGAATCCACCGCCTTTCAGTTTTTCGAAAAATATCGAGAATAGCATTTGCTAATAAGC\nACACTCTTCCAGAATTGGGGTATGAATATAATGCATTGGAACCAACAATCAGCAGTCAAATTATGGAGAT\nACATCATCGCAAACACCACCAAGCTTATGTAAATAACTTAAATACAGCAGAAGAACAGTTAGCTGAAGCT\nCAGCATAAAGGAGATACGTCAAAGATTATTTCTTTAGCTCCTGCGTTAAAATTCAATGGAGGTGGGCACA\nTCAATCATTCCATTTTTTGGACTAATCTTTCGCCAAACGGTGGAGGAAAACCAACAGGTGAACTATTAGA\nAGCCATATTAAAAGACTTTGGGTCTTTTGAGGCAATGAAAACACGGTTATCGTCTCCAGCTGTTGCAGTG\nCAAGGTTCGGGTTGGGGTTGGTTGGGATACGATTCTGTCACTAAAAGACTTGCAATTACAGCTTTACCTA\nATCAAGATCCTTTGCAAGCTACTACTGGGTTAATACCGTTACTCGGTATTGATGTTTGGGAGCATGCGTA\nCTACTTGCAGTATAAGAATGTTCGTCTTGATTATGTCAACGCAATATTTAACATCATTGATTGGAAAAAT\nGTATCCGCAAGGTTTGTCGCAGCTAAATAA\n\n'
hv_fasta = open('hv_nt_seq.fasta')
hv_seqs = {}

for lines in hv_fasta:
    lines = lines.rstrip()
    if lines.startswith('>') :
        words = lines.split()
        name = words[0][1: ]
        hv_seqs[name] = ''
        
    else :
        hv_seqs[name] = hv_seqs[name] + lines
        
hv_fasta.close()
        
for name, hv_seqs in hv_seqs.items() :
    print(name, hv_seqs)
DQ286061.1 TGTTGTGCTGTTGCTGATCGAACTGGTCATTCACTACTTCATACTCTTTATGGACAGTCATTGCGATACGATTGTAACTACTTTATAGAATACTTTGCATTGGATTTGTTGATGGATAAAGGAAAATGTGTTGGGATAATTGCATTAAATCTTGAAGATGGATCTTTGCATAGGATTAAAGCAAAAAATACCGTCCTTGCAACCGGTGGTTCTGGAAGAACGTATTTCTCATGTACTTCAGCCCATACATGCACAGGAGATGGCACTGCTATGGTTACAAGAGCTGGTCTTGCAAATGAAGATTTAGAGTTCATTCAGTTTCATCCTACTGGTATTTATGGAGCTGGTTGTCTCATCACAGAAGGTTGTAGAGGAGAAGGAGGCTACTTGATTAATAGCGAGGGTGAACGCTTTATGGAAAGATATGCTCCTACTGCAAAGGATCTTGCCTCAAGAGATGTTGTTTCTCGATCGATGACAATTGAGATGAGAGAAGGGCGTGGATGTGGACCTGAAAAAGATCATGTATATTTACAATTGTCTCATCTTCCCCAAGAGATACTTAAATCTCGTCTTCCTGGAATTTCTGAGACAGCCATGATATTTGCTGGTGTTGATGTAACTCGTGATCCTATACCTGTTCTTCCAACTTGCCATTACAATATGGGTGGAATACCAACCAACTTTAACGGACAGGTAATACAACATCATAATGGTAAAGATGTTATTGTGGAAGGTTTGTATGCTGCAGGTGAAGCTGCTTGTGCTTCAGTTCATGGTGCTAACCGTCTGGGAGCTAATTCTTTGCTCGATTTAGTCATATTTGGTCGTGCTTGTGCCTTAGATATTGCTGCCAAAAATAAGCCTGGAGACAGCATTCCTGATTTACCCAGTGATATTGGTGAAGTATCTGTGGCCAATCTTGATAAGGTTCGGTTTGCCAATGGACACACACCAACTGCAAATTTGAGATTAAAGATGCAAAAGATTATGCAAGGACATGCAGCTGTATTCAGGACTGGTGCTGTCTTGGCAGAAGGAGTGTCAAAGATTTATCAGGCTTATGATGAGTTGAAGGATCTTAAGCTTTATGACCGTGGTATGATATTGAATACAGATCTTGTTGAAGCTCTGGAACTTCAAAATTTAATGTTAAACTCCTGTCTAGCAATGGTGTCTGCAGAAGCCAGAAAAGAAAGTCGTGGTGCGCTT
DQ286058.1 ATTGGTCGTTATTGCGGTCAAGGACGTTACCCATTAATGTCTTCAGTGGGTAAACTTCTTGGTGGATACGTTCCTCCCGTTGAGCCTACGTTCTCCCCAACCACAAAAGGACCATCAACGCCTAGCAAAAGTAGTACTGCCACTGATCGTCCTGCAACAAACCCCCCAACTGGAGCATGTAAGGCAATCGATGCAAGAGTAAAAGATCAATGGTGTAATGATAACTGTCCCAAAGGATATTGCCCTACTGAGTTTTGTAAATGTTAAATAAATAAAAAAAGTGTTAACTTATTGCTCTTAAAAAAAAAAAAAAAAAAA
DQ286054.1 TTCTTCAATATTTTAATCAACACAGAAAAAAAATATTTTGACTGTAAAAAAGAAGAATCCTTTTTTAGCTATGGGTTTTTATGAATTACCAGAATGGTCAGTTAAACTAATTATAATTATTATTTTATTTTTATTGGGCATGATATTTGGTGTTGTACCATTAAATCTTTCACGAAGCTCCTCATTTGAAGGAAGAGTTTCTCCTACTCGCAATCTGCTTATTAGTTTGTCAAATTGCTTTGCTGGAGGGGTGTTTTTTAGCACTGTTATTCTTGATTTATTTCCATTGGTAAAGTTAACAGTAAATAATGCACTAATATCTGTTTATATTGATACTGATTTTCCGCTAGGGGATTTTATTATTGGTATTGGATTTATCTTTATGTTAATTTTAGAGCATATAGTTCATTCTTGTTGCCATCCTAATCAGTTATCTTATGAAGCTCCTAAAAATGTTAATAGTAACCAGGATGAATTATCATGTAATGAAAATAATCATCTTTTATCTCATGACAACAACTTTGATGTAGTTACTGATATTGAGATAAACACTTCAGAAAGACAGCTGCAACAA
NM_001309736.1 ATGTTTTCTTTTGGAATCCACCGCCTTTCAGTTTTTCGAAAAATATCGAGAATAGCATTTGCTAATAAGCACACTCTTCCAGAATTGGGGTATGAATATAATGCATTGGAACCAACAATCAGCAGTCAAATTATGGAGATACATCATCGCAAACACCACCAAGCTTATGTAAATAACTTAAATACAGCAGAAGAACAGTTAGCTGAAGCTCAGCATAAAGGAGATACGTCAAAGATTATTTCTTTAGCTCCTGCGTTAAAATTCAATGGAGGTGGGCACATCAATCATTCCATTTTTTGGACTAATCTTTCGCCAAACGGTGGAGGAAAACCAACAGGTGAACTATTAGAAGCCATATTAAAAGACTTTGGGTCTTTTGAGGCAATGAAAACACGGTTATCGTCTCCAGCTGTTGCAGTGCAAGGTTCGGGTTGGGGTTGGTTGGGATACGATTCTGTCACTAAAAGACTTGCAATTACAGCTTTACCTAATCAAGATCCTTTGCAAGCTACTACTGGGTTAATACCGTTACTCGGTATTGATGTTTGGGAGCATGCGTACTACTTGCAGTATAAGAATGTTCGTCTTGATTATGTCAACGCAATATTTAACATCATTGATTGGAAAAATGTATCCGCAAGGTTTGTCGCAGCTAAATAA
NM_001309772.1 ATGGCTGCATCAGACCCTACAAAAGCTTCTTCTATATTTGAATTTCAAGCAAAAAGTATAGATGGTGAAGATATCAGTCTTTCGAAATATAAAGGTTTTGTTACACTTATTGTTAACGTGGCTAGCAAGGGTTTAACTGAACTCAACTATGCTCAGCTTGCTGATCTGCACACCAAGTATGCTGAGAAAGGTCTTCGAATTCTTGCTTTTCCTTGTAATCAGTTTGGTAACCAAGAGCCTGGTACAGATTTAGAAATAAAAGCGTTTGCATTAGCGCGAGGCGCCCACTATGACTTATTCAGTAAAATTGATGTTAATGGAGATAAGGCAGATCCTCTGTATAAATATTTGAAATCAAAGCAGAAAGGTATTTTGGGTAATAAAATCAAATGGAATTTTTCAAAGTTTATTTGTGATAAAAACGGTATCCCTGTTAAAAGATATGCTCCTACAACAGAACCTTTGTCATTAGTTCCAGATATCGAAAAGTATTTATGCCAATAA
DQ286050.1 CATGCAATGGTCTTGTTCGTATTGGAGTACTTGATGAAGGAAGAATGAAGCTAGATTATGTTTTAGGTTTAAAAGTAGAAGATTTTTTGGAAAGACGTCTACAAACTCAAGTGCTTAAGTTGGGTCTCGCTAAGTCTATTCACCATGCTCGTGTTCTTATCCGACAAAAGCATATTAGAGTGCGAAAGCAGTTAGTCAACATCCCATCATTTATCGTGAGACTTGACTCTCAAAAGCACATAGATTTCAGTACTAATTCACCATTCGGTGGTGGTCGACCAGGACGTGTTTCACGAAAGAACATGAAGAAAGGTGGCAGTGGAGGAAACGATGAAGAAGACGAAGATGAATAGATTATATTGAAATCTGGCATGTGATTGTTTTGTTAGGCGGTTAATAAAGATCATTTGTCAAATAAATCTAAATACTGTACAATAAAAAAAAAAAAAAAAAAA
DQ286049.1 ACATATGGCTGGGTTCGTGAGCTAGAGAAAAAGTTTTCTGGAAAGCATGTCATTGTTGTTGGACAGAGAAGAATCTTGCCTAAACCCAGTCGTAAGACAAGAAATCAAAAGCAAATGAGACCAAGAAGTCGTACTCTAACTGCTGTGCACGATGCCATTCTTGAAGATCTTTGTTTCCCATCGGAAATCGTTGGTAAAAGCATTCGAGTTAAATTAGATGGTTCAAGATTGATAAAAATAGTTTTAGAAAAAGCTCAGCAAACAAATGTTGAACATAAACTTGACACGTTTGCAAATGTTTACAAGAAACTAACTGGTAAAGACACTCATTTTACTTTCGAAATATAAGTCATAACAGAGAAAAAAAAAAAAAAAAAA
DQ286057.1 TGACAACTTTCTGCTGTTGGGCGCCGGAAAGGTAGTGCTTTTCAAACTGTATCATTTCGTCACAAGGAACAACTAAAAAATTTGCTTACTACTCTTGGAATGACTAGTCCTCATTTCGTAAGATGTATCATTCCTAATGAAAAGAAGGAACCAGGAGTTGTTGAGGGCCAACTTGTTCTTCATCAGTTGAGGTGTAATGGTGTCTTGGAAGGTATTCGCATATGTAGAAAAGGTTTTCCATCCAGAATGAATTTTCAAGATTTTAAGTTAAGGTACCAAATACTAGCATCTAATGCGATCCCACCTGGTTTTATTGATGGCAAAGTAGCAGCTGAAAAATTAATTGAGGCACTTCAACTAGATCAAAGTGAATACAGAGTAGGAAAGACAAAAATATTCTTTAGAGCTGGTATTGTGGGAGAGTTAGAAGAAATGCGCGATGAGCGATTATCTAAAATTATTTCACAGTTCCAAGCATACTGTAAGGGCAGTATTATGCGCAGTGAATATAAAAAGATGGTGGCACAGCGTATTGGTCTAGCTGTTATTCAAAGAAATGTCAGAAAGTATTTATTCTTGCGCCATTGGTCTTGGTGGAAGTTGTACACTAAGGTTCAACCTTTATTGAGTGTTGCACGAGCAGAGGATGAAATGAGAGCAAAAGAAGAAGAGTTAGAAGCTGCTAAAGAACAATTAAAAAAAGATGCAGAAGCTAAGAAAAAAATGGAAGAAGAACTGACTGAGGCTATGGCTCAAAAAGAAAAACTTTATGCAAGTTTACAAGCTGAGACTGACAGATTAATTACAATTGAAGACAAGCTTCTCAATCTGCAAACAGTTAAGGATAAACTTGAAAGTAGTCTAAATGAAGCATTAGAAAAGCTGGATGGAGAAGAACATAGTGTTTTAGTTCTTGAAGAAAAGATTCAAGAAGCAGAAGAAAAAATTGACGAACTTACTGAAAAGACTGAGGAACTCCAATCAAACATTAGTCGACTTGAAACTGAAAAACAAAATCGTGATAAACAAATTGATACCTTGAATGAAGATATTCGCAAGCAAGATGAAACTATCTCTAAAATGAATGCAGAAAAGAAGCATGTAGATGAGGAGTTGAAAGATCGCACTGAACAACTACAGGCTGCTGAGGATAAATGCAACAACCTCAATAAAACAAAGAATAAATTAGAATCTTCTATTAGAGAGATTGAACAAGATTTAAAGAAAGAAAAAGACAGTAAAATGAAGTTAGAAAAAGAAAAAAAGAAAGTTGAGTCAGATCTTAAAGACAATCGAGATAAACTTTCAGAAACAGAAACTCGTCTAAAAGAAACTCAGGATCTTGTAACTAAACGAGAAAAGTCAATATCCGATTTAGAAAATGCAAAAGAAGGTCTTGAATCACAGATTAGTCAACTCCAAAGAAAAATACAAGAACTTCTTGCTAAAATTGAAGAATTAGAAGAAGAGCTTGAAAACGAAAGAAAGTTGAGGCAGAAATCAGAGCTACAAAGAAAAGAGTTAGAGTCAAGAATTGAGGAATTGCAAGACCAACTTGAAACAGCAGGCGGTGCTACATCAGCTCAAGTTGAAGTTGGTAAAAAACGTGAAGCTGAATGTAATCGCCTTAGAAAAGAGATTGAAGCCCTTAACATAGCAAATGATGCTGCCATCTCAGCTATTAAAGCAAAAACAAATGCTACAATAGCAGAAATTCAAGAGGAAAATGAAGCAATGAAAAAAGCAAAAGCAAAACTTGAGAAAGAAAAAAGTGCACTTAATAATGAATTAAATGAAACTAAAAACTCGCTTGATCAAATTAAGAAGCAAAAAACTAATAGTGACAAGAACTCCCGTATGCTTGAAGAACAAATCAATGAACTAAACAGCAAGTTGGCTCAAGTTGATGAATTACATTCTCAAAGTGAGTCAAAGAATTCTAAAGTTAACAGTGAGTTGTTGGCTCTTAACAGTCAATTGAGCGAATCAGAACATAATTTGGGAATAGCTACTAAAAATATAAAAACTTTAGAAAGTCAACTTGCAGAAAGTAAAAATTTTAATGAAGCTGAATCAAAGGCTAAACTTGAGAATTACAACAGCTCGAATGCTTTT
DQ286055.1 GAAGTTTAGAATTCTTCCAGACTACTTATTAACAGATTATACTGACAGTAGATGAAAGAAGTTAGTCGGTTTACAAATAGTCAAGGAATTAAGATGAAATGTAAGTTTGGCCGAAATGCTACCTTTATTTTAATGTTGGTATAACAATGTCATTTTTTATTGTGGAGCTTGTTGTTGGTTATATGACTAAATCAATGGCATTGGTTGCTGACTCCTTTCAGATGTTATCGGATACAGTCTCTATTATTGTTGGCTTTGTTGCTTTTCACTGTTCGAAGCGTAGTGAAACCTCCAGCCGATTTACATATGGCTGGGTTCGTGCTGAAATACTTGGAGCTTTAGTTAATTCAGTATTTCTTGCTGCTCTTTGTTTTACAATTCTCATAGAATCATTTAAGCGGTCTGCTATTCCAGAAAGAGTTGAAAATCCTAAACTTGTTCTTATAGTCGGAGCAGTTGGTTTGCTTGTTAATATAATTGGGTTGTTTCTTTTTAATCACCACAGTAATGGCCATTCAAATAATAGTGAATCTGTTGAAAAAGGACATAATAATGAAGTTGTAGACAATATTGTTGCTGAATTCCCATTAGTTGATAGTAGTGAAGTGGTTATTTATGATAGTGATAAAAGCAATTCCCAAGTACCTCAAGTTGTAAGTAATAATGAGAATAGTAAAAAAAAATTAGGAGCATCTCGTCTAAATATTCGTGGAGTTTATTTGAATATTCTTGGAGATGCTTTAGGGTCAG
DQ286051.1 CAGCTTTAATGGCCCTCAAGAATTTGTAAAAGCTTTTGCTGGACACCTAAAAAAAGGTAATAAATTTAAAGTGCCAGAATTTGTTGAGATAGTAAAAACTTCGAAAGCTTGTGAACTTGGACCTTCTGATCCTGATTGGTTTTATATCCGTGCTGCAGCTGTTGCTAGACATATTTATTTAAGACCAAACCTTGGTGTTGGTGCTATTCGTAAAATTTATGGTCGTGCTCAAAGGAATGGAACAAGGCCATCACATTCATGCTTAGGATCAGCCTCAATTGCTCGAAAGGTCTTACAATCTCTAGAAGCAATGAAACTTGTTACTAAAGATGCAGCAGGTGGACGTAGCTTAACTCCTGCAGGTCGAAGGGATATGGATCGAATAGCTGGACAGGTTGTGAACAAAGTTTAAAAATATATTACAGAGTTAATATTAAAAAAAAAAAAAAAAAA
hv_seqs[1:]
'AGCTTTAATGGCCCTCAAGAATTTGTAAAAGCTTTTGCTGGACACCTAAAAAAAGGTAATAAATTTAAAGTGCCAGAATTTGTTGAGATAGTAAAAACTTCGAAAGCTTGTGAACTTGGACCTTCTGATCCTGATTGGTTTTATATCCGTGCTGCAGCTGTTGCTAGACATATTTATTTAAGACCAAACCTTGGTGTTGGTGCTATTCGTAAAATTTATGGTCGTGCTCAAAGGAATGGAACAAGGCCATCACATTCATGCTTAGGATCAGCCTCAATTGCTCGAAAGGTCTTACAATCTCTAGAAGCAATGAAACTTGTTACTAAAGATGCAGCAGGTGGACGTAGCTTAACTCCTGCAGGTCGAAGGGATATGGATCGAATAGCTGGACAGGTTGTGAACAAAGTTTAAAAATATATTACAGAGTTAATATTAAAAAAAAAAAAAAAAAA'
hv_seq_1 = hv_seqs[1:]
hv_seq_1.count("AGC")
9
def revs_complement(dna):
    
        pairs = {"A": "T", "C": "G", "G": "C", "T": "A"} # complementary code
        c_dna = [pairs[s] for s in dna] # complementary replace
        return "".join(c_dna)[::-1] # reverse
    
revs_complement(hv_seq_1)
'TTTTTTTTTTTTTTTTTTAATATTAACTCTGTAATATATTTTTAAACTTTGTTCACAACCTGTCCAGCTATTCGATCCATATCCCTTCGACCTGCAGGAGTTAAGCTACGTCCACCTGCTGCATCTTTAGTAACAAGTTTCATTGCTTCTAGAGATTGTAAGACCTTTCGAGCAATTGAGGCTGATCCTAAGCATGAATGTGATGGCCTTGTTCCATTCCTTTGAGCACGACCATAAATTTTACGAATAGCACCAACACCAAGGTTTGGTCTTAAATAAATATGTCTAGCAACAGCTGCAGCACGGATATAAAACCAATCAGGATCAGAAGGTCCAAGTTCACAAGCTTTCGAAGTTTTTACTATCTCAACAAATTCTGGCACTTTAAATTTATTACCTTTTTTTAGGTGTCCAGCAAAAGCTTTTACAAATTCTTGAGGGCCATTAAAGCT'
class dna_tool_sets ():
    def __init__(self, file_name):
        self.file_name = file_name  
        self.dict = {} 
        f_reader = open (self.file_name)
        for line in f_reader:
            line = line.strip("\n") 
            if ">" in line: # if this line is a header
                header = line 
                self.dict[header] = "" # an initial empty string for header
            else:
                self.dict[header] += line # add dna sequence 
        f_reader.close()
    
    def count_records (self):
        number_of_records = len(self.dict) 
        print(" The number of records are in the multi-FASTA file: %d \n"\
               %number_of_records)
        
    def check_length(self):
        length_dict = {} # a dictornary for record length 
        for key, value in self.dict.items():
            length_dict[key] = len(value)
            
        lengths = length_dict.values() # length count per sequence
        
        max_length = max(lengths)  
        min_length = min(lengths) 

        record_max_length = [item for item in length_dict if length_dict[item] == max_length]
 
        record_min_length = [item for item in length_dict if length_dict[item] == min_length]
        
        print(" The length of the longest sequence: %d \n"%max_length, \
              "The number of longest sequence: %d \n"%len(record_max_length))
        
        print(" The length of the shortest sequence: %d \n"%min_length, \
              "The number of shortest sequence: %d \n"%len(record_min_length))
        
    
    def revs_complement(dna):
        pairs = {"A": "T", "C": "G", "G": "C", "T": "A"} 
        c_dna = [pairs[s] for s in dna] 
        return "".join(c_dna)[::-1] 
        
      
    def find_repeats(self, dna, n):
        repeats = {}
        for i in range(0, len(dna)):
            repeat = dna[i:i+n] # all possible repeats
            if len(repeat) == n:
                if repeat not in repeats:
                    repeats [repeat] = 1 # create 1st record
                else:
                    repeats[repeat] = repeats.get(repeat) + 1
        return repeats
    
      
if __name__ == "__main__":
    
    file_name = "hv_nt_seq.fasta"
    
    dna_tools = dna_tool_sets (file_name)
         
    dna_tools.count_records()
         
    dna_tools.check_length()
 The number of records are in the multi-FASTA file: 10 

 The length of the longest sequence: 2115 
 The number of longest sequence: 1 

 The length of the shortest sequence: 318 
 The number of shortest sequence: 1 
class dna_tool_sets ():
   
    def __init__(self, file_name):
        
        self.file_name = file_name  
        self.dict = {} 
        f_reader = open (self.file_name)
        for line in f_reader:
            line = line.strip("\n") 
            if ">" in line: 
                header = line 
                self.dict[header] = "" 
            else:
                self.dict[header] += line 
        f_reader.close()
      
    
    def find_pos(self, dna):
        start_code = "ATG"
        stop_codes = ["TAA", "TAG", "TGA"]

        pos_dict = {} 
        
        for i in range(3): 
            pos = []
            if i == 0:
                frame = [dna[j:j+3] for j in range(i, len(dna), 3)]
            else:
                frame = [dna[:i]] + [dna[j:j+3] for j in range(i, len(dna), 3)]
            start_pos = []
            stop_pos = []
            try:
                index_start_pos = [m for m, y in enumerate(frame) if \
                                  y == start_code]
                start_pos += index_start_pos 
            except ValueError:
                pos.append((-1, 0)) 
                continue
 
            for stop_code in stop_codes:
                try:
                    # possible positions of stop codes
                    index_stop_code = [n for n, x in enumerate(frame) if \
                                       x == stop_code and n > min(start_pos)]
                    stop_pos += index_stop_code
                except ValueError:
                    continue
            if len(stop_pos) == 0: # add -1 as start position when no stop found
                 pos.append((-1, 0))
            else:
                #find the closest paired code
                 while len(start_pos) != 0:
                     start = min(start_pos)
                     try:
                         end = min([stop for stop in stop_pos if stop > start])
                     except ValueError:
                         break     
                 # add start position and length
                     s_pos = len("".join(frame[:start])) + 1
                     pos.append((s_pos, (end - start + 1)*3))
                     start_pos.remove(start) 
            pos_dict["frame%d"%(i+1)] = pos 
            
        return pos_dict

    def revs_complement(dna):
        pairs = {"A": "T", "C": "G", "G": "C", "T": "A"} # complementary code
        c_dna = [pairs[s] for s in dna] # complementary replace
        return "".join(c_dna)[::-1] # reverse
        
    
    def orf_identifier (self):
        orf = {}
        for header, dna_seq in self.dict.items(): # generate orf for the whole file
            pos = self.find_pos(dna_seq)
            orf[header] = pos
        
        # find header for an ID
        id_key = [key for key in orf if "DQ286061.1" in key]
        idx = id_key[0]  
        
        # generate list of frames for questions 4 to 7
        frame1, frame2, all_frames, id_frames = [], [], [], [],
        for key, dict_value in orf.items():
            frame1 += dict_value["frame1"]
            frame2 += dict_value["frame2"]
            frames = dict_value["frame1"] + dict_value["frame2"] + dict_value["frame3"]
            all_frames += frames
            if key == idx:
                id_frames = dict_value["frame1"] + dict_value["frame2"] + dict_value["frame3"]
                

            
               
        frame2_max_length = max(frame2, key = lambda x: x[1])
        print("The length of longest ORF in frame2: %d\n"%frame2_max_length[1])
        
                
        frame1_max_length_pos = max(frame1, key = lambda x: x[1])
        print("The start position of longest ORF in frame1: %d\n"%frame1_max_length_pos[0])
        
        
        max_length = max(all_frames, key = lambda x: x[1])
        print("The longest ORF of all frames and sequences: %d\n"%max_length[1])
        
        max_length_id = max(id_frames, key = lambda x: x[1])
        print("The length of longest ORF for ", idx, "is: %d \n" %max_length_id[1])
              
        
            
    def find_repeats(self, dna, n):
        repeats = {}
        for i in range(0, len(dna)):
            repeat = dna[i:i+n] # all possible repeats
            if len(repeat) == n:
                if repeat not in repeats:
                    repeats [repeat] = 1 # initiate record
                else:
                    # count repeated repeats
                    repeats[repeat] = repeats.get(repeat) + 1
        return repeats
    
    def repeats_identifier(self, n):
        repeats_set = {}
        for header, dna_seq in self.dict.items():
            repeats = self.find_repeats(dna_seq, n)
            repeats_set[header] = repeats 
        # record the repeats with counts for the whole file
        combined_repeats = {}
        for dict_value in repeats_set.values():
            for key in dict_value:
                if key not in combined_repeats:
                    combined_repeats[key] = dict_value[key]
                else:
                    combined_repeats[key] = combined_repeats.get(key) \
                                            + dict_value[key]
        
        if n == 6:
            most_freq_7 = max (combined_repeats.values())
            print("The most frequently repeats occur: %d times \n"%most_freq_7)
        
        
            most_freq_7_seq = [key for key in combined_repeats if \
                       combined_repeats[key] == max(combined_repeats.values())]
            print("The following repeats occured most frequently: \n", most_freq_7_seq)
        
        if n == 12:
            
            count_most_freq_10 = len([value for value in combined_repeats.values()\
                             if value == max(combined_repeats.values())])
            print("The number of different 12-base sequences occur max times: %d \n"\
                  %count_most_freq_10)
        
           
        

        
if __name__ == "__main__":
    
    file_name = "hv_nt_seq.fasta"
    dna_tools = dna_tool_sets (file_name)
    
    
    dna_tools.orf_identifier()
    
    
    dna_tools.repeats_identifier(6)
    
    
    dna_tools.repeats_identifier(12) 
    
The length of longest ORF in frame2: 150

The start position of longest ORF in frame1: 1

The longest ORF of all frames and sequences: 660

The length of longest ORF for  >DQ286061.1 Hydra vulgaris mitochondrial succinate dehydrogenase flavoprotein subunit mRNA, partial cds; nuclear gene for mitochondrial product is: 150 

The most frequently repeats occur: 79 times 

The following repeats occured most frequently: 
 ['AAAAAA']
The number of different 12-base sequences occur max times: 1 

Manipulate Amino Acid Sequences

protein = 'UUUUUMFSFGIHRLSVFRKISRIAFANKHTLPELGYEYNALEPTISSQIMEIHHRKHHQAYVNNLNTAEEQLAEAQHKGDTSKIIS'
for i in range(len(protein)):
    if protein[i] not in 'ACDEFGHIKLMNPQRSTVWY':
        print(protein[i], i)
U 0
U 1
U 2
U 3
U 4
for i in range(len(protein)):
    if protein[i] not in 'ACDEFGHIKLMNPQRSTVWY':
        print("this is not a valid protein sequence!")
        break
this is not a valid protein sequence!
for i in range(len(protein)):
    if protein[i] not in 'ACDEFGHIKLMNPQRSTVWY':
        print("protein contains invalid amino acid %s at position %d" %(protein[i],i))
protein contains invalid amino acid U at position 0
protein contains invalid amino acid U at position 1
protein contains invalid amino acid U at position 2
protein contains invalid amino acid U at position 3
protein contains invalid amino acid U at position 4
corrected_protein=''

for i in range(len(protein)):
    if protein[i] not in 'ACDEFGHIKLMNPQRSTVWY':
        continue
    corrected_protein=corrected_protein+protein[i]

print("Corrected protein sequence is: %s" %corrected_protein)
Corrected protein sequence is: MFSFGIHRLSVFRKISRIAFANKHTLPELGYEYNALEPTISSQIMEIHHRKHHQAYVNNLNTAEEQLAEAQHKGDTSKIIS