import operator
import time
import string
import re
from collections import defaultdict
filepath = '/Users/JoJo/Desktop/Projects/650/Week 6/'
f = open(filepath + 'beauty&thebeast.txt', 'rb')
start = time.time()
beautybeast = defaultdict(int)
punc = string.punctuation
for line in f:
    cln_line = re.sub('[' + punc + ']', '', line.decode('utf-8'))
    spl_line = cln_line.split()
    for word in spl_line:
        lower_word = word.lower()
beautybeast.setdefault(lower_word, 0)
0
beautybeast[lower_word] += 1
sorted_beautybeast = sorted(beautybeast.items(), key=operator.itemgetter(1), reverse=True)
elapsed = time.time() - start
print('Run took', elapsed, ' seconds.')
Run took 507.92300391197205  seconds.
print('Number of distinct words:', len(sorted_beautybeast))
Number of distinct words: 1
top_n = 10
for pair in range(top_n):
    print(sorted_beautybeast[pair])
('end', 1)



---------------------------------------------------------------------------

IndexError                                Traceback (most recent call last)

<ipython-input-72-c3fcfe71def9> in <module>()
      1 for pair in range(top_n):
----> 2     print(sorted_beautybeast[pair])


IndexError: list index out of range
filepath = '/Users/JoJo/Desktop/Projects/650/Week 6/'
f = open(filepath + 'peterpan.txt', 'rb')
start = time.time()
peterpan = defaultdict(int)
punc = string.punctuation
for line in f:
    cln_line = re.sub('[' + punc + ']', '', line.decode('utf-8'))
    spl_line = cln_line.split()
    for word in spl_line:
        lower_word = word.lower()
        if lower_word in peterpan:
            peterpan[lower_word] += 1
        else:
            peterpan[lower_word] = 1
sorted_peterpan = sorted(peterpan.items(), key=operator.itemgetter(1), reverse=True)
elapsed = time.time() - start
print('Run took', elapsed, ' seconds.')
Run took 1085.910615682602  seconds.
print('Number of distinct words: ', len(sorted_peterpan))
Number of distinct words:  1
top_n = 10
y[]
  File "<ipython-input-91-2b8b5195ff99>", line 1
    y[]
      ^
SyntaxError: invalid syntax
LS0tCnRpdGxlOiAiUHl0aG9uIFRleHQgQW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHB5dGhvbgppbXBvcnQgb3BlcmF0b3IKYGBgCgoKYGBgcHl0aG9uCmltcG9ydCB0aW1lCmBgYAoKCmBgYHB5dGhvbgppbXBvcnQgc3RyaW5nCmBgYAoKCmBgYHB5dGhvbgppbXBvcnQgcmUKYGBgCgoKYGBgcHl0aG9uCmZyb20gY29sbGVjdGlvbnMgaW1wb3J0IGRlZmF1bHRkaWN0CmBgYAoKCmBgYHB5dGhvbgpmaWxlcGF0aCA9ICcvVXNlcnMvSm9Kby9EZXNrdG9wL1Byb2plY3RzLzY1MC9XZWVrIDYvJwpgYGAKCgpgYGBweXRob24KZiA9IG9wZW4oZmlsZXBhdGggKyAnYmVhdXR5JnRoZWJlYXN0LnR4dCcsICdyYicpCmBgYAoKCmBgYHB5dGhvbgpzdGFydCA9IHRpbWUudGltZSgpCmBgYAoKCmBgYHB5dGhvbgpiZWF1dHliZWFzdCA9IGRlZmF1bHRkaWN0KGludCkKYGBgCgoKYGBgcHl0aG9uCnB1bmMgPSBzdHJpbmcucHVuY3R1YXRpb24KYGBgCgoKYGBgcHl0aG9uCmZvciBsaW5lIGluIGY6CiAgICBjbG5fbGluZSA9IHJlLnN1YignWycgKyBwdW5jICsgJ10nLCAnJywgbGluZS5kZWNvZGUoJ3V0Zi04JykpCiAgICBzcGxfbGluZSA9IGNsbl9saW5lLnNwbGl0KCkKICAgIGZvciB3b3JkIGluIHNwbF9saW5lOgogICAgICAgIGxvd2VyX3dvcmQgPSB3b3JkLmxvd2VyKCkKYGBgCgoKYGBgcHl0aG9uCmJlYXV0eWJlYXN0LnNldGRlZmF1bHQobG93ZXJfd29yZCwgMCkKYGBgCgoKCgogICAgMAoKCgoKYGBgcHl0aG9uCmJlYXV0eWJlYXN0W2xvd2VyX3dvcmRdICs9IDEKYGBgCgoKYGBgcHl0aG9uCnNvcnRlZF9iZWF1dHliZWFzdCA9IHNvcnRlZChiZWF1dHliZWFzdC5pdGVtcygpLCBrZXk9b3BlcmF0b3IuaXRlbWdldHRlcigxKSwgcmV2ZXJzZT1UcnVlKQpgYGAKCgpgYGBweXRob24KZWxhcHNlZCA9IHRpbWUudGltZSgpIC0gc3RhcnQKYGBgCgoKYGBgcHl0aG9uCnByaW50KCdSdW4gdG9vaycsIGVsYXBzZWQsICcgc2Vjb25kcy4nKQpgYGAKCiAgICBSdW4gdG9vayA1MDcuOTIzMDAzOTExOTcyMDUgIHNlY29uZHMuCgoKCmBgYHB5dGhvbgpwcmludCgnTnVtYmVyIG9mIGRpc3RpbmN0IHdvcmRzOicsIGxlbihzb3J0ZWRfYmVhdXR5YmVhc3QpKQpgYGAKCiAgICBOdW1iZXIgb2YgZGlzdGluY3Qgd29yZHM6IDEKCgoKYGBgcHl0aG9uCnRvcF9uID0gMTAKYGBgCgoKYGBgcHl0aG9uCmZvciBwYWlyIGluIHJhbmdlKHRvcF9uKToKICAgIHByaW50KHNvcnRlZF9iZWF1dHliZWFzdFtwYWlyXSkKYGBgCgogICAgKCdlbmQnLCAxKQoKCgogICAgLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCgogICAgSW5kZXhFcnJvciAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgVHJhY2ViYWNrIChtb3N0IHJlY2VudCBjYWxsIGxhc3QpCgogICAgPGlweXRob24taW5wdXQtNzItYzNmY2ZlNzFkZWY5PiBpbiA8bW9kdWxlPigpCiAgICAgICAgICAxIGZvciBwYWlyIGluIHJhbmdlKHRvcF9uKToKICAgIC0tLS0+IDIgICAgIHByaW50KHNvcnRlZF9iZWF1dHliZWFzdFtwYWlyXSkKICAgIAoKICAgIEluZGV4RXJyb3I6IGxpc3QgaW5kZXggb3V0IG9mIHJhbmdlCgoKCmBgYHB5dGhvbgpmaWxlcGF0aCA9ICcvVXNlcnMvSm9Kby9EZXNrdG9wL1Byb2plY3RzLzY1MC9XZWVrIDYvJwpgYGAKCgpgYGBweXRob24KZiA9IG9wZW4oZmlsZXBhdGggKyAncGV0ZXJwYW4udHh0JywgJ3JiJykKYGBgCgoKYGBgcHl0aG9uCnN0YXJ0ID0gdGltZS50aW1lKCkKYGBgCgoKYGBgcHl0aG9uCnBldGVycGFuID0gZGVmYXVsdGRpY3QoaW50KQpgYGAKCgpgYGBweXRob24KcHVuYyA9IHN0cmluZy5wdW5jdHVhdGlvbgpgYGAKCgpgYGBweXRob24KZm9yIGxpbmUgaW4gZjoKICAgIGNsbl9saW5lID0gcmUuc3ViKCdbJyArIHB1bmMgKyAnXScsICcnLCBsaW5lLmRlY29kZSgndXRmLTgnKSkKICAgIHNwbF9saW5lID0gY2xuX2xpbmUuc3BsaXQoKQogICAgZm9yIHdvcmQgaW4gc3BsX2xpbmU6CiAgICAgICAgbG93ZXJfd29yZCA9IHdvcmQubG93ZXIoKQogICAgICAgIGlmIGxvd2VyX3dvcmQgaW4gcGV0ZXJwYW46CiAgICAgICAgICAgIHBldGVycGFuW2xvd2VyX3dvcmRdICs9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICBwZXRlcnBhbltsb3dlcl93b3JkXSA9IDEKYGBgCgoKYGBgcHl0aG9uCnNvcnRlZF9wZXRlcnBhbiA9IHNvcnRlZChwZXRlcnBhbi5pdGVtcygpLCBrZXk9b3BlcmF0b3IuaXRlbWdldHRlcigxKSwgcmV2ZXJzZT1UcnVlKQpgYGAKCgpgYGBweXRob24KZWxhcHNlZCA9IHRpbWUudGltZSgpIC0gc3RhcnQKYGBgCgoKYGBgcHl0aG9uCnByaW50KCdSdW4gdG9vaycsIGVsYXBzZWQsICcgc2Vjb25kcy4nKQpgYGAKCiAgICBSdW4gdG9vayAxMDg1LjkxMDYxNTY4MjYwMiAgc2Vjb25kcy4KCgoKYGBgcHl0aG9uCnByaW50KCdOdW1iZXIgb2YgZGlzdGluY3Qgd29yZHM6ICcsIGxlbihzb3J0ZWRfcGV0ZXJwYW4pKQpgYGAKCiAgICBOdW1iZXIgb2YgZGlzdGluY3Qgd29yZHM6ICAxCgoKCmBgYHB5dGhvbgp0b3BfbiA9IDEwCmBgYAoKCmBgYHB5dGhvbgp5W10KYGBgCgoKICAgICAgRmlsZSAiPGlweXRob24taW5wdXQtOTEtMmI4YjUxOTVmZjk5PiIsIGxpbmUgMQogICAgICAgIHlbXQogICAgICAgICAgXgogICAgU3ludGF4RXJyb3I6IGludmFsaWQgc3ludGF4CgoKCgpgYGBweXRob24KCmBgYAo=