In this analysis we will use TextBlob software for tokenization. TextBlob is an object-oriented Natural Language Processing (NLP) library and is built upon NLTK (Natural Language Tool Kit) and Pattern NLP libraries. TextBlob can be used to perform a variety of NLP tasks ranging from parts-of-speech tagging to sentiment analysis, and language translation to text classification.
from operator import itemgetter
from textblob import TextBlob
import nltk
nltk.download('punkt')
## True
##
## [nltk_data] Downloading package punkt to /home/r789995/nltk_data...
## [nltk_data] Package punkt is already up-to-date!
nltk.download('brown')
## True
##
## [nltk_data] Downloading package brown to /home/r789995/nltk_data...
## [nltk_data] Package brown is already up-to-date!
nltk.download('stopwords')
## True
##
## [nltk_data] Downloading package stopwords to
## [nltk_data] /home/r789995/nltk_data...
## [nltk_data] Package stopwords is already up-to-date!
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
You can download the Shakespeare’s Romeo & Juliet book from the following website.https://www.Gutenberg.org/ebooks/1513
# Create Blobs
textTotal = open('RomeoJuliet.txt').read()
blobTotal = TextBlob(textTotal)
numChars = 1000
text1000 = textTotal[0:numChars+1]
blob1000 = TextBlob(text1000)
def print_words(n):
blob = n.words
indi = 0
inital = 0
for i in range(0, len(n.words)):
blob.insert(indi, i)
indi = indi + 2
for i in range(inital, int(len(n.words)//10/2)):
print(blob[i*20:i*20+20])
print_words(blob1000)
## [0, 'Project', 1, 'Gutenberg', 2, '’', 3, 's', 4, 'Romeo', 5, 'and', 6, 'Juliet', 7, 'by', 8, 'William', 9, 'Shakespeare']
## [10, 'This', 11, 'eBook', 12, 'is', 13, 'for', 14, 'the', 15, 'use', 16, 'of', 17, 'anyone', 18, 'anywhere', 19, 'in']
## [20, 'the', 21, 'United', 22, 'States', 23, 'and', 24, 'most', 25, 'other', 26, 'parts', 27, 'of', 28, 'the', 29, 'world']
## [30, 'at', 31, 'no', 32, 'cost', 33, 'and', 34, 'with', 35, 'almost', 36, 'no', 37, 'restrictions', 38, 'whatsoever', 39, 'You']
## [40, 'may', 41, 'copy', 42, 'it', 43, 'give', 44, 'it', 45, 'away', 46, 'or', 47, 're-use', 48, 'it', 49, 'under']
## [50, 'the', 51, 'terms', 52, 'of', 53, 'the', 54, 'Project', 55, 'Gutenberg', 56, 'License', 57, 'included', 58, 'with', 59, 'this']
## [60, 'eBook', 61, 'or', 62, 'online', 63, 'at', 64, 'www.gutenberg.org', 65, 'If', 66, 'you', 67, 'are', 68, 'not', 69, 'located']
## [70, 'in', 71, 'the', 72, 'United', 73, 'States', 74, 'you', 75, '’', 76, 'll', 77, 'have', 78, 'to', 79, 'check']
## [80, 'the', 81, 'laws', 82, 'of', 83, 'the', 84, 'country', 85, 'where', 86, 'you', 87, 'are', 88, 'located', 89, 'before']
## [90, 'using', 91, 'this', 92, 'ebook', 93, 'Title', 94, 'Romeo', 95, 'and', 96, 'Juliet', 97, 'Author', 98, 'William', 99, 'Shakespeare']
## [100, 'Release', 101, 'Date', 102, 'November', 103, '1998', 104, 'Etext', 105, '1513', 106, 'Last', 107, 'Updated', 108, 'January', 109, '30']
## [110, '2019', 111, 'Language', 112, 'English', 113, 'Character', 114, 'set', 115, 'encoding', 116, 'UTF-8', 117, 'START', 118, 'OF', 119, 'THIS']
## [120, 'PROJECT', 121, 'GUTENBERG', 122, 'EBOOK', 123, 'ROMEO', 124, 'AND', 125, 'JULIET', 126, 'This', 127, 'etext', 128, 'was', 129, 'produced']
## [130, 'by', 131, 'the', 132, 'PG', 133, 'Shakespeare', 134, 'Team', 135, 'a', 136, 'team', 137, 'of', 138, 'about', 139, 'twenty']
## [140, 'Project', 141, 'Gutenberg', 142, 'volunteers', 143, 'THE', 144, 'TRAGEDY', 145, 'OF', 146, 'ROMEO', 147, 'AND', 148, 'JULIET', 149, 'by']
## [150, 'William', 151, 'Shakespeare', 152, 'Contents', 153, 'THE', 154, 'PROLOGUE', 155, 'ACT', 156, 'I', 157, 'Scene', 158, 'I', 159, 'A']
blob1000 = TextBlob(text1000)
print('Count of words = ', len(blob1000.words))
## Count of words = 167
print('Count of words = ', len(blobTotal.words))
## Count of words = 30796
unique_words = blobTotal.word_counts.items()
print("Total Unique Words = ", len(unique_words))
## Total Unique Words = 4145
stop_words = stopwords.words('english')
no_sw_list = [word for word in blobTotal.word_counts if word not in stop_words]
print('Total number of unique words AFTER removing stop Words = ',len(no_sw_list) )
## Total number of unique words AFTER removing stop Words = 4017
#Counting the words frequency
word_feq = blobTotal.word_counts.items()
#Creating the frequency data frame
word_feq_df = pd.DataFrame(word_feq,columns=['word','frequency'])
#Sorting the data frame
word_feq_df_sorted = word_feq_df.sort_values(by="frequency", ascending = False)
word_feq_df_sorted.iloc[0:10,]
## word frequency
## 14 the 876
## 2 ’ 869
## 5 and 808
## 92 i 655
## 53 to 626
## 84 a 542
## 16 of 519
## 19 in 395
## 12 is 372
## 292 that 369
stop_words = stopwords.words('english')
stop_words.append("’")
no_sw_list = [word for word in blobTotal.words.lower() if word not in stop_words]
#Counting the words frequency
word_feq = TextBlob(" ".join(no_sw_list)).word_counts.items()
#Creating the frequency data frame
word_feq_df = pd.DataFrame(word_feq,columns=['word','frequency'])
#Sorting the data frame
word_feq_df_sorted = word_feq_df.sort_values(by="frequency", ascending = False)
word_feq_df_sorted.iloc[0:10,]
## word frequency
## 2 romeo 320
## 223 thou 278
## 3 juliet 195
## 273 thy 170
## 69 capulet 163
## 122 nurse 149
## 186 love 148
## 237 thee 138
## 108 lady 117
## 200 shall 112
noun-phrases by printing 3 noun-phrases per line.
blob1000 = TextBlob(text1000)
def print_nounphrase(n):
blob = n.noun_phrases
index = 0
inital = 0
for i in range(1, len(n.noun_phrases)):
blob.insert(index, i)
index = index + 2
for i in range(inital, int((len(n.noun_phrases)//3)/2)):
print(blob[i*6:i*6+6])
print_nounphrase(blob1000)
## [1, 'project gutenberg', 2, '’ s', 3, 'romeo']
## [4, 'juliet', 5, 'william shakespeare', 6, 'restrictions whatsoever']
## [7, 'project gutenberg license', 8, '’ ll', 9, 'title']
## [10, 'romeo', 11, 'juliet author', 12, 'william shakespeare release date']
## [13, 'november', 14, 'etext', 15, 'updated']
## [16, 'january', 17, 'language', 18, 'english character']
## [19, 'utf-8', 20, '* * *', 21, 'start of this project gutenberg ebook romeo and juliet']
## [22, '* * *', 23, 'pg shakespeare team', 24, 'project gutenberg']
## [25, 'the tragedy of romeo and juliet', 26, 'william shakespeare contents the prologue', 27, 'act']
## [28, 'scene', 29, 'public place', 30, 'scene ii']
print("Total noun phrases in text = ", len(blobTotal.noun_phrases))
## Total noun phrases in text = 5039
# This function counts noun phrases
nounphrase_freq = blobTotal.np_counts.items()
# Creating the DF
nounphrase_freq_df = pd.DataFrame(nounphrase_freq, columns = ["Noun Phrase", "Freq"])
# Sorting the DF
nounphrase_freq_df_sorted = nounphrase_freq_df.sort_values(by = "Freq", ascending = False)
nounphrase_freq_df_sorted.iloc[0:10,]
## Noun Phrase Freq
## 2 romeo 297
## 3 juliet 177
## 66 nurse 132
## 28 capulet 106
## 1 ’ s 98
## 51 mercutio 86
## 63 tybalt 75
## 58 benvolio 72
## 36 friar lawrence 69
## 7 ’ ll 68
pattern = "WA"
WA_List = [word for word in blobTotal.words if pattern in word]
pd.DataFrame(WA_List, columns = ["Words Containing 'WA'"])
## Words Containing 'WA'
## 0 WATCH
## 1 WATCH
## 2 WATCH
## 3 WATCH
## 4 WATCH
## 5 WATCH
## 6 WATCH
## 7 WATCH
## 8 WARRANTY
## 9 WARRANTY
## 10 WARRANTIES
## 11 WARRANTIES
#reading the mask
import imageio
mask_image = imageio.imread("mask_star.png")
#Creating the word cloud
## <string>:1: DeprecationWarning: Starting with ImageIO v3 the behavior of this function will switch to that of iio.v3.imread. To keep the current behavior (and make this warning disappear) use `import imageio.v2 as imageio` or call `imageio.v2.imread` directly.
from wordcloud import WordCloud
wordcloud = WordCloud(colormap = 'prism', mask = mask_image, background_color = 'white')
wordcloudImage = wordcloud.generate(textTotal)
#Saving the wordcloud as a file
wordcloudfile = wordcloudImage.to_file('rj2.png')
#Plotting the wordcloud
wordcloudImage.to_image
## <bound method WordCloud.to_image of <wordcloud.wordcloud.WordCloud object at 0x7f9826d369a0>>
from PIL import Image
import matplotlib.pyplot as plt
im = Image.open('rj2.png')
fig, aux = plt.subplots()
aux.imshow(im)