Libraries / R Setup
- In this section, include the R set up for Python to run.
##r chunk
library(reticulate)
## Warning: package 'reticulate' was built under R version 3.6.3
- In this section, include import functions to load the packages you will use for Python.
##python chunk
import nltk
import spacy
from spacy import displacy
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
Import the grammar
- Modify
grammar1 from the lecture notes to account for the following sentences:
- New sentence: “The dog ate the food.”
- New sentence: “The dog walked by the cat in the park.”
##python chunk
grammar1 = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V NP PP | V PP |V PP PP
PP -> P NP
V -> "saw" | "ate" | "walked"
NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
Det -> "a" | "an" | "the" | "my" | "The"
N -> "man" | "dog" | "cat" | "telescope" | "park" | "food"
P -> "in" | "on" | "by" | "with"
""")
Process the sentences
- Process the sentences with both the
RecursiveDescentParser and ShiftReduceParser.
##python chunk
RD_parser = nltk.RecursiveDescentParser(grammar1)
SR_parser = nltk.ShiftReduceParser(grammar1)
Sentence1 = 'The dog ate the food'.split()
print(Sentence1)
## ['The', 'dog', 'ate', 'the', 'food']
for tree in RD_parser.parse(Sentence1):
print(tree)
## (S (NP (Det The) (N dog)) (VP (V ate) (NP (Det the) (N food))))
for tree in SR_parser.parse(Sentence1):
print(tree)
## (S (NP (Det The) (N dog)) (VP (V ate) (NP (Det the) (N food))))
Sentence2 = 'The dog walked by the cat in the park'.split()
print(Sentence2)
## ['The', 'dog', 'walked', 'by', 'the', 'cat', 'in', 'the', 'park']
for tree in RD_parser.parse(Sentence2):
print(tree)
## (S
## (NP (Det The) (N dog))
## (VP
## (V walked)
## (PP
## (P by)
## (NP (Det the) (N cat) (PP (P in) (NP (Det the) (N park)))))))
## (S
## (NP (Det The) (N dog))
## (VP
## (V walked)
## (PP (P by) (NP (Det the) (N cat)))
## (PP (P in) (NP (Det the) (N park)))))
for tree in RD_parser.parse(Sentence2):
print(tree)
## (S
## (NP (Det The) (N dog))
## (VP
## (V walked)
## (PP
## (P by)
## (NP (Det the) (N cat) (PP (P in) (NP (Det the) (N park)))))))
## (S
## (NP (Det The) (N dog))
## (VP
## (V walked)
## (PP (P by) (NP (Det the) (N cat)))
## (PP (P in) (NP (Det the) (N park)))))
Training Data
- Use two of your tweets from the previous assignment and modify the training data for dependency parsing.
##python chunk
TRAIN_DATA = [ #open list
("They trade mortgage-backed securities.", #sentence
{ #open dictionary
#the word numbers
'heads': [1, 1, 4, 4, 5, 1, 1],
#their type
'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
} #close dictionary
), #close sentence
("I like London and Berlin.", #sentence
{ #open dictionary
#the word numbers
'heads': [1, 1, 1, 2, 2, 1],
#their type
'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
} #close dictionary
) #close sentence
] #close list
Build the model
- Create a blank spacy pipeline.
- Add the parser to the pipeline.
- Add the labels to the pipeline.
##python chunk
#create a blank model
nlp = spacy.blank('en')
#add the parser to it
parser = nlp.create_pipe('parser')
nlp.add_pipe(parser, first=True)
#add the labels
for _, annotations in TRAIN_DATA:
for dep in annotations.get('deps', []):
parser.add_label(dep)
Train the model
- Train the model with 10 iterations of the data.
##python chunk
#start training
optimizer = nlp.begin_training()
#number of times to run
## C:\Users\punthakur\AppData\Local\Programs\Python\Python36\lib\site-packages\spacy\language.py:639: UserWarning: [W033] Training a new parser or NER using a model with no lexeme normalization table. This may degrade the performance of the model to some degree. If this is intentional or the language you're using doesn't have a normalization table, please ignore this warning. If this is surprising, make sure you have the spacy-lookups-data package installed. The languages with lexeme normalization tables are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.
## **kwargs
n_iter = 10
#run training
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
print(losses)
## {'parser': 6.133924755267799}
## {'parser': 5.985019829124212}
## {'parser': 5.584429318085313}
## {'parser': 4.747710921801627}
## {'parser': 3.7625065388856456}
## {'parser': 4.3902202763274545}
## {'parser': 2.6134939238618244}
## {'parser': 5.599591394420713}
## {'parser': 5.545804456225596}
## {'parser': 5.07607504868065}
Test the model
- Test your dependency model on a similar tweet.
##python chunk
# test the model
test_text = "I like securities."
doc = nlp(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
## Dependencies [('I', 'nsubj', 'like'), ('like', 'ROOT', 'like'), ('securities', 'dobj', 'like'), ('.', 'cc', 'securities')]
Visualize
- Include a visualization of the tweet you just tested.
- Remember, you should modify the chunk options to show the picture in the knitted document, since it does not display inline.
##python chunk
displacy.render(doc,
options={'distance': 110,
'arrow_stroke': 2,
'arrow_width': 8})
#Save the model
## '<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="bdd6c1f5eed04d0badb73c00ae03b1fc-0" class="displacy" width="380" height="192.0" direction="ltr" style="max-width: none; height: 192.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="102.0">\n <tspan class="displacy-word" fill="currentColor" x="50">I</tspan>\n <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50"></tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="102.0">\n <tspan class="displacy-word" fill="currentColor" x="160">like</tspan>\n <tspan class="displacy-tag" dy="2em" fill="currentColor" x="160"></tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="102.0">\n <tspan class="displacy-word" fill="currentColor" x="270">securities.</tspan>\n <tspan class="displacy-tag" dy="2em" fill="currentColor" x="270"></tspan>\n</text>\n\n<g class="displacy-arrow">\n <path class="displacy-arc" id="arrow-bdd6c1f5eed04d0badb73c00ae03b1fc-0-0" stroke-width="2px" d="M70,57.0 C70,2.0 160.0,2.0 160.0,57.0" fill="none" stroke="currentColor"/>\n <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">\n <textPath xlink:href="#arrow-bdd6c1f5eed04d0badb73c00ae03b1fc-0-0" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">nsubj</textPath>\n </text>\n <path class="displacy-arrowhead" d="M70,59.0 L64,49.0 76,49.0" fill="currentColor"/>\n</g>\n\n<g class="displacy-arrow">\n <path class="displacy-arc" id="arrow-bdd6c1f5eed04d0badb73c00ae03b1fc-0-1" stroke-width="2px" d="M180,57.0 C180,2.0 270.0,2.0 270.0,57.0" fill="none" stroke="currentColor"/>\n <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">\n <textPath xlink:href="#arrow-bdd6c1f5eed04d0badb73c00ae03b1fc-0-1" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">dobj</textPath>\n </text>\n <path class="displacy-arrowhead" d="M270.0,59.0 L276.0,49.0 264.0,49.0" fill="currentColor"/>\n</g>\n</svg>'
nlp.to_disk("./model")