Constituency and Dependency Parsing

Libraries / R Setup

In this section, include the R set up for Python to run.

##r chunk
library(reticulate)

In this section, include import functions to load the packages you will use for Python.

##python chunk
import nltk
import spacy
from spacy import displacy

from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path

Import the grammar

Modify grammar1 from the lecture notes to account for the following sentences:
- New sentence: “The dog ate the food.”
- New sentence: “The dog walked by the cat in the park.”

##python chunk
grammar1 = nltk.CFG.fromstring("""
  S -> NP VP
  VP -> V NP | V NP PP
  PP -> P NP
  V -> "saw" | "ate" | "walked"
  NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
  Det -> "a" | "an" | "the" | "my"  | "The"
  N -> "food" | "dog" | "cat" | "telescope" | "park"
  P -> "in" | "on" | "by" | "with"
  """)

Process the sentences

Process the sentences with both the RecursiveDescentParser and ShiftReduceParser.

##python chunk
rd_parser = nltk.RecursiveDescentParser(grammar1)
sent1 = "The dog ate the food".split()
print(sent1)

## ['The', 'dog', 'ate', 'the', 'food']

for tree1 in rd_parser.parse(sent1):
     print(tree1)

## (S (NP (Det The) (N dog)) (VP (V ate) (NP (Det the) (N food))))

sent2 = "The dog walked by the cat in the park".split()
print(sent2)

## ['The', 'dog', 'walked', 'by', 'the', 'cat', 'in', 'the', 'park']

for tree2 in rd_parser.parse(sent2):
     print(tree2)

sr_parser = nltk.ShiftReduceParser(grammar1)
sent3 = "The dog ate the food".split()
print(sent3)

## ['The', 'dog', 'ate', 'the', 'food']

for tree3 in rd_parser.parse(sent3):
     print(tree3)

## (S (NP (Det The) (N dog)) (VP (V ate) (NP (Det the) (N food))))

sent4 = "The dog walked by the cat in the park".split()
print(sent4)

## ['The', 'dog', 'walked', 'by', 'the', 'cat', 'in', 'the', 'park']

for tree4 in rd_parser.parse(sent4):
     print(tree4)

Training Data

Use two of your tweets from the previous assignment and modify the training data for dependency parsing.

##python chunk
nlp = spacy.load('en_core_web_sm')
sentence1 = "I will be the best by far in fighting terror"
sentence1_nlp = nlp(sentence1)
for token in sentence1_nlp:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

I/PRP <–nsubj– be/VB will/MD <–aux– be/VB be/VB <–ROOT– be/VB the/DT <–det– best/JJS best/JJS <–attr– be/VB by/IN <–advmod– far/RB far/RB <–advmod– be/VB in/IN <–prep– far/RB fighting/VBG <–pcomp– in/IN terror/NN <–dobj– fighting/VBG

displacy.render(sentence1_nlp,
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

‘’

sentence2 = "I am in Las Vegas at the best hotel Trump International"
sentence2_nlp = nlp(sentence2)
for token in sentence2_nlp:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

I/PRP <–nsubj– am/VBP am/VBP <–ROOT– am/VBP in/IN <–prep– am/VBP Las/NNP <–compound– Vegas/NNP Vegas/NNP <–pobj– in/IN at/IN <–prep– am/VBP the/DT <–det– hotel/NN best/JJS <–amod– hotel/NN hotel/NN <–pobj– at/IN Trump/NNP <–compound– International/NNP International/NNP <–appos– hotel/NN

displacy.render(sentence2_nlp,
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

‘’

train_data = [
    ("I will be the best by far in fighting terror", #sentence
        { #open dictionary
        #the word numbers
        'heads': [2,2,2,4,2,5,2,7,8,9],
        #their type
        'deps': ['nsubj', 'aux','ROOT', 'det', 'attr', 'advmod','advmod', 'prep',
        'pcomp','dobj'] 
    } #close dictionary
    ), #close sentence
    ("I am in Las Vegas at the best hotel Trump International", #sentence
    { #open dictionary
        #the word numbers
        'heads': [1,1,1,3,2,1,6,7,8,9,9],
        #their type
        'deps': ['nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep','det','amod','pobj','compound','appos'] 
    } #close dictionary
    ) #close sentence
] #close list

Build the model

Create a blank spacy pipeline.
Add the parser to the pipeline.
Add the labels to the pipeline.

##python chunk
# Create a blank spacy pipeline
nlp = spacy.blank("en")

# add the parser
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser,first=True)

# add the labels
for _, annotations in train_data:
        for dep in annotations.get('deps', []):
            parser.add_label(dep)

Train the model

Train the model with 10 iterations of the data.


# ##python chunk
#start training
optimizer = nlp.begin_training()
#number of times to run

## /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/lib/python3.6/site-packages/spacy/language.py:639: UserWarning: [W033] Training a new parser or NER using a model with no lexeme normalization table. This may degrade the performance of the model to some degree. If this is intentional or the language you're using doesn't have a normalization table, please ignore this warning. If this is surprising, make sure you have the spacy-lookups-data package installed. The languages with lexeme normalization tables are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.
##   **kwargs

n_iter = 10
#run training
for itn in range(n_iter):
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        nlp.update([text], [annotations], sgd=optimizer, losses=losses)
    print(losses)

## {'parser': 5.743537811562419}
## {'parser': 5.535966164432466}
## {'parser': 5.3011279832571745}
## {'parser': 5.120166735723615}
## {'parser': 5.062085444340482}
## {'parser': 6.826412809197791}
## {'parser': 6.596749360440299}
## {'parser': 7.0100842011743225}
## {'parser': 6.189121451054234}
## {'parser': 4.000978839059826}

Test the model

Test your dependency model on a similar tweet.

##python chunk
test_text = "I am at Trump National Doral-best resort in U.S."
doc = nlp(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])

## Dependencies [('I', 'nsubj', 'am'), ('am', 'ROOT', 'am'), ('at', 'det', 'Trump'), ('Trump', 'det', 'Doral'), ('National', 'det', 'Doral'), ('Doral', 'det', '-'), ('-', 'prep', 'am'), ('best', 'ROOT', 'best'), ('resort', 'ROOT', 'resort'), ('in', 'ROOT', 'in'), ('U.S.', 'appos', 'in')]

Visualize

Include a visualization of the tweet you just tested.
Remember, you should modify the chunk options to show the picture in the knitted document, since it does not display inline.

##python chunk
displacy.render(doc,
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

‘’