Libraries / R Setup

##r chunk

library(reticulate)
## Warning: package 'reticulate' was built under R version 3.6.3
##python chunk

import nltk

import spacy
from spacy import displacy
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path

Import the grammar

##python chunk

grammar1 = nltk.CFG.fromstring("""
  S -> NP VP
  VP -> V NP | V NP PP | V PP |V PP PP
  PP -> P NP
  V -> "saw" | "ate" | "walked"
  NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
  Det -> "a" | "an" | "the" | "my" | "The"
  N -> "man" | "dog" | "cat" | "telescope" | "park" | "food"
  P -> "in" | "on" | "by" | "with"
  """)

Process the sentences

##python chunk

RD_parser = nltk.RecursiveDescentParser(grammar1)
SR_parser = nltk.ShiftReduceParser(grammar1)

Sentence1 = 'The dog ate the food'.split()

print(Sentence1)
## ['The', 'dog', 'ate', 'the', 'food']
for tree in RD_parser.parse(Sentence1):
     print(tree)
     
## (S (NP (Det The) (N dog)) (VP (V ate) (NP (Det the) (N food))))
for tree in SR_parser.parse(Sentence1):
     print(tree)
     
     
## (S (NP (Det The) (N dog)) (VP (V ate) (NP (Det the) (N food))))
Sentence2 = 'The dog walked by the cat in the park'.split()
print(Sentence2)
## ['The', 'dog', 'walked', 'by', 'the', 'cat', 'in', 'the', 'park']
for tree in RD_parser.parse(Sentence2):
     print(tree)
     
## (S
##   (NP (Det The) (N dog))
##   (VP
##     (V walked)
##     (PP
##       (P by)
##       (NP (Det the) (N cat) (PP (P in) (NP (Det the) (N park)))))))
## (S
##   (NP (Det The) (N dog))
##   (VP
##     (V walked)
##     (PP (P by) (NP (Det the) (N cat)))
##     (PP (P in) (NP (Det the) (N park)))))
for tree in RD_parser.parse(Sentence2):
      print(tree)
## (S
##   (NP (Det The) (N dog))
##   (VP
##     (V walked)
##     (PP
##       (P by)
##       (NP (Det the) (N cat) (PP (P in) (NP (Det the) (N park)))))))
## (S
##   (NP (Det The) (N dog))
##   (VP
##     (V walked)
##     (PP (P by) (NP (Det the) (N cat)))
##     (PP (P in) (NP (Det the) (N park)))))

Training Data

##python chunk 


TRAIN_DATA = [ #open list
    ("They trade mortgage-backed securities.", #sentence
        { #open dictionary
        #the word numbers
        'heads': [1, 1, 4, 4, 5, 1, 1],
        #their type
        'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] 
    } #close dictionary
    ), #close sentence
    ("I like London and Berlin.", #sentence
    { #open dictionary
        #the word numbers
        'heads': [1, 1, 1, 2, 2, 1],
        #their type
        'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] 
    } #close dictionary
    ) #close sentence
] #close list

Build the model

##python chunk

#create a blank model
nlp = spacy.blank('en')

#add the parser to it
parser = nlp.create_pipe('parser')
nlp.add_pipe(parser, first=True)


#add the labels
for _, annotations in TRAIN_DATA:
        for dep in annotations.get('deps', []):
            parser.add_label(dep)

Train the model

##python chunk

#start training
optimizer = nlp.begin_training()
#number of times to run
## C:\Users\punthakur\AppData\Local\Programs\Python\Python36\lib\site-packages\spacy\language.py:639: UserWarning: [W033] Training a new parser or NER using a model with no lexeme normalization table. This may degrade the performance of the model to some degree. If this is intentional or the language you're using doesn't have a normalization table, please ignore this warning. If this is surprising, make sure you have the spacy-lookups-data package installed. The languages with lexeme normalization tables are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.
##   **kwargs
n_iter = 10
#run training 
for itn in range(n_iter):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer, losses=losses)
    print(losses)
## {'parser': 6.133924755267799}
## {'parser': 5.985019829124212}
## {'parser': 5.584429318085313}
## {'parser': 4.747710921801627}
## {'parser': 3.7625065388856456}
## {'parser': 4.3902202763274545}
## {'parser': 2.6134939238618244}
## {'parser': 5.599591394420713}
## {'parser': 5.545804456225596}
## {'parser': 5.07607504868065}

Test the model

##python chunk

# test the model 
test_text = "I like securities."
doc = nlp(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
## Dependencies [('I', 'nsubj', 'like'), ('like', 'ROOT', 'like'), ('securities', 'dobj', 'like'), ('.', 'cc', 'securities')]

Visualize

##python chunk

displacy.render(doc,
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})
                         
#Save the model
## '<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="bdd6c1f5eed04d0badb73c00ae03b1fc-0" class="displacy" width="380" height="192.0" direction="ltr" style="max-width: none; height: 192.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="102.0">\n    <tspan class="displacy-word" fill="currentColor" x="50">I</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50"></tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="102.0">\n    <tspan class="displacy-word" fill="currentColor" x="160">like</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="160"></tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="102.0">\n    <tspan class="displacy-word" fill="currentColor" x="270">securities.</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="270"></tspan>\n</text>\n\n<g class="displacy-arrow">\n    <path class="displacy-arc" id="arrow-bdd6c1f5eed04d0badb73c00ae03b1fc-0-0" stroke-width="2px" d="M70,57.0 C70,2.0 160.0,2.0 160.0,57.0" fill="none" stroke="currentColor"/>\n    <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">\n        <textPath xlink:href="#arrow-bdd6c1f5eed04d0badb73c00ae03b1fc-0-0" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">nsubj</textPath>\n    </text>\n    <path class="displacy-arrowhead" d="M70,59.0 L64,49.0 76,49.0" fill="currentColor"/>\n</g>\n\n<g class="displacy-arrow">\n    <path class="displacy-arc" id="arrow-bdd6c1f5eed04d0badb73c00ae03b1fc-0-1" stroke-width="2px" d="M180,57.0 C180,2.0 270.0,2.0 270.0,57.0" fill="none" stroke="currentColor"/>\n    <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">\n        <textPath xlink:href="#arrow-bdd6c1f5eed04d0badb73c00ae03b1fc-0-1" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">dobj</textPath>\n    </text>\n    <path class="displacy-arrowhead" d="M270.0,59.0 L276.0,49.0 264.0,49.0" fill="currentColor"/>\n</g>\n</svg>'
nlp.to_disk("./model")