##r chunk
library(reticulate)
##python chunk
import nltk
import spacy
from spacy import displacy
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
grammar1 from the lecture notes to account for the following sentences:
##python chunk
grammar1 = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V NP PP
PP -> P NP
V -> "saw" | "ate" | "walked"
NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
Det -> "a" | "an" | "the" | "my" | "The"
N -> "food" | "dog" | "cat" | "telescope" | "park"
P -> "in" | "on" | "by" | "with"
""")
RecursiveDescentParser and ShiftReduceParser.##python chunk
rd_parser = nltk.RecursiveDescentParser(grammar1)
sent1 = "The dog ate the food".split()
print(sent1)
## ['The', 'dog', 'ate', 'the', 'food']
for tree1 in rd_parser.parse(sent1):
print(tree1)
## (S (NP (Det The) (N dog)) (VP (V ate) (NP (Det the) (N food))))
sent2 = "The dog walked by the cat in the park".split()
print(sent2)
## ['The', 'dog', 'walked', 'by', 'the', 'cat', 'in', 'the', 'park']
for tree2 in rd_parser.parse(sent2):
print(tree2)
sr_parser = nltk.ShiftReduceParser(grammar1)
sent3 = "The dog ate the food".split()
print(sent3)
## ['The', 'dog', 'ate', 'the', 'food']
for tree3 in rd_parser.parse(sent3):
print(tree3)
## (S (NP (Det The) (N dog)) (VP (V ate) (NP (Det the) (N food))))
sent4 = "The dog walked by the cat in the park".split()
print(sent4)
## ['The', 'dog', 'walked', 'by', 'the', 'cat', 'in', 'the', 'park']
for tree4 in rd_parser.parse(sent4):
print(tree4)
##python chunk
nlp = spacy.load('en_core_web_sm')
sentence1 = "I will be the best by far in fighting terror"
sentence1_nlp = nlp(sentence1)
for token in sentence1_nlp:
print("{0}/{1} <--{2}-- {3}/{4}".format(
token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
I/PRP <–nsubj– be/VB will/MD <–aux– be/VB be/VB <–ROOT– be/VB the/DT <–det– best/JJS best/JJS <–attr– be/VB by/IN <–advmod– far/RB far/RB <–advmod– be/VB in/IN <–prep– far/RB fighting/VBG <–pcomp– in/IN terror/NN <–dobj– fighting/VBG
displacy.render(sentence1_nlp,
options={'distance': 110,
'arrow_stroke': 2,
'arrow_width': 8})
‘’
sentence2 = "I am in Las Vegas at the best hotel Trump International"
sentence2_nlp = nlp(sentence2)
for token in sentence2_nlp:
print("{0}/{1} <--{2}-- {3}/{4}".format(
token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
I/PRP <–nsubj– am/VBP am/VBP <–ROOT– am/VBP in/IN <–prep– am/VBP Las/NNP <–compound– Vegas/NNP Vegas/NNP <–pobj– in/IN at/IN <–prep– am/VBP the/DT <–det– hotel/NN best/JJS <–amod– hotel/NN hotel/NN <–pobj– at/IN Trump/NNP <–compound– International/NNP International/NNP <–appos– hotel/NN
displacy.render(sentence2_nlp,
options={'distance': 110,
'arrow_stroke': 2,
'arrow_width': 8})
‘’
train_data = [
("I will be the best by far in fighting terror", #sentence
{ #open dictionary
#the word numbers
'heads': [2,2,2,4,2,5,2,7,8,9],
#their type
'deps': ['nsubj', 'aux','ROOT', 'det', 'attr', 'advmod','advmod', 'prep',
'pcomp','dobj']
} #close dictionary
), #close sentence
("I am in Las Vegas at the best hotel Trump International", #sentence
{ #open dictionary
#the word numbers
'heads': [1,1,1,3,2,1,6,7,8,9,9],
#their type
'deps': ['nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep','det','amod','pobj','compound','appos']
} #close dictionary
) #close sentence
] #close list
##python chunk
# Create a blank spacy pipeline
nlp = spacy.blank("en")
# add the parser
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser,first=True)
# add the labels
for _, annotations in train_data:
for dep in annotations.get('deps', []):
parser.add_label(dep)
# ##python chunk
#start training
optimizer = nlp.begin_training()
#number of times to run
## /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/lib/python3.6/site-packages/spacy/language.py:639: UserWarning: [W033] Training a new parser or NER using a model with no lexeme normalization table. This may degrade the performance of the model to some degree. If this is intentional or the language you're using doesn't have a normalization table, please ignore this warning. If this is surprising, make sure you have the spacy-lookups-data package installed. The languages with lexeme normalization tables are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.
## **kwargs
n_iter = 10
#run training
for itn in range(n_iter):
random.shuffle(train_data)
losses = {}
for text, annotations in train_data:
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
print(losses)
## {'parser': 5.743537811562419}
## {'parser': 5.535966164432466}
## {'parser': 5.3011279832571745}
## {'parser': 5.120166735723615}
## {'parser': 5.062085444340482}
## {'parser': 6.826412809197791}
## {'parser': 6.596749360440299}
## {'parser': 7.0100842011743225}
## {'parser': 6.189121451054234}
## {'parser': 4.000978839059826}
##python chunk
test_text = "I am at Trump National Doral-best resort in U.S."
doc = nlp(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
## Dependencies [('I', 'nsubj', 'am'), ('am', 'ROOT', 'am'), ('at', 'det', 'Trump'), ('Trump', 'det', 'Doral'), ('National', 'det', 'Doral'), ('Doral', 'det', '-'), ('-', 'prep', 'am'), ('best', 'ROOT', 'best'), ('resort', 'ROOT', 'resort'), ('in', 'ROOT', 'in'), ('U.S.', 'appos', 'in')]
##python chunk
displacy.render(doc,
options={'distance': 110,
'arrow_stroke': 2,
'arrow_width': 8})
‘’