Libraries / R Setup

##r chunk

#install.packages("wordnet")
library(reticulate)
## Warning: package 'reticulate' was built under R version 3.6.3
#py_install("nltk")

library(wordnet)
## Warning: package 'wordnet' was built under R version 3.6.3
## Warning in initDict(): cannot find WordNet 'dict' directory: please set the
## environment variable WNHOME to its parent
library(qdap)
## Warning: package 'qdap' was built under R version 3.6.3
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Warning: package 'qdapRegex' was built under R version 3.6.3
## Loading required package: qdapTools
## Warning: package 'qdapTools' was built under R version 3.6.3
## Loading required package: RColorBrewer
## 
## Attaching package: 'qdap'
## The following object is masked from 'package:wordnet':
## 
##     synonyms
## The following object is masked from 'package:base':
## 
##     Filter
##python chunk

from nltk.corpus import wordnet as wn

import pandas as pd


from nltk.corpus import wordnet_ic

import spacy

Synsets

##python chunk

base_sets_py = wn.synsets("base")
print(base_sets_py)
## [Synset('base.n.01'), Synset('foundation.n.03'), Synset('base.n.03'), Synset('base.n.04'), Synset('base.n.05'), Synset('floor.n.03'), Synset('basis.n.02'), Synset('base.n.08'), Synset('nucleotide.n.01'), Synset('base.n.10'), Synset('base.n.11'), Synset('basis.n.03'), Synset('base.n.13'), Synset('base.n.14'), Synset('al-qaeda.n.01'), Synset('root.n.03'), Synset('infrastructure.n.02'), Synset('base.n.18'), Synset('base.n.19'), Synset('base.n.20'), Synset('establish.v.08'), Synset('base.v.02'), Synset('free-base.v.01'), Synset('basal.s.02'), Synset('base.s.02'), Synset('base.s.03'), Synset('base.s.04'), Synset('base.s.05'), Synset('base.s.06'), Synset('base.s.07')]
base_df = pd.DataFrame([
  {"Synset": each_synset, 
  "Part of Speech": each_synset.pos(), 
  "Definition": each_synset.definition(), 
  "Lemmas": each_synset.lemma_names(), 
  "Examples": each_synset.examples()} 
  for each_synset in base_sets_py])

base_df
##                            Synset  ...                                           Examples
## 0             Synset('base.n.01')  ...           [the attack wiped out our forward bases]
## 1       Synset('foundation.n.03')  ...  [it was built on a base of solid rock, he stoo...
## 2             Synset('base.n.03')  ...              [he scrambled to get back to the bag]
## 3             Synset('base.n.04')  ...                         [the base of the mountain]
## 4             Synset('base.n.05')  ...                            [the base of the skull]
## 5            Synset('floor.n.03')  ...          [the government established a wage floor]
## 6            Synset('basis.n.02')  ...  [the whole argument rested on a basis of conje...
## 7             Synset('base.n.08')  ...                             [the base of the lamp]
## 8       Synset('nucleotide.n.01')  ...                                                 []
## 9             Synset('base.n.10')  ...  [bases include oxides and hydroxides of metals...
## 10            Synset('base.n.11')  ...                         [the base of the triangle]
## 11           Synset('basis.n.03')  ...          [the basis of this drink is orange juice]
## 12            Synset('base.n.13')  ...            [10 is the radix of the decimal system]
## 13            Synset('base.n.14')  ...                                                 []
## 14        Synset('al-qaeda.n.01')  ...                                                 []
## 15            Synset('root.n.03')  ...             [thematic vowels are part of the stem]
## 16  Synset('infrastructure.n.02')  ...                     [the industrial base of Japan]
## 17            Synset('base.n.18')  ...  [glycerinated gelatin is used as a base for ma...
## 18            Synset('base.n.19')  ...                 [a tub should sit on its own base]
## 19            Synset('base.n.20')  ...                                                 []
## 20       Synset('establish.v.08')  ...                 [base a claim on some observation]
## 21            Synset('base.v.02')  ...         [we will base this project in the new lab]
## 22       Synset('free-base.v.01')  ...                                                 []
## 23           Synset('basal.s.02')  ...  [the painter applied a base coat followed by t...
## 24            Synset('base.s.02')  ...  [baseborn wretches with dirty faces, of humble...
## 25            Synset('base.s.03')  ...             [base coins of aluminum, a base metal]
## 26            Synset('base.s.04')  ...  [base and unpatriotic motives, a base, degradi...
## 27            Synset('base.s.05')  ...  [that liberal obedience without which your arm...
## 28            Synset('base.s.06')  ...                                                 []
## 29            Synset('base.s.07')  ...         [an attempt to eliminate the base coinage]
## 
## [30 rows x 5 columns]

Nyms

##python chunk

synset = wn.synsets("base")
synset
  
## [Synset('base.n.01'), Synset('foundation.n.03'), Synset('base.n.03'), Synset('base.n.04'), Synset('base.n.05'), Synset('floor.n.03'), Synset('basis.n.02'), Synset('base.n.08'), Synset('nucleotide.n.01'), Synset('base.n.10'), Synset('base.n.11'), Synset('basis.n.03'), Synset('base.n.13'), Synset('base.n.14'), Synset('al-qaeda.n.01'), Synset('root.n.03'), Synset('infrastructure.n.02'), Synset('base.n.18'), Synset('base.n.19'), Synset('base.n.20'), Synset('establish.v.08'), Synset('base.v.02'), Synset('free-base.v.01'), Synset('basal.s.02'), Synset('base.s.02'), Synset('base.s.03'), Synset('base.s.04'), Synset('base.s.05'), Synset('base.s.06'), Synset('base.s.07')]
base_syn = synset[0]

base_syn.hypernyms()
## [Synset('military_installation.n.01')]
base_syn.hyponyms()
## [Synset('air_base.n.01'), Synset('army_base.n.01'), Synset('firebase.n.01'), Synset('navy_base.n.01'), Synset('rocket_base.n.01')]

Similarity

##python chunk

lower = wn.synsets("lower")
lower[0].definition()
## 'the lower of two berths'
lower = lower[0]

foot = wn.synsets("foot")
foot[0].definition()
## 'the part of the leg of a human being below the ankle joint'
foot = foot[0]



semcor_ic = wordnet_ic.ic('ic-semcor.dat')
lower.jcn_similarity(base_syn, semcor_ic)
## 1e-300
base_syn.jcn_similarity(foot, semcor_ic)
## 0.06087219729222118
lower.lin_similarity(base_syn, semcor_ic)
## 4.9867681714978775e-300
base_syn.lin_similarity(foot, semcor_ic)
## 0.06959033260101762

NER Tagging

##python chunk

import spacy
nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en") 
##python chunk

#add NER pipeline
ner = nlp.create_pipe('ner')  
#add pipeline to our blank model we created
nlp.add_pipe(ner, last=True)  
##python chunk

training_data = [

  (u"These Media Posts will serve as notification to the United States Congress that should Iran strike any U.S. person or target, the United States will quickly & fully strike back, & perhaps in a disproportionate manner. Such legal notice is not required, but is given nevertheless!", 
  {'entities': [ (87,91,'COUNTRY') ] }),
  
  (u"The United States just spent Two Trillion Dollars on Military Equipment. We are the biggest and by far the BEST in the World! If Iran attacks an American Base, or any American, we will be sending some of that brand new beautiful equipment their way...and without hesitation!", 
  {'entities': [ (0,17,'COUNTRY') ] }),
  
    (u"They attacked us, & we hit back. If they attack again, which I would strongly advise them not to do, we will hit them harder than they have ever been hit before! https://t.co/qI5RfWsSCH", 
  {'entities': [ (162,185,'URL') ] }),
  
      (u"Nobody but Donald Trump will save Israel. You are wasting your time with these politicians and political clowns. Best!", 
  {'entities': [ (11,23,'PERSON'),(34,40,'COUNTRY') ] }),
  
        (u"The talks between the U.S. and Iran are going on forever,  WORLD'S LONGEST NEGOTIATION. Obama has no idea what he is doing - incompetent!", 
  {'entities': [ (88,93,'PERSON')] })
]
##python chunk

nlp.entity.add_label('PERSON')
nlp.entity.add_label('COUNTRY')
nlp.entity.add_label('URL')
##python chunk

optimizer = nlp.begin_training()
## C:\Users\punthakur\AppData\Local\Programs\Python\Python36\lib\site-packages\spacy\language.py:639: UserWarning: [W033] Training a new parser or NER using a model with no lexeme normalization table. This may degrade the performance of the model to some degree. If this is intentional or the language you're using doesn't have a normalization table, please ignore this warning. If this is surprising, make sure you have the spacy-lookups-data package installed. The languages with lexeme normalization tables are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.
##   **kwargs
import random

#run through training
for i in range(20):
    random.shuffle(training_data)
    for text, annotations in training_data:
        nlp.update([text], [annotations], sgd=optimizer)
        
nlp.to_disk("./model")

Using your NER Tagger

##python chunk

Test1 = nlp(u"#We are one step closer to delivering MASSIVE tax cuts for working families across America.")

for entity in Test1.ents:
  print(entity.label_, ' | ', entity.text)