Libraries / R Setup

##r chunk
library(reticulate)
py_config()
## python:         /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/bin/python
## libpython:      /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/lib/libpython3.6m.dylib
## pythonhome:     /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate:/Users/zosiajiang/Library/r-miniconda/envs/r-reticulate
## version:        3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 18:53:43)  [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy:          /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/lib/python3.6/site-packages/numpy
## numpy_version:  1.19.0
#worknet library
Sys.setenv(WNHOME = "/Users/zosiajiang/Desktop/WordNet-3.0/")
library(wordnet)
path <- file.path("usr", "share", "dict")
filter <- getTermFilter("ExactMatchFilter", #find this exact word
                        word = "fruit", #what the word is
                        ignoreCase = TRUE) #should I ignore case
##python chunk
from nltk.corpus import wordnet as wn
import pandas as pd

import spacy
nlp = spacy.load("en_core_web_sm")

Synsets

##python chunk
from nltk.corpus import wordnet as wn
import pandas as pd

# use "minute" word
minute_sets_py = wn.synsets("minute")
print(minute_sets_py)
# dataframe
## [Synset('minute.n.01'), Synset('moment.n.02'), Synset('moment.n.01'), Synset('minute.n.04'), Synset('minute.n.05'), Synset('hour.n.04'), Synset('infinitesimal.s.01'), Synset('minute.s.02')]
minute_df = pd.DataFrame([
{"Synset": each_synset,
"Part of Speech": each_synset.pos(),
"Definition": each_synset.definition(),
"Lemmas": each_synset.lemma_names(),
"Examples": each_synset.examples()}
for each_synset in minute_sets_py])
print(minute_df)
##                          Synset  ...                                           Examples
## 0         Synset('minute.n.01')  ...                           [he ran a 4 minute mile]
## 1         Synset('moment.n.02')  ...  [wait just a moment, in a mo, it only takes a ...
## 2         Synset('moment.n.01')  ...            [the moment he arrived the party began]
## 3         Synset('minute.n.04')  ...                                                 []
## 4         Synset('minute.n.05')  ...   [the secretary keeps the minutes of the meeting]
## 5           Synset('hour.n.04')  ...  [we live an hour from the airport, its just 10...
## 6  Synset('infinitesimal.s.01')  ...  [two minute whiplike threads of protoplasm, re...
## 7         Synset('minute.s.02')  ...  [a minute inspection of the grounds, a narrow ...
## 
## [8 rows x 5 columns]
minute_df["Definition"]
## 0    a unit of time equal to 60 seconds or 1/60th o...
## 1                           an indefinitely short time
## 2                           a particular point in time
## 3    a unit of angular distance equal to a 60th of ...
## 4                                         a short note
## 5      distance measured by the time taken to cover it
## 6                     infinitely or immeasurably small
## 7    characterized by painstaking care and detailed...
## Name: Definition, dtype: object

Nyms

##python chunk
for synset in wn.synsets("minute"):
  print(synset.name(), " - ", synset.definition())
## minute.n.01  -  a unit of time equal to 60 seconds or 1/60th of an hour
## moment.n.02  -  an indefinitely short time
## moment.n.01  -  a particular point in time
## minute.n.04  -  a unit of angular distance equal to a 60th of a degree
## minute.n.05  -  a short note
## hour.n.04  -  distance measured by the time taken to cover it
## infinitesimal.s.01  -  infinitely or immeasurably small
## minute.s.02  -  characterized by painstaking care and detailed examination
term = "minute"
term_synset = wn.synsets(term)



for each_synset in term_synset:
   term_l = each_synset.lemmas()[0]
   term_s = term_l.synset()

   term_a = term_l.antonyms()
   if len(term_a) > 0:
    term_a = term_a[0].synset()
    antonym = term_a.name()
    definition = term_a.definition()
   else:
    term_a = None
    antonym = None
    definition = None

   print("Synonym: ", term_s.name())
   print("Definition: ", term_s.definition())
   print("Antonym: ", antonym)
   print("Definition: ", definition)
#
# # hypernyms
## Synonym:  minute.n.01
## Definition:  a unit of time equal to 60 seconds or 1/60th of an hour
## Antonym:  None
## Definition:  None
## Synonym:  moment.n.02
## Definition:  an indefinitely short time
## Antonym:  None
## Definition:  None
## Synonym:  moment.n.01
## Definition:  a particular point in time
## Antonym:  None
## Definition:  None
## Synonym:  minute.n.04
## Definition:  a unit of angular distance equal to a 60th of a degree
## Antonym:  None
## Definition:  None
## Synonym:  minute.n.05
## Definition:  a short note
## Antonym:  None
## Definition:  None
## Synonym:  hour.n.04
## Definition:  distance measured by the time taken to cover it
## Antonym:  None
## Definition:  None
## Synonym:  infinitesimal.s.01
## Definition:  infinitely or immeasurably small
## Antonym:  None
## Definition:  None
## Synonym:  minute.s.02
## Definition:  characterized by painstaking care and detailed examination
## Antonym:  None
## Definition:  None
minute_synset = wn.synsets("minute")
minute_synset
## [Synset('minute.n.01'), Synset('moment.n.02'), Synset('moment.n.01'), Synset('minute.n.04'), Synset('minute.n.05'), Synset('hour.n.04'), Synset('infinitesimal.s.01'), Synset('minute.s.02')]
minute = minute_synset[0]
minute.hypernyms()
## [Synset('time_unit.n.01')]

Similarity

##python chunk
apple = wn.synsets("apple")
apple = apple[0]

tree = wn.synsets("tree")
tree = tree[0]

farm = wn.synsets("farm")
farm = farm[0]

apple.lowest_common_hypernyms(tree)
## [Synset('whole.n.02')]
apple.lowest_common_hypernyms(farm)

#JCN
## [Synset('object.n.01')]
from nltk.corpus import wordnet_ic
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
apple.jcn_similarity(tree, semcor_ic)
## 0.06339120383123907
apple.jcn_similarity(farm, semcor_ic)

# LIN
## 0.05830880605033221
apple.lin_similarity(tree, semcor_ic)
## 0.14794993333781561
apple.lin_similarity(farm, semcor_ic)
## 0.11998891781501196

NER Tagging

##python chunk
import spacy
nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank("en")
##python chunk
#add NER pipeline
ner = nlp.create_pipe('ner')  
#add pipeline to our blank model we created
nlp.add_pipe(ner, last=True)  
##python chunk
# training data
training_data = [
  (u"I will be the best by far in fighting terror. I’m the only one that was right from the beginning, & now Lyin’ Ted & others are copying me.", 
  {'entities': [ (110,113,'PERSON') ] }),
  (u"I am in Las Vegas, at the best hotel (by far), Trump International. I will be working with my wonderful teams and volunteers to WIN Nevada!", 
  {'entities': [ (8,17,'LOCATION'), (47, 52, 'PERSON') ] }),
  (u"I am at Trump National Doral-best resort in U.S. Rory and Adam Scott are doing great! Watch on NBC at 3:00 P.M.  MAKE AMERICA GREAT AGAIN!", 
  {'entities': [ (44,48,'LOCATION'), (58, 68, 'PERSON') ] }),
  (u"Donald Trump's Palm Beach mansion which I turned into the greatest club in the world-many jobs!", 
  {'entities': [ (0,12,'PERSON'), (15, 25, 'LOCATION') ] }),
  (u"Despite biggest ever job gains and a V shaped recovery, Joe Biden said, I would shut it down, referring to our Country. He has no clue!", 
  {'entities': [ (56,65,'PERSON')]})
]
##python chunk
nlp.entity.add_label('PERSON')
nlp.entity.add_label('LOCATION')
##python chunk
#begin training
optimizer = nlp.begin_training()
## /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/lib/python3.6/site-packages/spacy/language.py:639: UserWarning: [W033] Training a new parser or NER using a model with no lexeme normalization table. This may degrade the performance of the model to some degree. If this is intentional or the language you're using doesn't have a normalization table, please ignore this warning. If this is surprising, make sure you have the spacy-lookups-data package installed. The languages with lexeme normalization tables are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.
##   **kwargs
import random

#run through training
for i in range(20):
    random.shuffle(training_data)
    for text, annotations in training_data:
        nlp.update([text], [annotations], sgd=optimizer)
        
#save your model if you want to use it later
nlp.to_disk("./model")

Using your NER Tagger

##python chunk
tweet6 = nlp(u"Kimberly will work with the Donald Trump Administration and we will bring Baltimore back, and fast. Don’t blow it Baltimore, the Democrats have destroyed your city!")


for entity in tweet6.ents:
  print(entity.label_, ' | ', entity.text)
  
## PERSON  |  Donald Trump
tweet6.ents
## (Donald Trump,)