Libraries / R Setup

##r chunk
library(reticulate)
py_config()
## python:         /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate/bin/python
## libpython:      /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate/lib/libpython3.6m.so
## pythonhome:     /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate:/home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate
## version:        3.6.12 | packaged by conda-forge | (default, Dec  9 2020, 00:36:02)  [GCC 9.3.0]
## numpy:          /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate/lib/python3.6/site-packages/numpy
## numpy_version:  1.19.5
devtools::install_github("bradleyboehmke/harrypotter",force = TRUE)
## Downloading GitHub repo bradleyboehmke/harrypotter@HEAD
##   
   /bin/bash: /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate/lib/libtinfo.so.6: no version information available (required by /bin/bash)
## 
  
   checking for file ‘/tmp/RtmprHcFCi/remotes1172c44a2a53b/bradleyboehmke-harrypotter-51f7146/DESCRIPTION’ ...
  
✓  checking for file ‘/tmp/RtmprHcFCi/remotes1172c44a2a53b/bradleyboehmke-harrypotter-51f7146/DESCRIPTION’
## 
  
─  preparing ‘harrypotter’:
## 
  
   checking DESCRIPTION meta-information ...
  
✓  checking DESCRIPTION meta-information
## 
  
─  checking for LF line-endings in source and make files and shell scripts
## 
  
─  checking for empty or unneeded directories
## 
  
─  building ‘harrypotter_0.1.0.tar.gz’
## 
  
   
## 
## Installing package into '/home/sudan_rahul/R/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library(tagger)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rJava)
library(RDRPOSTagger)
library(harrypotter)
book=data("prisoner_of_azkaban")
book_r = head(prisoner_of_azkaban)

#head(book)
##pick one of the harrypotter books to analyze with your POS text
##https://github.com/bradleyboehmke/harrypotter check out the options
##load it using data(book title)
##python chunk
import spacy
import pandas as pd
import spacy
import pandas
import nltk
from nltk.corpus import brown
nlp = spacy.load('en_core_web_sm')
book_py = r.book_r
book_py = str(book_py)

Tagger Package

##r chunk
#tag_pos(book_r)
tag_pos(book_r [1]) %>% plot()

tag_pos(book_r [1]) %>% as_universal() %>% plot()

#penn_tags()

p

RDR POS Tagger

##r chunk
models <- rdr_available_models()
models
## Riple down Rule based available taggers:
## ----------------------------------------
## 
## 1/ POS tagging for languages:
## English, French, German, Hindi, Italian, Thai, Vietnamese
## 
## 2/ MORPH tagging for languages:
## Bulgarian, Czech, Dutch, French, German, Portuguese, Spanish, Swedish
## 
## 3/ UniversalPOS tagging for languages:
## Ancient_Greek-PROIEL, Ancient_Greek, Arabic, Basque, Belarusian, Bulgarian, Catalan, Chinese, Coptic, Croatian, Czech-CAC, Czech-CLTT, Czech, Danish, Dutch-LassySmall, Dutch, English-LinES, English-ParTUT, English, Estonian, Finnish-FTB, Finnish, French-ParTUT, French-Sequoia, French, Galician-TreeGal, Galician, German, Gothic, Greek, Hebrew, Hindi, Hungarian, Indonesian, Irish, Italian-ParTUT, Italian, Japanese, Korean, Latin-ITTB, Latin-PROIEL, Latin, Latvian, Lithuanian, Norwegian-Bokmaal, Norwegian-Nynorsk, Old_Church_Slavonic, Persian, Polish, Portuguese-BR, Portuguese, Romanian, Russian-SynTagRus, Russian, Slovak, Slovenian-SST, Slovenian, Spanish-AnCora, Spanish, Swedish-LinES, Swedish, Tamil, Turkish, Urdu, Vietnamese
create_tagger <- rdr_model(language = "English", annotation = "POS")
rdrpos =rdr_pos(create_tagger, x = book_r[1])
head(rdrpos)
##   doc_id token_id         token pos
## 1     d1        1           OWL  NN
## 2     d1        2 POST  Harry NNP
## 3     d1        3        Potter NNP
## 4     d1        4           was VBD
## 5     d1        5             a  DT
## 6     d1        6        highly  RB

spaCy

##python chunk
import nltk
import pandas
nlp = spacy.load('en_core_web_sm')
spacy_book_py_tag = [(word, word.tag_, word.pos_) for word in nlp(book_py)]
pd.DataFrame(spacy_book_py_tag)[1:10]
##                        0    1      2
## 1                      '   ''  PUNCT
## 2        \u3000\u3000OWL  NNP  PROPN
## 3  POST\u3000\u3000Harry  NNP  PROPN
## 4                 Potter  NNP  PROPN
## 5                    was  VBD    AUX
## 6                      a   DT    DET
## 7                 highly   RB    ADV
## 8                unusual   JJ    ADJ
## 9                    boy   NN   NOUN

Training your own tagger

##python chunk
default_tagger = nltk.DefaultTagger('NN')
tokens = brown.words(categories = "news") 
#pd.DataFrame(default_tagger.tag(tokens)).head

brown_tagged_sents = brown.tagged_sents()
default_tagger.evaluate(brown_tagged_sents)
## 0.13130472824476916
books_data_py = nltk.word_tokenize(book_py)
#pd.DataFrame(default_tagger.tag(books_data_py)).head

Unigram Tagger

##python chunk
from nltk.corpus import brown
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)

Evaluate

##python chunk
unigram_tagger.evaluate(test_sents)
## 0.8849353534083527
t1.evaluate(test_sents)
## 0.8912742817627459
t2.evaluate(test_sents)
## 0.9125751765470128

Apply to Harry Potter

##python chunk
import nltk
token_py = nltk.word_tokenize(book_py)
book_py_tag = unigram_tagger.tag(token_py)
pd.DataFrame(book_py_tag)[1:10]
##                        0     1
## 1       '\u3000\u3000OWL  None
## 2  POST\u3000\u3000Harry  None
## 3                 Potter    NP
## 4                    was  BEDZ
## 5                      a    AT
## 6                 highly    QL
## 7                unusual    JJ
## 8                    boy    NN
## 9                     in    IN
book_py_token_tag_t1 = t1.tag(token_py)
pd.DataFrame(book_py_token_tag_t1)[1:10]
##                        0     1
## 1       '\u3000\u3000OWL    NN
## 2  POST\u3000\u3000Harry    NN
## 3                 Potter    NP
## 4                    was  BEDZ
## 5                      a    AT
## 6                 highly    QL
## 7                unusual    JJ
## 8                    boy    NN
## 9                     in    IN
book_py_token_tag_t2 = t1.tag(token_py)
pd.DataFrame(book_py_token_tag_t2)[1:10]
##                        0     1
## 1       '\u3000\u3000OWL    NN
## 2  POST\u3000\u3000Harry    NN
## 3                 Potter    NP
## 4                    was  BEDZ
## 5                      a    AT
## 6                 highly    QL
## 7                unusual    JJ
## 8                    boy    NN
## 9                     in    IN

Compare Results