##r chunk
library(reticulate)
py_config()
## python: /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate/bin/python
## libpython: /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate/lib/libpython3.6m.so
## pythonhome: /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate:/home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate
## version: 3.6.12 | packaged by conda-forge | (default, Dec 9 2020, 00:36:02) [GCC 9.3.0]
## numpy: /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate/lib/python3.6/site-packages/numpy
## numpy_version: 1.19.5
devtools::install_github("bradleyboehmke/harrypotter",force = TRUE)
## Downloading GitHub repo bradleyboehmke/harrypotter@HEAD
##
/bin/bash: /home/sudan_rahul/.local/share/r-miniconda/envs/r-reticulate/lib/libtinfo.so.6: no version information available (required by /bin/bash)
##
checking for file ‘/tmp/RtmprHcFCi/remotes1172c44a2a53b/bradleyboehmke-harrypotter-51f7146/DESCRIPTION’ ...
✓ checking for file ‘/tmp/RtmprHcFCi/remotes1172c44a2a53b/bradleyboehmke-harrypotter-51f7146/DESCRIPTION’
##
─ preparing ‘harrypotter’:
##
checking DESCRIPTION meta-information ...
✓ checking DESCRIPTION meta-information
##
─ checking for LF line-endings in source and make files and shell scripts
##
─ checking for empty or unneeded directories
##
─ building ‘harrypotter_0.1.0.tar.gz’
##
##
## Installing package into '/home/sudan_rahul/R/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library(tagger)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rJava)
library(RDRPOSTagger)
library(harrypotter)
book=data("prisoner_of_azkaban")
book_r = head(prisoner_of_azkaban)
#head(book)
##pick one of the harrypotter books to analyze with your POS text
##https://github.com/bradleyboehmke/harrypotter check out the options
##load it using data(book title)
book_data from R into Python.##python chunk
import spacy
import pandas as pd
import spacy
import pandas
import nltk
from nltk.corpus import brown
nlp = spacy.load('en_core_web_sm')
book_py = r.book_r
book_py = str(book_py)
tagger package to tag your chosen book and print out the first chapter only (i.e., row 1 of the book you chose).(book[1])[[1]][1:10] to print out the first few tags.##r chunk
#tag_pos(book_r)
tag_pos(book_r [1]) %>% plot()
tag_pos(book_r [1]) %>% as_universal() %>% plot()
#penn_tags()
p
head(...(book[1])) to print out the first few examples.##r chunk
models <- rdr_available_models()
models
## Riple down Rule based available taggers:
## ----------------------------------------
##
## 1/ POS tagging for languages:
## English, French, German, Hindi, Italian, Thai, Vietnamese
##
## 2/ MORPH tagging for languages:
## Bulgarian, Czech, Dutch, French, German, Portuguese, Spanish, Swedish
##
## 3/ UniversalPOS tagging for languages:
## Ancient_Greek-PROIEL, Ancient_Greek, Arabic, Basque, Belarusian, Bulgarian, Catalan, Chinese, Coptic, Croatian, Czech-CAC, Czech-CLTT, Czech, Danish, Dutch-LassySmall, Dutch, English-LinES, English-ParTUT, English, Estonian, Finnish-FTB, Finnish, French-ParTUT, French-Sequoia, French, Galician-TreeGal, Galician, German, Gothic, Greek, Hebrew, Hindi, Hungarian, Indonesian, Irish, Italian-ParTUT, Italian, Japanese, Korean, Latin-ITTB, Latin-PROIEL, Latin, Latvian, Lithuanian, Norwegian-Bokmaal, Norwegian-Nynorsk, Old_Church_Slavonic, Persian, Polish, Portuguese-BR, Portuguese, Romanian, Russian-SynTagRus, Russian, Slovak, Slovenian-SST, Slovenian, Spanish-AnCora, Spanish, Swedish-LinES, Swedish, Tamil, Turkish, Urdu, Vietnamese
create_tagger <- rdr_model(language = "English", annotation = "POS")
rdrpos =rdr_pos(create_tagger, x = book_r[1])
head(rdrpos)
## doc_id token_id token pos
## 1 d1 1 OWL NN
## 2 d1 2 POST Harry NNP
## 3 d1 3 Potter NNP
## 4 d1 4 was VBD
## 5 d1 5 a DT
## 6 d1 6 highly RB
pandas option at the beginning of the lecture to print out only a few rows.##python chunk
import nltk
import pandas
nlp = spacy.load('en_core_web_sm')
spacy_book_py_tag = [(word, word.tag_, word.pos_) for word in nlp(book_py)]
pd.DataFrame(spacy_book_py_tag)[1:10]
## 0 1 2
## 1 ' '' PUNCT
## 2 \u3000\u3000OWL NNP PROPN
## 3 POST\u3000\u3000Harry NNP PROPN
## 4 Potter NNP PROPN
## 5 was VBD AUX
## 6 a DT DET
## 7 highly RB ADV
## 8 unusual JJ ADJ
## 9 boy NN NOUN
nltk.##python chunk
default_tagger = nltk.DefaultTagger('NN')
tokens = brown.words(categories = "news")
#pd.DataFrame(default_tagger.tag(tokens)).head
brown_tagged_sents = brown.tagged_sents()
default_tagger.evaluate(brown_tagged_sents)
## 0.13130472824476916
books_data_py = nltk.word_tokenize(book_py)
#pd.DataFrame(default_tagger.tag(books_data_py)).head
##python chunk
from nltk.corpus import brown
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
.evaluate function on your testing data to determine the accuracy of your tagger.##python chunk
unigram_tagger.evaluate(test_sents)
## 0.8849353534083527
t1.evaluate(test_sents)
## 0.8912742817627459
t2.evaluate(test_sents)
## 0.9125751765470128
tagger.tag(book)[1:10] to print out only the first ten tags.##python chunk
import nltk
token_py = nltk.word_tokenize(book_py)
book_py_tag = unigram_tagger.tag(token_py)
pd.DataFrame(book_py_tag)[1:10]
## 0 1
## 1 '\u3000\u3000OWL None
## 2 POST\u3000\u3000Harry None
## 3 Potter NP
## 4 was BEDZ
## 5 a AT
## 6 highly QL
## 7 unusual JJ
## 8 boy NN
## 9 in IN
book_py_token_tag_t1 = t1.tag(token_py)
pd.DataFrame(book_py_token_tag_t1)[1:10]
## 0 1
## 1 '\u3000\u3000OWL NN
## 2 POST\u3000\u3000Harry NN
## 3 Potter NP
## 4 was BEDZ
## 5 a AT
## 6 highly QL
## 7 unusual JJ
## 8 boy NN
## 9 in IN
book_py_token_tag_t2 = t1.tag(token_py)
pd.DataFrame(book_py_token_tag_t2)[1:10]
## 0 1
## 1 '\u3000\u3000OWL NN
## 2 POST\u3000\u3000Harry NN
## 3 Potter NP
## 4 was BEDZ
## 5 a AT
## 6 highly QL
## 7 unusual JJ
## 8 boy NN
## 9 in IN