Objective: A quick introduction to NLP. Word and sentence tokenization. POS tagging.
Data Source: http://www.anb.org/articles/16/16-03109.html
Data Description: Biography excerpt of Jerena Lee.
Approach: Using NLP techniques to explore data set via tokenization of sentence, word, person, location, organization. Moreover, create POS tags.
## Create annotators for words and sentences
word_ann <- Maxent_Word_Token_Annotator()
sent_ann <- Maxent_Sent_Token_Annotator()
## Create annotation of the biography.
bio_annotations <- annotate(bio, list(sent_ann, word_ann))
head(bio_annotations)
## id type start end features
## 1 sentence 1 110 constituents=<<integer,20>>
## 2 sentence 112 240 constituents=<<integer,20>>
## 3 sentence 242 386 constituents=<<integer,25>>
## 4 sentence 388 412 constituents=<<integer,6>>
## 5 sentence 414 693 constituents=<<integer,49>>
## 6 sentence 695 791 constituents=<<integer,19>>
## Combine the biography and its annotations.
bio_doc <- AnnotatedPlainTextDocument(bio, bio_annotations)
## Display first 2 sentences of the biography
sents(bio_doc) %>% head(2)
## [[1]]
## [1] "In" "1804" "," "after"
## [5] "several" "months" "of" "profound"
## [9] "spiritual" "anxiety" "," "Jarena"
## [13] "Lee" "moved" "from" "New"
## [17] "Jersey" "to" "Philadelphia" "."
##
## [[2]]
## [1] "There" "she" "labored" "as"
## [5] "a" "domestic" "and" "worshiped"
## [9] "among" "white" "congregations" "of"
## [13] "Roman" "Catholics" "and" "mixed"
## [17] "congregations" "of" "Methodists" "."
## Display the first 10 words of the biography
words(bio_doc) %>% head(10)
## [1] "In" "1804" "," "after" "several"
## [6] "months" "of" "profound" "spiritual" "anxiety"
# NOTE: Uses named entity recognition (NER)
## Create annotators for people
person_ann <- Maxent_Entity_Annotator(kind = "person")
location_ann <- Maxent_Entity_Annotator(kind = "location")
organization_ann <- Maxent_Entity_Annotator(kind = "organization")
# Create pipeline for sentences, words, people, locations, and organization
pipeline <- list(sent_ann, word_ann, person_ann, location_ann, organization_ann)
bio_annotations <- annotate(bio, pipeline)
bio_doc <- AnnotatedPlainTextDocument(bio, bio_annotations)
# Extract entities from an AnnotatedPlainTextDocument
entities <- function(doc, kind) {
s <- doc$content
a <- annotations(doc)[[1]]
if(hasArg(kind)) {
k <- sapply(a$features, `[[`, "kind")
s[a[k == kind]]
} else {
s[a[a$type == "entity"]]
}
}
# extract all of the named entities from [bio_doc] using created entities()
ent_people <- entities(bio_doc, kind = "person")
# total people mentioned
ent_people %>%
sapply(length)
## Jarena Lee Richard Allen Lee Joseph Lee
## 1 1 1 1
# unique people mentioned
ent_people %>%
sapply(unique) %>%
sapply(length)
## Jarena Lee Richard Allen Lee Joseph Lee
## 1 1 1 1
# extract all of the locations entities from [bio_doc] using created entities()
ent_locations <- entities(bio_doc, kind = "location")
# total places mentioned
ent_locations %>%
sapply(length)
## New Jersey Philadelphia New Jersey
## 1 1 1
# unique places mentioned
ent_locations %>%
sapply(unique) %>%
sapply(length)
## New Jersey Philadelphia New Jersey
## 1 1 1
# extract all of the organizations entities from [bio_doc] using created entities()
ent_organization <- entities(bio_doc, kind = "organization")
# total organizations mentioned
ent_organization %>%
sapply(length)
## Bethel African Methodist Episcopal Church
## 1
# unique organizations mentioned
ent_organization %>%
sapply(unique) %>%
sapply(length)
## Bethel African Methodist Episcopal Church
## 1