Objective: A quick introduction to NLP. Word and sentence tokenization. POS tagging.

Data Source: http://www.anb.org/articles/16/16-03109.html

Data Description: Biography excerpt of Jerena Lee.

Approach: Using NLP techniques to explore data set via tokenization of sentence, word, person, location, organization. Moreover, create POS tags.

1.) Annotating Sentences and Words

## Create annotators for words and sentences
word_ann <- Maxent_Word_Token_Annotator()
sent_ann <- Maxent_Sent_Token_Annotator()

## Create annotation of the biography.
bio_annotations <- annotate(bio, list(sent_ann, word_ann))
head(bio_annotations)
##  id type     start end features
##   1 sentence     1 110 constituents=<<integer,20>>
##   2 sentence   112 240 constituents=<<integer,20>>
##   3 sentence   242 386 constituents=<<integer,25>>
##   4 sentence   388 412 constituents=<<integer,6>>
##   5 sentence   414 693 constituents=<<integer,49>>
##   6 sentence   695 791 constituents=<<integer,19>>
## Combine the biography and its annotations.
bio_doc <- AnnotatedPlainTextDocument(bio, bio_annotations)

## Display first 2 sentences of the biography
sents(bio_doc) %>% head(2)
## [[1]]
##  [1] "In"           "1804"         ","            "after"       
##  [5] "several"      "months"       "of"           "profound"    
##  [9] "spiritual"    "anxiety"      ","            "Jarena"      
## [13] "Lee"          "moved"        "from"         "New"         
## [17] "Jersey"       "to"           "Philadelphia" "."           
## 
## [[2]]
##  [1] "There"         "she"           "labored"       "as"           
##  [5] "a"             "domestic"      "and"           "worshiped"    
##  [9] "among"         "white"         "congregations" "of"           
## [13] "Roman"         "Catholics"     "and"           "mixed"        
## [17] "congregations" "of"            "Methodists"    "."
## Display the first 10 words of the biography
words(bio_doc) %>% head(10)
##  [1] "In"        "1804"      ","         "after"     "several"  
##  [6] "months"    "of"        "profound"  "spiritual" "anxiety"

2.) Annotating People and Places

#   NOTE: Uses named entity recognition (NER)

## Create annotators for people
person_ann <- Maxent_Entity_Annotator(kind = "person")
location_ann <- Maxent_Entity_Annotator(kind = "location")
organization_ann <- Maxent_Entity_Annotator(kind = "organization")




# Create pipeline for sentences, words, people, locations, and organization
pipeline <- list(sent_ann, word_ann, person_ann, location_ann, organization_ann)
bio_annotations <- annotate(bio, pipeline)
bio_doc <- AnnotatedPlainTextDocument(bio, bio_annotations)

3.) Create function that extract the desire entity of an annotated document.

# Extract entities from an AnnotatedPlainTextDocument
entities <- function(doc, kind) {
  s <- doc$content
  a <- annotations(doc)[[1]]
  if(hasArg(kind)) {
    k <- sapply(a$features, `[[`, "kind")
    s[a[k == kind]]
  } else {
    s[a[a$type == "entity"]]
  }
}





# extract all of the named entities from [bio_doc] using created entities()
ent_people <- entities(bio_doc, kind = "person")

# total people mentioned
ent_people %>% 
  sapply(length)
##    Jarena Lee Richard Allen           Lee    Joseph Lee 
##             1             1             1             1
# unique people mentioned
ent_people %>% 
  sapply(unique) %>% 
  sapply(length)
##    Jarena Lee Richard Allen           Lee    Joseph Lee 
##             1             1             1             1
# extract all of the locations entities from [bio_doc] using created entities()
ent_locations <- entities(bio_doc, kind = "location")

# total places mentioned
ent_locations %>% 
  sapply(length)
##   New Jersey Philadelphia   New Jersey 
##            1            1            1
# unique places mentioned
ent_locations %>% 
  sapply(unique) %>% 
  sapply(length)
##   New Jersey Philadelphia   New Jersey 
##            1            1            1
# extract all of the organizations entities from [bio_doc] using created entities()
ent_organization <- entities(bio_doc, kind = "organization")

# total organizations mentioned
ent_organization %>% 
  sapply(length)
## Bethel African Methodist Episcopal Church 
##                                         1
# unique organizations mentioned
ent_organization %>% 
  sapply(unique) %>% 
  sapply(length)
## Bethel African Methodist Episcopal Church 
##                                         1

4.) Create POS tags

pos_ann <- Maxent_POS_Tag_Annotator()
bio_annotations2 <- annotate(bio, pos_ann, bio_annotations)

# Determine the distribution of POS tags for word tokens.
bio_annotations2_words <- subset(bio_annotations2, type == "word")
tags <- sapply(bio_annotations2_words$features, `[[`, "POS")
tags
##   [1] "IN"   "CD"   ","    "IN"   "JJ"   "NNS"  "IN"   "JJ"   "JJ"   "NN"  
##  [11] ","    "NNP"  "NNP"  "VBD"  "IN"   "NNP"  "NNP"  "TO"   "NNP"  "."   
##  [21] "EX"   "PRP"  "VBD"  "IN"   "DT"   "JJ"   "CC"   "JJ"   "IN"   "JJ"  
##  [31] "NNS"  "IN"   "NNP"  "NNPS" "CC"   "JJ"   "NNS"  "IN"   "NNS"  "."   
##  [41] "IN"   "VBG"  "DT"   "VBN"  "NN"   "IN"   "DT"   "NNP"  "NNP"  "NNP" 
##  [51] ","    "NN"   "IN"   "DT"   "NNP"  "NNP"  "NNP"  "NNP"  "NNP"  ","   
##  [61] "NNP"  "VBD"  "DT"   "NNPS" "."    "PRP"  "VBD"  "VBN"  "IN"   "CD"  
##  [71] "."    "RB"   "TO"   "PRP$" "NN"   ","    "PRP"  "VBD"  "DT"   "JJ"  
##  [81] "JJ"   "CC"   "JJ"   "NNS"  "IN"   "NN"   ":"    "JJ"   "NNS"  "IN"  
##  [91] "NNS"  "CC"   "JJ"   "NN"   ":"    "JJ"   "NNS"  "IN"   "NN"   "CC"  
## [101] "NN"   ":"    "VBN"  "NNS"  "IN"   "NN"   ","    "VBG"  ","    "CC"  
## [111] "NN"   ":"    "NN"   "CC"   "NN"   ":"    "NN"   "CC"   "NN"   "."   
## [121] "IN"   "CD"   "PRP"  "VBD"  "NNP"  "NNP"  ","    "WP"   "VBD"  "DT"  
## [131] "JJ"   "NN"   "IN"   "NNP"  "NNP"  ","    "NNP"  "NNP"  "."    "PRP" 
## [141] "VBD"  "CD"   "NNS"  ","    "CD"   "IN"   "WP"   "VBD"  "IN"   "NN"  
## [151] "."
table(tags)
## tags
##    ,    :    .   CC   CD   DT   EX   IN   JJ   NN  NNP NNPS  NNS  PRP PRP$ 
##   10    5    7    8    5    7    1   20   14   16   21    2   10    5    1 
##   RB   TO  VBD  VBG  VBN   WP 
##    1    2    9    2    3    2
# Extract token/POS pairs (all of them)
sprintf("%s/%s", bio[bio_annotations2_words], tags)
##   [1] "In/IN"               "1804/CD"             ",/,"                
##   [4] "after/IN"            "several/JJ"          "months/NNS"         
##   [7] "of/IN"               "profound/JJ"         "spiritual/JJ"       
##  [10] "anxiety/NN"          ",/,"                 "Jarena/NNP"         
##  [13] "Lee/NNP"             "moved/VBD"           "from/IN"            
##  [16] "New/NNP"             "Jersey/NNP"          "to/TO"              
##  [19] "Philadelphia/NNP"    "./."                 "There/EX"           
##  [22] "she/PRP"             "labored/VBD"         "as/IN"              
##  [25] "a/DT"                "domestic/JJ"         "and/CC"             
##  [28] "worshiped/JJ"        "among/IN"            "white/JJ"           
##  [31] "congregations/NNS"   "of/IN"               "Roman/NNP"          
##  [34] "Catholics/NNPS"      "and/CC"              "mixed/JJ"           
##  [37] "congregations/NNS"   "of/IN"               "Methodists/NNS"     
##  [40] "./."                 "On/IN"               "hearing/VBG"        
##  [43] "an/DT"               "inspired/VBN"        "sermon/NN"          
##  [46] "by/IN"               "the/DT"              "Reverend/NNP"       
##  [49] "Richard/NNP"         "Allen/NNP"           ",/,"                
##  [52] "founder/NN"          "of/IN"               "the/DT"             
##  [55] "Bethel/NNP"          "African/NNP"         "Methodist/NNP"      
##  [58] "Episcopal/NNP"       "Church/NNP"          ",/,"                
##  [61] "Lee/NNP"             "joined/VBD"          "the/DT"             
##  [64] "Methodists/NNPS"     "./."                 "She/PRP"            
##  [67] "was/VBD"             "baptized/VBN"        "in/IN"              
##  [70] "1807/CD"             "./."                 "Prior/RB"           
##  [73] "to/TO"               "her/PRP$"            "baptism/NN"         
##  [76] ",/,"                 "she/PRP"             "experienced/VBD"    
##  [79] "the/DT"              "various/JJ"          "physical/JJ"        
##  [82] "and/CC"              "emotional/JJ"        "stages/NNS"         
##  [85] "of/IN"               "conversion/NN"       ":/:"                
##  [88] "terrifying/JJ"       "visions/NNS"         "of/IN"              
##  [91] "demons/NNS"          "and/CC"              "eternal/JJ"         
##  [94] "perdition/NN"        ";/:"                 "extreme/JJ"         
##  [97] "feelings/NNS"        "of/IN"               "ecstasy/NN"         
## [100] "and/CC"              "depression/NN"       ";/:"                
## [103] "protracted/VBN"      "periods/NNS"         "of/IN"              
## [106] "meditation/NN"       ",/,"                 "fasting/VBG"        
## [109] ",/,"                 "and/CC"              "prayer/NN"          
## [112] ";/:"                 "ennui/NN"            "and/CC"             
## [115] "fever/NN"            ";/:"                 "energy/NN"          
## [118] "and/CC"              "vigor/NN"            "./."                
## [121] "In/IN"               "1811/CD"             "she/PRP"            
## [124] "married/VBD"         "Joseph/NNP"          "Lee/NNP"            
## [127] ",/,"                 "who/WP"              "pastored/VBD"       
## [130] "an/DT"               "African-American/JJ" "church/NN"          
## [133] "in/IN"               "Snow/NNP"            "Hill/NNP"           
## [136] ",/,"                 "New/NNP"             "Jersey/NNP"         
## [139] "./."                 "They/PRP"            "had/VBD"            
## [142] "six/CD"              "children/NNS"        ",/,"                
## [145] "four/CD"             "of/IN"               "whom/WP"            
## [148] "died/VBD"            "in/IN"               "infancy/NN"         
## [151] "./."
# Extract pairs of word tokens and POS tags for second sentence:
bio_annotations2_words_sent <-  annotations_in_spans(subset(bio_annotations2, type == "word"),
                                subset(bio_annotations2, type == "sentence")[3L])[[1L]]
sprintf("%s/%s", bio[bio_annotations2_words_sent], sapply(bio_annotations2_words_sent$features, `[[`, "POS"))
##  [1] "On/IN"           "hearing/VBG"     "an/DT"          
##  [4] "inspired/VBN"    "sermon/NN"       "by/IN"          
##  [7] "the/DT"          "Reverend/NNP"    "Richard/NNP"    
## [10] "Allen/NNP"       ",/,"             "founder/NN"     
## [13] "of/IN"           "the/DT"          "Bethel/NNP"     
## [16] "African/NNP"     "Methodist/NNP"   "Episcopal/NNP"  
## [19] "Church/NNP"      ",/,"             "Lee/NNP"        
## [22] "joined/VBD"      "the/DT"          "Methodists/NNPS"
## [25] "./."