NLP_intro

Objective: A quick introduction to NLP. Word and sentence tokenization. POS tagging.

Data Source: http://www.anb.org/articles/16/16-03109.html

Data Description: Biography excerpt of Jerena Lee.

Approach: Using NLP techniques to explore data set via tokenization of sentence, word, person, location, organization. Moreover, create POS tags.

1.) Annotating Sentences and Words

## Create annotators for words and sentences
word_ann <- Maxent_Word_Token_Annotator()
sent_ann <- Maxent_Sent_Token_Annotator()

## Create annotation of the biography.
bio_annotations <- annotate(bio, list(sent_ann, word_ann))
head(bio_annotations)

##  id type     start end features
##   1 sentence     1 110 constituents=<<integer,20>>
##   2 sentence   112 240 constituents=<<integer,20>>
##   3 sentence   242 386 constituents=<<integer,25>>
##   4 sentence   388 412 constituents=<<integer,6>>
##   5 sentence   414 693 constituents=<<integer,49>>
##   6 sentence   695 791 constituents=<<integer,19>>

## Combine the biography and its annotations.
bio_doc <- AnnotatedPlainTextDocument(bio, bio_annotations)

## Display first 2 sentences of the biography
sents(bio_doc) %>% head(2)

## [[1]]
##  [1] "In"           "1804"         ","            "after"       
##  [5] "several"      "months"       "of"           "profound"    
##  [9] "spiritual"    "anxiety"      ","            "Jarena"      
## [13] "Lee"          "moved"        "from"         "New"         
## [17] "Jersey"       "to"           "Philadelphia" "."           
## 
## [[2]]
##  [1] "There"         "she"           "labored"       "as"           
##  [5] "a"             "domestic"      "and"           "worshiped"    
##  [9] "among"         "white"         "congregations" "of"           
## [13] "Roman"         "Catholics"     "and"           "mixed"        
## [17] "congregations" "of"            "Methodists"    "."

## Display the first 10 words of the biography
words(bio_doc) %>% head(10)

##  [1] "In"        "1804"      ","         "after"     "several"  
##  [6] "months"    "of"        "profound"  "spiritual" "anxiety"

2.) Annotating People and Places

#   NOTE: Uses named entity recognition (NER)

## Create annotators for people
person_ann <- Maxent_Entity_Annotator(kind = "person")
location_ann <- Maxent_Entity_Annotator(kind = "location")
organization_ann <- Maxent_Entity_Annotator(kind = "organization")




# Create pipeline for sentences, words, people, locations, and organization
pipeline <- list(sent_ann, word_ann, person_ann, location_ann, organization_ann)
bio_annotations <- annotate(bio, pipeline)
bio_doc <- AnnotatedPlainTextDocument(bio, bio_annotations)

3.) Create function that extract the desire entity of an annotated document.

# Extract entities from an AnnotatedPlainTextDocument
entities <- function(doc, kind) {
  s <- doc$content
  a <- annotations(doc)[[1]]
  if(hasArg(kind)) {
    k <- sapply(a$features, `[[`, "kind")
    s[a[k == kind]]
  } else {
    s[a[a$type == "entity"]]
  }
}





# extract all of the named entities from [bio_doc] using created entities()
ent_people <- entities(bio_doc, kind = "person")

# total people mentioned
ent_people %>% 
  sapply(length)

##    Jarena Lee Richard Allen           Lee    Joseph Lee 
##             1             1             1             1

# unique people mentioned
ent_people %>% 
  sapply(unique) %>% 
  sapply(length)

##    Jarena Lee Richard Allen           Lee    Joseph Lee 
##             1             1             1             1

# extract all of the locations entities from [bio_doc] using created entities()
ent_locations <- entities(bio_doc, kind = "location")

# total places mentioned
ent_locations %>% 
  sapply(length)

##   New Jersey Philadelphia   New Jersey 
##            1            1            1

# unique places mentioned
ent_locations %>% 
  sapply(unique) %>% 
  sapply(length)

##   New Jersey Philadelphia   New Jersey 
##            1            1            1

# extract all of the organizations entities from [bio_doc] using created entities()
ent_organization <- entities(bio_doc, kind = "organization")

# total organizations mentioned
ent_organization %>% 
  sapply(length)

## Bethel African Methodist Episcopal Church 
##                                         1

# unique organizations mentioned
ent_organization %>% 
  sapply(unique) %>% 
  sapply(length)

## Bethel African Methodist Episcopal Church 
##                                         1

4.) Create POS tags

pos_ann <- Maxent_POS_Tag_Annotator()
bio_annotations2 <- annotate(bio, pos_ann, bio_annotations)

# Determine the distribution of POS tags for word tokens.
bio_annotations2_words <- subset(bio_annotations2, type == "word")
tags <- sapply(bio_annotations2_words$features, `[[`, "POS")
tags

##   [1] "IN"   "CD"   ","    "IN"   "JJ"   "NNS"  "IN"   "JJ"   "JJ"   "NN"  
##  [11] ","    "NNP"  "NNP"  "VBD"  "IN"   "NNP"  "NNP"  "TO"   "NNP"  "."   
##  [21] "EX"   "PRP"  "VBD"  "IN"   "DT"   "JJ"   "CC"   "JJ"   "IN"   "JJ"  
##  [31] "NNS"  "IN"   "NNP"  "NNPS" "CC"   "JJ"   "NNS"  "IN"   "NNS"  "."   
##  [41] "IN"   "VBG"  "DT"   "VBN"  "NN"   "IN"   "DT"   "NNP"  "NNP"  "NNP" 
##  [51] ","    "NN"   "IN"   "DT"   "NNP"  "NNP"  "NNP"  "NNP"  "NNP"  ","   
##  [61] "NNP"  "VBD"  "DT"   "NNPS" "."    "PRP"  "VBD"  "VBN"  "IN"   "CD"  
##  [71] "."    "RB"   "TO"   "PRP$" "NN"   ","    "PRP"  "VBD"  "DT"   "JJ"  
##  [81] "JJ"   "CC"   "JJ"   "NNS"  "IN"   "NN"   ":"    "JJ"   "NNS"  "IN"  
##  [91] "NNS"  "CC"   "JJ"   "NN"   ":"    "JJ"   "NNS"  "IN"   "NN"   "CC"  
## [101] "NN"   ":"    "VBN"  "NNS"  "IN"   "NN"   ","    "VBG"  ","    "CC"  
## [111] "NN"   ":"    "NN"   "CC"   "NN"   ":"    "NN"   "CC"   "NN"   "."   
## [121] "IN"   "CD"   "PRP"  "VBD"  "NNP"  "NNP"  ","    "WP"   "VBD"  "DT"  
## [131] "JJ"   "NN"   "IN"   "NNP"  "NNP"  ","    "NNP"  "NNP"  "."    "PRP" 
## [141] "VBD"  "CD"   "NNS"  ","    "CD"   "IN"   "WP"   "VBD"  "IN"   "NN"  
## [151] "."

table(tags)

## tags
##    ,    :    .   CC   CD   DT   EX   IN   JJ   NN  NNP NNPS  NNS  PRP PRP$ 
##   10    5    7    8    5    7    1   20   14   16   21    2   10    5    1 
##   RB   TO  VBD  VBG  VBN   WP 
##    1    2    9    2    3    2

# Extract token/POS pairs (all of them)
sprintf("%s/%s", bio[bio_annotations2_words], tags)

##   [1] "In/IN"               "1804/CD"             ",/,"                
##   [4] "after/IN"            "several/JJ"          "months/NNS"         
##   [7] "of/IN"               "profound/JJ"         "spiritual/JJ"       
##  [10] "anxiety/NN"          ",/,"                 "Jarena/NNP"         
##  [13] "Lee/NNP"             "moved/VBD"           "from/IN"            
##  [16] "New/NNP"             "Jersey/NNP"          "to/TO"              
##  [19] "Philadelphia/NNP"    "./."                 "There/EX"           
##  [22] "she/PRP"             "labored/VBD"         "as/IN"              
##  [25] "a/DT"                "domestic/JJ"         "and/CC"             
##  [28] "worshiped/JJ"        "among/IN"            "white/JJ"           
##  [31] "congregations/NNS"   "of/IN"               "Roman/NNP"          
##  [34] "Catholics/NNPS"      "and/CC"              "mixed/JJ"           
##  [37] "congregations/NNS"   "of/IN"               "Methodists/NNS"     
##  [40] "./."                 "On/IN"               "hearing/VBG"        
##  [43] "an/DT"               "inspired/VBN"        "sermon/NN"          
##  [46] "by/IN"               "the/DT"              "Reverend/NNP"       
##  [49] "Richard/NNP"         "Allen/NNP"           ",/,"                
##  [52] "founder/NN"          "of/IN"               "the/DT"             
##  [55] "Bethel/NNP"          "African/NNP"         "Methodist/NNP"      
##  [58] "Episcopal/NNP"       "Church/NNP"          ",/,"                
##  [61] "Lee/NNP"             "joined/VBD"          "the/DT"             
##  [64] "Methodists/NNPS"     "./."                 "She/PRP"            
##  [67] "was/VBD"             "baptized/VBN"        "in/IN"              
##  [70] "1807/CD"             "./."                 "Prior/RB"           
##  [73] "to/TO"               "her/PRP$"            "baptism/NN"         
##  [76] ",/,"                 "she/PRP"             "experienced/VBD"    
##  [79] "the/DT"              "various/JJ"          "physical/JJ"        
##  [82] "and/CC"              "emotional/JJ"        "stages/NNS"         
##  [85] "of/IN"               "conversion/NN"       ":/:"                
##  [88] "terrifying/JJ"       "visions/NNS"         "of/IN"              
##  [91] "demons/NNS"          "and/CC"              "eternal/JJ"         
##  [94] "perdition/NN"        ";/:"                 "extreme/JJ"         
##  [97] "feelings/NNS"        "of/IN"               "ecstasy/NN"         
## [100] "and/CC"              "depression/NN"       ";/:"                
## [103] "protracted/VBN"      "periods/NNS"         "of/IN"              
## [106] "meditation/NN"       ",/,"                 "fasting/VBG"        
## [109] ",/,"                 "and/CC"              "prayer/NN"          
## [112] ";/:"                 "ennui/NN"            "and/CC"             
## [115] "fever/NN"            ";/:"                 "energy/NN"          
## [118] "and/CC"              "vigor/NN"            "./."                
## [121] "In/IN"               "1811/CD"             "she/PRP"            
## [124] "married/VBD"         "Joseph/NNP"          "Lee/NNP"            
## [127] ",/,"                 "who/WP"              "pastored/VBD"       
## [130] "an/DT"               "African-American/JJ" "church/NN"          
## [133] "in/IN"               "Snow/NNP"            "Hill/NNP"           
## [136] ",/,"                 "New/NNP"             "Jersey/NNP"         
## [139] "./."                 "They/PRP"            "had/VBD"            
## [142] "six/CD"              "children/NNS"        ",/,"                
## [145] "four/CD"             "of/IN"               "whom/WP"            
## [148] "died/VBD"            "in/IN"               "infancy/NN"         
## [151] "./."

# Extract pairs of word tokens and POS tags for second sentence:
bio_annotations2_words_sent <-  annotations_in_spans(subset(bio_annotations2, type == "word"),
                                subset(bio_annotations2, type == "sentence")[3L])[[1L]]
sprintf("%s/%s", bio[bio_annotations2_words_sent], sapply(bio_annotations2_words_sent$features, `[[`, "POS"))

##  [1] "On/IN"           "hearing/VBG"     "an/DT"          
##  [4] "inspired/VBN"    "sermon/NN"       "by/IN"          
##  [7] "the/DT"          "Reverend/NNP"    "Richard/NNP"    
## [10] "Allen/NNP"       ",/,"             "founder/NN"     
## [13] "of/IN"           "the/DT"          "Bethel/NNP"     
## [16] "African/NNP"     "Methodist/NNP"   "Episcopal/NNP"  
## [19] "Church/NNP"      ",/,"             "Lee/NNP"        
## [22] "joined/VBD"      "the/DT"          "Methodists/NNPS"
## [25] "./."

NLP_intro

Rich Leung

December 30, 2015

1.) Annotating Sentences and Words

2.) Annotating People and Places

3.) Create function that extract the desire entity of an annotated document.

4.) Create POS tags