Introduction

The goal of this document is to show a sample script for pattern-based entity recognition over text documents using the openNLP (natural language processing) and the tm (text mining) packages in R.

I cannot claim full authorship of this document, since I have taken code snippets and have been inspired by multiple books and documents in the Web. Thanks everyone for sharing.

Preparation

Check working directory

Check the working directory with wd. If it is not the one where your data are located, change it with setwd.

getwd()
## [1] "/Users/raul/ownCloud/Trabajo/Docencia/2015 Intelligent Systems/R"
setwd("~/ownCloud/Trabajo/Docencia/2015 Intelligent Systems/R")

Load libraries

Now we load the required libraries. Only a couple of things to mention:

  • Using the annotate function of the openNLP package requires to explicitly include the package name (i.e., NLP::annotate) due to a name clash with ggplot2
  • Need to change the memory allocated to Java to avoid out-of-memory problems
# Needed for OutOfMemoryError: Java heap space 
library(rJava)
.jinit(parameters="-Xmx4g")
# If there are more memory problems, invoke gc() after the POS tagging

library(NLP) 
library(openNLP) 
library(openNLPmodels.en)
library(tm)
library(stringr)

Auxiliary functions

getAnnotationsFromDocument

getAnnotationsFromDocument returns annotations for the text document: word, sentence, part-of-speech, and Penn Treebank parse annotations.

As an alternative, the koRpus package uses TreeTagger for POS tagging.

getAnnotationsFromDocument = function(doc){
  x=as.String(doc)
  sent_token_annotator <- Maxent_Sent_Token_Annotator()
  word_token_annotator <- Maxent_Word_Token_Annotator()
  pos_tag_annotator <- Maxent_POS_Tag_Annotator()
  y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
  y2 <- annotate(x, pos_tag_annotator, y1)
  return(y2)  
} 

getAnnotatedMergedDocument

getAnnotatedMergedDocument returns the text document merged with the annotations.

getAnnotatedMergedDocument = function(doc,annotations){
  x=as.String(doc)
  y2w <- subset(annotations, type == "word")
  tags <- sapply(y2w$features, '[[', "POS")
  r1 <- sprintf("%s/%s", x[y2w], tags)
  r2 <- paste(r1, collapse = " ")
  return(r2)  
} 

getAnnotatedPlainTextDocument

getAnnotatedPlainTextDocument returns the text document along with its annotations in an AnnotatedPlainTextDocument.

getAnnotatedPlainTextDocument = function(doc,annotations){
  x=as.String(doc)
  a = AnnotatedPlainTextDocument(x,annotations)
  return(a)  
} 

detectPatternOnDocument

detectPatternOnDocument returns the pattern detected on an AnnotatedPlainTextDocument.

detectPatternOnDocument <- function(doc, pattern) {
  x=as.String(doc)
  res=str_match_all(x,pattern)
  
  dimrow=dim(res[[1]])[1]
  dimcol=dim(res[[1]])[2]
  
  # If there are no rows, no matches have been found
  if (dimrow == 0) {
    return(NA)
  }else{
    if (dimcol > 2){
      # If there are three or more columns, we have to paste all the groups together
      for (i in 1:dimrow) {
        res[[1]][i,2] = paste(res[[1]][i,2:dimcol], collapse = ' ')
      }
    }
    
    # We return all the results found separated by ','
    if (dimcol != 1) {
      result = paste(res[[1]][,2], collapse = ', ')
    }else{
      result = paste(res[[1]][,1], collapse = ', ')
    }
    return(result)
  }
}

detectPatternOnDocumentWithContext

detectPatternOnDocumentWithContext returns the pattern detected on an AnnotatedPlainTextDocument with some context.

detectPatternOnDocumentWithContext <- function(doc, pattern) {
  txt=as.String(doc)
  number=50
  coord=str_locate(txt,pattern)
  res3=substr(txt,coord[1]-number,coord[2]+number)
  return (res3)
}

detectPatternsInCorpus

detectPatternsInCorpus returns a data frame with all the patterns detected in a corpus.

detectPatternsInCorpus = function(corpus, patterns){
  vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1, 
                                    nrow = length(corpus)))
  names(vallEntities) <- c("File",patterns)
  for (i in 1:length(patterns)) {
    vallEntities[,i+1]=unlist(lapply(corpus, detectPatternOnDocument, 
                                     pattern=patterns[i]))
    }
  for (i in 1:length(corpus)) {
    vallEntities$File[i]=meta(corpus[[i]])$id
    }
  return (vallEntities)  
  }

detectPatternsInTaggedCorpus

detectPatternsInTaggedCorpus returns a data frame with all the patterns detected in an annotated corpus.

detectPatternsInTaggedCorpus = function(corpus, taggedCorpus, patterns){
  vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1, 
                                    nrow = length(corpus)))
  names(vallEntities) <- c("File",patterns)
  for (i in 1:length(patterns)) {
    vallEntities[,i+1]=unlist(lapply(taggedCorpus, detectPatternOnDocument, 
                                     pattern=patterns[i]))
    }
  for (i in 1:length(corpus)) {
    vallEntities$File[i]=meta(corpus[[i]])$id
    }
  return (vallEntities)  
  }

countMatchesPerColumn

countMatchesPerColumn returns the number of matches per pattern/column.

Counts the number of columns with non-NA values for each pattern.

countMatchesPerColumn = function (df) {
  entityCountPerPattern <- data.frame(matrix(NA, ncol = 2, 
                                             nrow = length(names(df))-1))
  names(entityCountPerPattern) <- c("Entity","Count")
  
  for (i in 2:length(names(df))) {
    entityCountPerPattern$Entity[i-1] = names(df)[i]
    entityCountPerPattern$Count[i-1] = nrow(subset(df, !is.na(df[i])))
    }
  return (entityCountPerPattern)
  }

countMatchesPerRow

countMatchesPerRow returns the number of entities per file/row.

Counts the number of rows with non-NA values for each file.

countMatchesPerRow = function (df) {
  entityCountPerFile <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(entityCountPerFile) <- c("File","Count")
  
  for (i in 1:nrow(df)) {
    entityCountPerFile$File[i] = df$File[i]
    entityCountPerFile$Count[i] = length(Filter(Negate(is.na),df[i,2:length(df[i,])]))
    }
  return (entityCountPerFile[entityCountPerFile[2]!=0,])
  }

printMatchesPerPattern

printMatchesPerPattern prints the matches found per pattern.

printMatchesPerPattern = function (patterns, matches) {
  for (i in 1:length(patterns)){
    print(paste("PATTERN: ",patterns[i]))
    strings = matches[,i+1][!is.na(unlist(matches[,i+1]))]
    print(strings)
    print(" ") 
  }
}

mergeAllMatchesInLists

mergeAllMatchesInLists returns a data frame with all the files and their matches in a single list per file.

mergeAllMatchesInLists = function (df) {
  matchesPerFile = rep(list(list()), nrow(df))
  for (i in 1:nrow(df)) {    
    matches=list()
    for (j in 2:ncol(df)){
      if (grepl(',',df[i,j])){
        b=strsplit(as.character(df[i,j]),split=',')
        for (j in 1:length(b[[1]])){
          matches= c(matches,str_trim(b[[1]][j]))
        }
      }else{
        if (!(is.na(df[i,j]))){
          matches = c(matches,str_trim(df[i,j]))
        }
      }
    }
    matches = unique(matches)
    matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
  }
  
  files = df[,1]
  matches = matchesPerFile
  
  allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(allMatches) <- c("Files","Matches")
  
  allMatches$Files=files
  allMatches$Matches=matches
  
  return (allMatches)
}

mergeGoldStandardInLists

mergeGoldStandardInLists returns a data frame with all the files and the gold standard matches in a single list per file.

mergeGoldStandardInLists = function (df) {
  matchesPerFile = rep(list(list()), nrow(df))
  
  for (i in 1:nrow(df)) {    
    matches=as.list(unlist(Filter(Negate(is.na),df[i,2:length(df)])))
    matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
  }
  
  files = df[,1]
  matches = matchesPerFile
  
  allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(allMatches) <- c("Files","Matches")
  
  allMatches$Files=files
  allMatches$Matches=matches
  
  return (allMatches)
}

calculateMetrics

calculateMetrics calculates precision, recall and f-measure according to a gold standard.

calculateMetrics = function (matches, matches.gs) {
  
  metrics<- data.frame(matrix(NA, ncol = 3, nrow = 1))
  names(metrics) <- c("Precision","Recall","Fmeasure")
  
  numCorrect = 0
  allAnswers = 0
  possibleAnswers = 0
  
  for (i in 1:nrow(matches)) {    
    if (length(matches.gs$Matches[[i]])!=0) {
      l = str_trim(unlist(matches[i,2]))
      l.gs = unname(unlist(matches.gs[i,2]))
      intersection = intersect(l, l.gs)
      numCorrect = numCorrect + length(intersect(l, l.gs))
      allAnswers = allAnswers + length (l)
      possibleAnswers = possibleAnswers + length(l.gs)    
    }
  }
  
  metrics$Precision = numCorrect / allAnswers
  metrics$Recall = numCorrect / possibleAnswers
  
  beta = 1
  if ((metrics$Precision == 0) & (metrics$Recall == 0)) {
    metrics$Fmeasure = 0
  } else {
    metrics$Fmeasure = ((sqrt(beta)+1) * metrics$Precision * metrics$Recall) / 
      ((sqrt(beta)*metrics$Precision) + metrics$Recall)
  }
  
  return(metrics)
}

Load corpus

We are going to use the Movie review data version 2.0, created by Bo Pang and Lillian Lee.

Once unzipped, the data splits the different documents into positive and negative opinions. In this script we are going to use the positive opinions located in ./txt_sentoken/pos.

We are only going to load the first 500 reviews.

source.pos = DirSource("../Corpus/review_polarity/txt_sentoken/pos", encoding = "UTF-8")
corpus = Corpus(source.pos)

Inspect corpus

Let’s take a look at the document in the first entry.

inspect(corpus[[1]])
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 4226
## 
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
## in other words , don't dismiss this film because of its source . 
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ? 
## the ghetto in question is , of course , whitechapel in 1888 london's east end . 
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision . 
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case . 
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium . 
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach . 
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay . 
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end . 
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts . 
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) . 
## don't worry - it'll all make sense when you see it . 
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) . 
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic . 
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place . 
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent . 
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham . 
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad . 
## the film , however , is all good . 
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content

Annotate corpus

We just apply the getAnnotationsFromDocument function to every document in the corpus using lapply.

This step may take long depending on the size of the corpus and on the annotations that we want to identify.

annotations = lapply(corpus, getAnnotationsFromDocument)

The first annotations are sentence annotations. They indicate where the sentence starts and where it ends. In constituents we can access the tokens in the sentence (and check the number of tokens it has). In parse we can access the parse tree.

head(annotations[[1]])
##  id type     start end  features
##   1 sentence     1  265 constituents=<<integer,54>>
##   2 sentence   268  439 constituents=<<integer,36>>
##   3 sentence   442  591 constituents=<<integer,27>>
##   4 sentence   594  797 constituents=<<integer,44>>
##   5 sentence   800  939 constituents=<<integer,28>>
##   6 sentence   942 1299 constituents=<<integer,70>>

Word annotations also are defined. They indicate where the word starts, where it ends, and the part-of-speech tag.

tail(annotations[[1]])
##  id  type start end  features
##  844 word  4189 4197 POS=NN
##  845 word  4199 4199 POS=,
##  846 word  4201 4208 POS=NN
##  847 word  4210 4212 POS=CC
##  848 word  4214 4217 POS=NN
##  849 word  4219 4225 POS=NN

We can create AnnotatedPlainTextDocuments that attach the annotations to the document and store the annotated corpus in another variable (since we destroy the corpus metadata).

corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)
inspect(corpus.tagged[[1]])
## <<AnnotatedPlainTextDocument>>
## Metadata:  0
## Annotations:  length: 849
## Content:  chars: 4226
## 
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
## in other words , don't dismiss this film because of its source . 
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ? 
## the ghetto in question is , of course , whitechapel in 1888 london's east end . 
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision . 
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case . 
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium . 
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach . 
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay . 
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end . 
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts . 
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) . 
## don't worry - it'll all make sense when you see it . 
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) . 
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic . 
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place . 
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent . 
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham . 
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad . 
## the film , however , is all good . 
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content

We can also store all the annotations inline with the text and store the annotated corpus in another variable (since we destroy the corpus metadata).

corpus.taggedText = Map(getAnnotatedMergedDocument, corpus, annotations)
corpus.taggedText[[1]] 
## [1] "films/NNS adapted/VBD from/IN comic/JJ books/NNS have/VBP had/VBN plenty/NN of/IN success/NN ,/, whether/IN they/PRP 're/VBP about/IN superheroes/NNS (/-LRB- batman/NN ,/, superman/NN ,/, spawn/NN )/-RRB- ,/, or/CC geared/VBN toward/IN kids/NNS (/-LRB- casper/NN )/-RRB- or/CC the/DT arthouse/NN crowd/NN (/-LRB- ghost/NN world/NN )/-RRB- ,/, but/CC there/EX 's/VBZ never/RB really/RB been/VBN a/DT comic/JJ book/NN like/IN from/IN hell/NN before/IN ./. for/IN starters/NNS ,/, it/PRP was/VBD created/VBN by/IN alan/NN moore/NN (/-LRB- and/CC eddie/JJ campbell/NN )/-RRB- ,/, who/WP brought/VBD the/DT medium/NN to/TO a/DT whole/JJ new/JJ level/NN in/IN the/DT mid/JJ '80s/NNS with/IN a/DT 12-part/JJ series/NN called/VBN the/DT watchmen/NNS ./. to/TO say/VB moore/NN and/CC campbell/NN thoroughly/RB researched/VBD the/DT subject/NN of/IN jack/NN the/DT ripper/NN would/MD be/VB like/IN saying/VBG michael/NN jackson/NN is/VBZ starting/VBG to/TO look/VB a/DT little/JJ odd/JJ ./. the/DT book/NN (/-LRB- or/CC \"/`` graphic/JJ novel/NN ,/, \"/`` if/IN you/PRP will/MD )/-RRB- is/VBZ over/IN 500/CD pages/NNS long/RB and/CC includes/VBZ nearly/RB 30/CD more/RBR that/IN consist/VB of/IN nothing/NN but/CC footnotes/NNS ./. in/IN other/JJ words/NNS ,/, do/VBP n't/RB dismiss/VB this/DT film/NN because/IN of/IN its/PRP$ source/NN ./. if/IN you/PRP can/MD get/VB past/IN the/DT whole/JJ comic/JJ book/NN thing/NN ,/, you/PRP might/MD find/VB another/DT stumbling/JJ block/NN in/IN from/IN hell/NN 's/POS directors/NNS ,/, albert/NN and/CC allen/JJ hughes/NNS ./. getting/VBG the/DT hughes/NNS brothers/NNS to/TO direct/VB this/DT seems/VBZ almost/RB as/RB ludicrous/JJ as/IN casting/VBG carrot/NN top/NN in/IN ,/, well/RB ,/, anything/NN ,/, but/CC riddle/VB me/PRP this/DT :/: who/WP better/RB to/TO direct/VB a/DT film/NN that/WDT 's/VBZ set/VBN in/IN the/DT ghetto/NN and/CC features/NNS really/RB violent/JJ street/NN crime/NN than/IN the/DT mad/JJ geniuses/NNS behind/IN menace/NN ii/NNS society/NN ?/. the/DT ghetto/NN in/IN question/NN is/VBZ ,/, of/IN course/NN ,/, whitechapel/NN in/IN 1888/CD london/. 's/POS east/JJ end/NN ./. it/PRP 's/VBZ a/DT filthy/JJ ,/, sooty/JJ place/NN where/WRB the/DT whores/NNS (/-LRB- called/VBN \"/`` unfortunates/JJ \"/'' )/-RRB- are/VBP starting/VBG to/TO get/VB a/DT little/JJ nervous/JJ about/IN this/DT mysterious/JJ psychopath/NN who/WP has/VBZ been/VBN carving/VBG through/IN their/PRP$ profession/NN with/IN surgical/JJ precision/NN ./. when/WRB the/DT first/JJ stiff/NN turns/VBZ up/RP ,/, copper/NN peter/NN godley/NN (/-LRB- robbie/NN coltrane/NN ,/, the/DT world/NN is/VBZ not/RB enough/JJ )/-RRB- calls/VBZ in/IN inspector/NN frederick/NN abberline/NN (/-LRB- johnny/JJ depp/NN ,/, blow/NN )/-RRB- to/TO crack/VB the/DT case/NN ./. abberline/NN ,/, a/DT widower/NN ,/, has/VBZ prophetic/JJ dreams/NNS he/PRP unsuccessfully/RB tries/VBZ to/TO quell/VB with/IN copious/JJ amounts/NNS of/IN absinthe/NNS and/CC opium/NN ./. upon/IN arriving/VBG in/IN whitechapel/NN ,/, he/PRP befriends/VBZ an/DT unfortunate/NN named/VBN mary/JJ kelly/NN (/-LRB- heather/NN graham/NN ,/, say/VBP it/PRP is/VBZ n't/RB so/RB )/-RRB- and/CC proceeds/NNS to/TO investigate/VB the/DT horribly/RB gruesome/JJ crimes/NNS that/IN even/RB the/DT police/NN surgeon/NN ca/MD n't/RB stomach/VB ./. i/PRP do/VBP n't/RB think/VB anyone/NN needs/NNS to/TO be/VB briefed/VBN on/IN jack/NN the/DT ripper/NN ,/, so/IN i/PRP wo/MD n't/RB go/VB into/IN the/DT particulars/NNS here/RB ,/, other/JJ than/IN to/TO say/VB moore/NN and/CC campbell/NN have/VBP a/DT unique/JJ and/CC interesting/JJ theory/NN about/IN both/DT the/DT identity/NN of/IN the/DT killer/NN and/CC the/DT reasons/NNS he/PRP chooses/VBZ to/TO slay/VB ./. in/IN the/DT comic/JJ ,/, they/PRP do/VBP n't/RB bother/VB cloaking/VBG the/DT identity/NN of/IN the/DT ripper/NN ,/, but/CC screenwriters/NNS terry/NN hayes/NNS (/-LRB- vertical/JJ limit/NN )/-RRB- and/CC rafael/JJ yglesias/NNS (/-LRB- les/NNS mis/NN ?/. rables/NNS )/-RRB- do/VBP a/DT good/JJ job/NN of/IN keeping/VBG him/PRP hidden/VBN from/IN viewers/NNS until/IN the/DT very/JJ end/NN ./. it/PRP 's/VBZ funny/JJ to/TO watch/VB the/DT locals/NNS blindly/RB point/VBP the/DT finger/NN of/IN blame/NN at/IN jews/NNS and/CC indians/NNS because/IN ,/, after/IN all/DT ,/, an/DT englishman/NN could/MD never/RB be/VB capable/JJ of/IN committing/VBG such/JJ ghastly/JJ acts/NNS ./. and/CC from/IN hell/NN 's/POS ending/NN had/VBD me/PRP whistling/VBG the/DT stonecutters/NNS song/NN from/IN the/DT simpsons/NNS for/IN days/NNS (/-LRB- \"/'' who/WP holds/VBZ back/RB the/DT electric/JJ car/who/NN made/VBD steve/JJ guttenberg/NN a/DT star/NN ?/. \"/`` )/-RRB- ./. do/VBP n't/RB worry/VB -/: it/PRP 'll/MD all/DT make/VB sense/NN when/WRB you/PRP see/VBP it/PRP ./. now/RB onto/IN from/IN hell/NN 's/POS appearance/NN :/: it/PRP 's/VBZ certainly/RB dark/JJ and/CC bleak/JJ enough/JJ ,/, and/CC it/PRP 's/VBZ surprising/JJ to/TO see/VB how/WRB much/RB more/JJR it/PRP looks/VBZ like/IN a/DT tim/JJ burton/NN film/NN than/IN planet/NN of/IN the/DT apes/NNS did/VBD (/-LRB- at/IN times/NNS ,/, it/PRP seems/VBZ like/IN sleepy/JJ hollow/JJ 2/CD )/-RRB- ./. the/DT print/NN i/NN saw/VBD was/VBD n't/RB completely/RB finished/VBN (/-LRB- both/DT color/NN and/CC music/NN had/VBD not/RB been/VBN finalized/VBN ,/, so/IN no/DT comments/NNS about/IN marilyn/JJ manson/NN )/-RRB- ,/, but/CC cinematographer/NN peter/NN deming/NN (/-LRB- do/VBP n't/RB say/VB a/DT word/NN )/-RRB- ably/RB captures/VBZ the/DT dreariness/NN of/IN victorian-era/NN london/RB and/CC helped/VBD make/VB the/DT flashy/JJ killing/NN scenes/NNS remind/VBD me/PRP of/IN the/DT crazy/JJ flashbacks/NNS in/IN twin/JJ peaks/NNS ,/, even/RB though/IN the/DT violence/NN in/IN the/DT film/NN pales/NNS in/IN comparison/NN to/TO that/DT in/IN the/DT black-and-white/JJ comic/JJ ./. oscar/NN winner/NN martin/VBG childs/NNS '/POS (/-LRB- shakespeare/NN in/IN love/NN )/-RRB- production/NN design/NN turns/VBZ the/DT original/JJ prague/NN surroundings/NNS into/IN one/CD creepy/JJ place/NN ./. even/RB the/DT acting/VBG in/IN from/IN hell/NN is/VBZ solid/JJ ,/, with/IN the/DT dreamy/JJ depp/NN turning/VBG in/IN a/DT typically/RB strong/JJ performance/NN and/CC deftly/RB handling/VBG a/DT british/JJ accent/NN ./. ians/NNS holm/VBP (/-LRB- joe/NN gould/NN 's/POS secret/NN )/-RRB- and/CC richardson/NN (/-LRB- 102/CD dalmatians/NNS )/-RRB- log/VBP in/IN great/JJ supporting/VBG roles/NNS ,/, but/CC the/DT big/JJ surprise/NN here/RB is/VBZ graham/NN ./. i/NN cringed/VBD the/DT first/JJ time/NN she/PRP opened/VBD her/PRP$ mouth/NN ,/, imagining/VBG her/PRP$ attempt/NN at/IN an/DT irish/JJ accent/NN ,/, but/CC it/PRP actually/RB was/VBD n't/RB half/DT bad/JJ ./. the/DT film/NN ,/, however/RB ,/, is/VBZ all/DT good/JJ ./. 2/CD :/: 00/CD -/: r/NN for/IN strong/JJ violence/gore/NN ,/, sexuality/NN ,/, language/NN and/CC drug/NN content/NN"

Find simple patterns

Based on the first file, we define some simple string patterns to try to identify people appearances.

pattern0=c("created by")
pattern0=c(pattern0,"screenwriter[s]?")
pattern0=c(pattern0,"cinematographer")
pattern0=c(pattern0,"oscar winner")

We detect those patterns in the corpus and we can see in which files they do appear.

matches0 = detectPatternsInCorpus(corpus, pattern0)
matches0[!is.na(matches0[3]),c(1,3)]
##                File                                        screenwriter[s]?
## 1   cv000_29590.txt                                           screenwriters
## 29  cv028_26746.txt                                           screenwriters
## 30  cv029_18643.txt                                            screenwriter
## 77  cv076_24945.txt                                            screenwriter
## 79  cv078_14730.txt                              screenwriter, screenwriter
## 87  cv086_18371.txt                                            screenwriter
## 95  cv094_27889.txt                                           screenwriters
## 116 cv115_25396.txt                                            screenwriter
## 122 cv121_17302.txt                                            screenwriter
## 136 cv135_11603.txt                                           screenwriters
## 143 cv142_22516.txt                                            screenwriter
## 144 cv143_19666.txt                                            screenwriter
## 159 cv158_10390.txt                                            screenwriter
## 163 cv162_10424.txt                screenwriter, screenwriter, screenwriter
## 179 cv178_12972.txt                                            screenwriter
## 191 cv190_27052.txt                                            screenwriter
## 192 cv191_29719.txt                                            screenwriter
## 209  cv208_9020.txt                                           screenwriters
## 226 cv225_29224.txt                                            screenwriter
## 236 cv235_10217.txt                                           screenwriters
## 241 cv240_14336.txt                                           screenwriters
## 242 cv241_23130.txt                                            screenwriter
## 275 cv274_25253.txt                                            screenwriter
## 319 cv318_10493.txt                                            screenwriter
## 337 cv336_10143.txt                                            screenwriter
## 360  cv359_6647.txt                                            screenwriter
## 366 cv365_11576.txt                                            screenwriter
## 371  cv370_5221.txt                                            screenwriter
## 396 cv395_10849.txt                                           screenwriters
## 405 cv404_20315.txt                              screenwriter, screenwriter
## 406 cv405_20399.txt                                            screenwriter
## 411 cv410_24266.txt                                           screenwriters
## 433 cv432_14224.txt                                            screenwriter
## 453  cv452_5088.txt                                           screenwriters
## 457 cv456_18985.txt                                           screenwriters
## 465 cv464_15650.txt                                           screenwriters
## 467 cv466_18722.txt                                            screenwriter
## 475 cv474_10209.txt                                            screenwriter
## 477 cv476_16856.txt                                            screenwriter
## 507 cv506_15956.txt                                            screenwriter
## 527 cv526_12083.txt                                            screenwriter
## 544  cv543_5045.txt                                            screenwriter
## 553 cv552_10016.txt                                            screenwriter
## 556 cv555_23922.txt                                            screenwriter
## 566 cv565_29572.txt                                            screenwriter
## 570 cv569_26381.txt                                            screenwriter
## 579 cv578_15094.txt                                            screenwriter
## 584 cv583_29692.txt                                            screenwriter
## 614 cv613_21796.txt                                            screenwriter
## 621 cv620_24265.txt                                           screenwriters
## 625 cv624_10744.txt                                           screenwriters
## 645 cv644_17154.txt                                            screenwriter
## 669 cv668_17604.txt                                           screenwriters
## 671 cv670_25826.txt                                            screenwriter
## 673 cv672_28083.txt                                           screenwriters
## 683 cv682_16139.txt                                           screenwriters
## 699 cv698_15253.txt                                           screenwriters
## 706 cv705_11059.txt                                           screenwriters
## 713 cv712_22920.txt                                           screenwriters
## 717 cv716_10514.txt                                            screenwriter
## 729 cv728_16133.txt                                            screenwriter
## 747 cv746_10147.txt                                            screenwriter
## 748 cv747_16556.txt                                            screenwriter
## 750 cv749_17765.txt                                            screenwriter
## 770  cv769_8123.txt                                            screenwriter
## 776 cv775_16237.txt                                           screenwriters
## 795 cv794_15868.txt                                            screenwriter
## 813 cv812_17924.txt                                            screenwriter
## 819 cv818_10211.txt                                            screenwriter
## 830 cv829_20289.txt screenwriter, screenwriter, screenwriters, screenwriter
## 839 cv838_24728.txt                                            screenwriter
## 870 cv869_23611.txt                                           screenwriters
## 873 cv872_12591.txt                                            screenwriter
## 879 cv878_15694.txt                                            screenwriter
## 899 cv898_14187.txt                                            screenwriter
## 923 cv922_10073.txt                                           screenwriters
## 929  cv928_9168.txt                                            screenwriter
## 940 cv939_10583.txt                                            screenwriter
## 946 cv945_12160.txt                                            screenwriter
## 959 cv958_12162.txt                                            screenwriter
## 961 cv960_29007.txt                                            screenwriter
## 975 cv974_22941.txt                                            screenwriter
## 980 cv979_18921.txt                                           screenwriters
## 981 cv980_10953.txt                                           screenwriters

We check how many patterns we have found in each file.

countMatchesPerRow(matches0) 
##                File Count
## 1   cv000_29590.txt     4
## 29  cv028_26746.txt     1
## 30  cv029_18643.txt     1
## 68  cv067_19774.txt     1
## 77  cv076_24945.txt     1
## 79  cv078_14730.txt     1
## 87  cv086_18371.txt     1
## 95  cv094_27889.txt     1
## 100 cv099_10534.txt     1
## 109 cv108_15571.txt     1
## 116 cv115_25396.txt     1
## 122 cv121_17302.txt     2
## 136 cv135_11603.txt     1
## 143 cv142_22516.txt     1
## 144 cv143_19666.txt     1
## 155  cv154_9328.txt     1
## 159 cv158_10390.txt     1
## 160 cv159_29505.txt     1
## 163 cv162_10424.txt     1
## 179 cv178_12972.txt     1
## 191 cv190_27052.txt     1
## 192 cv191_29719.txt     1
## 206  cv205_9457.txt     1
## 209  cv208_9020.txt     1
## 216 cv215_22240.txt     1
## 221 cv220_29059.txt     1
## 226 cv225_29224.txt     1
## 236 cv235_10217.txt     1
## 241 cv240_14336.txt     1
## 242 cv241_23130.txt     1
## 274 cv273_29112.txt     1
## 275 cv274_25253.txt     1
## 286 cv285_16494.txt     1
## 295 cv294_11684.txt     1
## 298 cv297_10047.txt     1
## 301 cv300_22284.txt     1
## 315 cv314_14422.txt     1
## 318 cv317_24049.txt     1
## 319 cv318_10493.txt     1
## 324 cv323_29805.txt     1
## 325  cv324_7082.txt     1
## 337 cv336_10143.txt     1
## 352 cv351_15458.txt     1
## 360  cv359_6647.txt     1
## 363 cv362_15341.txt     1
## 366 cv365_11576.txt     1
## 371  cv370_5221.txt     1
## 372  cv371_7630.txt     1
## 387 cv386_10080.txt     1
## 396 cv395_10849.txt     1
## 398 cv397_29023.txt     1
## 405 cv404_20315.txt     1
## 406 cv405_20399.txt     1
## 410 cv409_29786.txt     1
## 411 cv410_24266.txt     1
## 428 cv427_10825.txt     1
## 432  cv431_7085.txt     1
## 433 cv432_14224.txt     1
## 453  cv452_5088.txt     1
## 457 cv456_18985.txt     1
## 465 cv464_15650.txt     1
## 467 cv466_18722.txt     1
## 475 cv474_10209.txt     1
## 476 cv475_21692.txt     1
## 477 cv476_16856.txt     1
## 485 cv484_25054.txt     1
## 506 cv505_12090.txt     1
## 507 cv506_15956.txt     1
## 520 cv519_14661.txt     1
## 527 cv526_12083.txt     1
## 533  cv532_6522.txt     1
## 544  cv543_5045.txt     1
## 553 cv552_10016.txt     2
## 556 cv555_23922.txt     1
## 559 cv558_29507.txt     1
## 566 cv565_29572.txt     1
## 570 cv569_26381.txt     1
## 579 cv578_15094.txt     1
## 584 cv583_29692.txt     2
## 590 cv589_12064.txt     1
## 614 cv613_21796.txt     1
## 621 cv620_24265.txt     1
## 625 cv624_10744.txt     1
## 629 cv628_19325.txt     1
## 642 cv641_12349.txt     1
## 645 cv644_17154.txt     1
## 662  cv661_2450.txt     1
## 666 cv665_29538.txt     1
## 669 cv668_17604.txt     1
## 671 cv670_25826.txt     1
## 673 cv672_28083.txt     1
## 683 cv682_16139.txt     1
## 690 cv689_12587.txt     2
## 699 cv698_15253.txt     1
## 701 cv700_21947.txt     1
## 706 cv705_11059.txt     1
## 711 cv710_22577.txt     1
## 713 cv712_22920.txt     1
## 714 cv713_29155.txt     1
## 717 cv716_10514.txt     1
## 729 cv728_16133.txt     1
## 744 cv743_15449.txt     1
## 747 cv746_10147.txt     1
## 748 cv747_16556.txt     1
## 750 cv749_17765.txt     1
## 770  cv769_8123.txt     1
## 776 cv775_16237.txt     1
## 795 cv794_15868.txt     1
## 803 cv802_28664.txt     1
## 813 cv812_17924.txt     1
## 819 cv818_10211.txt     1
## 830 cv829_20289.txt     1
## 838 cv837_27325.txt     1
## 839 cv838_24728.txt     1
## 852 cv851_20469.txt     1
## 869 cv868_11948.txt     1
## 870 cv869_23611.txt     1
## 873 cv872_12591.txt     2
## 879 cv878_15694.txt     1
## 888  cv887_5126.txt     1
## 893 cv892_17576.txt     1
## 899 cv898_14187.txt     1
## 923 cv922_10073.txt     1
## 929  cv928_9168.txt     1
## 940 cv939_10583.txt     1
## 946 cv945_12160.txt     2
## 950 cv949_20112.txt     1
## 959 cv958_12162.txt     2
## 961 cv960_29007.txt     1
## 964  cv963_6895.txt     1
## 975 cv974_22941.txt     1
## 980 cv979_18921.txt     1
## 981 cv980_10953.txt     1

And we check how many times each pattern has been found.

countMatchesPerColumn(matches0) 
##             Entity Count
## 1       created by     9
## 2 screenwriter[s]?    84
## 3  cinematographer    42
## 4     oscar winner     8

And we print the context in which the patterns are found, to see if we can build better patterns.

for (i in 1:length(pattern0)){
  print(paste("PATTERN: ",pattern0[i]))
  strings = lapply(corpus, detectPatternOnDocumentWithContext, pattern=pattern0[i])
  print(unlist(strings[!is.na(unlist(strings))]))
  print(" ")
}
## [1] "PATTERN:  created by"
##                                                                                                    cv000_29590.txt 
##  "ok like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought t" 
##                                                                                                     cv205_9457.txt 
##  "turvy . \nrobert zemeckis , back from the euphoria created by his last film , forrest gump , once again proves " 
##                                                                                                    cv285_16494.txt 
##  "ve got mail like dried-up mistletoe . \nthe sparks created by the earlier movie are , by necessity , not eviden" 
##                                                                                                     cv324_7082.txt 
##  "the real thing . \nthe two of them , as characters created by fingal's imagination , serve as aspects of his pe" 
##                                                                                                     cv371_7630.txt 
##  "nd always right on the mark , enhancing the moods created by the animated scenery . \nas far as the subtitles g" 
##                                                                                                    cv484_25054.txt 
##   " and there are cliches , but the walls of water , created by fluid dynamics simulating real-life phenomena , a" 
##                                                                                                    cv628_19325.txt 
##  " on the story `tarzan of the apes' and characters created by edgar rice burroughs . \nseen july 4 , 1999 at 7 :" 
##                                                                                                    cv743_15449.txt 
##  "nt of a bug's life is the quality of animation . \ncreated by pixar , the same people who brought us toy story " 
##                                                                                                    cv892_17576.txt 
## " antagonists in the movie , a group of \" agents \" created by the matrix in its computer program , all dress in" 
## [1] " "
## [1] "PATTERN:  screenwriter[s]?"
##                                                                                                          cv000_29590.txt 
##      " bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesia" 
##                                                                                                          cv028_26746.txt 
##   "d the story is so complex and \" clever \" that the screenwriters are the first to get lost in it . \nthere is is no" 
##                                                                                                          cv029_18643.txt 
##       "mise certainly is interesting and director and co-screenwriter alex proyas is able to keep the film consistently" 
##                                                                                                          cv076_24945.txt 
##    " logical confusion . \ndirector gregory hoblit and screenwriter toby emmerich structure \" frequency \" as good hol" 
##                                                                                                          cv078_14730.txt 
##       "different , it would be easy for the director and screenwriter to dumb it down and appeal to the lowest common d" 
##                                                                                                          cv086_18371.txt 
##      "itively made . \ndirector michael winterbottom and screenwriter frank cottrell boyce vividly express the societal" 
##                                                                                                          cv094_27889.txt 
##     "it prince acquit themselves admirably . \nkudos to screenwriters james schamus , wang hui ling and tsai kuo jing ," 
##                                                                                                          cv115_25396.txt 
##      "ins an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does n" 
##                                                                                                          cv121_17302.txt 
##      "i relish those rare opportunities when a talented screenwriter can make me feel like a fool . \ni spent the first" 
##                                                                                                          cv135_11603.txt 
##     "is a highly enjoyable ride . \nonce again , chan's screenwriters ( here edward tang and fibe ma ) have taken the e" 
##                                                                                                          cv142_22516.txt 
##       "e hallstrom ( what's eating gilbert grape ? ) and screenwriter/novelist john irving ( the world according to gar" 
##                                                                                                          cv143_19666.txt 
##       "sy to have written vivian as merely a flake , but screenwriter j . f . lawton clearly cared too much about his c" 
##                                                                                                          cv158_10390.txt 
##      " potentially hilarious comedy , and pitched it to screenwriter paul rudnick . \nit's true that if this same film " 
##                                                                                                          cv162_10424.txt 
##       "rise to fame , quickly become the most well-known screenwriter amongst the entertainment weekly-reading , box of" 
##                                                                                                          cv178_12972.txt 
##       "aseball bat , but i'm pretty sure andrew niccol , screenwriter for the truman show , has had the same curious th" 
##                                                                                                          cv190_27052.txt 
##      "has been sharply pieced together by tony gilroy , screenwriter of the devil's advocate and dolores claiborne . \n" 
##                                                                                                          cv191_29719.txt 
##     "\" was right after all . \nfortunately , first-time screenwriter marc klein has sketched strong , well-rounded , c" 
##                                                                                                           cv208_9020.txt 
##      "ble , but after all the star power , mega bucks , screenwriters , directors , and cool trailers , men in black is" 
##                                                                                                          cv225_29224.txt 
##      "efall the participants en route to silver city . \nscreenwriter andy breckman adds a nice touch by not having the" 
##                                                                                                          cv235_10217.txt 
##      "too much to it , but capra and the gang ( various screenwriters , composers , actors ) plumet the material to its" 
##                                                                                                          cv240_14336.txt 
##     "sh anything , it just comes natural to him . \nthe screenwriters use the right words and phrases to describe the m" 
##                                                                                                          cv241_23130.txt 
##       "lf more seriously this time ; maybe so , or maybe screenwriter ehren kruger ( arlington road ) , who took over a" 
##                                                                                                          cv274_25253.txt 
##      "ins an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does n" 
##                                                                                                          cv318_10493.txt 
##       "nd-up from veteran horror director wes craven and screenwriter kevin williamson that seemed to breathe new life " 
##                                                                                                          cv336_10143.txt 
##       "ed for scream 2 , including director wes craven , screenwriter kevin williamson , and actors neve campbell , cou" 
##                                                                                                           cv359_6647.txt 
##      "y after all . \nfavreau also doubled as the film's screenwriter , and he proves he has the gift for creating enga" 
##                                                                                                          cv365_11576.txt 
##     "ing . \nthat's not to say the movie isn't funny . \nscreenwriter tim herlihy has written for sandler before ( bill" 
##                                                                                                           cv370_5221.txt 
##      " . \nit tore at my heart to watch a gifted lesbian screenwriter explain that , as a rule , gay audiences hunger f" 
##                                                                                                          cv395_10849.txt 
##                   "jake kasdan , son of one of the best screenwriters around , breaks into filmmaking by writing and di" 
##                                                                                                          cv404_20315.txt 
##       "albert brooks plays steven phillips , a hollywood screenwriter who after winning a humanitarian award for his wo" 
##                                                                                                          cv405_20399.txt 
##      " the 1999 film outside providence ( 6 . 5/10 ) . \nscreenwriter w . peter iliff also had a part in writing the sc" 
##                                                                                                          cv410_24266.txt 
##     " . \nminkoff likes to point out scenes where other screenwriters came in and polished up the script , namely write" 
##                                                                                                          cv432_14224.txt 
##      "ments work very well -- for a comic book story . \nscreenwriter david goyer ( who also wrote the crow ) incorpora" 
##                                                                                                           cv452_5088.txt 
##     "tober is distinguished by its water-tight plot . \nscreenwriters larry ferguson and donald stewart have gracefully" 
##                                                                                                          cv456_18985.txt 
##     "d , and all are handled exceptionally well by the screenwriters . \nthere is no shred of doubt left to ponder afte" 
##                                                                                                          cv464_15650.txt 
##     "ture and charm holds right up to the last reel . \nscreenwriters john eskow , ted elliot and terry rosio have unfo" 
##                                                                                                          cv466_18722.txt 
##                  "david mamet has long been my favorite screenwriter and director . \nwith his distinctive , more often" 
##                                                                                                          cv474_10209.txt 
##     "s that \" genius is insanity with some success \" , screenwriter fierstein is taking a lazy shortcut ) , pryce mak" 
##                                                                                                          cv476_16856.txt 
##       "threat of class struggle ; for george pal and his screenwriter david duncan , who produced the film in the worst" 
##                                                                                                          cv506_15956.txt 
##       "ents or mistakes done on the part of lyne and his screenwriter , stephen schiff , but just parts of a whole new " 
##                                                                                                          cv526_12083.txt 
##      "antly british slang . \npossibly an attempt by the screenwriter to balance the british so that american audiences" 
##                                                                                                           cv543_5045.txt 
##      " , and outstanding acting . \ndirector kleiser and screenwriter elizabeth jane howard ( adapting her own highly a" 
##                                                                                                          cv552_10016.txt 
##      "int clearly a greater talent as a director than a screenwriter . \nwhile boogie nights shows great inventiveness " 
##                                                                                                          cv555_23922.txt 
##      "ious efforts good burger and varsity blues . \nand screenwriter steven brill ( the epic mighty ducks trilogy , la" 
##                                                                                                          cv565_29572.txt 
##     "ile . \" \nbased on stephen king stories adapted by screenwriter william goldman and directed by scott hicks , it'" 
##                                                                                                          cv569_26381.txt 
##       "gh a lot of this honor should be addressed to the screenwriter ______ and frears , the director , the acting is " 
##                                                                                                          cv578_15094.txt 
##       "tory gazillions of times , but debut director and screenwriter mark christopher keeps things moving with lively " 
##                                                                                                          cv583_29692.txt 
##      "im that we're more scared by what we don't see . \nscreenwriter and legendary film critic james agee does a beaut" 
##                                                                                                          cv613_21796.txt 
##      " have to lie in the conception of this film ; the screenwriter and the director . \nthree kings ranks among the b" 
##                                                                                                          cv620_24265.txt 
##     "re with american psycho there is no difference . \nscreenwriters mary harron and guinevere turner had the unenviab" 
##                                                                                                          cv624_10744.txt 
##     " another part of life generally ignored by modern screenwriters . \nwith his well worn bible in hand , sonny leaps" 
##                                                                                                          cv644_17154.txt 
##       " his character , not only by thornton but also by screenwriter scott b . smith ( adapting his own book , by the " 
##                                                                                                          cv668_17604.txt 
##      " of note is larisa oleynik who , with the help of screenwriters lutz and smith , is able to turn bianca into a pe" 
##                                                                                                          cv670_25826.txt 
##      "ars . \nthe quick script written by barry fanaro , screenwriter of kingpin , is carried by plenty of subtly dry h" 
##                                                                                                          cv672_28083.txt 
##     "ject is going up soon that will block her view . \nscreenwriters and actors jean-pierre bacri and agnes jaoui wrot" 
##                                                                                                          cv682_16139.txt 
##     "es like worm just aren't cut out for the cards . \nscreenwriters david levien and brian koppelman entertain and ed" 
##                                                                                                          cv698_15253.txt 
##      "gets rather heady at times , but it's a credit to screenwriters zwick , lawrence wright , and menno meyjes that t" 
##                                                                                                          cv705_11059.txt 
##     "and thrilling . \nbased on an original story , the screenwriters did a good job at imagining all the potential and" 
##                                                                                                          cv712_22920.txt 
##     "has opened new doors for excellent young actors , screenwriters , and directors . \nscream is also an extremely fu" 
##                                                                                                          cv716_10514.txt 
##       "ccomplished , quite credibly , but then boyle and screenwriter john hodge seem to flinch and opt for a more conv" 
##                                                                                                          cv728_16133.txt 
##                             "playwright tom stoppard and screenwriter marc norman took on an astonishingly difficult ta" 
##                                                                                                          cv746_10147.txt 
##      " -- but why ? \nlike the first scream , craven and screenwriter kevin williamson inaugurate things with a bang . " 
##                                                                                                          cv747_16556.txt 
##     "res and murders a suspect . \nthe future zwick and screenwriter lawrence wright offer is all too possible . \nwhat" 
##                                                                                                          cv749_17765.txt 
##      " \ndirector liman ( who also photographed go ) and screenwriter august celebrate the drug/rave scene in l . a . ," 
##                                                                                                           cv769_8123.txt 
##     "in another big alien film from the past summer . \nscreenwriter ed solomon , writer of super mario bros . \nand th" 
##                                                                                                          cv775_16237.txt 
##     "d be held up as an example to all those hollywood screenwriters . \nscripts as creative and endearing as this shou" 
##                                                                                                          cv794_15868.txt 
## "perado \" and 1996's \" from dusk till dawn \" ) and screenwriter kevin williamson ( 1996's \" scream \" and 1997's \"" 
##                                                                                                          cv812_17924.txt 
##      "r in extreme ways . \nin order to test this theory screenwriter john august and director doug liman assemble a ca" 
##                                                                                                          cv818_10211.txt 
##       "-director of the wildly uneven four rooms and the screenwriter of the gory vampire-fest , from dusk till dawn . " 
##                                                                                                          cv829_20289.txt 
##       "'s latest comedy , he tells a story of a troubled screenwriter ( brooks ) who's losing his edge , as his busines" 
##                                                                                                          cv838_24728.txt 
##     "estination \" is the best so far . \ntalented young screenwriter jeffrey reddick offers a fresh variation on a fam" 
##                                                                                                          cv869_23611.txt 
##      "s of the slasher genre , writer-director wong and screenwriters glen morgan and jeffrey reddick have created a pr" 
##                                                                                                          cv872_12591.txt 
##  "es of madison county \" ( richard lagravenese , co-screenwriter here also adapted \" bridges \" ) . \nalthough \" the" 
##                                                                                                          cv878_15694.txt 
##     "after 1996's \" scream , \" and written by the same screenwriter , kevin williamson , is a stylish , effective hor" 
##                                                                                                          cv898_14187.txt 
##      "esy mark steven johnson , the film's director and screenwriter . \nfortunately , simon birch isn't the real star " 
##                                                                                                          cv922_10073.txt 
##      " straight-forward , but for some odd reason , the screenwriters wanted to surprise everyone by giving us somethin" 
##                                                                                                           cv928_9168.txt 
##      " guess the outcome of the film . \nbut leave it to screenwriter david mamet to add humor and a few surprises to m" 
##                                                                                                          cv939_10583.txt 
##      "e same thing on january 1 , 1999 . \nif craven and screenwriter kevin williamson are able to keep the quality hig" 
##                                                                                                          cv945_12160.txt 
##  "cause director brad silberling ( \" casper \" ) and screenwriter dana stevens ( \" blink \" ) wanted to make \" city " 
##                                                                                                          cv958_12162.txt 
##      "osterous mystery smilla's sense of snow . \nhe and screenwriter rafael yglesias bring the sprawling tale into cle" 
##                                                                                                          cv960_29007.txt 
##      ") , who is seeing his old girlfriend . \nmike ( co-screenwriter steven gevedon ) has discovered a cache of audiot" 
##                                                                                                          cv974_22941.txt 
##      "ed teacher . \nit's like payne and taylor , his co-screenwriter , have taken tracy , a girl desperate for friends" 
##                                                                                                          cv979_18921.txt 
##     "tle bit of pity , but a whole lot of affection . \nscreenwriters sitch , santo cilauro , tom gleisner , and jane k" 
##                                                                                                          cv980_10953.txt 
##     "ten , totatly unoffensive and funny comedy . \nthe screenwriters , director ramis and danny rubin , have written a" 
## [1] " "
## [1] "PATTERN:  cinematographer"
##                                                                                                           cv000_29590.txt 
##     "zed , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures t" 
##                                                                                                           cv067_19774.txt 
##     "atmospherically shot by the silence of the lambs' cinematographer , tak fujimoto ) is actually a drama-its spooky ," 
##                                                                                                           cv099_10534.txt 
##    "nger . \nas depicted by hoblit ( primal fear ) and cinematographer newton thomas sigel , philadelphia is a dark , dr" 
##                                                                                                           cv121_17302.txt 
##     "occasionally the hyper-real approach works , with cinematographer elliot davis creating a world of fantasy romance " 
##                                                                                                            cv154_9328.txt 
##     "rom editors karen schmeer and shondra merrill and cinematographer robert richardson ( oliver stone's longtime colla" 
##                                                                                                           cv159_29505.txt 
##    "ly amazing piotr sobocinski , the oscar-nominated cinematographer behind krzysztof kieslowski's red . \nthe acting i" 
##                                                                                                           cv215_22240.txt 
##    " echoing the loneliness of the protagonists . \n ( cinematographer remi adafarasin often allows space to engulf them" 
##                                                                                                           cv220_29059.txt 
##     ", and comic-panel framed with a virtuoso grace by cinematographer amy vincent ( _death in venice , ca_ ) , while in" 
##                                                                                                           cv273_29112.txt 
##    "han he expected at the beginning . \nit helps that cinematographer matthieu poirot-delpech's crisp lensing complimen" 
##                                                                                                           cv294_11684.txt 
##   "hudsucker proxy . \" \nthe film was shot by veteran cinematographer roger deakins , who has worked with the coens on " 
##                                                                                                           cv297_10047.txt 
##    "work done by production designer nigel phelps and cinematographer darius khondji . \nas technically adept as jeunet'" 
##                                                                                                           cv323_29805.txt 
##     "sarossy who directs spent most of his career as a cinematographer and like the kingpin's lair , he has molded image" 
##                                                                                                           cv351_15458.txt 
##    "s the result of a perfect pairing of director and cinematographer . \nkapur and his cinematographer remi adefarasin " 
##                                                                                                           cv362_15341.txt 
##    "eth are shekhar kapur's visual delights . \nhe and cinematographer remi adefarasin have crafted a film with a rich c" 
##                                                                                                           cv386_10080.txt 
##     "keen eye for the stylish ; his collaboration with cinematographer slawomir idziak , production designer jan roelfs " 
##                                                                                                           cv397_29023.txt 
##    "of skilled craftsmen to work behind the camera . \ncinematographer matthew f . leonetti has had a long career of sho" 
##                                                                                                           cv409_29786.txt 
##     "( once again wielded by superb and ever-attentive cinematographer eric gautier ) is less appropriate here than in h" 
##                                                                                                           cv427_10825.txt 
##    "e telephones are old-fashioned , rotary models . \ncinematographer bill butler is given an opportunity to use unconv" 
##                                                                                                            cv431_7085.txt 
##    "is death in the early 80's . \nmore practiced as a cinematographer than a director , bava nonetheless sat in the dir" 
##                                                                                                           cv505_12090.txt 
##    "r whose love has turned into fear . \nscorsese and cinematographer michael chapman elected to shoot the bulk of ragi" 
##                                                                                                           cv519_14661.txt 
##    "oviding some interesting and colorful costumes . \ncinematographer eduardo serra , whose work was last seen in the l" 
##                                                                                                            cv532_6522.txt 
##    " in production long before that film's release . \ncinematographer peter suschitzky , who makes the most of wynn tho" 
##                                                                                                           cv552_10016.txt 
##   "t of \" there are shadows in light , baby \" to his cinematographer's complaint about poor set lighting echoes back t" 
##                                                                                                           cv558_29507.txt 
##     "nishing foresight , shackleton brought australian cinematographer and photographer frank hurley along on the journe" 
##                                                                                                           cv583_29692.txt 
##    " . \nand credit must be handed out to laughton and cinematographer stanley cortez , who create a series of haunting " 
##                                                                                                           cv589_12064.txt 
##    "ud of constant violence to the action . \naltman's cinematographer , changwei gu , gives the film a dark , soaked lo" 
##                                                                                                           cv641_12349.txt 
##   "o let his actors do most of the work - but he and cinematographer slavomir idziak ( \" gattaca \" ) successfully evok" 
##                                                                                                            cv661_2450.txt 
##     "asse hallstrom ( my life as a dog ) and legendary cinematographer sven nykvist create a magnificent visual backdrop" 
##                                                                                                           cv665_29538.txt 
##    "nd always very strange . \njunichiro hayashi , the cinematographer who recently has been doing all of kurosawa's fil" 
##                                                                                                           cv689_12587.txt 
##    "himmering fields of wheat--all are resplendent in cinematographer robert richardson's viewfinder . \nveteran english" 
##                                                                                                           cv710_22577.txt 
##     "ional stars that couldn't possibly be ignored are cinematographer frank griebe and editor mathilde bonnefroy , who " 
##                                                                                                           cv713_29155.txt 
##  "in the cyclic screenplay . \ntran anh hung and his cinematographer mark lee ping-bin ( \" flowers of shanghai \" ) lin" 
##                                                                                                           cv802_28664.txt 
##    "ected in a coen brothers venture , first class . \ncinematographer roger deakins , who has worked on five previous c" 
##                                                                                                           cv837_27325.txt 
##    "ents of each genre to be found within the film . \ncinematographer roger pratt brings an atmospheric , fairy tale lo" 
##                                                                                                           cv851_20469.txt 
##     "read of blair , a creepy closed-in feel thanks to cinematographer fred murphy and great performances by the whole c" 
##                                                                                                           cv868_11948.txt 
##    "o the senses in a way that few love stories do . \ncinematographer john seale ( the english patient ) provides some " 
##                                                                                                           cv872_12591.txt 
## "e land . \nto further emphasize this , redford and cinematographer robert richardson ( \" natural born killers , \" \" " 
##                                                                                                            cv887_5126.txt 
##   "plots , perhaps in homage to d . w . \ngriffith . \ncinematographer ernest dickerson , who has worked on all of spike" 
##                                                                                                           cv945_12160.txt 
##  "vie . \nit is beautifully filmed in lush colors by cinematographer john seale ( \" the english patient \" ) , whose sw" 
##                                                                                                           cv949_20112.txt 
##     "tin's nonsensical explanation to dave , his loyal cinematographer ( jamie kennedy ) , that every movie , in the end" 
##                                                                                                           cv958_12162.txt 
##     "a asp , costume designer gabriella pescucci , and cinematographer jorgen persson give les miserables a sumptuous pe" 
##                                                                                                            cv963_6895.txt 
##    "eit a visually interesting one . \ncampion and her cinematographer stuart dryburgh come up with a great variety of e" 
## [1] " "
## [1] "PATTERN:  oscar winner"
##                                                                                                        cv000_29590.txt 
##    "omparison to that in the black-and-white comic . \noscar winner martin childs' ( shakespeare in love ) production" 
##                                                                                                        cv108_15571.txt 
##    "sshoppers is hopper , who is fiendishly voiced by oscar winner kevin spacey . \nwhen the offering is lost hopper " 
##                                                                                                        cv300_22284.txt 
##    " he stole every scene he was in away from veteran oscar winner tom hanks . \nrockwell , an independent film veter" 
##                                                                                                        cv314_14422.txt 
##     "eresting to see that this movie was one of future oscar winner susan sarandon's ( dead man walking ) first film-" 
##                                                                                                        cv317_24049.txt 
##     " the former actually had the insight to follow up oscar winner usual suspects with a pauly shore vehicle , and j" 
##                                                                                                        cv475_21692.txt 
##    "on and ben affleck in the starring roles . \nbeing oscar winners for the classic 'good will hunting' they give gr" 
##                                                                                                        cv689_12587.txt 
## "ances with wolves \" by way of \" out of africa \" ( oscar winners both for barry ) . \nfortunately , barry's replac" 
##                                                                                                        cv700_21947.txt 
##    "the talent with her that one would expect from an oscar winner . \nrobbie coltrane was easily one of the best par" 
## [1] " "

Find entities

Now we define more complex regular expressions that help identifying people appearances.

pattern1=c("created by ([A-z]* [A-z]*)")
pattern1=c(pattern1,"created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)")
pattern1=c(pattern1,"screenwriter[s]? ([A-z]* [A-z]*)")
pattern1=c(pattern1,"cinematographer(?: ,)? ([A-z]* [A-z]*)")
pattern1=c(pattern1,"oscar winner ([A-z]* [A-z]*)")

We detect those patterns in the corpus and we can see in which files they do appear.

matches1 = detectPatternsInCorpus(corpus, pattern1)
matches1[!is.na(matches1[4]),c(1,4)]
##                File screenwriter[s]? ([A-z]* [A-z]*)
## 1   cv000_29590.txt                      terry hayes
## 29  cv028_26746.txt                          are the
## 30  cv029_18643.txt                      alex proyas
## 77  cv076_24945.txt                    toby emmerich
## 79  cv078_14730.txt                          to dumb
## 87  cv086_18371.txt                   frank cottrell
## 95  cv094_27889.txt                    james schamus
## 116 cv115_25396.txt                karey kirkpatrick
## 122 cv121_17302.txt                         can make
## 144 cv143_19666.txt                               j 
## 159 cv158_10390.txt                     paul rudnick
## 163 cv162_10424.txt                      amongst the
## 179 cv178_12972.txt                          for the
## 191 cv190_27052.txt                           of the
## 192 cv191_29719.txt                       marc klein
## 226 cv225_29224.txt                    andy breckman
## 241 cv240_14336.txt                          use the
## 242 cv241_23130.txt                     ehren kruger
## 275 cv274_25253.txt                karey kirkpatrick
## 319 cv318_10493.txt                 kevin williamson
## 337 cv336_10143.txt                 kevin williamson
## 366 cv365_11576.txt                      tim herlihy
## 371  cv370_5221.txt                     explain that
## 396 cv395_10849.txt                          around 
## 405 cv404_20315.txt                        who after
## 406 cv405_20399.txt                               w 
## 411 cv410_24266.txt                          came in
## 433 cv432_14224.txt                      david goyer
## 453  cv452_5088.txt                   larry ferguson
## 465 cv464_15650.txt                       john eskow
## 467 cv466_18722.txt                     and director
## 475 cv474_10209.txt                     fierstein is
## 477 cv476_16856.txt                     david duncan
## 527 cv526_12083.txt                       to balance
## 544  cv543_5045.txt                   elizabeth jane
## 556 cv555_23922.txt                     steven brill
## 566 cv565_29572.txt                  william goldman
## 570 cv569_26381.txt                       ______ and
## 579 cv578_15094.txt                 mark christopher
## 584 cv583_29692.txt                    and legendary
## 614 cv613_21796.txt                          and the
## 621 cv620_24265.txt                      mary harron
## 645 cv644_17154.txt                          scott b
## 669 cv668_17604.txt                         lutz and
## 671 cv670_25826.txt                       of kingpin
## 673 cv672_28083.txt                       and actors
## 683 cv682_16139.txt                     david levien
## 699 cv698_15253.txt                           zwick 
## 706 cv705_11059.txt                            did a
## 717 cv716_10514.txt                       john hodge
## 729 cv728_16133.txt                      marc norman
## 747 cv746_10147.txt                 kevin williamson
## 748 cv747_16556.txt                  lawrence wright
## 750 cv749_17765.txt                 august celebrate
## 770  cv769_8123.txt                       ed solomon
## 795 cv794_15868.txt                 kevin williamson
## 813 cv812_17924.txt                      john august
## 819 cv818_10211.txt                           of the
## 830 cv829_20289.txt               in the, really get
## 839 cv838_24728.txt                  jeffrey reddick
## 870 cv869_23611.txt                      glen morgan
## 873 cv872_12591.txt                        here also
## 923 cv922_10073.txt                        wanted to
## 929  cv928_9168.txt                      david mamet
## 940 cv939_10583.txt                 kevin williamson
## 946 cv945_12160.txt                     dana stevens
## 959 cv958_12162.txt                  rafael yglesias
## 961 cv960_29007.txt                   steven gevedon
## 980 cv979_18921.txt                           sitch

We print the matches found per pattern.

printMatchesPerPattern(pattern1, matches1)
## [1] "PATTERN:  created by ([A-z]* [A-z]*)"
## [1] "alan moore"     "his last"       "the earlier"    "the animated"  
## [5] "fluid dynamics" "edgar rice"     "pixar "         "the matrix"    
## [1] " "
## [1] "PATTERN:  created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)"
## [1] "eddie campbell"
## [1] " "
## [1] "PATTERN:  screenwriter[s]? ([A-z]* [A-z]*)"
##  [1] "terry hayes"        "are the"            "alex proyas"       
##  [4] "toby emmerich"      "to dumb"            "frank cottrell"    
##  [7] "james schamus"      "karey kirkpatrick"  "can make"          
## [10] "j "                 "paul rudnick"       "amongst the"       
## [13] "for the"            "of the"             "marc klein"        
## [16] "andy breckman"      "use the"            "ehren kruger"      
## [19] "karey kirkpatrick"  "kevin williamson"   "kevin williamson"  
## [22] "tim herlihy"        "explain that"       "around "           
## [25] "who after"          "w "                 "came in"           
## [28] "david goyer"        "larry ferguson"     "john eskow"        
## [31] "and director"       "fierstein is"       "david duncan"      
## [34] "to balance"         "elizabeth jane"     "steven brill"      
## [37] "william goldman"    "______ and"         "mark christopher"  
## [40] "and legendary"      "and the"            "mary harron"       
## [43] "scott b"            "lutz and"           "of kingpin"        
## [46] "and actors"         "david levien"       "zwick "            
## [49] "did a"              "john hodge"         "marc norman"       
## [52] "kevin williamson"   "lawrence wright"    "august celebrate"  
## [55] "ed solomon"         "kevin williamson"   "john august"       
## [58] "of the"             "in the, really get" "jeffrey reddick"   
## [61] "glen morgan"        "here also"          "wanted to"         
## [64] "david mamet"        "kevin williamson"   "dana stevens"      
## [67] "rafael yglesias"    "steven gevedon"     "sitch "            
## [1] " "
## [1] "PATTERN:  cinematographer(?: ,)? ([A-z]* [A-z]*)"
##  [1] "peter deming"      "tak fujimoto"      "newton thomas"    
##  [4] "elliot davis"      "robert richardson" "behind krzysztof" 
##  [7] "remi adafarasin"   "amy vincent"       "matthieu poirot"  
## [10] "roger deakins"     "darius khondji"    "and like"         
## [13] "remi adefarasin"   "remi adefarasin"   "slawomir idziak"  
## [16] "matthew f"         "eric gautier"      "bill butler"      
## [19] "than a"            "michael chapman"   "eduardo serra"    
## [22] "peter suschitzky"  "and photographer"  "stanley cortez"   
## [25] "changwei gu"       "slavomir idziak"   "sven nykvist"     
## [28] "who recently"      "robert richardson" "frank griebe"     
## [31] "mark lee"          "roger deakins"     "roger pratt"      
## [34] "fred murphy"       "john seale"        "robert richardson"
## [37] "ernest dickerson"  "john seale"        "jorgen persson"   
## [40] "stuart dryburgh"  
## [1] " "
## [1] "PATTERN:  oscar winner ([A-z]* [A-z]*)"
## [1] "martin childs"  "kevin spacey"   "tom hanks"      "susan sarandon"
## [5] "usual suspects"
## [1] " "

We check how many patterns we have found in each file.

countMatchesPerRow(matches1) 
##                File Count
## 1   cv000_29590.txt     5
## 29  cv028_26746.txt     1
## 30  cv029_18643.txt     1
## 68  cv067_19774.txt     1
## 77  cv076_24945.txt     1
## 79  cv078_14730.txt     1
## 87  cv086_18371.txt     1
## 95  cv094_27889.txt     1
## 100 cv099_10534.txt     1
## 109 cv108_15571.txt     1
## 116 cv115_25396.txt     1
## 122 cv121_17302.txt     2
## 144 cv143_19666.txt     1
## 155  cv154_9328.txt     1
## 159 cv158_10390.txt     1
## 160 cv159_29505.txt     1
## 163 cv162_10424.txt     1
## 179 cv178_12972.txt     1
## 191 cv190_27052.txt     1
## 192 cv191_29719.txt     1
## 206  cv205_9457.txt     1
## 216 cv215_22240.txt     1
## 221 cv220_29059.txt     1
## 226 cv225_29224.txt     1
## 241 cv240_14336.txt     1
## 242 cv241_23130.txt     1
## 274 cv273_29112.txt     1
## 275 cv274_25253.txt     1
## 286 cv285_16494.txt     1
## 295 cv294_11684.txt     1
## 298 cv297_10047.txt     1
## 301 cv300_22284.txt     1
## 315 cv314_14422.txt     1
## 318 cv317_24049.txt     1
## 319 cv318_10493.txt     1
## 324 cv323_29805.txt     1
## 337 cv336_10143.txt     1
## 352 cv351_15458.txt     1
## 363 cv362_15341.txt     1
## 366 cv365_11576.txt     1
## 371  cv370_5221.txt     1
## 372  cv371_7630.txt     1
## 387 cv386_10080.txt     1
## 396 cv395_10849.txt     1
## 398 cv397_29023.txt     1
## 405 cv404_20315.txt     1
## 406 cv405_20399.txt     1
## 410 cv409_29786.txt     1
## 411 cv410_24266.txt     1
## 428 cv427_10825.txt     1
## 432  cv431_7085.txt     1
## 433 cv432_14224.txt     1
## 453  cv452_5088.txt     1
## 465 cv464_15650.txt     1
## 467 cv466_18722.txt     1
## 475 cv474_10209.txt     1
## 477 cv476_16856.txt     1
## 485 cv484_25054.txt     1
## 506 cv505_12090.txt     1
## 520 cv519_14661.txt     1
## 527 cv526_12083.txt     1
## 533  cv532_6522.txt     1
## 544  cv543_5045.txt     1
## 556 cv555_23922.txt     1
## 559 cv558_29507.txt     1
## 566 cv565_29572.txt     1
## 570 cv569_26381.txt     1
## 579 cv578_15094.txt     1
## 584 cv583_29692.txt     2
## 590 cv589_12064.txt     1
## 614 cv613_21796.txt     1
## 621 cv620_24265.txt     1
## 629 cv628_19325.txt     1
## 642 cv641_12349.txt     1
## 645 cv644_17154.txt     1
## 662  cv661_2450.txt     1
## 666 cv665_29538.txt     1
## 669 cv668_17604.txt     1
## 671 cv670_25826.txt     1
## 673 cv672_28083.txt     1
## 683 cv682_16139.txt     1
## 690 cv689_12587.txt     1
## 699 cv698_15253.txt     1
## 706 cv705_11059.txt     1
## 711 cv710_22577.txt     1
## 714 cv713_29155.txt     1
## 717 cv716_10514.txt     1
## 729 cv728_16133.txt     1
## 744 cv743_15449.txt     1
## 747 cv746_10147.txt     1
## 748 cv747_16556.txt     1
## 750 cv749_17765.txt     1
## 770  cv769_8123.txt     1
## 795 cv794_15868.txt     1
## 803 cv802_28664.txt     1
## 813 cv812_17924.txt     1
## 819 cv818_10211.txt     1
## 830 cv829_20289.txt     1
## 838 cv837_27325.txt     1
## 839 cv838_24728.txt     1
## 852 cv851_20469.txt     1
## 869 cv868_11948.txt     1
## 870 cv869_23611.txt     1
## 873 cv872_12591.txt     2
## 888  cv887_5126.txt     1
## 893 cv892_17576.txt     1
## 923 cv922_10073.txt     1
## 929  cv928_9168.txt     1
## 940 cv939_10583.txt     1
## 946 cv945_12160.txt     2
## 959 cv958_12162.txt     2
## 961 cv960_29007.txt     1
## 964  cv963_6895.txt     1
## 980 cv979_18921.txt     1

And we check how many times each pattern has been found.

countMatchesPerColumn(matches1) 
##                                             Entity Count
## 1                       created by ([A-z]* [A-z]*)     8
## 2 created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)     1
## 3                 screenwriter[s]? ([A-z]* [A-z]*)    69
## 4           cinematographer(?: ,)? ([A-z]* [A-z]*)    40
## 5                     oscar winner ([A-z]* [A-z]*)     5

Find entities using part-of-speech (POS) tags

Now we include in our regular expressions part-of-speech information to avoid having incorrect answers.

pattern2=c("created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN")
pattern2=c(pattern2,"created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN")
pattern2=c(pattern2,"screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)")
pattern2=c(pattern2,"cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/NN")
pattern2=c(pattern2,"cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN")
pattern2=c(pattern2,"oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS")

We detect those patterns in the POS-tagged corpus.

allEntities = detectPatternsInTaggedCorpus(corpus, corpus.taggedText, pattern2)
allEntities[!is.na(allEntities[4]),c(1,4)]
##                File
## 1   cv000_29590.txt
## 30  cv029_18643.txt
## 77  cv076_24945.txt
## 87  cv086_18371.txt
## 95  cv094_27889.txt
## 116 cv115_25396.txt
## 192 cv191_29719.txt
## 226 cv225_29224.txt
## 242 cv241_23130.txt
## 275 cv274_25253.txt
## 433 cv432_14224.txt
## 453  cv452_5088.txt
## 465 cv464_15650.txt
## 477 cv476_16856.txt
## 544  cv543_5045.txt
## 579 cv578_15094.txt
## 621 cv620_24265.txt
## 645 cv644_17154.txt
## 717 cv716_10514.txt
## 729 cv728_16133.txt
## 748 cv747_16556.txt
## 813 cv812_17924.txt
## 839 cv838_24728.txt
## 870 cv869_23611.txt
## 946 cv945_12160.txt
## 959 cv958_12162.txt
## 961 cv960_29007.txt
##     screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)
## 1                                                         terry hayes
## 30                                                        alex proyas
## 77                                                      toby emmerich
## 87                                                     frank cottrell
## 95                                                      james schamus
## 116                                                 karey kirkpatrick
## 192                                                        marc klein
## 226                                                     andy breckman
## 242                                                      ehren kruger
## 275                                                 karey kirkpatrick
## 433                                                       david goyer
## 453                                                    larry ferguson
## 465                                                        john eskow
## 477                                                      david duncan
## 544                                                    elizabeth jane
## 579                                                  mark christopher
## 621                                                       mary harron
## 645                                                           scott b
## 717                                                        john hodge
## 729                                                       marc norman
## 748                                                   lawrence wright
## 813                                                       john august
## 839                                                   jeffrey reddick
## 870                                                       glen morgan
## 946                                                      dana stevens
## 959                                                   rafael yglesias
## 961                                                    steven gevedon

We can also view the entities for a certain pattern.

Filter(Negate(is.na),allEntities[[4]])
##  [1] "terry hayes"       "alex proyas"       "toby emmerich"    
##  [4] "frank cottrell"    "james schamus"     "karey kirkpatrick"
##  [7] "marc klein"        "andy breckman"     "ehren kruger"     
## [10] "karey kirkpatrick" "david goyer"       "larry ferguson"   
## [13] "john eskow"        "david duncan"      "elizabeth jane"   
## [16] "mark christopher"  "mary harron"       "scott b"          
## [19] "john hodge"        "marc norman"       "lawrence wright"  
## [22] "john august"       "jeffrey reddick"   "glen morgan"      
## [25] "dana stevens"      "rafael yglesias"   "steven gevedon"
printMatchesPerPattern(pattern2, allEntities)
## [1] "PATTERN:  created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN"
## [1] "alan moore"     "fluid dynamics" "edgar rice"    
## [1] " "
## [1] "PATTERN:  created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN"
## [1] "eddie campbell"
## [1] " "
## [1] "PATTERN:  screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)"
##  [1] "terry hayes"       "alex proyas"       "toby emmerich"    
##  [4] "frank cottrell"    "james schamus"     "karey kirkpatrick"
##  [7] "marc klein"        "andy breckman"     "ehren kruger"     
## [10] "karey kirkpatrick" "david goyer"       "larry ferguson"   
## [13] "john eskow"        "david duncan"      "elizabeth jane"   
## [16] "mark christopher"  "mary harron"       "scott b"          
## [19] "john hodge"        "marc norman"       "lawrence wright"  
## [22] "john august"       "jeffrey reddick"   "glen morgan"      
## [25] "dana stevens"      "rafael yglesias"   "steven gevedon"   
## [1] " "
## [1] "PATTERN:  cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/NN"
##  [1] "peter deming"      "tak fujimoto"      "elliot davis"     
##  [4] "robert richardson" "remi adafarasin"   "roger deakins"    
##  [7] "remi adefarasin"   "slawomir idziak"   "matthew f"        
## [10] "bill butler"       "michael chapman"   "eduardo serra"    
## [13] "changwei gu"       "robert richardson" "mark lee"         
## [16] "roger deakins"     "roger pratt"       "john seale"       
## [19] "robert richardson" "john seale"        "jorgen persson"   
## [22] "stuart dryburgh"  
## [1] " "
## [1] "PATTERN:  cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN"
## [1] "newton thomas sigel"
## [1] " "
## [1] "PATTERN:  oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS"
## [1] "martin childs"
## [1] " "

We count all the entities per pattern.

And we can also draw a histogram of the counts.

entityCountPerPattern = countMatchesPerColumn(allEntities)
entityCountPerPattern
##                                                                           Entity
## 1                                      created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN
## 2 created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN
## 3              screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)
## 4                            cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/NN
## 5                cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN
## 6                                   oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS
##   Count
## 1     3
## 2     1
## 3    27
## 4    22
## 5     1
## 6     1
hist(entityCountPerPattern$Count)

We count all the entities per file.

And we can also draw a histogram of the counts.

entityCountPerFile=countMatchesPerRow(allEntities)
entityCountPerFile
##                File Count
## 1   cv000_29590.txt     5
## 30  cv029_18643.txt     1
## 68  cv067_19774.txt     1
## 77  cv076_24945.txt     1
## 87  cv086_18371.txt     1
## 95  cv094_27889.txt     1
## 100 cv099_10534.txt     1
## 116 cv115_25396.txt     1
## 122 cv121_17302.txt     1
## 155  cv154_9328.txt     1
## 192 cv191_29719.txt     1
## 216 cv215_22240.txt     1
## 226 cv225_29224.txt     1
## 242 cv241_23130.txt     1
## 275 cv274_25253.txt     1
## 295 cv294_11684.txt     1
## 352 cv351_15458.txt     1
## 387 cv386_10080.txt     1
## 398 cv397_29023.txt     1
## 428 cv427_10825.txt     1
## 433 cv432_14224.txt     1
## 453  cv452_5088.txt     1
## 465 cv464_15650.txt     1
## 477 cv476_16856.txt     1
## 485 cv484_25054.txt     1
## 506 cv505_12090.txt     1
## 520 cv519_14661.txt     1
## 544  cv543_5045.txt     1
## 579 cv578_15094.txt     1
## 590 cv589_12064.txt     1
## 621 cv620_24265.txt     1
## 629 cv628_19325.txt     1
## 645 cv644_17154.txt     1
## 690 cv689_12587.txt     1
## 714 cv713_29155.txt     1
## 717 cv716_10514.txt     1
## 729 cv728_16133.txt     1
## 748 cv747_16556.txt     1
## 803 cv802_28664.txt     1
## 813 cv812_17924.txt     1
## 838 cv837_27325.txt     1
## 839 cv838_24728.txt     1
## 869 cv868_11948.txt     1
## 870 cv869_23611.txt     1
## 873 cv872_12591.txt     1
## 946 cv945_12160.txt     2
## 959 cv958_12162.txt     2
## 961 cv960_29007.txt     1
## 964  cv963_6895.txt     1
hist(entityCountPerFile$Count)

Write results to a file

We can write our results to a CSV file, sowe can use them in other places.

write.table(allEntities, file = "allEntities.csv", row.names = F, na="", sep=";")

Compare with a gold standard

Put all matches in a list for comparison with a gold standard.

allMatches = mergeAllMatchesInLists(allEntities)
head(allMatches)
##             Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
##                                                                Matches
## 1 alan moore, eddie campbell, terry hayes, peter deming, martin childs
## 2                                                                 NULL
## 3                                                                 NULL
## 4                                                                 NULL
## 5                                                                 NULL
## 6                                                                 NULL

Load the gold standard and put all gold standard matches in a list for comparison.

goldStandard = read.table(file = "goldStandard.csv", quote = "", na.strings=c(""),
                          colClasses="character", sep=";")
allMatchesGold = mergeGoldStandardInLists(goldStandard)
head(allMatchesGold)
##             Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
##                                                                                                                                                                                                                                                                                                                                                     Matches
## 1 alan moore, eddie campbell, moore, campbell, jack, michael jackson, albert, allen hughes, peter godley, robbie coltrane, frederick abberline, johnny depp, abberline, mary kelly, heather graham, terry hayes, rafael yglesias, steve guttenberg, tim burton, marilyn manson, peter deming, martin childs, depp, ians holm, joe gould, richardson, graham
## 2                                                                                                                                                 matthew broderick, reese witherspoon, george washington carver, tracy flick, paul, max fischer, bill murray, broderick, witherspoon, jessica campbell, tammy, rooney, campbell, alexander payne, tracy, m
## 3                                                                                                                                                                                                                                                                                   ryan, hanks, tom hanks, joe fox, meg ryan, kathleen kelley, fox, kelley
## 4                                                                          john williams, steven spielberg, spielberg, williams, martin brody, roy scheider, larry vaughn, murray hamilton, brody, matt hooper, richard dreyfuss, hooper, vaughn, quint, robert shaw, hitchcock, scheider, dreyfuss, shaw, robert redford, paul newman, duddy kravitz, ahab
## 5                                                                                                                                                                                                                    herb, jackie chan, barry sanders, sanders, jackie, chan, bruce lee, tim allen, lawrence kazdan, john williams, spielberg, george lucas
## 6                                                                                                                                                                                             raoul peck, lumumba, patrice lumumba, eriq ebouaney, helmer peck, peck, pascal bonitzer, patrice, joseph kasa vubu, maka kotto, moise tschombe, pascal nzonzi

Calculate the metrics (precision, recall, f-measure).

metrics = calculateMetrics(allMatches, allMatchesGold)
metrics
##   Precision      Recall    Fmeasure
## 1 0.8181818 0.003370029 0.006712411