Intelligent Systems. Hands-on 3

Álvaro Arranz Domínguez (a.arranzd@alumnos.upm.es)

Introduction

The goal of this document is to show a sample script for pattern-based entity recognition over text documents using the openNLP (natural language processing) and the tm (text mining) packages in R.

I cannot claim full authorship of this document, since I have taken code snippets and have been inspired by multiple books and documents in the Web. Thanks everyone for sharing.

Preparation

Check working directory

Check the working directory with wd. If it is not the one where your data are located, change it with setwd.

getwd()
## [1] "/Users/alvaro.arranz/Universidad/Intelligent Systems/HandsOn-3"
setwd("~/Universidad/Intelligent Systems/HandsOn-3")

Load libraries

Now we load the required libraries. Only a couple of things to mention:

  • Using the annotate function of the openNLP package may require to explicitly include the package name (i.e., ``) due to a name clash with ggplot2
  • Need to change the memory allocated to Java to avoid out-of-memory problems
# Needed for OutOfMemoryError: Java heap space 
library(rJava)
.jinit(parameters="-Xmx4g")
# If there are more memory problems, invoke gc() after the POS tagging

library(NLP) 
library(openNLP) 
library(openNLPmodels.en)
library(tm)
library(stringr)

Auxiliary functions

getAnnotationsFromDocument

getAnnotationsFromDocument returns annotations for the text document: word, sentence, part-of-speech, and Penn Treebank parse annotations.

As an alternative, the koRpus package uses TreeTagger for POS tagging.

getAnnotationsFromDocument = function(doc){
  x=as.String(doc)
  sent_token_annotator <- Maxent_Sent_Token_Annotator()
  word_token_annotator <- Maxent_Word_Token_Annotator()
  pos_tag_annotator <- Maxent_POS_Tag_Annotator()
  y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
  y2 <- annotate(x, pos_tag_annotator, y1)
  parse_annotator <- Parse_Annotator()
  y3 <- annotate(x, parse_annotator, y2)
  return(y3)  
} 

getAnnotatedMergedDocument

`` returns the text document merged with the annotations.

getAnnotatedMergedDocument = function(doc,annotations){
  x=as.String(doc)
  y2w <- subset(annotations, type == "word")
  tags <- sapply(y2w$features, '[[', "POS")
  r1 <- sprintf("%s/%s", x[y2w], tags)
  r2 <- paste(r1, collapse = " ")
  return(r2)  
} 

getAnnotatedPlainTextDocument

getAnnotatedPlainTextDocument returns the text document along with its annotations in an AnnotatedPlainTextDocument.

getAnnotatedPlainTextDocument = function(doc,annotations){
  x=as.String(doc)
  a = AnnotatedPlainTextDocument(x,annotations)
  return(a)  
} 

detectPatternOnDocument

detectPatternOnDocument returns the pattern detected on an AnnotatedPlainTextDocument.

detectPatternOnDocument <- function(doc, pattern) {
  x=as.String(doc)
  res=str_match_all(x,pattern)
  
  dimrow=dim(res[[1]])[1]
  dimcol=dim(res[[1]])[2]
  
  # If there are no rows, no matches have been found
  if (dimrow == 0) {
    return(NA)
  }else{
    if (dimcol > 2){
      # If there are three or more columns, we have to paste all the groups together
      for (i in 1:dimrow) {
        res[[1]][i,2] = paste(res[[1]][i,2:dimcol], collapse = ' ')
      }
    }
    
    # We return all the results found separated by ','
    if (dimcol != 1) {
      result = paste(res[[1]][,2], collapse = ', ')
    }else{
      result = paste(res[[1]][,1], collapse = ', ')
    }
    return(result)
  }
}

detectPatternOnDocumentWithContext

detectPatternOnDocumentWithContext returns the pattern detected on an AnnotatedPlainTextDocument with some context.

detectPatternOnDocumentWithContext <- function(doc, pattern) {
  txt=as.String(doc)
  number=50
  coord=str_locate(txt,pattern)
  res3=substr(txt,coord[1]-number,coord[2]+number)
  return (res3)
}

detectPatternsInCorpus

detectPatternsInCorpus returns a data frame with all the patterns detected in a corpus.

detectPatternsInCorpus = function(corpus, patterns){
  vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1, 
                                    nrow = length(corpus)))
  names(vallEntities) <- c("File",patterns)
  for (i in 1:length(patterns)) {
    vallEntities[,i+1]=unlist(lapply(corpus, detectPatternOnDocument, 
                                     pattern=patterns[i]))
    }
  for (i in 1:length(corpus)) {
    vallEntities$File[i]=meta(corpus[[i]])$id
    }
  return (vallEntities)  
}

detectPatternsInTaggedCorpus

detectPatternsInTaggedCorpus returns a data frame with all the patterns detected in an annotated corpus.

detectPatternsInTaggedCorpus = function(corpus, taggedCorpus, patterns){
  vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1, 
                                    nrow = length(corpus)))
  names(vallEntities) <- c("File",patterns)
  for (i in 1:length(patterns)) {
    vallEntities[,i+1]=unlist(lapply(taggedCorpus, detectPatternOnDocument, 
                                     pattern=patterns[i]))
    }
  for (i in 1:length(corpus)) {
    vallEntities$File[i]=meta(corpus[[i]])$id
    }
  return (vallEntities)  
}

countMatchesPerColumn

countMatchesPerColumn returns the number of matches per pattern/column.

Counts the number of columns with non-NA values for each pattern.

countMatchesPerColumn = function (df) {
  entityCountPerPattern <- data.frame(matrix(NA, ncol = 2, 
                                             nrow = length(names(df))-1))
  names(entityCountPerPattern) <- c("Entity","Count")
  
  for (i in 2:length(names(df))) {
    entityCountPerPattern$Entity[i-1] = names(df)[i]
    entityCountPerPattern$Count[i-1] = nrow(subset(df, !is.na(df[i])))
    }
  return (entityCountPerPattern)
  }

countMatchesPerRow

countMatchesPerRow returns the number of entities per file/row.

Counts the number of rows with non-NA values for each file.

countMatchesPerRow = function (df) {
  entityCountPerFile <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(entityCountPerFile) <- c("File","Count")
  
  for (i in 1:nrow(df)) {
    entityCountPerFile$File[i] = df$File[i]
    entityCountPerFile$Count[i] = length(Filter(Negate(is.na),df[i,2:length(df[i,])]))
    }
  return (entityCountPerFile[entityCountPerFile[2]!=0,])
}

printMatchesPerPattern

printMatchesPerPattern prints the matches found per pattern.

printMatchesPerPattern = function (patterns, matches) {
  for (i in 1:length(patterns)){
    print(paste("PATTERN: ",patterns[i]))
    strings = matches[,i+1][!is.na(unlist(matches[,i+1]))]
    print(strings)
    print(" ") 
  }
}

mergeAllMatchesInLists

mergeAllMatchesInLists returns a data frame with all the files and their matches in a single list per file.

mergeAllMatchesInLists = function (df) {
  matchesPerFile = rep(list(list()), nrow(df))
  for (i in 1:nrow(df)) {    
    matches=list()
    for (j in 2:ncol(df)){
      if (grepl(',',df[i,j])){
        b=strsplit(as.character(df[i,j]),split=',')
        for (j in 1:length(b[[1]])){
          matches= c(matches,str_trim(b[[1]][j]))
        }
      }else{
        if (!(is.na(df[i,j]))){
          matches = c(matches,str_trim(df[i,j]))
        }
      }
    }
    matches = unique(matches)
    matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
  }
  
  files = df[,1]
  matches = matchesPerFile
  
  allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(allMatches) <- c("Files","Matches")
  
  allMatches$Files=files
  allMatches$Matches=matches
  
  return (allMatches)
}

mergeGoldStandardInLists

mergeGoldStandardInLists returns a data frame with all the files and the gold standard matches in a single list per file.

mergeGoldStandardInLists = function (df) {
  matchesPerFile = rep(list(list()), nrow(df))
  
  for (i in 1:nrow(df)) {    
    matches=as.list(unlist(Filter(Negate(is.na),df[i,2:length(df)])))
    matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
  }
  
  files = df[,1]
  matches = matchesPerFile
  
  allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(allMatches) <- c("Files","Matches")
  
  allMatches$Files=files
  allMatches$Matches=matches
  
  return (allMatches)
}

calculateMetrics

calculateMetrics calculates precision, recall and f-measure according to a gold standard.

calculateMetrics = function (matches, matches.gs) {
  
  metrics<- data.frame(matrix(NA, ncol = 3, nrow = 1))
  names(metrics) <- c("Precision","Recall","Fmeasure")
  
  numCorrect = 0
  allAnswers = 0
  possibleAnswers = 0
  
  for (i in 1:nrow(matches)) {    
    if (length(matches.gs$Matches[[i]])!=0) {
      l = str_trim(unlist(matches[i,2]))
      l.gs = unname(unlist(matches.gs[i,2]))
      intersection = intersect(l, l.gs)
      numCorrect = numCorrect + length(intersect(l, l.gs))
      allAnswers = allAnswers + length (l)
      possibleAnswers = possibleAnswers + length(l.gs)    
    }
  }
  
  metrics$Precision = numCorrect / allAnswers
  metrics$Recall = numCorrect / possibleAnswers
  
  beta = 1
  if ((metrics$Precision == 0) & (metrics$Recall == 0)) {
    metrics$Fmeasure = 0
  } else {
    metrics$Fmeasure = ((sqrt(beta)+1) * metrics$Precision * metrics$Recall) / 
      ((sqrt(beta)*metrics$Precision) + metrics$Recall)
  }
  
  return(metrics)
}

Load corpus

We are going to use the Movie review data version 2.0, created by Bo Pang and Lillian Lee.

Once unzipped, the data splits the different documents into positive and negative opinions. In this script we are going to use the positive opinions located in ./txt_sentoken/pos.

We are only going to load the first 500 reviews.

source.pos = DirSource("./Corpus/review_polarity_small/txt_sentoken/pos", encoding = "UTF-8")
corpus = Corpus(source.pos)

Inspect corpus

Let’s take a look at the document in the first entry.

inspect(corpus[[1]])
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 4226
## 
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
## in other words , don't dismiss this film because of its source . 
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ? 
## the ghetto in question is , of course , whitechapel in 1888 london's east end . 
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision . 
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case . 
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium . 
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach . 
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay . 
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end . 
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts . 
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) . 
## don't worry - it'll all make sense when you see it . 
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) . 
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic . 
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place . 
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent . 
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham . 
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad . 
## the film , however , is all good . 
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content

Annotate corpus

We just apply the getAnnotationsFromDocument function to every document in the corpus using lapply.

This step may take long depending on the size of the corpus and on the annotations that we want to identify.

annotations = lapply(corpus, getAnnotationsFromDocument)

The first annotations are sentence annotations. They indicate where the sentence starts and where it ends. In `` we can access the tokens in the sentence (and check the number of tokens it has). In parse we can access the parse tree.

head(annotations[[1]])
##  id type     start end  features
##   1 sentence     1  265 constituents=<<integer,54>>,
##                         parse=<<character,1>>
##   2 sentence   268  439 constituents=<<integer,36>>,
##                         parse=<<character,1>>
##   3 sentence   442  591 constituents=<<integer,27>>,
##                         parse=<<character,1>>
##   4 sentence   594  797 constituents=<<integer,44>>,
##                         parse=<<character,1>>
##   5 sentence   800  939 constituents=<<integer,28>>,
##                         parse=<<character,1>>
##   6 sentence   942 1299 constituents=<<integer,70>>,
##                         parse=<<character,1>>

Word annotations also are defined. They indicate where the word starts, where it ends, and the part-of-speech tag.

tail(annotations[[1]])
##  id  type start end  features
##  844 word  4189 4197 POS=NN
##  845 word  4199 4199 POS=,
##  846 word  4201 4208 POS=NN
##  847 word  4210 4212 POS=CC
##  848 word  4214 4217 POS=NN
##  849 word  4219 4225 POS=NN

We can create `AnnotatedPlainTextDocuments that attach the annotations to the document and store the annotated corpus in another variable (since we destroy the corpus metadata).

corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)
corpus.tagged[[1]]
## <<AnnotatedPlainTextDocument>>
## Metadata:  0
## Annotations:  length: 849
## Content:  chars: 4226

We can also store all the annotations inline with the text and store the annotated corpus in another variable (since we destroy the corpus metadata).

corpus.taggedText = Map(getAnnotatedMergedDocument, corpus, annotations)
corpus.taggedText[[1]]
## [1] "films/NNS adapted/VBD from/IN comic/JJ books/NNS have/VBP had/VBN plenty/NN of/IN success/NN ,/, whether/IN they/PRP 're/VBP about/IN superheroes/NNS (/-LRB- batman/NN ,/, superman/NN ,/, spawn/NN )/-RRB- ,/, or/CC geared/VBN toward/IN kids/NNS (/-LRB- casper/NN )/-RRB- or/CC the/DT arthouse/NN crowd/NN (/-LRB- ghost/NN world/NN )/-RRB- ,/, but/CC there/EX 's/VBZ never/RB really/RB been/VBN a/DT comic/JJ book/NN like/IN from/IN hell/NN before/IN ./. for/IN starters/NNS ,/, it/PRP was/VBD created/VBN by/IN alan/NN moore/NN (/-LRB- and/CC eddie/JJ campbell/NN )/-RRB- ,/, who/WP brought/VBD the/DT medium/NN to/TO a/DT whole/JJ new/JJ level/NN in/IN the/DT mid/JJ '80s/NNS with/IN a/DT 12-part/JJ series/NN called/VBN the/DT watchmen/NNS ./. to/TO say/VB moore/NN and/CC campbell/NN thoroughly/RB researched/VBD the/DT subject/NN of/IN jack/NN the/DT ripper/NN would/MD be/VB like/IN saying/VBG michael/NN jackson/NN is/VBZ starting/VBG to/TO look/VB a/DT little/JJ odd/JJ ./. the/DT book/NN (/-LRB- or/CC \"/`` graphic/JJ novel/NN ,/, \"/`` if/IN you/PRP will/MD )/-RRB- is/VBZ over/IN 500/CD pages/NNS long/RB and/CC includes/VBZ nearly/RB 30/CD more/RBR that/IN consist/VB of/IN nothing/NN but/CC footnotes/NNS ./. in/IN other/JJ words/NNS ,/, do/VBP n't/RB dismiss/VB this/DT film/NN because/IN of/IN its/PRP$ source/NN ./. if/IN you/PRP can/MD get/VB past/IN the/DT whole/JJ comic/JJ book/NN thing/NN ,/, you/PRP might/MD find/VB another/DT stumbling/JJ block/NN in/IN from/IN hell/NN 's/POS directors/NNS ,/, albert/NN and/CC allen/JJ hughes/NNS ./. getting/VBG the/DT hughes/NNS brothers/NNS to/TO direct/VB this/DT seems/VBZ almost/RB as/RB ludicrous/JJ as/IN casting/VBG carrot/NN top/NN in/IN ,/, well/RB ,/, anything/NN ,/, but/CC riddle/VB me/PRP this/DT :/: who/WP better/RB to/TO direct/VB a/DT film/NN that/WDT 's/VBZ set/VBN in/IN the/DT ghetto/NN and/CC features/NNS really/RB violent/JJ street/NN crime/NN than/IN the/DT mad/JJ geniuses/NNS behind/IN menace/NN ii/NNS society/NN ?/. the/DT ghetto/NN in/IN question/NN is/VBZ ,/, of/IN course/NN ,/, whitechapel/NN in/IN 1888/CD london/. 's/POS east/JJ end/NN ./. it/PRP 's/VBZ a/DT filthy/JJ ,/, sooty/JJ place/NN where/WRB the/DT whores/NNS (/-LRB- called/VBN \"/`` unfortunates/JJ \"/'' )/-RRB- are/VBP starting/VBG to/TO get/VB a/DT little/JJ nervous/JJ about/IN this/DT mysterious/JJ psychopath/NN who/WP has/VBZ been/VBN carving/VBG through/IN their/PRP$ profession/NN with/IN surgical/JJ precision/NN ./. when/WRB the/DT first/JJ stiff/NN turns/VBZ up/RP ,/, copper/NN peter/NN godley/NN (/-LRB- robbie/NN coltrane/NN ,/, the/DT world/NN is/VBZ not/RB enough/JJ )/-RRB- calls/VBZ in/IN inspector/NN frederick/NN abberline/NN (/-LRB- johnny/JJ depp/NN ,/, blow/NN )/-RRB- to/TO crack/VB the/DT case/NN ./. abberline/NN ,/, a/DT widower/NN ,/, has/VBZ prophetic/JJ dreams/NNS he/PRP unsuccessfully/RB tries/VBZ to/TO quell/VB with/IN copious/JJ amounts/NNS of/IN absinthe/NNS and/CC opium/NN ./. upon/IN arriving/VBG in/IN whitechapel/NN ,/, he/PRP befriends/VBZ an/DT unfortunate/NN named/VBN mary/JJ kelly/NN (/-LRB- heather/NN graham/NN ,/, say/VBP it/PRP is/VBZ n't/RB so/RB )/-RRB- and/CC proceeds/NNS to/TO investigate/VB the/DT horribly/RB gruesome/JJ crimes/NNS that/IN even/RB the/DT police/NN surgeon/NN ca/MD n't/RB stomach/VB ./. i/PRP do/VBP n't/RB think/VB anyone/NN needs/NNS to/TO be/VB briefed/VBN on/IN jack/NN the/DT ripper/NN ,/, so/IN i/PRP wo/MD n't/RB go/VB into/IN the/DT particulars/NNS here/RB ,/, other/JJ than/IN to/TO say/VB moore/NN and/CC campbell/NN have/VBP a/DT unique/JJ and/CC interesting/JJ theory/NN about/IN both/DT the/DT identity/NN of/IN the/DT killer/NN and/CC the/DT reasons/NNS he/PRP chooses/VBZ to/TO slay/VB ./. in/IN the/DT comic/JJ ,/, they/PRP do/VBP n't/RB bother/VB cloaking/VBG the/DT identity/NN of/IN the/DT ripper/NN ,/, but/CC screenwriters/NNS terry/NN hayes/NNS (/-LRB- vertical/JJ limit/NN )/-RRB- and/CC rafael/JJ yglesias/NNS (/-LRB- les/NNS mis/NN ?/. rables/NNS )/-RRB- do/VBP a/DT good/JJ job/NN of/IN keeping/VBG him/PRP hidden/VBN from/IN viewers/NNS until/IN the/DT very/JJ end/NN ./. it/PRP 's/VBZ funny/JJ to/TO watch/VB the/DT locals/NNS blindly/RB point/VBP the/DT finger/NN of/IN blame/NN at/IN jews/NNS and/CC indians/NNS because/IN ,/, after/IN all/DT ,/, an/DT englishman/NN could/MD never/RB be/VB capable/JJ of/IN committing/VBG such/JJ ghastly/JJ acts/NNS ./. and/CC from/IN hell/NN 's/POS ending/NN had/VBD me/PRP whistling/VBG the/DT stonecutters/NNS song/NN from/IN the/DT simpsons/NNS for/IN days/NNS (/-LRB- \"/'' who/WP holds/VBZ back/RB the/DT electric/JJ car/who/NN made/VBD steve/JJ guttenberg/NN a/DT star/NN ?/. \"/`` )/-RRB- ./. do/VBP n't/RB worry/VB -/: it/PRP 'll/MD all/DT make/VB sense/NN when/WRB you/PRP see/VBP it/PRP ./. now/RB onto/IN from/IN hell/NN 's/POS appearance/NN :/: it/PRP 's/VBZ certainly/RB dark/JJ and/CC bleak/JJ enough/JJ ,/, and/CC it/PRP 's/VBZ surprising/JJ to/TO see/VB how/WRB much/RB more/JJR it/PRP looks/VBZ like/IN a/DT tim/JJ burton/NN film/NN than/IN planet/NN of/IN the/DT apes/NNS did/VBD (/-LRB- at/IN times/NNS ,/, it/PRP seems/VBZ like/IN sleepy/JJ hollow/JJ 2/CD )/-RRB- ./. the/DT print/NN i/NN saw/VBD was/VBD n't/RB completely/RB finished/VBN (/-LRB- both/DT color/NN and/CC music/NN had/VBD not/RB been/VBN finalized/VBN ,/, so/IN no/DT comments/NNS about/IN marilyn/JJ manson/NN )/-RRB- ,/, but/CC cinematographer/NN peter/NN deming/NN (/-LRB- do/VBP n't/RB say/VB a/DT word/NN )/-RRB- ably/RB captures/VBZ the/DT dreariness/NN of/IN victorian-era/NN london/RB and/CC helped/VBD make/VB the/DT flashy/JJ killing/NN scenes/NNS remind/VBD me/PRP of/IN the/DT crazy/JJ flashbacks/NNS in/IN twin/JJ peaks/NNS ,/, even/RB though/IN the/DT violence/NN in/IN the/DT film/NN pales/NNS in/IN comparison/NN to/TO that/DT in/IN the/DT black-and-white/JJ comic/JJ ./. oscar/NN winner/NN martin/VBG childs/NNS '/POS (/-LRB- shakespeare/NN in/IN love/NN )/-RRB- production/NN design/NN turns/VBZ the/DT original/JJ prague/NN surroundings/NNS into/IN one/CD creepy/JJ place/NN ./. even/RB the/DT acting/VBG in/IN from/IN hell/NN is/VBZ solid/JJ ,/, with/IN the/DT dreamy/JJ depp/NN turning/VBG in/IN a/DT typically/RB strong/JJ performance/NN and/CC deftly/RB handling/VBG a/DT british/JJ accent/NN ./. ians/NNS holm/VBP (/-LRB- joe/NN gould/NN 's/POS secret/NN )/-RRB- and/CC richardson/NN (/-LRB- 102/CD dalmatians/NNS )/-RRB- log/VBP in/IN great/JJ supporting/VBG roles/NNS ,/, but/CC the/DT big/JJ surprise/NN here/RB is/VBZ graham/NN ./. i/NN cringed/VBD the/DT first/JJ time/NN she/PRP opened/VBD her/PRP$ mouth/NN ,/, imagining/VBG her/PRP$ attempt/NN at/IN an/DT irish/JJ accent/NN ,/, but/CC it/PRP actually/RB was/VBD n't/RB half/DT bad/JJ ./. the/DT film/NN ,/, however/RB ,/, is/VBZ all/DT good/JJ ./. 2/CD :/: 00/CD -/: r/NN for/IN strong/JJ violence/gore/NN ,/, sexuality/NN ,/, language/NN and/CC drug/NN content/NN"

Find simple patterns

Based on the first file, we define some simple string patterns to try to identify people appearances.

pattern0=c("created by")
pattern0=c(pattern0,"screenwriter[s]?")
pattern0=c(pattern0,"cinematographer")
pattern0=c(pattern0,"oscar winner")

We detect those patterns in the corpus and we can see in which files they do appear.

matches0 = detectPatternsInCorpus(corpus, pattern0)
matches0[!is.na(matches0[3]),c(1,3)]
##                File                         screenwriter[s]?
## 1   cv000_29590.txt                            screenwriters
## 29  cv028_26746.txt                            screenwriters
## 30  cv029_18643.txt                             screenwriter
## 77  cv076_24945.txt                             screenwriter
## 79  cv078_14730.txt               screenwriter, screenwriter
## 87  cv086_18371.txt                             screenwriter
## 95  cv094_27889.txt                            screenwriters
## 116 cv115_25396.txt                             screenwriter
## 122 cv121_17302.txt                             screenwriter
## 136 cv135_11603.txt                            screenwriters
## 143 cv142_22516.txt                             screenwriter
## 144 cv143_19666.txt                             screenwriter
## 159 cv158_10390.txt                             screenwriter
## 163 cv162_10424.txt screenwriter, screenwriter, screenwriter
## 179 cv178_12972.txt                             screenwriter
## 191 cv190_27052.txt                             screenwriter
## 192 cv191_29719.txt                             screenwriter
## 209  cv208_9020.txt                            screenwriters
## 226 cv225_29224.txt                             screenwriter
## 236 cv235_10217.txt                            screenwriters
## 241 cv240_14336.txt                            screenwriters
## 242 cv241_23130.txt                             screenwriter
## 275 cv274_25253.txt                             screenwriter
## 319 cv318_10493.txt                             screenwriter
## 337 cv336_10143.txt                             screenwriter
## 360  cv359_6647.txt                             screenwriter
## 366 cv365_11576.txt                             screenwriter
## 371  cv370_5221.txt                             screenwriter
## 396 cv395_10849.txt                            screenwriters
## 405 cv404_20315.txt               screenwriter, screenwriter
## 406 cv405_20399.txt                             screenwriter
## 411 cv410_24266.txt                            screenwriters
## 433 cv432_14224.txt                             screenwriter
## 453  cv452_5088.txt                            screenwriters
## 457 cv456_18985.txt                            screenwriters
## 465 cv464_15650.txt                            screenwriters
## 467 cv466_18722.txt                             screenwriter
## 475 cv474_10209.txt                             screenwriter
## 477 cv476_16856.txt                             screenwriter

We check how many patterns we have found in each file.

countMatchesPerRow(matches0) 
##                File Count
## 1   cv000_29590.txt     4
## 29  cv028_26746.txt     1
## 30  cv029_18643.txt     1
## 68  cv067_19774.txt     1
## 77  cv076_24945.txt     1
## 79  cv078_14730.txt     1
## 87  cv086_18371.txt     1
## 95  cv094_27889.txt     1
## 100 cv099_10534.txt     1
## 109 cv108_15571.txt     1
## 116 cv115_25396.txt     1
## 122 cv121_17302.txt     2
## 136 cv135_11603.txt     1
## 143 cv142_22516.txt     1
## 144 cv143_19666.txt     1
## 155  cv154_9328.txt     1
## 159 cv158_10390.txt     1
## 160 cv159_29505.txt     1
## 163 cv162_10424.txt     1
## 179 cv178_12972.txt     1
## 191 cv190_27052.txt     1
## 192 cv191_29719.txt     1
## 206  cv205_9457.txt     1
## 209  cv208_9020.txt     1
## 216 cv215_22240.txt     1
## 221 cv220_29059.txt     1
## 226 cv225_29224.txt     1
## 236 cv235_10217.txt     1
## 241 cv240_14336.txt     1
## 242 cv241_23130.txt     1
## 274 cv273_29112.txt     1
## 275 cv274_25253.txt     1
## 286 cv285_16494.txt     1
## 295 cv294_11684.txt     1
## 298 cv297_10047.txt     1
## 301 cv300_22284.txt     1
## 315 cv314_14422.txt     1
## 318 cv317_24049.txt     1
## 319 cv318_10493.txt     1
## 324 cv323_29805.txt     1
## 325  cv324_7082.txt     1
## 337 cv336_10143.txt     1
## 352 cv351_15458.txt     1
## 360  cv359_6647.txt     1
## 363 cv362_15341.txt     1
## 366 cv365_11576.txt     1
## 371  cv370_5221.txt     1
## 372  cv371_7630.txt     1
## 387 cv386_10080.txt     1
## 396 cv395_10849.txt     1
## 398 cv397_29023.txt     1
## 405 cv404_20315.txt     1
## 406 cv405_20399.txt     1
## 410 cv409_29786.txt     1
## 411 cv410_24266.txt     1
## 428 cv427_10825.txt     1
## 432  cv431_7085.txt     1
## 433 cv432_14224.txt     1
## 453  cv452_5088.txt     1
## 457 cv456_18985.txt     1
## 465 cv464_15650.txt     1
## 467 cv466_18722.txt     1
## 475 cv474_10209.txt     1
## 476 cv475_21692.txt     1
## 477 cv476_16856.txt     1
## 485 cv484_25054.txt     1

And we check how many times each pattern has been found.

countMatchesPerColumn(matches0) 
##             Entity Count
## 1       created by     6
## 2 screenwriter[s]?    39
## 3  cinematographer    19
## 4     oscar winner     6

And we print the context in which the patterns are found, to see if we can build better patterns.

for (i in 1:length(pattern0)){
  print(paste("PATTERN: ",pattern0[i]))
  strings = lapply(corpus, detectPatternOnDocumentWithContext, pattern=pattern0[i])
  print(unlist(strings[!is.na(unlist(strings))]))
  print(" ")
}
## [1] "PATTERN:  created by"
##                                                                                                   cv000_29590.txt 
## "ok like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought t" 
##                                                                                                    cv205_9457.txt 
## "turvy . \nrobert zemeckis , back from the euphoria created by his last film , forrest gump , once again proves " 
##                                                                                                   cv285_16494.txt 
## "ve got mail like dried-up mistletoe . \nthe sparks created by the earlier movie are , by necessity , not eviden" 
##                                                                                                    cv324_7082.txt 
## "the real thing . \nthe two of them , as characters created by fingal's imagination , serve as aspects of his pe" 
##                                                                                                    cv371_7630.txt 
## "nd always right on the mark , enhancing the moods created by the animated scenery . \nas far as the subtitles g" 
##                                                                                                   cv484_25054.txt 
##  " and there are cliches , but the walls of water , created by fluid dynamics simulating real-life phenomena , a" 
## [1] " "
## [1] "PATTERN:  screenwriter[s]?"
##                                                                                                        cv000_29590.txt 
##    " bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesia" 
##                                                                                                        cv028_26746.txt 
## "d the story is so complex and \" clever \" that the screenwriters are the first to get lost in it . \nthere is is no" 
##                                                                                                        cv029_18643.txt 
##     "mise certainly is interesting and director and co-screenwriter alex proyas is able to keep the film consistently" 
##                                                                                                        cv076_24945.txt 
##  " logical confusion . \ndirector gregory hoblit and screenwriter toby emmerich structure \" frequency \" as good hol" 
##                                                                                                        cv078_14730.txt 
##     "different , it would be easy for the director and screenwriter to dumb it down and appeal to the lowest common d" 
##                                                                                                        cv086_18371.txt 
##    "itively made . \ndirector michael winterbottom and screenwriter frank cottrell boyce vividly express the societal" 
##                                                                                                        cv094_27889.txt 
##   "it prince acquit themselves admirably . \nkudos to screenwriters james schamus , wang hui ling and tsai kuo jing ," 
##                                                                                                        cv115_25396.txt 
##    "ins an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does n" 
##                                                                                                        cv121_17302.txt 
##    "i relish those rare opportunities when a talented screenwriter can make me feel like a fool . \ni spent the first" 
##                                                                                                        cv135_11603.txt 
##   "is a highly enjoyable ride . \nonce again , chan's screenwriters ( here edward tang and fibe ma ) have taken the e" 
##                                                                                                        cv142_22516.txt 
##     "e hallstrom ( what's eating gilbert grape ? ) and screenwriter/novelist john irving ( the world according to gar" 
##                                                                                                        cv143_19666.txt 
##     "sy to have written vivian as merely a flake , but screenwriter j . f . lawton clearly cared too much about his c" 
##                                                                                                        cv158_10390.txt 
##    " potentially hilarious comedy , and pitched it to screenwriter paul rudnick . \nit's true that if this same film " 
##                                                                                                        cv162_10424.txt 
##     "rise to fame , quickly become the most well-known screenwriter amongst the entertainment weekly-reading , box of" 
##                                                                                                        cv178_12972.txt 
##     "aseball bat , but i'm pretty sure andrew niccol , screenwriter for the truman show , has had the same curious th" 
##                                                                                                        cv190_27052.txt 
##    "has been sharply pieced together by tony gilroy , screenwriter of the devil's advocate and dolores claiborne . \n" 
##                                                                                                        cv191_29719.txt 
##   "\" was right after all . \nfortunately , first-time screenwriter marc klein has sketched strong , well-rounded , c" 
##                                                                                                         cv208_9020.txt 
##    "ble , but after all the star power , mega bucks , screenwriters , directors , and cool trailers , men in black is" 
##                                                                                                        cv225_29224.txt 
##    "efall the participants en route to silver city . \nscreenwriter andy breckman adds a nice touch by not having the" 
##                                                                                                        cv235_10217.txt 
##    "too much to it , but capra and the gang ( various screenwriters , composers , actors ) plumet the material to its" 
##                                                                                                        cv240_14336.txt 
##   "sh anything , it just comes natural to him . \nthe screenwriters use the right words and phrases to describe the m" 
##                                                                                                        cv241_23130.txt 
##     "lf more seriously this time ; maybe so , or maybe screenwriter ehren kruger ( arlington road ) , who took over a" 
##                                                                                                        cv274_25253.txt 
##    "ins an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does n" 
##                                                                                                        cv318_10493.txt 
##     "nd-up from veteran horror director wes craven and screenwriter kevin williamson that seemed to breathe new life " 
##                                                                                                        cv336_10143.txt 
##     "ed for scream 2 , including director wes craven , screenwriter kevin williamson , and actors neve campbell , cou" 
##                                                                                                         cv359_6647.txt 
##    "y after all . \nfavreau also doubled as the film's screenwriter , and he proves he has the gift for creating enga" 
##                                                                                                        cv365_11576.txt 
##   "ing . \nthat's not to say the movie isn't funny . \nscreenwriter tim herlihy has written for sandler before ( bill" 
##                                                                                                         cv370_5221.txt 
##    " . \nit tore at my heart to watch a gifted lesbian screenwriter explain that , as a rule , gay audiences hunger f" 
##                                                                                                        cv395_10849.txt 
##                 "jake kasdan , son of one of the best screenwriters around , breaks into filmmaking by writing and di" 
##                                                                                                        cv404_20315.txt 
##     "albert brooks plays steven phillips , a hollywood screenwriter who after winning a humanitarian award for his wo" 
##                                                                                                        cv405_20399.txt 
##    " the 1999 film outside providence ( 6 . 5/10 ) . \nscreenwriter w . peter iliff also had a part in writing the sc" 
##                                                                                                        cv410_24266.txt 
##   " . \nminkoff likes to point out scenes where other screenwriters came in and polished up the script , namely write" 
##                                                                                                        cv432_14224.txt 
##    "ments work very well -- for a comic book story . \nscreenwriter david goyer ( who also wrote the crow ) incorpora" 
##                                                                                                         cv452_5088.txt 
##   "tober is distinguished by its water-tight plot . \nscreenwriters larry ferguson and donald stewart have gracefully" 
##                                                                                                        cv456_18985.txt 
##   "d , and all are handled exceptionally well by the screenwriters . \nthere is no shred of doubt left to ponder afte" 
##                                                                                                        cv464_15650.txt 
##   "ture and charm holds right up to the last reel . \nscreenwriters john eskow , ted elliot and terry rosio have unfo" 
##                                                                                                        cv466_18722.txt 
##                "david mamet has long been my favorite screenwriter and director . \nwith his distinctive , more often" 
##                                                                                                        cv474_10209.txt 
##   "s that \" genius is insanity with some success \" , screenwriter fierstein is taking a lazy shortcut ) , pryce mak" 
##                                                                                                        cv476_16856.txt 
##     "threat of class struggle ; for george pal and his screenwriter david duncan , who produced the film in the worst" 
## [1] " "
## [1] "PATTERN:  cinematographer"
##                                                                                                         cv000_29590.txt 
##   "zed , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures t" 
##                                                                                                         cv067_19774.txt 
##   "atmospherically shot by the silence of the lambs' cinematographer , tak fujimoto ) is actually a drama-its spooky ," 
##                                                                                                         cv099_10534.txt 
##  "nger . \nas depicted by hoblit ( primal fear ) and cinematographer newton thomas sigel , philadelphia is a dark , dr" 
##                                                                                                         cv121_17302.txt 
##   "occasionally the hyper-real approach works , with cinematographer elliot davis creating a world of fantasy romance " 
##                                                                                                          cv154_9328.txt 
##   "rom editors karen schmeer and shondra merrill and cinematographer robert richardson ( oliver stone's longtime colla" 
##                                                                                                         cv159_29505.txt 
##  "ly amazing piotr sobocinski , the oscar-nominated cinematographer behind krzysztof kieslowski's red . \nthe acting i" 
##                                                                                                         cv215_22240.txt 
##  " echoing the loneliness of the protagonists . \n ( cinematographer remi adafarasin often allows space to engulf them" 
##                                                                                                         cv220_29059.txt 
##   ", and comic-panel framed with a virtuoso grace by cinematographer amy vincent ( _death in venice , ca_ ) , while in" 
##                                                                                                         cv273_29112.txt 
##  "han he expected at the beginning . \nit helps that cinematographer matthieu poirot-delpech's crisp lensing complimen" 
##                                                                                                         cv294_11684.txt 
## "hudsucker proxy . \" \nthe film was shot by veteran cinematographer roger deakins , who has worked with the coens on " 
##                                                                                                         cv297_10047.txt 
##  "work done by production designer nigel phelps and cinematographer darius khondji . \nas technically adept as jeunet'" 
##                                                                                                         cv323_29805.txt 
##   "sarossy who directs spent most of his career as a cinematographer and like the kingpin's lair , he has molded image" 
##                                                                                                         cv351_15458.txt 
##  "s the result of a perfect pairing of director and cinematographer . \nkapur and his cinematographer remi adefarasin " 
##                                                                                                         cv362_15341.txt 
##  "eth are shekhar kapur's visual delights . \nhe and cinematographer remi adefarasin have crafted a film with a rich c" 
##                                                                                                         cv386_10080.txt 
##   "keen eye for the stylish ; his collaboration with cinematographer slawomir idziak , production designer jan roelfs " 
##                                                                                                         cv397_29023.txt 
##  "of skilled craftsmen to work behind the camera . \ncinematographer matthew f . leonetti has had a long career of sho" 
##                                                                                                         cv409_29786.txt 
##   "( once again wielded by superb and ever-attentive cinematographer eric gautier ) is less appropriate here than in h" 
##                                                                                                         cv427_10825.txt 
##  "e telephones are old-fashioned , rotary models . \ncinematographer bill butler is given an opportunity to use unconv" 
##                                                                                                          cv431_7085.txt 
##  "is death in the early 80's . \nmore practiced as a cinematographer than a director , bava nonetheless sat in the dir" 
## [1] " "
## [1] "PATTERN:  oscar winner"
##                                                                                                     cv000_29590.txt 
## "omparison to that in the black-and-white comic . \noscar winner martin childs' ( shakespeare in love ) production" 
##                                                                                                     cv108_15571.txt 
## "sshoppers is hopper , who is fiendishly voiced by oscar winner kevin spacey . \nwhen the offering is lost hopper " 
##                                                                                                     cv300_22284.txt 
## " he stole every scene he was in away from veteran oscar winner tom hanks . \nrockwell , an independent film veter" 
##                                                                                                     cv314_14422.txt 
##  "eresting to see that this movie was one of future oscar winner susan sarandon's ( dead man walking ) first film-" 
##                                                                                                     cv317_24049.txt 
##  " the former actually had the insight to follow up oscar winner usual suspects with a pauly shore vehicle , and j" 
##                                                                                                     cv475_21692.txt 
## "on and ben affleck in the starring roles . \nbeing oscar winners for the classic 'good will hunting' they give gr" 
## [1] " "

Find entities

Now we define more complex regular expressions that help identifying people appearances.

pattern1=c("created by ([A-z]* [A-z]*)")
pattern1=c(pattern1,"created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)")
pattern1=c(pattern1,"screenwriter[s]? ([A-z]* [A-z]*)")
pattern1=c(pattern1,"cinematographer(?: ,)? ([A-z]* [A-z]*)")
pattern1=c(pattern1,"oscar winner ([A-z]* [A-z]*)")

We detect those patterns in the corpus and we can see in which files they do appear.

matches1 = detectPatternsInCorpus(corpus, pattern1)
matches1[!is.na(matches1[4]),c(1,4)]
##                File screenwriter[s]? ([A-z]* [A-z]*)
## 1   cv000_29590.txt                      terry hayes
## 29  cv028_26746.txt                          are the
## 30  cv029_18643.txt                      alex proyas
## 77  cv076_24945.txt                    toby emmerich
## 79  cv078_14730.txt                          to dumb
## 87  cv086_18371.txt                   frank cottrell
## 95  cv094_27889.txt                    james schamus
## 116 cv115_25396.txt                karey kirkpatrick
## 122 cv121_17302.txt                         can make
## 144 cv143_19666.txt                               j 
## 159 cv158_10390.txt                     paul rudnick
## 163 cv162_10424.txt                      amongst the
## 179 cv178_12972.txt                          for the
## 191 cv190_27052.txt                           of the
## 192 cv191_29719.txt                       marc klein
## 226 cv225_29224.txt                    andy breckman
## 241 cv240_14336.txt                          use the
## 242 cv241_23130.txt                     ehren kruger
## 275 cv274_25253.txt                karey kirkpatrick
## 319 cv318_10493.txt                 kevin williamson
## 337 cv336_10143.txt                 kevin williamson
## 366 cv365_11576.txt                      tim herlihy
## 371  cv370_5221.txt                     explain that
## 396 cv395_10849.txt                          around 
## 405 cv404_20315.txt                        who after
## 406 cv405_20399.txt                               w 
## 411 cv410_24266.txt                          came in
## 433 cv432_14224.txt                      david goyer
## 453  cv452_5088.txt                   larry ferguson
## 465 cv464_15650.txt                       john eskow
## 467 cv466_18722.txt                     and director
## 475 cv474_10209.txt                     fierstein is
## 477 cv476_16856.txt                     david duncan

We print the matches found per pattern.

printMatchesPerPattern(pattern1, matches1)
## [1] "PATTERN:  created by ([A-z]* [A-z]*)"
## [1] "alan moore"     "his last"       "the earlier"    "the animated"  
## [5] "fluid dynamics"
## [1] " "
## [1] "PATTERN:  created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)"
## [1] "eddie campbell"
## [1] " "
## [1] "PATTERN:  screenwriter[s]? ([A-z]* [A-z]*)"
##  [1] "terry hayes"       "are the"           "alex proyas"      
##  [4] "toby emmerich"     "to dumb"           "frank cottrell"   
##  [7] "james schamus"     "karey kirkpatrick" "can make"         
## [10] "j "                "paul rudnick"      "amongst the"      
## [13] "for the"           "of the"            "marc klein"       
## [16] "andy breckman"     "use the"           "ehren kruger"     
## [19] "karey kirkpatrick" "kevin williamson"  "kevin williamson" 
## [22] "tim herlihy"       "explain that"      "around "          
## [25] "who after"         "w "                "came in"          
## [28] "david goyer"       "larry ferguson"    "john eskow"       
## [31] "and director"      "fierstein is"      "david duncan"     
## [1] " "
## [1] "PATTERN:  cinematographer(?: ,)? ([A-z]* [A-z]*)"
##  [1] "peter deming"      "tak fujimoto"      "newton thomas"    
##  [4] "elliot davis"      "robert richardson" "behind krzysztof" 
##  [7] "remi adafarasin"   "amy vincent"       "matthieu poirot"  
## [10] "roger deakins"     "darius khondji"    "and like"         
## [13] "remi adefarasin"   "remi adefarasin"   "slawomir idziak"  
## [16] "matthew f"         "eric gautier"      "bill butler"      
## [19] "than a"           
## [1] " "
## [1] "PATTERN:  oscar winner ([A-z]* [A-z]*)"
## [1] "martin childs"  "kevin spacey"   "tom hanks"      "susan sarandon"
## [5] "usual suspects"
## [1] " "

We check how many patterns we have found in each file.

countMatchesPerRow(matches1)
##                File Count
## 1   cv000_29590.txt     5
## 29  cv028_26746.txt     1
## 30  cv029_18643.txt     1
## 68  cv067_19774.txt     1
## 77  cv076_24945.txt     1
## 79  cv078_14730.txt     1
## 87  cv086_18371.txt     1
## 95  cv094_27889.txt     1
## 100 cv099_10534.txt     1
## 109 cv108_15571.txt     1
## 116 cv115_25396.txt     1
## 122 cv121_17302.txt     2
## 144 cv143_19666.txt     1
## 155  cv154_9328.txt     1
## 159 cv158_10390.txt     1
## 160 cv159_29505.txt     1
## 163 cv162_10424.txt     1
## 179 cv178_12972.txt     1
## 191 cv190_27052.txt     1
## 192 cv191_29719.txt     1
## 206  cv205_9457.txt     1
## 216 cv215_22240.txt     1
## 221 cv220_29059.txt     1
## 226 cv225_29224.txt     1
## 241 cv240_14336.txt     1
## 242 cv241_23130.txt     1
## 274 cv273_29112.txt     1
## 275 cv274_25253.txt     1
## 286 cv285_16494.txt     1
## 295 cv294_11684.txt     1
## 298 cv297_10047.txt     1
## 301 cv300_22284.txt     1
## 315 cv314_14422.txt     1
## 318 cv317_24049.txt     1
## 319 cv318_10493.txt     1
## 324 cv323_29805.txt     1
## 337 cv336_10143.txt     1
## 352 cv351_15458.txt     1
## 363 cv362_15341.txt     1
## 366 cv365_11576.txt     1
## 371  cv370_5221.txt     1
## 372  cv371_7630.txt     1
## 387 cv386_10080.txt     1
## 396 cv395_10849.txt     1
## 398 cv397_29023.txt     1
## 405 cv404_20315.txt     1
## 406 cv405_20399.txt     1
## 410 cv409_29786.txt     1
## 411 cv410_24266.txt     1
## 428 cv427_10825.txt     1
## 432  cv431_7085.txt     1
## 433 cv432_14224.txt     1
## 453  cv452_5088.txt     1
## 465 cv464_15650.txt     1
## 467 cv466_18722.txt     1
## 475 cv474_10209.txt     1
## 477 cv476_16856.txt     1
## 485 cv484_25054.txt     1

And we check how many times each pattern has been found.

countMatchesPerColumn(matches1)
##                                             Entity Count
## 1                       created by ([A-z]* [A-z]*)     5
## 2 created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)     1
## 3                 screenwriter[s]? ([A-z]* [A-z]*)    33
## 4           cinematographer(?: ,)? ([A-z]* [A-z]*)    19
## 5                     oscar winner ([A-z]* [A-z]*)     5

Find entities using part-of-speech (POS) tags

Now we include in our regular expressions part-of-speech information to avoid having incorrect answers.

pattern2=c("created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN")
pattern2=c(pattern2,"created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN")
pattern2=c(pattern2,"screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)")
pattern2=c(pattern2,"cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/NN")
pattern2=c(pattern2,"cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN")
pattern2=c(pattern2,"oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS")
pattern2=c(pattern2,"\\(/-LRB- ([A-z]*)/NN ([A-z]*)/NN \\)/-RRB-")
pattern2=c(pattern2,"\\(/-LRB- ([A-z]*)/JJ ([A-z]*)/NN \\)/-RRB-")
pattern2=c(pattern2,"\\(/-LRB- ([A-z]*)/NN ([A-z]*)/NNS \\)/-RRB-")

We detect those patterns in the POS-tagged corpus.

allEntities = detectPatternsInTaggedCorpus(corpus, corpus.taggedText, pattern2)
allEntities[!is.na(allEntities[4]),c(1,4)]
##                File
## 1   cv000_29590.txt
## 30  cv029_18643.txt
## 77  cv076_24945.txt
## 87  cv086_18371.txt
## 95  cv094_27889.txt
## 116 cv115_25396.txt
## 192 cv191_29719.txt
## 226 cv225_29224.txt
## 242 cv241_23130.txt
## 275 cv274_25253.txt
## 433 cv432_14224.txt
## 453  cv452_5088.txt
## 465 cv464_15650.txt
## 477 cv476_16856.txt
##     screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)
## 1                                                         terry hayes
## 30                                                        alex proyas
## 77                                                      toby emmerich
## 87                                                     frank cottrell
## 95                                                      james schamus
## 116                                                 karey kirkpatrick
## 192                                                        marc klein
## 226                                                     andy breckman
## 242                                                      ehren kruger
## 275                                                 karey kirkpatrick
## 433                                                       david goyer
## 453                                                    larry ferguson
## 465                                                        john eskow
## 477                                                      david duncan

We can also view the entities for a certain pattern.

Filter(Negate(is.na),allEntities[[4]])
##  [1] "terry hayes"       "alex proyas"       "toby emmerich"    
##  [4] "frank cottrell"    "james schamus"     "karey kirkpatrick"
##  [7] "marc klein"        "andy breckman"     "ehren kruger"     
## [10] "karey kirkpatrick" "david goyer"       "larry ferguson"   
## [13] "john eskow"        "david duncan"
printMatchesPerPattern(pattern2, allEntities)
## [1] "PATTERN:  created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN"
## [1] "alan moore"     "fluid dynamics"
## [1] " "
## [1] "PATTERN:  created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN"
## [1] "eddie campbell"
## [1] " "
## [1] "PATTERN:  screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)"
##  [1] "terry hayes"       "alex proyas"       "toby emmerich"    
##  [4] "frank cottrell"    "james schamus"     "karey kirkpatrick"
##  [7] "marc klein"        "andy breckman"     "ehren kruger"     
## [10] "karey kirkpatrick" "david goyer"       "larry ferguson"   
## [13] "john eskow"        "david duncan"     
## [1] " "
## [1] "PATTERN:  cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/NN"
##  [1] "peter deming"      "tak fujimoto"      "elliot davis"     
##  [4] "robert richardson" "remi adafarasin"   "roger deakins"    
##  [7] "remi adefarasin"   "slawomir idziak"   "matthew f"        
## [10] "bill butler"      
## [1] " "
## [1] "PATTERN:  cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN"
## [1] "newton thomas sigel"
## [1] " "
## [1] "PATTERN:  oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS"
## [1] "martin childs"
## [1] " "
## [1] "PATTERN:  \\(/-LRB- ([A-z]*)/NN ([A-z]*)/NN \\)/-RRB-"
##   [1] "ghost world"                                                                                             
##   [2] "_election _, _rushmore _, matthew broderick, bill murray, _rushmore _, _election _"                      
##   [3] "roy scheider, murray hamilton, robert shaw"                                                              
##   [4] "maka kotto"                                                                                              
##   [5] "jennifer lien"                                                                                           
##   [6] "maka kotto"                                                                                              
##   [7] "john cleese, rowan atkinson, wayne knight, kathy najimy, vince vieluf, dean cain"                        
##   [8] "hope davis"                                                                                              
##   [9] "forrest gump, tin cup"                                                                                   
##  [10] "patrick stewart, donna murphy, brent spiner"                                                             
##  [11] "leslie mann"                                                                                             
##  [12] "graham beckel, halle berry"                                                                              
##  [13] "fernanda montenegro, _la promesse_"                                                                      
##  [14] "liam neeson, ewan mcgregor, natalie portman, jake lloyd"                                                 
##  [15] "johnathon schaech, liv tyler"                                                                            
##  [16] "allison janney, chris cooper"                                                                            
##  [17] "matt damon"                                                                                              
##  [18] "paul campbell, winston bell"                                                                             
##  [19] "john goodman"                                                                                            
##  [20] "maaia sethna"                                                                                            
##  [21] "gene hackman, rene russo"                                                                                
##  [22] "patrick stewart"                                                                                         
##  [23] "john lithgow, cameron diaz"                                                                              
##  [24] "tsutomu yamazaki, nobuko miyamoto, rikiya yasuoka"                                                       
##  [25] "mark hamill, carrie fisher, peter mayhew, harrison ford"                                                 
##  [26] "jennifer aniston, christopher mcdonald"                                                                  
##  [27] "michel aumont, alexandra vandernoot, thierry lhermite"                                                   
##  [28] "joaquin phoenix"                                                                                         
##  [29] "giorgio cantarini"                                                                                       
##  [30] "max shreck"                                                                                              
##  [31] "bill pullman, angela featherstone"                                                                       
##  [32] "dennis quaid"                                                                                            
##  [33] "stephen dorff"                                                                                           
##  [34] "stephen rea"                                                                                             
##  [35] "mel gibson"                                                                                              
##  [36] "jake gyllenhaal, chris cooper, laura dern, jurassic park"                                                
##  [37] "john belushi"                                                                                            
##  [38] "minnie driver"                                                                                           
##  [39] "stephen dillane, marisa tomei"                                                                           
##  [40] "andre dussollier"                                                                                        
##  [41] "paul newman, linda fiorentino, dylan mcdermott"                                                          
##  [42] "bruce davison"                                                                                           
##  [43] "jason priestley"                                                                                         
##  [44] "tobey maguire"                                                                                           
##  [45] "denzel washington, john goodman"                                                                         
##  [46] "julie christie"                                                                                          
##  [47] "angela featherstone, matthew glave"                                                                      
##  [48] "robert deniro, dustin hoffman"                                                                           
##  [49] "mel gibson, miranda richardson"                                                                          
##  [50] "sam neill, laura dern"                                                                                   
##  [51] "john cusack"                                                                                             
##  [52] "linda hamilton"                                                                                          
##  [53] "maura tierney, sandra bullock"                                                                           
##  [54] "walter matthau"                                                                                          
##  [55] "kate nelligan, alfre woodard, winona ryder"                                                              
##  [56] "giorgio cantarini"                                                                                       
##  [57] "liam cunningham, kate winslet, nudge nudge"                                                              
##  [58] "gabrielle fitzpatrick"                                                                                   
##  [59] "elke sommer, bernard bresslaw, jack douglas, sherrie hewson"                                             
##  [60] "bruce willis, chris tucker"                                                                              
##  [61] "cameron diaz, leland orser, jeremy piven, daniel stern, carla scott"                                     
##  [62] "bruce willis"                                                                                            
##  [63] "val kilmer, patrick stewart"                                                                             
##  [64] "diane lane"                                                                                              
##  [65] "christopher walken, chuck aspegren"                                                                      
##  [66] "joan cusack, matt dillon, bob newhart, tom selleck"                                                      
##  [67] "anton yelchin, mika boorem"                                                                              
##  [68] "jessica hester, valerie chow"                                                                            
##  [69] "peter wang, shen guanglan, sharon iwai"                                                                  
##  [70] "neve campbell, jamie kennedy, courteney cox, liev schreiber, laurie metcalf"                             
##  [71] "peter billingsley"                                                                                       
##  [72] "tom cruise, jeremy blackman"                                                                             
##  [73] "matthew mcconaughey, tom skerritt"                                                                       
##  [74] "giorgio cantarini"                                                                                       
##  [75] "gretchen mol"                                                                                            
##  [76] "cameron diaz, matt dillon, chris elliot, lin shaye"                                                      
##  [77] "jim carrey, laura linney"                                                                                
##  [78] "ashley judd"                                                                                             
##  [79] "sally field, pierce brosnan"                                                                             
##  [80] "john leguizamo, mira sorvino, jennifer esposito"                                                         
##  [81] "john cusack"                                                                                             
##  [82] "ashley judd"                                                                                             
##  [83] "natascha mcelhone"                                                                                       
##  [84] "eliza dushku"                                                                                            
##  [85] "charlotte rampling, bruno cremer"                                                                        
##  [86] "kate winslet, leonardo dicaprio"                                                                         
##  [87] "robert carlyle"                                                                                          
##  [88] "matthew mcconaughey"                                                                                     
##  [89] "ralph cotterill, claire benito, carmel johnson"                                                          
##  [90] "tamiyo kusakari, naoto takenaka"                                                                         
##  [91] "tom skerritt"                                                                                            
##  [92] "pat morita"                                                                                              
##  [93] "bill murray, chris rock, laurence fishburne"                                                             
##  [94] "paul reiser, jenette goldstein, michael biehn, lance henriksen, carrie henn"                             
##  [95] "matt damon"                                                                                              
##  [96] "george coe"                                                                                              
##  [97] "dennis quaid"                                                                                            
##  [98] "robert deniro, lisa kudrow"                                                                              
##  [99] "colm feore, aunjanue ellis"                                                                              
## [100] "estella warren"                                                                                          
## [101] "mark hamill"                                                                                             
## [102] "john cleese, rowan atkinson"                                                                             
## [103] "john lovitz"                                                                                             
## [104] "bill paxton"                                                                                             
## [105] "matt damon"                                                                                              
## [106] "pam grier, robert forster, bridget fonda"                                                                
## [107] "joe ranft, denis leary, madeliene kahn, bonnie hunt, michael mcshane"                                    
## [108] "matt dillon, neve campbell, bill murray"                                                                 
## [109] "gretchen mol"                                                                                            
## [110] "neve campbell, liev schreiber"                                                                           
## [111] "minnie driver"                                                                                           
## [112] "cameron diaz, catherine keener"                                                                          
## [113] "tamiyo kusakari, hideko hara, naoto takenaka"                                                            
## [114] "jennifer lopez"                                                                                          
## [115] "abu warda"                                                                                               
## [116] "guy pearce, stephen spinella, robert carlyle"                                                            
## [117] "sylvester stallone, sharon stone, gene hackman, christopher walkin"                                      
## [118] "halle barry"                                                                                             
## [119] "christopher walken"                                                                                      
## [120] "robert deniro, lisa kudrow"                                                                              
## [121] "stephen ramsey"                                                                                          
## [122] "charlie sheen"                                                                                           
## [123] "natalie portman, ewan mcgregor, liam neeson"                                                             
## [124] "philippe garziano, ariane ascaride"                                                                      
## [125] "mel gibson, miranda richardson"                                                                          
## [126] "john leguizamo"                                                                                          
## [127] "martin donovan, queen latifah"                                                                           
## [128] "robert duvall, daryl hannah"                                                                             
## [129] "jake gyllenhaal, laura dern"                                                                             
## [130] "glenn close, lance henricksen, wayne knight"                                                             
## [131] "tom mcinnerny"                                                                                           
## [132] "patrick stewart, brent spiner, marina sirtis, alice krige, alfre woodard"                                
## [133] "morgan freeman, matthew mcconaughey"                                                                     
## [134] "brendan fraser"                                                                                          
## [135] "john goodman, tara reid, john turturro"                                                                  
## [136] "jon voight, gene hackman"                                                                                
## [137] "sigourney weaver"                                                                                        
## [138] "wayne knight, wallace shawn, john ratzenberger, jim varney, joan cusack"                                 
## [139] "cate blanchett, hilary swank"                                                                            
## [140] "alexa vega, daryl sabara, carla gugino, cheech marin"                                                    
## [141] "eugene levy, parker posey"                                                                               
## [142] "henning mortizen, paprika steen"                                                                         
## [143] "john lithgow"                                                                                            
## [144] "matthew broderick, jeanine jackson, chris klein, jessica campbell"                                       
## [145] "bill paxton, gloria stuart"                                                                              
## [146] "courtney cox, jamie kennedy, neve campbell"                                                              
## [147] "robert carlyle, mark addy, paul barber, tom wilkinson"                                                   
## [148] "uma thurman"                                                                                             
## [149] "sylvester stallone, hulk hogan, burgess meredith"                                                        
## [150] "jack warden"                                                                                             
## [151] "bridget fonda"                                                                                           
## [152] "wright penn"                                                                                             
## [153] "delroy lindo, rebecca pidgeon"                                                                           
## [154] "mira sorvino, val kilmer, bruce davison"                                                                 
## [155] "neve campbell, jamie kennedy, courtney cox, liev schreiber, laurie metcalf"                              
## [156] "tommy boy"                                                                                               
## [157] "melanie chisolm, emma bunton"                                                                            
## [158] "joaquin phoenix, connie nielsen"                                                                         
## [159] "sigourney weaver"                                                                                        
## [160] "jennifer lopez"                                                                                          
## [161] "kevin spacey, annette bening"                                                                            
## [162] "sharon stone"                                                                                            
## [163] "tom wilkinson"                                                                                           
## [164] "sarah patterson, angela lansbury"                                                                        
## [165] "ron livingston"                                                                                          
## [166] "harrison ford"                                                                                           
## [167] "pat carroll"                                                                                             
## [168] "cate blanchett, kathy burke"                                                                             
## [169] "peter falk"                                                                                              
## [170] "jim dale"                                                                                                
## [171] "jamey sheridan, sigourney weaver"                                                                        
## [172] "stephen rea, julianne moore"                                                                             
## [173] "leonardo dicaprio, kate winslet, bernard hill"                                                           
## [174] "laura linney, natasha mcelhone"                                                                          
## [175] "annette bening"                                                                                          
## [176] "liam neeson"                                                                                             
## [177] "heather graham"                                                                                          
## [178] "john travolta, bruce willis, amanda plummer, uma thurman, harvey keitel, quentin tarantino, peter greene"
## [179] "uma thurman"                                                                                             
## [180] "bill pullman, patricia arquette"                                                                         
## [181] "alice poon, rosanna arquette, peter macneill"                                                            
## [182] "bill pullman, angela featherstone"                                                                       
## [183] "chris tucker"                                                                                            
## [184] "kate winslet, leonardo dicaprio"                                                                         
## [185] "jack lemmon"                                                                                             
## [186] "sharon stone"                                                                                            
## [187] "carrie fisher, kenny baker, mark hamill, alec guiness, harrison ford, peter mayhew"                      
## [188] "jim caviezel, dennis quaid"                                                                              
## [189] "pam grier, robert forster"                                                                               
## [190] "pat morita, jerry tondo, eddie murphy"                                                                   
## [191] "dina meyer"                                                                                              
## [192] "nina hartley, tracy wright"                                                                              
## [193] "jake busey"                                                                                              
## [194] "renee zellweger, chris penn, michael rooker, rosanna arquette"                                           
## [195] "gloria stuart, kate winslett"                                                                            
## [196] "chris tucker"                                                                                            
## [197] "bill paxton, jamie gertz"                                                                                
## [198] "tom skerrit, john hurt, matthew mcconaughey, jake busey"                                                 
## [199] "charlize theron, gretchen mol"                                                                           
## [200] "harve presnell"                                                                                          
## [201] "will smith"                                                                                              
## [202] "bruce willis"                                                                                            
## [203] "jason miller, linda blair"                                                                               
## [204] "robert loggia"                                                                                           
## [205] "bill paxton, kate winslett, leonardo dicaprio"                                                           
## [206] "harrison ford"                                                                                           
## [207] "wayne knight"                                                                                            
## [208] "brad pitt, tom cruise, stephen rea"                                                                      
## [209] "alice hardford"                                                                                          
## [210] "stuart wilson, matthew letscher"                                                                         
## [211] "sal mineo"                                                                                               
## [212] "rebecca pidgeon"                                                                                         
## [213] "jennifer lopez, vince vaughn"                                                                            
## [214] "cherie lunghi"                                                                                           
## [215] "cate blanchette"                                                                                         
## [216] "carrie fisher"                                                                                           
## [217] "chris rock, jason lee, linda fiorentino"                                                                 
## [218] "rod taylor, yvette mimieux"                                                                              
## [219] "albert hall, dennis hopper"                                                                              
## [220] "linus roache, alison elliot"                                                                             
## [221] "george clooney, mark walhberg, diane lane, william fichtner"                                             
## [222] "chazz palminteri, kevin spacey"                                                                          
## [223] "forrest gump"                                                                                            
## [224] "laurence fishburne"                                                                                      
## [225] "sessue hayakawa, william holden"                                                                         
## [226] "lisa kudrow"                                                                                             
## [227] "blythe danner, dustin hoffman, bill nunn"                                                                
## [1] " "
## [1] "PATTERN:  \\(/-LRB- ([A-z]*)/JJ ([A-z]*)/NN \\)/-RRB-"
##   [1] "vertical limit"                                                                  
##   [2] "reese witherspoon, matthew broderick"                                            
##   [3] "richard dreyfuss"                                                                
##   [4] "eriq ebouaney, pascal nzonzi"                                                    
##   [5] "rudi delhem, pascal nzonzi"                                                      
##   [6] "whoopi goldberg, lanai chapman, jon lovitz, seth green, breckin meyer, amy smart"
##   [7] "djimon hounsou"                                                                  
##   [8] "kevin spacey, david morse"                                                       
##   [9] "barbara windsor"                                                                 
##  [10] "alan tilvern, stubby kaye"                                                       
##  [11] "broken arrow"                                                                    
##  [12] "oliver platt"                                                                    
##  [13] "universal solider"                                                               
##  [14] "alan rickman, salma hayek"                                                       
##  [15] "steve zahn, ethan embry"                                                         
##  [16] "kevin spacey, annette bening, thora birch, wes bentley"                          
##  [17] "stellan skarsgard"                                                               
##  [18] "rahul khanna"                                                                    
##  [19] "danny devito"                                                                    
##  [20] "eddie murphy"                                                                    
##  [21] "kenny baker"                                                                     
##  [22] "vin diesel"                                                                      
##  [23] "daniel auteil, michele laroque, stanislas crevillen, gerard depardieu"           
##  [24] "richard harris, russell crowe, oliver reed"                                      
##  [25] "gwyneth paltrow, anne bancroft"                                                  
##  [26] "greta schroeder"                                                                 
##  [27] "fiona shaw, alan boyle"                                                          
##  [28] "ben stiller"                                                                     
##  [29] "elizabeth mitchell, daniel henson"                                               
##  [30] "fiona shaw"                                                                      
##  [31] "danny glover"                                                                    
##  [32] "david duchovny"                                                                  
##  [33] "woody harrelson"                                                                 
##  [34] "daniel auteuil, daniel auteuil, emmanuelle beart"                                
##  [35] "jay hernandez, kirsten dunst"                                                    
##  [36] "fiona loewi"                                                                     
##  [37] "michael douglas"                                                                 
##  [38] "embeth davidtz, donald sutherland, primal fear"                                  
##  [39] "nick nolte"                                                                      
##  [40] "sela ward"                                                                       
##  [41] "steve buscemi"                                                                   
##  [42] "anthony franciosa"                                                               
##  [43] "nick nolte, sissy spacek"                                                        
##  [44] "anne heche, woody harrelson"                                                     
##  [45] "tony haygarth"                                                                   
##  [46] "todd louiso"                                                                     
##  [47] "ben affleck"                                                                     
##  [48] "henry fonda, larry hagman"                                                       
##  [49] "jonathan pryce, jimmy nail"                                                      
##  [50] "ann bancroft, ellen burstyn, lois smith"                                         
##  [51] "nicoletta braschi"                                                               
##  [52] "ben chaplin"                                                                     
##  [53] "kenneth connor"                                                                  
##  [54] "ian holm, maiwenn lebesco"                                                       
##  [55] "jon favreau, christian slater"                                                   
##  [56] "richard gere"                                                                    
##  [57] "michelle pfeiffer"                                                               
##  [58] "liar liar"                                                                       
##  [59] "anne heche, david schwimmer"                                                     
##  [60] "helen hunt, greg kinnear"                                                        
##  [61] "george dzundza"                                                                  
##  [62] "michael caine"                                                                   
##  [63] "hy xiaoguang, li qinqin"                                                         
##  [64] "david arquette, timothy olyphant, duane martin"                                  
##  [65] "julianne moore"                                                                  
##  [66] "david morse"                                                                     
##  [67] "like existenz"                                                                   
##  [68] "brian benben"                                                                    
##  [69] "ben stiller"                                                                     
##  [70] "francesca neri, javier bardem"                                                   
##  [71] "tori spelling"                                                                   
##  [72] "ed harris, noah emmerich"                                                        
##  [73] "brent briscoe"                                                                   
##  [74] "joseph mazzello, david strathairn, oliver platt"                                 
##  [75] "adrien brody"                                                                    
##  [76] "jude law"                                                                        
##  [77] "david morse"                                                                     
##  [78] "joseph mazzello, oliver platt"                                                   
##  [79] "shannon elizabeth, ali larter"                                                   
##  [80] "ewan mcgregor, ewan bremner, kevin mckidd"                                       
##  [81] "djimon hounsou"                                                                  
##  [82] "nicholas hope"                                                                   
##  [83] "miguel ferrer"                                                                   
##  [84] "runny nose"                                                                      
##  [85] "kathleen quinlan"                                                                
##  [86] "danny devito, danny glover"                                                      
##  [87] "like solaris"                                                                    
##  [88] "tim dekay, eric schweig"                                                         
##  [89] "tamara tunie, _practical magic_"                                                 
##  [90] "whoopi goldberg, lanai chapman, jon lovitz, breckin meyer"                       
##  [91] "michael bowen"                                                                   
##  [92] "jonathan harris"                                                                 
##  [93] "jimmy stewart"                                                                   
##  [94] "kristy swanson, jon stewart"                                                     
##  [95] "david schwimmer"                                                                 
##  [96] "david arquette, arlington road"                                                  
##  [97] "christian slater"                                                                
##  [98] "geoffrey rush"                                                                   
##  [99] "sean gullette"                                                                   
## [100] "neal mcdonough, david arquette"                                                  
## [101] "stellan skarsgard"                                                               
## [102] "anne bancroft, danny glover"                                                     
## [103] "oliver platt"                                                                    
## [104] "shannon elizabeth"                                                               
## [105] "billy crystal"                                                                   
## [106] "tony haygarth"                                                                   
## [107] "ewan mcgregor, nicole kidman, richard roxburgh"                                  
## [108] "holly hunter, danny devito"                                                      
## [109] "kenneth branagh, embeth davidzt"                                                 
## [110] "nicky katt"                                                                      
## [111] "minnie driver, nigel hawthorne, brian blessed"                                   
## [112] "levar burton, michael dorn"                                                      
## [113] "djimon hounsou, stellan skarsgard, arliss howard"                                
## [114] "steve buscemi, david huddleston, julianne moore, ben gazzara"                    
## [115] "tim allen, kelsey grammer"                                                       
## [116] "michael jeter"                                                                   
## [117] "giovanni ribisi, rosemary harris, greg kinnear, gary cole"                       
## [118] "alan cumming"                                                                    
## [119] "stellan skarsgard, ben affleck, minnie driver"                                   
## [120] "lee ving"                                                                        
## [121] "david duchovny, gillian anderson"                                                
## [122] "gbatokai dakinah"                                                                
## [123] "reese witherspoon"                                                               
## [124] "billy zane"                                                                      
## [125] "gary sinise, delroy lindo, rene russo"                                           
## [126] "loren dean, tony shalhoub, jude law"                                             
## [127] "david calder"                                                                    
## [128] "michael keaton"                                                                  
## [129] "ricky jay, danny devito, sam rockwell"                                           
## [130] "elie arroway"                                                                    
## [131] "kelly mcgillis"                                                                  
## [132] "danny glover"                                                                    
## [133] "david arquette, duane martin, david warner"                                      
## [134] "executive decision"                                                              
## [135] "russell crowe, richard harris, djimon hounsou, oliver reed"                      
## [136] "margaret welsh"                                                                  
## [137] "steve zahn"                                                                      
## [138] "thora birch"                                                                     
## [139] "arnold schwarzenegger, fake memory"                                              
## [140] "brian cox, gerard mcsorley, ciaran fitzgerald"                                   
## [141] "gary oldman"                                                                     
## [142] "jon pertwee"                                                                     
## [143] "elijah wood, kevin kline"                                                        
## [144] "ian hart"                                                                        
## [145] "billy zane"                                                                      
## [146] "ed harris, noah emmerich"                                                        
## [147] "ben kingsley"                                                                    
## [148] "arliss howard"                                                                   
## [149] "johnny depp, michael gambon"                                                     
## [150] "eddie murphy"                                                                    
## [151] "tim roth, eric stoltz, duane whitaker, frank whaley"                             
## [152] "stellan skarsgard"                                                               
## [153] "jude law"                                                                        
## [154] "donal mccann"                                                                    
## [155] "robert blake"                                                                    
## [156] "ben stiller"                                                                     
## [157] "michael york"                                                                    
## [158] "dan aykroyd"                                                                     
## [159] "andy macdowell"                                                                  
## [160] "jonathan lipnicki"                                                               
## [161] "michael bowen, michael keaton"                                                   
## [162] "joseph ashton, tantoo cardinal"                                                  
## [163] "miguel ferrer, gedde watanabe"                                                   
## [164] "clive owen"                                                                      
## [165] "donald sutherland"                                                               
## [166] "shauny sexton, daniel macivor"                                                   
## [167] "ann todd"                                                                        
## [168] "tim roth"                                                                        
## [169] "billy zane"                                                                      
## [170] "michael bowen"                                                                   
## [171] "helen hunt"                                                                      
## [172] "dan hedaya"                                                                      
## [173] "david morse"                                                                     
## [174] "jude law"                                                                        
## [175] "joe mantegna"                                                                    
## [176] "denzel washington, annette bening, tony shalhoub"                                
## [177] "kitty winn, ellen burstyn"                                                       
## [178] "alan cumming, william hurt"                                                      
## [179] "steven bauer, michelle pfeiffer"                                                 
## [180] "billy zane"                                                                      
## [181] "andreas katsulas"                                                                
## [182] "brian blessed"                                                                   
## [183] "christian slater, kirsten dunst"                                                 
## [184] "nick nightingale"                                                                
## [185] "ian hunter"                                                                      
## [186] "natalie wood"                                                                    
## [187] "kenneth branagh, ian holm"                                                       
## [188] "daniel craig"                                                                    
## [189] "american pie"                                                                    
## [190] "frederic forrest"                                                                
## [191] "elizabeth mcgovern"                                                              
## [192] "jimmy keogh, ian bannen, david kelly, eileen dromey"                             
## [193] "allen payne"                                                                     
## [194] "gabriel byrne, kevin pollak"                                                     
## [195] "leila hatami, ali mosaffa, jamileh sheikhi"                                      
## [196] "alec guinness"                                                                   
## [197] "tim roth, michael rappaport"                                                     
## [1] " "
## [1] "PATTERN:  \\(/-LRB- ([A-z]*)/NN ([A-z]*)/NNS \\)/-RRB-"
##  [1] "bob hoskins"                             
##  [2] "jason mewes"                             
##  [3] "robin williams"                          
##  [4] "jeff daniels"                            
##  [5] "mark danvers"                            
##  [6] "claire danes, robin williams"            
##  [7] "anthony daniels"                         
##  [8] "miramax films"                           
##  [9] "kim dickens"                             
## [10] "elias koteas"                            
## [11] "antonio banderas"                        
## [12] "jean simmons, maya angelou"              
## [13] "joan sims, windsor davies, carol hawkins"
## [14] "ralph fiennes"                           
## [15] "jason robards"                           
## [16] "anthony hopkins"                         
## [17] "denise richards"                         
## [18] "jacqueline obradors"                     
## [19] "jeffrey jones, jeremy davies"            
## [20] "anthony hopkins"                         
## [21] "tom hanks"                               
## [22] "julia roberts"                           
## [23] "keanu reeves, katie holmes, kim dickens" 
## [24] "antonio banderas"                        
## [25] "linda griffiths"                         
## [26] "stacy edwards"                           
## [27] "victoria adams"                          
## [28] "keanu reeves"                            
## [29] "jason isaacs"                            
## [30] "jane adams"                              
## [31] "sidney james, angela douglas"            
## [32] "christina ricci"                         
## [33] "ralph fiennes"                           
## [34] "elias kosteas"                           
## [35] "kim dickens"                             
## [36] "jeff bridges"                            
## [37] "miriam margoyles"                        
## [38] "denise richards"                         
## [39] "tom hanks"                               
## [40] "antonio banderas"                        
## [41] "ralph fiennes"                           
## [42] "jason mewes"                             
## [43] "sam bottoms"                             
## [44] "john hawkes"                             
## [45] "jack hawkins"                            
## [1] " "

We count all the entities per pattern.

And we can also draw a histogram of the counts.

entityCountPerPattern = countMatchesPerColumn(allEntities)
entityCountPerPattern
##                                                                           Entity
## 1                                      created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN
## 2 created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN
## 3              screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)
## 4                            cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/NN
## 5                cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN
## 6                                   oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS
## 7                                    \\(/-LRB- ([A-z]*)/NN ([A-z]*)/NN \\)/-RRB-
## 8                                    \\(/-LRB- ([A-z]*)/JJ ([A-z]*)/NN \\)/-RRB-
## 9                                   \\(/-LRB- ([A-z]*)/NN ([A-z]*)/NNS \\)/-RRB-
##   Count
## 1     2
## 2     1
## 3    14
## 4    10
## 5     1
## 6     1
## 7   227
## 8   197
## 9    45
hist(entityCountPerPattern$Count)

We count all the entities per file.

And we can also draw a histogram of the counts.

entityCountPerFile=countMatchesPerRow(allEntities)
entityCountPerFile
##                File Count
## 1   cv000_29590.txt     7
## 2   cv001_18431.txt     2
## 4   cv003_11664.txt     2
## 6   cv005_29443.txt     2
## 7   cv006_15448.txt     1
## 9   cv008_29435.txt     2
## 11  cv010_29198.txt     2
## 13  cv012_29576.txt     1
## 14  cv013_10159.txt     1
## 15  cv014_13924.txt     1
## 17   cv016_4659.txt     1
## 19  cv018_20137.txt     2
## 21   cv020_8825.txt     2
## 22  cv021_15838.txt     1
## 23  cv022_12864.txt     1
## 24  cv023_12672.txt     2
## 26   cv025_3108.txt     1
## 30  cv029_18643.txt     1
## 31  cv030_21593.txt     2
## 37  cv036_16831.txt     1
## 38  cv037_18510.txt     1
## 40   cv039_6170.txt     2
## 42  cv041_21113.txt     2
## 43  cv042_10982.txt     2
## 44  cv043_15013.txt     1
## 46  cv045_23923.txt     2
## 48   cv047_1754.txt     1
## 52  cv051_10306.txt     2
## 54  cv053_21822.txt     2
## 55   cv054_4230.txt     2
## 56   cv055_8338.txt     1
## 60  cv059_28885.txt     2
## 61  cv060_10844.txt     1
## 62   cv061_8837.txt     3
## 63  cv062_23115.txt     2
## 64  cv063_28997.txt     3
## 65  cv064_24576.txt     2
## 66  cv065_15248.txt     1
## 67  cv066_10821.txt     1
## 68  cv067_19774.txt     1
## 69  cv068_13400.txt     2
## 71  cv070_12289.txt     1
## 72  cv071_12095.txt     3
## 77  cv076_24945.txt     3
## 78  cv077_22138.txt     1
## 80  cv079_11933.txt     2
## 81  cv080_13465.txt     2
## 82  cv081_16582.txt     1
## 83  cv082_11080.txt     1
## 84  cv083_24234.txt     2
## 87  cv086_18371.txt     3
## 88   cv087_1989.txt     2
## 89  cv088_24113.txt     1
## 95  cv094_27889.txt     1
## 96  cv095_28892.txt     2
## 97  cv096_11474.txt     2
## 98  cv097_24970.txt     2
## 100 cv099_10534.txt     4
## 102 cv101_10175.txt     2
## 103  cv102_7846.txt     1
## 104 cv103_11021.txt     2
## 106 cv105_17990.txt     1
## 107 cv106_16807.txt     1
## 113 cv112_11193.txt     2
## 116 cv115_25396.txt     3
## 117 cv116_28942.txt     1
## 118 cv117_24295.txt     2
## 120  cv119_9867.txt     1
## 122 cv121_17302.txt     3
## 123  cv122_7392.txt     2
## 124 cv123_11182.txt     2
## 125  cv124_4122.txt     3
## 130 cv129_16741.txt     2
## 132 cv131_10713.txt     1
## 134 cv133_16336.txt     1
## 136 cv135_11603.txt     1
## 140 cv139_12873.txt     3
## 141  cv140_7479.txt     2
## 142 cv141_15686.txt     2
## 144 cv143_19666.txt     1
## 148 cv147_21193.txt     2
## 150 cv149_15670.txt     1
## 151 cv150_12916.txt     1
## 152 cv151_15771.txt     2
## 153  cv152_8736.txt     1
## 154 cv153_10779.txt     1
## 155  cv154_9328.txt     1
## 156  cv155_7308.txt     2
## 158 cv157_29372.txt     1
## 159 cv158_10390.txt     1
## 160 cv159_29505.txt     1
## 161 cv160_10362.txt     1
## 162 cv161_11425.txt     2
## 163 cv162_10424.txt     2
## 165 cv164_22447.txt     1
## 166 cv165_22619.txt     3
## 167 cv166_11052.txt     2
## 168 cv167_16376.txt     1
## 170 cv169_23778.txt     2
## 171  cv170_3006.txt     1
## 172 cv171_13537.txt     2
## 173 cv172_11131.txt     1
## 178 cv177_10367.txt     1
## 179 cv178_12972.txt     2
## 181 cv180_16113.txt     1
## 182 cv181_14401.txt     2
## 187  cv186_2269.txt     1
## 189 cv188_19226.txt     2
## 190 cv189_22934.txt     1
## 191 cv190_27052.txt     1
## 192 cv191_29719.txt     2
## 193 cv192_14395.txt     2
## 196 cv195_14528.txt     1
## 197 cv196_29027.txt     2
## 198 cv197_29328.txt     1
## 200  cv199_9629.txt     1
## 202  cv201_6997.txt     2
## 203 cv202_10654.txt     3
## 204 cv203_17986.txt     2
## 205  cv204_8451.txt     1
## 206  cv205_9457.txt     1
## 207 cv206_14293.txt     2
## 210 cv209_29118.txt     2
## 211  cv210_9312.txt     1
## 212  cv211_9953.txt     1
## 213 cv212_10027.txt     2
## 214 cv213_18934.txt     1
## 216 cv215_22240.txt     1
## 218 cv217_28842.txt     2
## 219 cv218_24352.txt     1
## 220 cv219_18626.txt     1
## 221 cv220_29059.txt     2
## 224 cv223_29066.txt     1
## 225 cv224_17661.txt     1
## 226 cv225_29224.txt     3
## 227  cv226_2618.txt     1
## 228 cv227_24215.txt     1
## 230 cv229_13611.txt     1
## 232 cv231_10425.txt     2
## 234 cv233_15964.txt     2
## 236 cv235_10217.txt     1
## 237 cv236_11565.txt     2
## 238 cv237_19221.txt     1
## 239 cv238_12931.txt     2
## 241 cv240_14336.txt     1
## 242 cv241_23130.txt     3
## 243 cv242_10638.txt     2
## 245 cv244_21649.txt     1
## 246  cv245_8569.txt     1
## 247 cv246_28807.txt     1
## 248 cv247_13142.txt     1
## 249 cv248_13987.txt     1
## 251 cv250_25616.txt     1
## 253 cv252_23779.txt     3
## 254 cv253_10077.txt     1
## 257 cv256_14740.txt     2
## 263 cv262_12649.txt     2
## 264 cv263_19259.txt     1
## 268 cv267_14952.txt     1
## 269 cv268_18834.txt     2
## 270 cv269_21732.txt     1
## 272 cv271_13837.txt     1
## 273 cv272_18974.txt     1
## 274 cv273_29112.txt     1
## 275 cv274_25253.txt     3
## 276 cv275_28887.txt     2
## 277 cv276_15684.txt     2
## 279 cv278_13041.txt     2
## 280 cv279_18329.txt     1
## 282 cv281_23253.txt     1
## 285 cv284_19119.txt     2
## 289 cv288_18791.txt     1
## 290  cv289_6463.txt     2
## 291 cv290_11084.txt     3
## 292 cv291_26635.txt     1
## 295 cv294_11684.txt     3
## 296 cv295_15570.txt     1
## 298 cv297_10047.txt     1
## 299 cv298_23111.txt     2
## 301 cv300_22284.txt     2
## 302 cv301_12146.txt     1
## 304 cv303_27520.txt     3
## 305 cv304_28706.txt     3
## 306  cv305_9946.txt     1
## 307 cv306_10364.txt     1
## 308 cv307_25270.txt     1
## 311 cv310_13091.txt     1
## 312 cv311_16002.txt     2
## 313 cv312_29377.txt     1
## 314 cv313_18198.txt     2
## 316 cv315_11629.txt     2
## 317  cv316_6370.txt     1
## 319 cv318_10493.txt     1
## 320 cv319_14727.txt     1
## 321  cv320_9530.txt     2
## 324 cv323_29805.txt     1
## 325  cv324_7082.txt     1
## 327 cv326_13295.txt     1
## 328 cv327_20292.txt     1
## 329 cv328_10373.txt     2
## 330 cv329_29370.txt     1
## 331 cv330_29809.txt     2
## 332  cv331_8273.txt     1
## 333 cv332_16307.txt     2
## 334  cv333_8916.txt     1
## 336 cv335_14665.txt     1
## 337 cv336_10143.txt     2
## 339  cv338_8821.txt     2
## 341 cv340_13287.txt     2
## 342 cv341_24430.txt     2
## 344 cv343_10368.txt     1
## 346  cv345_9954.txt     1
## 348 cv347_13194.txt     2
## 349 cv348_18176.txt     1
## 351 cv350_20670.txt     2
## 352 cv351_15458.txt     1
## 353  cv352_5524.txt     2
## 357 cv356_25163.txt     2
## 358 cv357_13156.txt     1
## 359 cv358_10691.txt     1
## 360  cv359_6647.txt     1
## 361  cv360_8398.txt     2
## 362 cv361_28944.txt     2
## 363 cv362_15341.txt     1
## 364 cv363_29332.txt     1
## 365 cv364_12901.txt     3
## 367 cv366_10221.txt     3
## 368 cv367_22792.txt     3
## 369 cv368_10466.txt     2
## 370 cv369_12886.txt     2
## 375 cv374_25436.txt     2
## 376  cv375_9929.txt     1
## 378  cv377_7946.txt     1
## 380 cv379_21963.txt     1
## 382 cv381_20172.txt     2
## 383  cv382_7897.txt     2
## 386 cv385_29741.txt     1
## 387 cv386_10080.txt     3
## 388 cv387_11507.txt     1
## 390  cv389_9369.txt     2
## 392 cv391_10802.txt     2
## 396 cv395_10849.txt     3
## 397 cv396_17989.txt     1
## 398 cv397_29023.txt     2
## 402 cv401_12605.txt     1
## 404  cv403_6621.txt     2
## 405 cv404_20315.txt     3
## 411 cv410_24266.txt     1
## 412 cv411_15007.txt     1
## 413 cv412_24095.txt     1
## 415 cv414_10518.txt     2
## 417 cv416_11136.txt     1
## 418 cv417_13115.txt     3
## 420 cv419_13394.txt     1
## 421 cv420_28795.txt     1
## 422  cv421_9709.txt     2
## 423  cv422_9381.txt     2
## 426  cv425_8250.txt     1
## 427 cv426_10421.txt     1
## 428 cv427_10825.txt     3
## 429 cv428_11347.txt     2
## 433 cv432_14224.txt     1
## 434 cv433_10144.txt     2
## 435  cv434_5793.txt     2
## 436 cv435_23110.txt     1
## 437 cv436_19179.txt     2
## 438 cv437_22849.txt     1
## 440 cv439_15970.txt     2
## 442 cv441_13711.txt     2
## 445  cv444_9974.txt     1
## 446 cv445_25882.txt     2
## 448 cv447_27332.txt     2
## 449 cv448_14695.txt     1
## 451  cv450_7890.txt     2
## 453  cv452_5088.txt     1
## 454 cv453_10379.txt     2
## 455  cv454_2053.txt     2
## 457 cv456_18985.txt     2
## 462 cv461_19600.txt     3
## 463 cv462_19350.txt     2
## 464 cv463_10343.txt     1
## 465 cv464_15650.txt     2
## 466 cv465_22431.txt     2
## 467 cv466_18722.txt     1
## 468 cv467_25773.txt     1
## 469 cv468_15228.txt     1
## 470 cv469_20630.txt     2
## 471 cv470_15952.txt     1
## 472 cv471_16858.txt     1
## 473 cv472_29280.txt     3
## 476 cv475_21692.txt     1
## 477 cv476_16856.txt     2
## 482  cv481_7436.txt     3
## 483 cv482_10580.txt     2
## 484 cv483_16378.txt     1
## 485 cv484_25054.txt     4
## 488 cv487_10446.txt     2
## 489 cv488_19856.txt     1
## 491 cv490_17872.txt     1
## 493 cv492_18271.txt     1
## 495 cv494_17389.txt     3
## 498 cv497_26980.txt     2
## 501 cv500_10251.txt     1
hist(entityCountPerFile$Count)

Write results to a file

We can write our results to a CSV file, sowe can use them in other places.

write.table(allEntities, file = "allEntities.csv", row.names = F, na="", sep=";")

Compare with a gold standard

Put all matches in a list for comparison with a gold standard.

allMatches = mergeAllMatchesInLists(allEntities)
head(allMatches)
##             Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
##                                                                                             Matches
## 1 alan moore, eddie campbell, terry hayes, peter deming, martin childs, ghost world, vertical limit
## 2                       _election _, _rushmore _, matthew broderick, bill murray, reese witherspoon
## 3                                                                                              NULL
## 4                                      roy scheider, murray hamilton, robert shaw, richard dreyfuss
## 5                                                                                              NULL
## 6                                                          maka kotto, eriq ebouaney, pascal nzonzi

Load the gold standard and put all gold standard matches in a list for comparison.

goldStandard = read.table(file = "goldStandard.csv", quote = "", na.strings=c(""),
                          colClasses="character", sep=";")
allMatchesGold = mergeGoldStandardInLists(goldStandard)
head(allMatchesGold)
##             Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
##                                                                                                                                                                                                                                                                                                                                                     Matches
## 1 alan moore, eddie campbell, moore, campbell, jack, michael jackson, albert, allen hughes, peter godley, robbie coltrane, frederick abberline, johnny depp, abberline, mary kelly, heather graham, terry hayes, rafael yglesias, steve guttenberg, tim burton, marilyn manson, peter deming, martin childs, depp, ians holm, joe gould, richardson, graham
## 2                                                                                                                                                 matthew broderick, reese witherspoon, george washington carver, tracy flick, paul, max fischer, bill murray, broderick, witherspoon, jessica campbell, tammy, rooney, campbell, alexander payne, tracy, m
## 3                                                                                                                                                                                                                                                                                   ryan, hanks, tom hanks, joe fox, meg ryan, kathleen kelley, fox, kelley
## 4                                                                          john williams, steven spielberg, spielberg, williams, martin brody, roy scheider, larry vaughn, murray hamilton, brody, matt hooper, richard dreyfuss, hooper, vaughn, quint, robert shaw, hitchcock, scheider, dreyfuss, shaw, robert redford, paul newman, duddy kravitz, ahab
## 5                                                                                                                                                                                                                    herb, jackie chan, barry sanders, sanders, jackie, chan, bruce lee, tim allen, lawrence kazdan, john williams, spielberg, george lucas
## 6                                                                                                                                                                                             raoul peck, lumumba, patrice lumumba, eriq ebouaney, helmer peck, peck, pascal bonitzer, patrice, joseph kasa vubu, maka kotto, moise tschombe, pascal nzonzi

Calculate the metrics (precision, recall, f-measure).

metrics = calculateMetrics(allMatches, allMatchesGold)
metrics
##   Precision  Recall  Fmeasure
## 1 0.9364508 0.12492 0.2204347