The goal of this document is to show a sample script for pattern-based entity recognition over text documents using the openNLP (natural language processing) and the tm (text mining) packages in R.
I cannot claim full authorship of this document, since I have taken code snippets and have been inspired by multiple books and documents in the Web. Thanks everyone for sharing.
Check the working directory with wd. If it is not the one where your data are located, change it with setwd.
getwd()
## [1] "/Users/raul/ownCloud/Trabajo/Docencia/2015 Intelligent Systems/R"
setwd("~/ownCloud/Trabajo/Docencia/2015 Intelligent Systems/R")
Now we load the required libraries. Only a couple of things to mention:
annotate function of the openNLP package requires to explicitly include the package name (i.e., NLP::annotate) due to a name clash with ggplot2# Needed for OutOfMemoryError: Java heap space
library(rJava)
.jinit(parameters="-Xmx4g")
# If there are more memory problems, invoke gc() after the POS tagging
library(NLP)
library(openNLP)
library(openNLPmodels.en)
library(tm)
library(stringr)
getAnnotationsFromDocument returns annotations for the text document: word, sentence, part-of-speech, and Penn Treebank parse annotations.
As an alternative, the koRpus package uses TreeTagger for POS tagging.
getAnnotationsFromDocument = function(doc){
x=as.String(doc)
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
y2 <- annotate(x, pos_tag_annotator, y1)
return(y2)
}
getAnnotatedMergedDocument returns the text document merged with the annotations.
getAnnotatedMergedDocument = function(doc,annotations){
x=as.String(doc)
y2w <- subset(annotations, type == "word")
tags <- sapply(y2w$features, '[[', "POS")
r1 <- sprintf("%s/%s", x[y2w], tags)
r2 <- paste(r1, collapse = " ")
return(r2)
}
getAnnotatedPlainTextDocument returns the text document along with its annotations in an AnnotatedPlainTextDocument.
getAnnotatedPlainTextDocument = function(doc,annotations){
x=as.String(doc)
a = AnnotatedPlainTextDocument(x,annotations)
return(a)
}
detectPatternOnDocument returns the pattern detected on an AnnotatedPlainTextDocument.
detectPatternOnDocument <- function(doc, pattern) {
x=as.String(doc)
res=str_match_all(x,pattern)
dimrow=dim(res[[1]])[1]
dimcol=dim(res[[1]])[2]
# If there are no rows, no matches have been found
if (dimrow == 0) {
return(NA)
}else{
if (dimcol > 2){
# If there are three or more columns, we have to paste all the groups together
for (i in 1:dimrow) {
res[[1]][i,2] = paste(res[[1]][i,2:dimcol], collapse = ' ')
}
}
# We return all the results found separated by ','
if (dimcol != 1) {
result = paste(res[[1]][,2], collapse = ', ')
}else{
result = paste(res[[1]][,1], collapse = ', ')
}
return(result)
}
}
detectPatternOnDocumentWithContext returns the pattern detected on an AnnotatedPlainTextDocument with some context.
detectPatternOnDocumentWithContext <- function(doc, pattern) {
txt=as.String(doc)
number=50
coord=str_locate(txt,pattern)
res3=substr(txt,coord[1]-number,coord[2]+number)
return (res3)
}
detectPatternsInCorpus returns a data frame with all the patterns detected in a corpus.
detectPatternsInCorpus = function(corpus, patterns){
vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1,
nrow = length(corpus)))
names(vallEntities) <- c("File",patterns)
for (i in 1:length(patterns)) {
vallEntities[,i+1]=unlist(lapply(corpus, detectPatternOnDocument,
pattern=patterns[i]))
}
for (i in 1:length(corpus)) {
vallEntities$File[i]=meta(corpus[[i]])$id
}
return (vallEntities)
}
detectPatternsInTaggedCorpus returns a data frame with all the patterns detected in an annotated corpus.
detectPatternsInTaggedCorpus = function(corpus, taggedCorpus, patterns){
vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1,
nrow = length(corpus)))
names(vallEntities) <- c("File",patterns)
for (i in 1:length(patterns)) {
vallEntities[,i+1]=unlist(lapply(taggedCorpus, detectPatternOnDocument,
pattern=patterns[i]))
}
for (i in 1:length(corpus)) {
vallEntities$File[i]=meta(corpus[[i]])$id
}
return (vallEntities)
}
countMatchesPerColumn returns the number of matches per pattern/column.
Counts the number of columns with non-NA values for each pattern.
countMatchesPerColumn = function (df) {
entityCountPerPattern <- data.frame(matrix(NA, ncol = 2,
nrow = length(names(df))-1))
names(entityCountPerPattern) <- c("Entity","Count")
for (i in 2:length(names(df))) {
entityCountPerPattern$Entity[i-1] = names(df)[i]
entityCountPerPattern$Count[i-1] = nrow(subset(df, !is.na(df[i])))
}
return (entityCountPerPattern)
}
countMatchesPerRow returns the number of entities per file/row.
Counts the number of rows with non-NA values for each file.
countMatchesPerRow = function (df) {
entityCountPerFile <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(entityCountPerFile) <- c("File","Count")
for (i in 1:nrow(df)) {
entityCountPerFile$File[i] = df$File[i]
entityCountPerFile$Count[i] = length(Filter(Negate(is.na),df[i,2:length(df[i,])]))
}
return (entityCountPerFile[entityCountPerFile[2]!=0,])
}
printMatchesPerPattern prints the matches found per pattern.
printMatchesPerPattern = function (patterns, matches) {
for (i in 1:length(patterns)){
print(paste("PATTERN: ",patterns[i]))
strings = matches[,i+1][!is.na(unlist(matches[,i+1]))]
print(strings)
print(" ")
}
}
mergeAllMatchesInLists returns a data frame with all the files and their matches in a single list per file.
mergeAllMatchesInLists = function (df) {
matchesPerFile = rep(list(list()), nrow(df))
for (i in 1:nrow(df)) {
matches=list()
for (j in 2:ncol(df)){
if (grepl(',',df[i,j])){
b=strsplit(as.character(df[i,j]),split=',')
for (j in 1:length(b[[1]])){
matches= c(matches,str_trim(b[[1]][j]))
}
}else{
if (!(is.na(df[i,j]))){
matches = c(matches,str_trim(df[i,j]))
}
}
}
matches = unique(matches)
matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
}
files = df[,1]
matches = matchesPerFile
allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(allMatches) <- c("Files","Matches")
allMatches$Files=files
allMatches$Matches=matches
return (allMatches)
}
mergeGoldStandardInLists returns a data frame with all the files and the gold standard matches in a single list per file.
mergeGoldStandardInLists = function (df) {
matchesPerFile = rep(list(list()), nrow(df))
for (i in 1:nrow(df)) {
matches=as.list(unlist(Filter(Negate(is.na),df[i,2:length(df)])))
matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
}
files = df[,1]
matches = matchesPerFile
allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(allMatches) <- c("Files","Matches")
allMatches$Files=files
allMatches$Matches=matches
return (allMatches)
}
calculateMetrics calculates precision, recall and f-measure according to a gold standard.
calculateMetrics = function (matches, matches.gs) {
metrics<- data.frame(matrix(NA, ncol = 3, nrow = 1))
names(metrics) <- c("Precision","Recall","Fmeasure")
numCorrect = 0
allAnswers = 0
possibleAnswers = 0
for (i in 1:nrow(matches)) {
if (length(matches.gs$Matches[[i]])!=0) {
l = str_trim(unlist(matches[i,2]))
l.gs = unname(unlist(matches.gs[i,2]))
intersection = intersect(l, l.gs)
numCorrect = numCorrect + length(intersect(l, l.gs))
allAnswers = allAnswers + length (l)
possibleAnswers = possibleAnswers + length(l.gs)
}
}
metrics$Precision = numCorrect / allAnswers
metrics$Recall = numCorrect / possibleAnswers
beta = 1
if ((metrics$Precision == 0) & (metrics$Recall == 0)) {
metrics$Fmeasure = 0
} else {
metrics$Fmeasure = ((sqrt(beta)+1) * metrics$Precision * metrics$Recall) /
((sqrt(beta)*metrics$Precision) + metrics$Recall)
}
return(metrics)
}
We are going to use the Movie review data version 2.0, created by Bo Pang and Lillian Lee.
Once unzipped, the data splits the different documents into positive and negative opinions. In this script we are going to use the positive opinions located in ./txt_sentoken/pos.
We are only going to load the first 500 reviews.
source.pos = DirSource("../Corpus/review_polarity/txt_sentoken/pos", encoding = "UTF-8")
corpus = Corpus(source.pos)
Let’s take a look at the document in the first entry.
inspect(corpus[[1]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 4226
##
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd .
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes .
## in other words , don't dismiss this film because of its source .
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes .
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ?
## the ghetto in question is , of course , whitechapel in 1888 london's east end .
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision .
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case .
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium .
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach .
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay .
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end .
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts .
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) .
## don't worry - it'll all make sense when you see it .
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) .
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic .
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place .
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent .
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham .
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad .
## the film , however , is all good .
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content
We just apply the getAnnotationsFromDocument function to every document in the corpus using lapply.
This step may take long depending on the size of the corpus and on the annotations that we want to identify.
annotations = lapply(corpus, getAnnotationsFromDocument)
The first annotations are sentence annotations. They indicate where the sentence starts and where it ends. In constituents we can access the tokens in the sentence (and check the number of tokens it has). In parse we can access the parse tree.
head(annotations[[1]])
## id type start end features
## 1 sentence 1 265 constituents=<<integer,54>>
## 2 sentence 268 439 constituents=<<integer,36>>
## 3 sentence 442 591 constituents=<<integer,27>>
## 4 sentence 594 797 constituents=<<integer,44>>
## 5 sentence 800 939 constituents=<<integer,28>>
## 6 sentence 942 1299 constituents=<<integer,70>>
Word annotations also are defined. They indicate where the word starts, where it ends, and the part-of-speech tag.
tail(annotations[[1]])
## id type start end features
## 844 word 4189 4197 POS=NN
## 845 word 4199 4199 POS=,
## 846 word 4201 4208 POS=NN
## 847 word 4210 4212 POS=CC
## 848 word 4214 4217 POS=NN
## 849 word 4219 4225 POS=NN
We can create AnnotatedPlainTextDocuments that attach the annotations to the document and store the annotated corpus in another variable (since we destroy the corpus metadata).
corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)
inspect(corpus.tagged[[1]])
## <<AnnotatedPlainTextDocument>>
## Metadata: 0
## Annotations: length: 849
## Content: chars: 4226
##
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd .
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes .
## in other words , don't dismiss this film because of its source .
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes .
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ?
## the ghetto in question is , of course , whitechapel in 1888 london's east end .
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision .
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case .
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium .
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach .
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay .
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end .
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts .
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) .
## don't worry - it'll all make sense when you see it .
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) .
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic .
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place .
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent .
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham .
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad .
## the film , however , is all good .
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content
We can also store all the annotations inline with the text and store the annotated corpus in another variable (since we destroy the corpus metadata).
corpus.taggedText = Map(getAnnotatedMergedDocument, corpus, annotations)
corpus.taggedText[[1]]
## [1] "films/NNS adapted/VBD from/IN comic/JJ books/NNS have/VBP had/VBN plenty/NN of/IN success/NN ,/, whether/IN they/PRP 're/VBP about/IN superheroes/NNS (/-LRB- batman/NN ,/, superman/NN ,/, spawn/NN )/-RRB- ,/, or/CC geared/VBN toward/IN kids/NNS (/-LRB- casper/NN )/-RRB- or/CC the/DT arthouse/NN crowd/NN (/-LRB- ghost/NN world/NN )/-RRB- ,/, but/CC there/EX 's/VBZ never/RB really/RB been/VBN a/DT comic/JJ book/NN like/IN from/IN hell/NN before/IN ./. for/IN starters/NNS ,/, it/PRP was/VBD created/VBN by/IN alan/NN moore/NN (/-LRB- and/CC eddie/JJ campbell/NN )/-RRB- ,/, who/WP brought/VBD the/DT medium/NN to/TO a/DT whole/JJ new/JJ level/NN in/IN the/DT mid/JJ '80s/NNS with/IN a/DT 12-part/JJ series/NN called/VBN the/DT watchmen/NNS ./. to/TO say/VB moore/NN and/CC campbell/NN thoroughly/RB researched/VBD the/DT subject/NN of/IN jack/NN the/DT ripper/NN would/MD be/VB like/IN saying/VBG michael/NN jackson/NN is/VBZ starting/VBG to/TO look/VB a/DT little/JJ odd/JJ ./. the/DT book/NN (/-LRB- or/CC \"/`` graphic/JJ novel/NN ,/, \"/`` if/IN you/PRP will/MD )/-RRB- is/VBZ over/IN 500/CD pages/NNS long/RB and/CC includes/VBZ nearly/RB 30/CD more/RBR that/IN consist/VB of/IN nothing/NN but/CC footnotes/NNS ./. in/IN other/JJ words/NNS ,/, do/VBP n't/RB dismiss/VB this/DT film/NN because/IN of/IN its/PRP$ source/NN ./. if/IN you/PRP can/MD get/VB past/IN the/DT whole/JJ comic/JJ book/NN thing/NN ,/, you/PRP might/MD find/VB another/DT stumbling/JJ block/NN in/IN from/IN hell/NN 's/POS directors/NNS ,/, albert/NN and/CC allen/JJ hughes/NNS ./. getting/VBG the/DT hughes/NNS brothers/NNS to/TO direct/VB this/DT seems/VBZ almost/RB as/RB ludicrous/JJ as/IN casting/VBG carrot/NN top/NN in/IN ,/, well/RB ,/, anything/NN ,/, but/CC riddle/VB me/PRP this/DT :/: who/WP better/RB to/TO direct/VB a/DT film/NN that/WDT 's/VBZ set/VBN in/IN the/DT ghetto/NN and/CC features/NNS really/RB violent/JJ street/NN crime/NN than/IN the/DT mad/JJ geniuses/NNS behind/IN menace/NN ii/NNS society/NN ?/. the/DT ghetto/NN in/IN question/NN is/VBZ ,/, of/IN course/NN ,/, whitechapel/NN in/IN 1888/CD london/. 's/POS east/JJ end/NN ./. it/PRP 's/VBZ a/DT filthy/JJ ,/, sooty/JJ place/NN where/WRB the/DT whores/NNS (/-LRB- called/VBN \"/`` unfortunates/JJ \"/'' )/-RRB- are/VBP starting/VBG to/TO get/VB a/DT little/JJ nervous/JJ about/IN this/DT mysterious/JJ psychopath/NN who/WP has/VBZ been/VBN carving/VBG through/IN their/PRP$ profession/NN with/IN surgical/JJ precision/NN ./. when/WRB the/DT first/JJ stiff/NN turns/VBZ up/RP ,/, copper/NN peter/NN godley/NN (/-LRB- robbie/NN coltrane/NN ,/, the/DT world/NN is/VBZ not/RB enough/JJ )/-RRB- calls/VBZ in/IN inspector/NN frederick/NN abberline/NN (/-LRB- johnny/JJ depp/NN ,/, blow/NN )/-RRB- to/TO crack/VB the/DT case/NN ./. abberline/NN ,/, a/DT widower/NN ,/, has/VBZ prophetic/JJ dreams/NNS he/PRP unsuccessfully/RB tries/VBZ to/TO quell/VB with/IN copious/JJ amounts/NNS of/IN absinthe/NNS and/CC opium/NN ./. upon/IN arriving/VBG in/IN whitechapel/NN ,/, he/PRP befriends/VBZ an/DT unfortunate/NN named/VBN mary/JJ kelly/NN (/-LRB- heather/NN graham/NN ,/, say/VBP it/PRP is/VBZ n't/RB so/RB )/-RRB- and/CC proceeds/NNS to/TO investigate/VB the/DT horribly/RB gruesome/JJ crimes/NNS that/IN even/RB the/DT police/NN surgeon/NN ca/MD n't/RB stomach/VB ./. i/PRP do/VBP n't/RB think/VB anyone/NN needs/NNS to/TO be/VB briefed/VBN on/IN jack/NN the/DT ripper/NN ,/, so/IN i/PRP wo/MD n't/RB go/VB into/IN the/DT particulars/NNS here/RB ,/, other/JJ than/IN to/TO say/VB moore/NN and/CC campbell/NN have/VBP a/DT unique/JJ and/CC interesting/JJ theory/NN about/IN both/DT the/DT identity/NN of/IN the/DT killer/NN and/CC the/DT reasons/NNS he/PRP chooses/VBZ to/TO slay/VB ./. in/IN the/DT comic/JJ ,/, they/PRP do/VBP n't/RB bother/VB cloaking/VBG the/DT identity/NN of/IN the/DT ripper/NN ,/, but/CC screenwriters/NNS terry/NN hayes/NNS (/-LRB- vertical/JJ limit/NN )/-RRB- and/CC rafael/JJ yglesias/NNS (/-LRB- les/NNS mis/NN ?/. rables/NNS )/-RRB- do/VBP a/DT good/JJ job/NN of/IN keeping/VBG him/PRP hidden/VBN from/IN viewers/NNS until/IN the/DT very/JJ end/NN ./. it/PRP 's/VBZ funny/JJ to/TO watch/VB the/DT locals/NNS blindly/RB point/VBP the/DT finger/NN of/IN blame/NN at/IN jews/NNS and/CC indians/NNS because/IN ,/, after/IN all/DT ,/, an/DT englishman/NN could/MD never/RB be/VB capable/JJ of/IN committing/VBG such/JJ ghastly/JJ acts/NNS ./. and/CC from/IN hell/NN 's/POS ending/NN had/VBD me/PRP whistling/VBG the/DT stonecutters/NNS song/NN from/IN the/DT simpsons/NNS for/IN days/NNS (/-LRB- \"/'' who/WP holds/VBZ back/RB the/DT electric/JJ car/who/NN made/VBD steve/JJ guttenberg/NN a/DT star/NN ?/. \"/`` )/-RRB- ./. do/VBP n't/RB worry/VB -/: it/PRP 'll/MD all/DT make/VB sense/NN when/WRB you/PRP see/VBP it/PRP ./. now/RB onto/IN from/IN hell/NN 's/POS appearance/NN :/: it/PRP 's/VBZ certainly/RB dark/JJ and/CC bleak/JJ enough/JJ ,/, and/CC it/PRP 's/VBZ surprising/JJ to/TO see/VB how/WRB much/RB more/JJR it/PRP looks/VBZ like/IN a/DT tim/JJ burton/NN film/NN than/IN planet/NN of/IN the/DT apes/NNS did/VBD (/-LRB- at/IN times/NNS ,/, it/PRP seems/VBZ like/IN sleepy/JJ hollow/JJ 2/CD )/-RRB- ./. the/DT print/NN i/NN saw/VBD was/VBD n't/RB completely/RB finished/VBN (/-LRB- both/DT color/NN and/CC music/NN had/VBD not/RB been/VBN finalized/VBN ,/, so/IN no/DT comments/NNS about/IN marilyn/JJ manson/NN )/-RRB- ,/, but/CC cinematographer/NN peter/NN deming/NN (/-LRB- do/VBP n't/RB say/VB a/DT word/NN )/-RRB- ably/RB captures/VBZ the/DT dreariness/NN of/IN victorian-era/NN london/RB and/CC helped/VBD make/VB the/DT flashy/JJ killing/NN scenes/NNS remind/VBD me/PRP of/IN the/DT crazy/JJ flashbacks/NNS in/IN twin/JJ peaks/NNS ,/, even/RB though/IN the/DT violence/NN in/IN the/DT film/NN pales/NNS in/IN comparison/NN to/TO that/DT in/IN the/DT black-and-white/JJ comic/JJ ./. oscar/NN winner/NN martin/VBG childs/NNS '/POS (/-LRB- shakespeare/NN in/IN love/NN )/-RRB- production/NN design/NN turns/VBZ the/DT original/JJ prague/NN surroundings/NNS into/IN one/CD creepy/JJ place/NN ./. even/RB the/DT acting/VBG in/IN from/IN hell/NN is/VBZ solid/JJ ,/, with/IN the/DT dreamy/JJ depp/NN turning/VBG in/IN a/DT typically/RB strong/JJ performance/NN and/CC deftly/RB handling/VBG a/DT british/JJ accent/NN ./. ians/NNS holm/VBP (/-LRB- joe/NN gould/NN 's/POS secret/NN )/-RRB- and/CC richardson/NN (/-LRB- 102/CD dalmatians/NNS )/-RRB- log/VBP in/IN great/JJ supporting/VBG roles/NNS ,/, but/CC the/DT big/JJ surprise/NN here/RB is/VBZ graham/NN ./. i/NN cringed/VBD the/DT first/JJ time/NN she/PRP opened/VBD her/PRP$ mouth/NN ,/, imagining/VBG her/PRP$ attempt/NN at/IN an/DT irish/JJ accent/NN ,/, but/CC it/PRP actually/RB was/VBD n't/RB half/DT bad/JJ ./. the/DT film/NN ,/, however/RB ,/, is/VBZ all/DT good/JJ ./. 2/CD :/: 00/CD -/: r/NN for/IN strong/JJ violence/gore/NN ,/, sexuality/NN ,/, language/NN and/CC drug/NN content/NN"
Based on the first file, we define some simple string patterns to try to identify people appearances.
pattern0=c("created by")
pattern0=c(pattern0,"screenwriter[s]?")
pattern0=c(pattern0,"cinematographer")
pattern0=c(pattern0,"oscar winner")
We detect those patterns in the corpus and we can see in which files they do appear.
matches0 = detectPatternsInCorpus(corpus, pattern0)
matches0[!is.na(matches0[3]),c(1,3)]
## File screenwriter[s]?
## 1 cv000_29590.txt screenwriters
## 29 cv028_26746.txt screenwriters
## 30 cv029_18643.txt screenwriter
## 77 cv076_24945.txt screenwriter
## 79 cv078_14730.txt screenwriter, screenwriter
## 87 cv086_18371.txt screenwriter
## 95 cv094_27889.txt screenwriters
## 116 cv115_25396.txt screenwriter
## 122 cv121_17302.txt screenwriter
## 136 cv135_11603.txt screenwriters
## 143 cv142_22516.txt screenwriter
## 144 cv143_19666.txt screenwriter
## 159 cv158_10390.txt screenwriter
## 163 cv162_10424.txt screenwriter, screenwriter, screenwriter
## 179 cv178_12972.txt screenwriter
## 191 cv190_27052.txt screenwriter
## 192 cv191_29719.txt screenwriter
## 209 cv208_9020.txt screenwriters
## 226 cv225_29224.txt screenwriter
## 236 cv235_10217.txt screenwriters
## 241 cv240_14336.txt screenwriters
## 242 cv241_23130.txt screenwriter
## 275 cv274_25253.txt screenwriter
## 319 cv318_10493.txt screenwriter
## 337 cv336_10143.txt screenwriter
## 360 cv359_6647.txt screenwriter
## 366 cv365_11576.txt screenwriter
## 371 cv370_5221.txt screenwriter
## 396 cv395_10849.txt screenwriters
## 405 cv404_20315.txt screenwriter, screenwriter
## 406 cv405_20399.txt screenwriter
## 411 cv410_24266.txt screenwriters
## 433 cv432_14224.txt screenwriter
## 453 cv452_5088.txt screenwriters
## 457 cv456_18985.txt screenwriters
## 465 cv464_15650.txt screenwriters
## 467 cv466_18722.txt screenwriter
## 475 cv474_10209.txt screenwriter
## 477 cv476_16856.txt screenwriter
## 507 cv506_15956.txt screenwriter
## 527 cv526_12083.txt screenwriter
## 544 cv543_5045.txt screenwriter
## 553 cv552_10016.txt screenwriter
## 556 cv555_23922.txt screenwriter
## 566 cv565_29572.txt screenwriter
## 570 cv569_26381.txt screenwriter
## 579 cv578_15094.txt screenwriter
## 584 cv583_29692.txt screenwriter
## 614 cv613_21796.txt screenwriter
## 621 cv620_24265.txt screenwriters
## 625 cv624_10744.txt screenwriters
## 645 cv644_17154.txt screenwriter
## 669 cv668_17604.txt screenwriters
## 671 cv670_25826.txt screenwriter
## 673 cv672_28083.txt screenwriters
## 683 cv682_16139.txt screenwriters
## 699 cv698_15253.txt screenwriters
## 706 cv705_11059.txt screenwriters
## 713 cv712_22920.txt screenwriters
## 717 cv716_10514.txt screenwriter
## 729 cv728_16133.txt screenwriter
## 747 cv746_10147.txt screenwriter
## 748 cv747_16556.txt screenwriter
## 750 cv749_17765.txt screenwriter
## 770 cv769_8123.txt screenwriter
## 776 cv775_16237.txt screenwriters
## 795 cv794_15868.txt screenwriter
## 813 cv812_17924.txt screenwriter
## 819 cv818_10211.txt screenwriter
## 830 cv829_20289.txt screenwriter, screenwriter, screenwriters, screenwriter
## 839 cv838_24728.txt screenwriter
## 870 cv869_23611.txt screenwriters
## 873 cv872_12591.txt screenwriter
## 879 cv878_15694.txt screenwriter
## 899 cv898_14187.txt screenwriter
## 923 cv922_10073.txt screenwriters
## 929 cv928_9168.txt screenwriter
## 940 cv939_10583.txt screenwriter
## 946 cv945_12160.txt screenwriter
## 959 cv958_12162.txt screenwriter
## 961 cv960_29007.txt screenwriter
## 975 cv974_22941.txt screenwriter
## 980 cv979_18921.txt screenwriters
## 981 cv980_10953.txt screenwriters
We check how many patterns we have found in each file.
countMatchesPerRow(matches0)
## File Count
## 1 cv000_29590.txt 4
## 29 cv028_26746.txt 1
## 30 cv029_18643.txt 1
## 68 cv067_19774.txt 1
## 77 cv076_24945.txt 1
## 79 cv078_14730.txt 1
## 87 cv086_18371.txt 1
## 95 cv094_27889.txt 1
## 100 cv099_10534.txt 1
## 109 cv108_15571.txt 1
## 116 cv115_25396.txt 1
## 122 cv121_17302.txt 2
## 136 cv135_11603.txt 1
## 143 cv142_22516.txt 1
## 144 cv143_19666.txt 1
## 155 cv154_9328.txt 1
## 159 cv158_10390.txt 1
## 160 cv159_29505.txt 1
## 163 cv162_10424.txt 1
## 179 cv178_12972.txt 1
## 191 cv190_27052.txt 1
## 192 cv191_29719.txt 1
## 206 cv205_9457.txt 1
## 209 cv208_9020.txt 1
## 216 cv215_22240.txt 1
## 221 cv220_29059.txt 1
## 226 cv225_29224.txt 1
## 236 cv235_10217.txt 1
## 241 cv240_14336.txt 1
## 242 cv241_23130.txt 1
## 274 cv273_29112.txt 1
## 275 cv274_25253.txt 1
## 286 cv285_16494.txt 1
## 295 cv294_11684.txt 1
## 298 cv297_10047.txt 1
## 301 cv300_22284.txt 1
## 315 cv314_14422.txt 1
## 318 cv317_24049.txt 1
## 319 cv318_10493.txt 1
## 324 cv323_29805.txt 1
## 325 cv324_7082.txt 1
## 337 cv336_10143.txt 1
## 352 cv351_15458.txt 1
## 360 cv359_6647.txt 1
## 363 cv362_15341.txt 1
## 366 cv365_11576.txt 1
## 371 cv370_5221.txt 1
## 372 cv371_7630.txt 1
## 387 cv386_10080.txt 1
## 396 cv395_10849.txt 1
## 398 cv397_29023.txt 1
## 405 cv404_20315.txt 1
## 406 cv405_20399.txt 1
## 410 cv409_29786.txt 1
## 411 cv410_24266.txt 1
## 428 cv427_10825.txt 1
## 432 cv431_7085.txt 1
## 433 cv432_14224.txt 1
## 453 cv452_5088.txt 1
## 457 cv456_18985.txt 1
## 465 cv464_15650.txt 1
## 467 cv466_18722.txt 1
## 475 cv474_10209.txt 1
## 476 cv475_21692.txt 1
## 477 cv476_16856.txt 1
## 485 cv484_25054.txt 1
## 506 cv505_12090.txt 1
## 507 cv506_15956.txt 1
## 520 cv519_14661.txt 1
## 527 cv526_12083.txt 1
## 533 cv532_6522.txt 1
## 544 cv543_5045.txt 1
## 553 cv552_10016.txt 2
## 556 cv555_23922.txt 1
## 559 cv558_29507.txt 1
## 566 cv565_29572.txt 1
## 570 cv569_26381.txt 1
## 579 cv578_15094.txt 1
## 584 cv583_29692.txt 2
## 590 cv589_12064.txt 1
## 614 cv613_21796.txt 1
## 621 cv620_24265.txt 1
## 625 cv624_10744.txt 1
## 629 cv628_19325.txt 1
## 642 cv641_12349.txt 1
## 645 cv644_17154.txt 1
## 662 cv661_2450.txt 1
## 666 cv665_29538.txt 1
## 669 cv668_17604.txt 1
## 671 cv670_25826.txt 1
## 673 cv672_28083.txt 1
## 683 cv682_16139.txt 1
## 690 cv689_12587.txt 2
## 699 cv698_15253.txt 1
## 701 cv700_21947.txt 1
## 706 cv705_11059.txt 1
## 711 cv710_22577.txt 1
## 713 cv712_22920.txt 1
## 714 cv713_29155.txt 1
## 717 cv716_10514.txt 1
## 729 cv728_16133.txt 1
## 744 cv743_15449.txt 1
## 747 cv746_10147.txt 1
## 748 cv747_16556.txt 1
## 750 cv749_17765.txt 1
## 770 cv769_8123.txt 1
## 776 cv775_16237.txt 1
## 795 cv794_15868.txt 1
## 803 cv802_28664.txt 1
## 813 cv812_17924.txt 1
## 819 cv818_10211.txt 1
## 830 cv829_20289.txt 1
## 838 cv837_27325.txt 1
## 839 cv838_24728.txt 1
## 852 cv851_20469.txt 1
## 869 cv868_11948.txt 1
## 870 cv869_23611.txt 1
## 873 cv872_12591.txt 2
## 879 cv878_15694.txt 1
## 888 cv887_5126.txt 1
## 893 cv892_17576.txt 1
## 899 cv898_14187.txt 1
## 923 cv922_10073.txt 1
## 929 cv928_9168.txt 1
## 940 cv939_10583.txt 1
## 946 cv945_12160.txt 2
## 950 cv949_20112.txt 1
## 959 cv958_12162.txt 2
## 961 cv960_29007.txt 1
## 964 cv963_6895.txt 1
## 975 cv974_22941.txt 1
## 980 cv979_18921.txt 1
## 981 cv980_10953.txt 1
And we check how many times each pattern has been found.
countMatchesPerColumn(matches0)
## Entity Count
## 1 created by 9
## 2 screenwriter[s]? 84
## 3 cinematographer 42
## 4 oscar winner 8
And we print the context in which the patterns are found, to see if we can build better patterns.
for (i in 1:length(pattern0)){
print(paste("PATTERN: ",pattern0[i]))
strings = lapply(corpus, detectPatternOnDocumentWithContext, pattern=pattern0[i])
print(unlist(strings[!is.na(unlist(strings))]))
print(" ")
}
## [1] "PATTERN: created by"
## cv000_29590.txt
## "ok like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought t"
## cv205_9457.txt
## "turvy . \nrobert zemeckis , back from the euphoria created by his last film , forrest gump , once again proves "
## cv285_16494.txt
## "ve got mail like dried-up mistletoe . \nthe sparks created by the earlier movie are , by necessity , not eviden"
## cv324_7082.txt
## "the real thing . \nthe two of them , as characters created by fingal's imagination , serve as aspects of his pe"
## cv371_7630.txt
## "nd always right on the mark , enhancing the moods created by the animated scenery . \nas far as the subtitles g"
## cv484_25054.txt
## " and there are cliches , but the walls of water , created by fluid dynamics simulating real-life phenomena , a"
## cv628_19325.txt
## " on the story `tarzan of the apes' and characters created by edgar rice burroughs . \nseen july 4 , 1999 at 7 :"
## cv743_15449.txt
## "nt of a bug's life is the quality of animation . \ncreated by pixar , the same people who brought us toy story "
## cv892_17576.txt
## " antagonists in the movie , a group of \" agents \" created by the matrix in its computer program , all dress in"
## [1] " "
## [1] "PATTERN: screenwriter[s]?"
## cv000_29590.txt
## " bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesia"
## cv028_26746.txt
## "d the story is so complex and \" clever \" that the screenwriters are the first to get lost in it . \nthere is is no"
## cv029_18643.txt
## "mise certainly is interesting and director and co-screenwriter alex proyas is able to keep the film consistently"
## cv076_24945.txt
## " logical confusion . \ndirector gregory hoblit and screenwriter toby emmerich structure \" frequency \" as good hol"
## cv078_14730.txt
## "different , it would be easy for the director and screenwriter to dumb it down and appeal to the lowest common d"
## cv086_18371.txt
## "itively made . \ndirector michael winterbottom and screenwriter frank cottrell boyce vividly express the societal"
## cv094_27889.txt
## "it prince acquit themselves admirably . \nkudos to screenwriters james schamus , wang hui ling and tsai kuo jing ,"
## cv115_25396.txt
## "ins an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does n"
## cv121_17302.txt
## "i relish those rare opportunities when a talented screenwriter can make me feel like a fool . \ni spent the first"
## cv135_11603.txt
## "is a highly enjoyable ride . \nonce again , chan's screenwriters ( here edward tang and fibe ma ) have taken the e"
## cv142_22516.txt
## "e hallstrom ( what's eating gilbert grape ? ) and screenwriter/novelist john irving ( the world according to gar"
## cv143_19666.txt
## "sy to have written vivian as merely a flake , but screenwriter j . f . lawton clearly cared too much about his c"
## cv158_10390.txt
## " potentially hilarious comedy , and pitched it to screenwriter paul rudnick . \nit's true that if this same film "
## cv162_10424.txt
## "rise to fame , quickly become the most well-known screenwriter amongst the entertainment weekly-reading , box of"
## cv178_12972.txt
## "aseball bat , but i'm pretty sure andrew niccol , screenwriter for the truman show , has had the same curious th"
## cv190_27052.txt
## "has been sharply pieced together by tony gilroy , screenwriter of the devil's advocate and dolores claiborne . \n"
## cv191_29719.txt
## "\" was right after all . \nfortunately , first-time screenwriter marc klein has sketched strong , well-rounded , c"
## cv208_9020.txt
## "ble , but after all the star power , mega bucks , screenwriters , directors , and cool trailers , men in black is"
## cv225_29224.txt
## "efall the participants en route to silver city . \nscreenwriter andy breckman adds a nice touch by not having the"
## cv235_10217.txt
## "too much to it , but capra and the gang ( various screenwriters , composers , actors ) plumet the material to its"
## cv240_14336.txt
## "sh anything , it just comes natural to him . \nthe screenwriters use the right words and phrases to describe the m"
## cv241_23130.txt
## "lf more seriously this time ; maybe so , or maybe screenwriter ehren kruger ( arlington road ) , who took over a"
## cv274_25253.txt
## "ins an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does n"
## cv318_10493.txt
## "nd-up from veteran horror director wes craven and screenwriter kevin williamson that seemed to breathe new life "
## cv336_10143.txt
## "ed for scream 2 , including director wes craven , screenwriter kevin williamson , and actors neve campbell , cou"
## cv359_6647.txt
## "y after all . \nfavreau also doubled as the film's screenwriter , and he proves he has the gift for creating enga"
## cv365_11576.txt
## "ing . \nthat's not to say the movie isn't funny . \nscreenwriter tim herlihy has written for sandler before ( bill"
## cv370_5221.txt
## " . \nit tore at my heart to watch a gifted lesbian screenwriter explain that , as a rule , gay audiences hunger f"
## cv395_10849.txt
## "jake kasdan , son of one of the best screenwriters around , breaks into filmmaking by writing and di"
## cv404_20315.txt
## "albert brooks plays steven phillips , a hollywood screenwriter who after winning a humanitarian award for his wo"
## cv405_20399.txt
## " the 1999 film outside providence ( 6 . 5/10 ) . \nscreenwriter w . peter iliff also had a part in writing the sc"
## cv410_24266.txt
## " . \nminkoff likes to point out scenes where other screenwriters came in and polished up the script , namely write"
## cv432_14224.txt
## "ments work very well -- for a comic book story . \nscreenwriter david goyer ( who also wrote the crow ) incorpora"
## cv452_5088.txt
## "tober is distinguished by its water-tight plot . \nscreenwriters larry ferguson and donald stewart have gracefully"
## cv456_18985.txt
## "d , and all are handled exceptionally well by the screenwriters . \nthere is no shred of doubt left to ponder afte"
## cv464_15650.txt
## "ture and charm holds right up to the last reel . \nscreenwriters john eskow , ted elliot and terry rosio have unfo"
## cv466_18722.txt
## "david mamet has long been my favorite screenwriter and director . \nwith his distinctive , more often"
## cv474_10209.txt
## "s that \" genius is insanity with some success \" , screenwriter fierstein is taking a lazy shortcut ) , pryce mak"
## cv476_16856.txt
## "threat of class struggle ; for george pal and his screenwriter david duncan , who produced the film in the worst"
## cv506_15956.txt
## "ents or mistakes done on the part of lyne and his screenwriter , stephen schiff , but just parts of a whole new "
## cv526_12083.txt
## "antly british slang . \npossibly an attempt by the screenwriter to balance the british so that american audiences"
## cv543_5045.txt
## " , and outstanding acting . \ndirector kleiser and screenwriter elizabeth jane howard ( adapting her own highly a"
## cv552_10016.txt
## "int clearly a greater talent as a director than a screenwriter . \nwhile boogie nights shows great inventiveness "
## cv555_23922.txt
## "ious efforts good burger and varsity blues . \nand screenwriter steven brill ( the epic mighty ducks trilogy , la"
## cv565_29572.txt
## "ile . \" \nbased on stephen king stories adapted by screenwriter william goldman and directed by scott hicks , it'"
## cv569_26381.txt
## "gh a lot of this honor should be addressed to the screenwriter ______ and frears , the director , the acting is "
## cv578_15094.txt
## "tory gazillions of times , but debut director and screenwriter mark christopher keeps things moving with lively "
## cv583_29692.txt
## "im that we're more scared by what we don't see . \nscreenwriter and legendary film critic james agee does a beaut"
## cv613_21796.txt
## " have to lie in the conception of this film ; the screenwriter and the director . \nthree kings ranks among the b"
## cv620_24265.txt
## "re with american psycho there is no difference . \nscreenwriters mary harron and guinevere turner had the unenviab"
## cv624_10744.txt
## " another part of life generally ignored by modern screenwriters . \nwith his well worn bible in hand , sonny leaps"
## cv644_17154.txt
## " his character , not only by thornton but also by screenwriter scott b . smith ( adapting his own book , by the "
## cv668_17604.txt
## " of note is larisa oleynik who , with the help of screenwriters lutz and smith , is able to turn bianca into a pe"
## cv670_25826.txt
## "ars . \nthe quick script written by barry fanaro , screenwriter of kingpin , is carried by plenty of subtly dry h"
## cv672_28083.txt
## "ject is going up soon that will block her view . \nscreenwriters and actors jean-pierre bacri and agnes jaoui wrot"
## cv682_16139.txt
## "es like worm just aren't cut out for the cards . \nscreenwriters david levien and brian koppelman entertain and ed"
## cv698_15253.txt
## "gets rather heady at times , but it's a credit to screenwriters zwick , lawrence wright , and menno meyjes that t"
## cv705_11059.txt
## "and thrilling . \nbased on an original story , the screenwriters did a good job at imagining all the potential and"
## cv712_22920.txt
## "has opened new doors for excellent young actors , screenwriters , and directors . \nscream is also an extremely fu"
## cv716_10514.txt
## "ccomplished , quite credibly , but then boyle and screenwriter john hodge seem to flinch and opt for a more conv"
## cv728_16133.txt
## "playwright tom stoppard and screenwriter marc norman took on an astonishingly difficult ta"
## cv746_10147.txt
## " -- but why ? \nlike the first scream , craven and screenwriter kevin williamson inaugurate things with a bang . "
## cv747_16556.txt
## "res and murders a suspect . \nthe future zwick and screenwriter lawrence wright offer is all too possible . \nwhat"
## cv749_17765.txt
## " \ndirector liman ( who also photographed go ) and screenwriter august celebrate the drug/rave scene in l . a . ,"
## cv769_8123.txt
## "in another big alien film from the past summer . \nscreenwriter ed solomon , writer of super mario bros . \nand th"
## cv775_16237.txt
## "d be held up as an example to all those hollywood screenwriters . \nscripts as creative and endearing as this shou"
## cv794_15868.txt
## "perado \" and 1996's \" from dusk till dawn \" ) and screenwriter kevin williamson ( 1996's \" scream \" and 1997's \""
## cv812_17924.txt
## "r in extreme ways . \nin order to test this theory screenwriter john august and director doug liman assemble a ca"
## cv818_10211.txt
## "-director of the wildly uneven four rooms and the screenwriter of the gory vampire-fest , from dusk till dawn . "
## cv829_20289.txt
## "'s latest comedy , he tells a story of a troubled screenwriter ( brooks ) who's losing his edge , as his busines"
## cv838_24728.txt
## "estination \" is the best so far . \ntalented young screenwriter jeffrey reddick offers a fresh variation on a fam"
## cv869_23611.txt
## "s of the slasher genre , writer-director wong and screenwriters glen morgan and jeffrey reddick have created a pr"
## cv872_12591.txt
## "es of madison county \" ( richard lagravenese , co-screenwriter here also adapted \" bridges \" ) . \nalthough \" the"
## cv878_15694.txt
## "after 1996's \" scream , \" and written by the same screenwriter , kevin williamson , is a stylish , effective hor"
## cv898_14187.txt
## "esy mark steven johnson , the film's director and screenwriter . \nfortunately , simon birch isn't the real star "
## cv922_10073.txt
## " straight-forward , but for some odd reason , the screenwriters wanted to surprise everyone by giving us somethin"
## cv928_9168.txt
## " guess the outcome of the film . \nbut leave it to screenwriter david mamet to add humor and a few surprises to m"
## cv939_10583.txt
## "e same thing on january 1 , 1999 . \nif craven and screenwriter kevin williamson are able to keep the quality hig"
## cv945_12160.txt
## "cause director brad silberling ( \" casper \" ) and screenwriter dana stevens ( \" blink \" ) wanted to make \" city "
## cv958_12162.txt
## "osterous mystery smilla's sense of snow . \nhe and screenwriter rafael yglesias bring the sprawling tale into cle"
## cv960_29007.txt
## ") , who is seeing his old girlfriend . \nmike ( co-screenwriter steven gevedon ) has discovered a cache of audiot"
## cv974_22941.txt
## "ed teacher . \nit's like payne and taylor , his co-screenwriter , have taken tracy , a girl desperate for friends"
## cv979_18921.txt
## "tle bit of pity , but a whole lot of affection . \nscreenwriters sitch , santo cilauro , tom gleisner , and jane k"
## cv980_10953.txt
## "ten , totatly unoffensive and funny comedy . \nthe screenwriters , director ramis and danny rubin , have written a"
## [1] " "
## [1] "PATTERN: cinematographer"
## cv000_29590.txt
## "zed , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures t"
## cv067_19774.txt
## "atmospherically shot by the silence of the lambs' cinematographer , tak fujimoto ) is actually a drama-its spooky ,"
## cv099_10534.txt
## "nger . \nas depicted by hoblit ( primal fear ) and cinematographer newton thomas sigel , philadelphia is a dark , dr"
## cv121_17302.txt
## "occasionally the hyper-real approach works , with cinematographer elliot davis creating a world of fantasy romance "
## cv154_9328.txt
## "rom editors karen schmeer and shondra merrill and cinematographer robert richardson ( oliver stone's longtime colla"
## cv159_29505.txt
## "ly amazing piotr sobocinski , the oscar-nominated cinematographer behind krzysztof kieslowski's red . \nthe acting i"
## cv215_22240.txt
## " echoing the loneliness of the protagonists . \n ( cinematographer remi adafarasin often allows space to engulf them"
## cv220_29059.txt
## ", and comic-panel framed with a virtuoso grace by cinematographer amy vincent ( _death in venice , ca_ ) , while in"
## cv273_29112.txt
## "han he expected at the beginning . \nit helps that cinematographer matthieu poirot-delpech's crisp lensing complimen"
## cv294_11684.txt
## "hudsucker proxy . \" \nthe film was shot by veteran cinematographer roger deakins , who has worked with the coens on "
## cv297_10047.txt
## "work done by production designer nigel phelps and cinematographer darius khondji . \nas technically adept as jeunet'"
## cv323_29805.txt
## "sarossy who directs spent most of his career as a cinematographer and like the kingpin's lair , he has molded image"
## cv351_15458.txt
## "s the result of a perfect pairing of director and cinematographer . \nkapur and his cinematographer remi adefarasin "
## cv362_15341.txt
## "eth are shekhar kapur's visual delights . \nhe and cinematographer remi adefarasin have crafted a film with a rich c"
## cv386_10080.txt
## "keen eye for the stylish ; his collaboration with cinematographer slawomir idziak , production designer jan roelfs "
## cv397_29023.txt
## "of skilled craftsmen to work behind the camera . \ncinematographer matthew f . leonetti has had a long career of sho"
## cv409_29786.txt
## "( once again wielded by superb and ever-attentive cinematographer eric gautier ) is less appropriate here than in h"
## cv427_10825.txt
## "e telephones are old-fashioned , rotary models . \ncinematographer bill butler is given an opportunity to use unconv"
## cv431_7085.txt
## "is death in the early 80's . \nmore practiced as a cinematographer than a director , bava nonetheless sat in the dir"
## cv505_12090.txt
## "r whose love has turned into fear . \nscorsese and cinematographer michael chapman elected to shoot the bulk of ragi"
## cv519_14661.txt
## "oviding some interesting and colorful costumes . \ncinematographer eduardo serra , whose work was last seen in the l"
## cv532_6522.txt
## " in production long before that film's release . \ncinematographer peter suschitzky , who makes the most of wynn tho"
## cv552_10016.txt
## "t of \" there are shadows in light , baby \" to his cinematographer's complaint about poor set lighting echoes back t"
## cv558_29507.txt
## "nishing foresight , shackleton brought australian cinematographer and photographer frank hurley along on the journe"
## cv583_29692.txt
## " . \nand credit must be handed out to laughton and cinematographer stanley cortez , who create a series of haunting "
## cv589_12064.txt
## "ud of constant violence to the action . \naltman's cinematographer , changwei gu , gives the film a dark , soaked lo"
## cv641_12349.txt
## "o let his actors do most of the work - but he and cinematographer slavomir idziak ( \" gattaca \" ) successfully evok"
## cv661_2450.txt
## "asse hallstrom ( my life as a dog ) and legendary cinematographer sven nykvist create a magnificent visual backdrop"
## cv665_29538.txt
## "nd always very strange . \njunichiro hayashi , the cinematographer who recently has been doing all of kurosawa's fil"
## cv689_12587.txt
## "himmering fields of wheat--all are resplendent in cinematographer robert richardson's viewfinder . \nveteran english"
## cv710_22577.txt
## "ional stars that couldn't possibly be ignored are cinematographer frank griebe and editor mathilde bonnefroy , who "
## cv713_29155.txt
## "in the cyclic screenplay . \ntran anh hung and his cinematographer mark lee ping-bin ( \" flowers of shanghai \" ) lin"
## cv802_28664.txt
## "ected in a coen brothers venture , first class . \ncinematographer roger deakins , who has worked on five previous c"
## cv837_27325.txt
## "ents of each genre to be found within the film . \ncinematographer roger pratt brings an atmospheric , fairy tale lo"
## cv851_20469.txt
## "read of blair , a creepy closed-in feel thanks to cinematographer fred murphy and great performances by the whole c"
## cv868_11948.txt
## "o the senses in a way that few love stories do . \ncinematographer john seale ( the english patient ) provides some "
## cv872_12591.txt
## "e land . \nto further emphasize this , redford and cinematographer robert richardson ( \" natural born killers , \" \" "
## cv887_5126.txt
## "plots , perhaps in homage to d . w . \ngriffith . \ncinematographer ernest dickerson , who has worked on all of spike"
## cv945_12160.txt
## "vie . \nit is beautifully filmed in lush colors by cinematographer john seale ( \" the english patient \" ) , whose sw"
## cv949_20112.txt
## "tin's nonsensical explanation to dave , his loyal cinematographer ( jamie kennedy ) , that every movie , in the end"
## cv958_12162.txt
## "a asp , costume designer gabriella pescucci , and cinematographer jorgen persson give les miserables a sumptuous pe"
## cv963_6895.txt
## "eit a visually interesting one . \ncampion and her cinematographer stuart dryburgh come up with a great variety of e"
## [1] " "
## [1] "PATTERN: oscar winner"
## cv000_29590.txt
## "omparison to that in the black-and-white comic . \noscar winner martin childs' ( shakespeare in love ) production"
## cv108_15571.txt
## "sshoppers is hopper , who is fiendishly voiced by oscar winner kevin spacey . \nwhen the offering is lost hopper "
## cv300_22284.txt
## " he stole every scene he was in away from veteran oscar winner tom hanks . \nrockwell , an independent film veter"
## cv314_14422.txt
## "eresting to see that this movie was one of future oscar winner susan sarandon's ( dead man walking ) first film-"
## cv317_24049.txt
## " the former actually had the insight to follow up oscar winner usual suspects with a pauly shore vehicle , and j"
## cv475_21692.txt
## "on and ben affleck in the starring roles . \nbeing oscar winners for the classic 'good will hunting' they give gr"
## cv689_12587.txt
## "ances with wolves \" by way of \" out of africa \" ( oscar winners both for barry ) . \nfortunately , barry's replac"
## cv700_21947.txt
## "the talent with her that one would expect from an oscar winner . \nrobbie coltrane was easily one of the best par"
## [1] " "
Now we define more complex regular expressions that help identifying people appearances.
pattern1=c("created by ([A-z]* [A-z]*)")
pattern1=c(pattern1,"created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)")
pattern1=c(pattern1,"screenwriter[s]? ([A-z]* [A-z]*)")
pattern1=c(pattern1,"cinematographer(?: ,)? ([A-z]* [A-z]*)")
pattern1=c(pattern1,"oscar winner ([A-z]* [A-z]*)")
We detect those patterns in the corpus and we can see in which files they do appear.
matches1 = detectPatternsInCorpus(corpus, pattern1)
matches1[!is.na(matches1[4]),c(1,4)]
## File screenwriter[s]? ([A-z]* [A-z]*)
## 1 cv000_29590.txt terry hayes
## 29 cv028_26746.txt are the
## 30 cv029_18643.txt alex proyas
## 77 cv076_24945.txt toby emmerich
## 79 cv078_14730.txt to dumb
## 87 cv086_18371.txt frank cottrell
## 95 cv094_27889.txt james schamus
## 116 cv115_25396.txt karey kirkpatrick
## 122 cv121_17302.txt can make
## 144 cv143_19666.txt j
## 159 cv158_10390.txt paul rudnick
## 163 cv162_10424.txt amongst the
## 179 cv178_12972.txt for the
## 191 cv190_27052.txt of the
## 192 cv191_29719.txt marc klein
## 226 cv225_29224.txt andy breckman
## 241 cv240_14336.txt use the
## 242 cv241_23130.txt ehren kruger
## 275 cv274_25253.txt karey kirkpatrick
## 319 cv318_10493.txt kevin williamson
## 337 cv336_10143.txt kevin williamson
## 366 cv365_11576.txt tim herlihy
## 371 cv370_5221.txt explain that
## 396 cv395_10849.txt around
## 405 cv404_20315.txt who after
## 406 cv405_20399.txt w
## 411 cv410_24266.txt came in
## 433 cv432_14224.txt david goyer
## 453 cv452_5088.txt larry ferguson
## 465 cv464_15650.txt john eskow
## 467 cv466_18722.txt and director
## 475 cv474_10209.txt fierstein is
## 477 cv476_16856.txt david duncan
## 527 cv526_12083.txt to balance
## 544 cv543_5045.txt elizabeth jane
## 556 cv555_23922.txt steven brill
## 566 cv565_29572.txt william goldman
## 570 cv569_26381.txt ______ and
## 579 cv578_15094.txt mark christopher
## 584 cv583_29692.txt and legendary
## 614 cv613_21796.txt and the
## 621 cv620_24265.txt mary harron
## 645 cv644_17154.txt scott b
## 669 cv668_17604.txt lutz and
## 671 cv670_25826.txt of kingpin
## 673 cv672_28083.txt and actors
## 683 cv682_16139.txt david levien
## 699 cv698_15253.txt zwick
## 706 cv705_11059.txt did a
## 717 cv716_10514.txt john hodge
## 729 cv728_16133.txt marc norman
## 747 cv746_10147.txt kevin williamson
## 748 cv747_16556.txt lawrence wright
## 750 cv749_17765.txt august celebrate
## 770 cv769_8123.txt ed solomon
## 795 cv794_15868.txt kevin williamson
## 813 cv812_17924.txt john august
## 819 cv818_10211.txt of the
## 830 cv829_20289.txt in the, really get
## 839 cv838_24728.txt jeffrey reddick
## 870 cv869_23611.txt glen morgan
## 873 cv872_12591.txt here also
## 923 cv922_10073.txt wanted to
## 929 cv928_9168.txt david mamet
## 940 cv939_10583.txt kevin williamson
## 946 cv945_12160.txt dana stevens
## 959 cv958_12162.txt rafael yglesias
## 961 cv960_29007.txt steven gevedon
## 980 cv979_18921.txt sitch
We print the matches found per pattern.
printMatchesPerPattern(pattern1, matches1)
## [1] "PATTERN: created by ([A-z]* [A-z]*)"
## [1] "alan moore" "his last" "the earlier" "the animated"
## [5] "fluid dynamics" "edgar rice" "pixar " "the matrix"
## [1] " "
## [1] "PATTERN: created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)"
## [1] "eddie campbell"
## [1] " "
## [1] "PATTERN: screenwriter[s]? ([A-z]* [A-z]*)"
## [1] "terry hayes" "are the" "alex proyas"
## [4] "toby emmerich" "to dumb" "frank cottrell"
## [7] "james schamus" "karey kirkpatrick" "can make"
## [10] "j " "paul rudnick" "amongst the"
## [13] "for the" "of the" "marc klein"
## [16] "andy breckman" "use the" "ehren kruger"
## [19] "karey kirkpatrick" "kevin williamson" "kevin williamson"
## [22] "tim herlihy" "explain that" "around "
## [25] "who after" "w " "came in"
## [28] "david goyer" "larry ferguson" "john eskow"
## [31] "and director" "fierstein is" "david duncan"
## [34] "to balance" "elizabeth jane" "steven brill"
## [37] "william goldman" "______ and" "mark christopher"
## [40] "and legendary" "and the" "mary harron"
## [43] "scott b" "lutz and" "of kingpin"
## [46] "and actors" "david levien" "zwick "
## [49] "did a" "john hodge" "marc norman"
## [52] "kevin williamson" "lawrence wright" "august celebrate"
## [55] "ed solomon" "kevin williamson" "john august"
## [58] "of the" "in the, really get" "jeffrey reddick"
## [61] "glen morgan" "here also" "wanted to"
## [64] "david mamet" "kevin williamson" "dana stevens"
## [67] "rafael yglesias" "steven gevedon" "sitch "
## [1] " "
## [1] "PATTERN: cinematographer(?: ,)? ([A-z]* [A-z]*)"
## [1] "peter deming" "tak fujimoto" "newton thomas"
## [4] "elliot davis" "robert richardson" "behind krzysztof"
## [7] "remi adafarasin" "amy vincent" "matthieu poirot"
## [10] "roger deakins" "darius khondji" "and like"
## [13] "remi adefarasin" "remi adefarasin" "slawomir idziak"
## [16] "matthew f" "eric gautier" "bill butler"
## [19] "than a" "michael chapman" "eduardo serra"
## [22] "peter suschitzky" "and photographer" "stanley cortez"
## [25] "changwei gu" "slavomir idziak" "sven nykvist"
## [28] "who recently" "robert richardson" "frank griebe"
## [31] "mark lee" "roger deakins" "roger pratt"
## [34] "fred murphy" "john seale" "robert richardson"
## [37] "ernest dickerson" "john seale" "jorgen persson"
## [40] "stuart dryburgh"
## [1] " "
## [1] "PATTERN: oscar winner ([A-z]* [A-z]*)"
## [1] "martin childs" "kevin spacey" "tom hanks" "susan sarandon"
## [5] "usual suspects"
## [1] " "
We check how many patterns we have found in each file.
countMatchesPerRow(matches1)
## File Count
## 1 cv000_29590.txt 5
## 29 cv028_26746.txt 1
## 30 cv029_18643.txt 1
## 68 cv067_19774.txt 1
## 77 cv076_24945.txt 1
## 79 cv078_14730.txt 1
## 87 cv086_18371.txt 1
## 95 cv094_27889.txt 1
## 100 cv099_10534.txt 1
## 109 cv108_15571.txt 1
## 116 cv115_25396.txt 1
## 122 cv121_17302.txt 2
## 144 cv143_19666.txt 1
## 155 cv154_9328.txt 1
## 159 cv158_10390.txt 1
## 160 cv159_29505.txt 1
## 163 cv162_10424.txt 1
## 179 cv178_12972.txt 1
## 191 cv190_27052.txt 1
## 192 cv191_29719.txt 1
## 206 cv205_9457.txt 1
## 216 cv215_22240.txt 1
## 221 cv220_29059.txt 1
## 226 cv225_29224.txt 1
## 241 cv240_14336.txt 1
## 242 cv241_23130.txt 1
## 274 cv273_29112.txt 1
## 275 cv274_25253.txt 1
## 286 cv285_16494.txt 1
## 295 cv294_11684.txt 1
## 298 cv297_10047.txt 1
## 301 cv300_22284.txt 1
## 315 cv314_14422.txt 1
## 318 cv317_24049.txt 1
## 319 cv318_10493.txt 1
## 324 cv323_29805.txt 1
## 337 cv336_10143.txt 1
## 352 cv351_15458.txt 1
## 363 cv362_15341.txt 1
## 366 cv365_11576.txt 1
## 371 cv370_5221.txt 1
## 372 cv371_7630.txt 1
## 387 cv386_10080.txt 1
## 396 cv395_10849.txt 1
## 398 cv397_29023.txt 1
## 405 cv404_20315.txt 1
## 406 cv405_20399.txt 1
## 410 cv409_29786.txt 1
## 411 cv410_24266.txt 1
## 428 cv427_10825.txt 1
## 432 cv431_7085.txt 1
## 433 cv432_14224.txt 1
## 453 cv452_5088.txt 1
## 465 cv464_15650.txt 1
## 467 cv466_18722.txt 1
## 475 cv474_10209.txt 1
## 477 cv476_16856.txt 1
## 485 cv484_25054.txt 1
## 506 cv505_12090.txt 1
## 520 cv519_14661.txt 1
## 527 cv526_12083.txt 1
## 533 cv532_6522.txt 1
## 544 cv543_5045.txt 1
## 556 cv555_23922.txt 1
## 559 cv558_29507.txt 1
## 566 cv565_29572.txt 1
## 570 cv569_26381.txt 1
## 579 cv578_15094.txt 1
## 584 cv583_29692.txt 2
## 590 cv589_12064.txt 1
## 614 cv613_21796.txt 1
## 621 cv620_24265.txt 1
## 629 cv628_19325.txt 1
## 642 cv641_12349.txt 1
## 645 cv644_17154.txt 1
## 662 cv661_2450.txt 1
## 666 cv665_29538.txt 1
## 669 cv668_17604.txt 1
## 671 cv670_25826.txt 1
## 673 cv672_28083.txt 1
## 683 cv682_16139.txt 1
## 690 cv689_12587.txt 1
## 699 cv698_15253.txt 1
## 706 cv705_11059.txt 1
## 711 cv710_22577.txt 1
## 714 cv713_29155.txt 1
## 717 cv716_10514.txt 1
## 729 cv728_16133.txt 1
## 744 cv743_15449.txt 1
## 747 cv746_10147.txt 1
## 748 cv747_16556.txt 1
## 750 cv749_17765.txt 1
## 770 cv769_8123.txt 1
## 795 cv794_15868.txt 1
## 803 cv802_28664.txt 1
## 813 cv812_17924.txt 1
## 819 cv818_10211.txt 1
## 830 cv829_20289.txt 1
## 838 cv837_27325.txt 1
## 839 cv838_24728.txt 1
## 852 cv851_20469.txt 1
## 869 cv868_11948.txt 1
## 870 cv869_23611.txt 1
## 873 cv872_12591.txt 2
## 888 cv887_5126.txt 1
## 893 cv892_17576.txt 1
## 923 cv922_10073.txt 1
## 929 cv928_9168.txt 1
## 940 cv939_10583.txt 1
## 946 cv945_12160.txt 2
## 959 cv958_12162.txt 2
## 961 cv960_29007.txt 1
## 964 cv963_6895.txt 1
## 980 cv979_18921.txt 1
And we check how many times each pattern has been found.
countMatchesPerColumn(matches1)
## Entity Count
## 1 created by ([A-z]* [A-z]*) 8
## 2 created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*) 1
## 3 screenwriter[s]? ([A-z]* [A-z]*) 69
## 4 cinematographer(?: ,)? ([A-z]* [A-z]*) 40
## 5 oscar winner ([A-z]* [A-z]*) 5
Put all matches in a list for comparison with a gold standard.
allMatches = mergeAllMatchesInLists(allEntities)
head(allMatches)
## Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
## Matches
## 1 alan moore, eddie campbell, terry hayes, peter deming, martin childs
## 2 NULL
## 3 NULL
## 4 NULL
## 5 NULL
## 6 NULL
Load the gold standard and put all gold standard matches in a list for comparison.
goldStandard = read.table(file = "goldStandard.csv", quote = "", na.strings=c(""),
colClasses="character", sep=";")
allMatchesGold = mergeGoldStandardInLists(goldStandard)
head(allMatchesGold)
## Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
## Matches
## 1 alan moore, eddie campbell, moore, campbell, jack, michael jackson, albert, allen hughes, peter godley, robbie coltrane, frederick abberline, johnny depp, abberline, mary kelly, heather graham, terry hayes, rafael yglesias, steve guttenberg, tim burton, marilyn manson, peter deming, martin childs, depp, ians holm, joe gould, richardson, graham
## 2 matthew broderick, reese witherspoon, george washington carver, tracy flick, paul, max fischer, bill murray, broderick, witherspoon, jessica campbell, tammy, rooney, campbell, alexander payne, tracy, m
## 3 ryan, hanks, tom hanks, joe fox, meg ryan, kathleen kelley, fox, kelley
## 4 john williams, steven spielberg, spielberg, williams, martin brody, roy scheider, larry vaughn, murray hamilton, brody, matt hooper, richard dreyfuss, hooper, vaughn, quint, robert shaw, hitchcock, scheider, dreyfuss, shaw, robert redford, paul newman, duddy kravitz, ahab
## 5 herb, jackie chan, barry sanders, sanders, jackie, chan, bruce lee, tim allen, lawrence kazdan, john williams, spielberg, george lucas
## 6 raoul peck, lumumba, patrice lumumba, eriq ebouaney, helmer peck, peck, pascal bonitzer, patrice, joseph kasa vubu, maka kotto, moise tschombe, pascal nzonzi
Calculate the metrics (precision, recall, f-measure).
metrics = calculateMetrics(allMatches, allMatchesGold)
metrics
## Precision Recall Fmeasure
## 1 0.8181818 0.003370029 0.006712411