The goal of this document is to show a sample script for pattern-based entity recognition over text documents using the openNLP (natural language processing) and the tm (text mining) packages in R.
I cannot claim full authorship of this document, since I have taken code snippets and have been inspired by multiple books and documents in the Web. Thanks everyone for sharing.
Check the working directory with wd. If it is not the one where your data are located, change it with setwd.
getwd()
## [1] "/Users/alvaro.arranz/Universidad/Intelligent Systems/HandsOn-3"
setwd("~/Universidad/Intelligent Systems/HandsOn-3")
Now we load the required libraries. Only a couple of things to mention:
annotate function of the openNLP package may require to explicitly include the package name (i.e., ``) due to a name clash with ggplot2# Needed for OutOfMemoryError: Java heap space
library(rJava)
.jinit(parameters="-Xmx4g")
# If there are more memory problems, invoke gc() after the POS tagging
library(NLP)
library(openNLP)
library(openNLPmodels.en)
library(tm)
library(stringr)
getAnnotationsFromDocument returns annotations for the text document: word, sentence, part-of-speech, and Penn Treebank parse annotations.
As an alternative, the koRpus package uses TreeTagger for POS tagging.
getAnnotationsFromDocument = function(doc){
x=as.String(doc)
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
y2 <- annotate(x, pos_tag_annotator, y1)
parse_annotator <- Parse_Annotator()
y3 <- annotate(x, parse_annotator, y2)
return(y3)
}
`` returns the text document merged with the annotations.
getAnnotatedMergedDocument = function(doc,annotations){
x=as.String(doc)
y2w <- subset(annotations, type == "word")
tags <- sapply(y2w$features, '[[', "POS")
r1 <- sprintf("%s/%s", x[y2w], tags)
r2 <- paste(r1, collapse = " ")
return(r2)
}
getAnnotatedPlainTextDocument returns the text document along with its annotations in an AnnotatedPlainTextDocument.
getAnnotatedPlainTextDocument = function(doc,annotations){
x=as.String(doc)
a = AnnotatedPlainTextDocument(x,annotations)
return(a)
}
detectPatternOnDocument returns the pattern detected on an AnnotatedPlainTextDocument.
detectPatternOnDocument <- function(doc, pattern) {
x=as.String(doc)
res=str_match_all(x,pattern)
dimrow=dim(res[[1]])[1]
dimcol=dim(res[[1]])[2]
# If there are no rows, no matches have been found
if (dimrow == 0) {
return(NA)
}else{
if (dimcol > 2){
# If there are three or more columns, we have to paste all the groups together
for (i in 1:dimrow) {
res[[1]][i,2] = paste(res[[1]][i,2:dimcol], collapse = ' ')
}
}
# We return all the results found separated by ','
if (dimcol != 1) {
result = paste(res[[1]][,2], collapse = ', ')
}else{
result = paste(res[[1]][,1], collapse = ', ')
}
return(result)
}
}
detectPatternOnDocumentWithContext returns the pattern detected on an AnnotatedPlainTextDocument with some context.
detectPatternOnDocumentWithContext <- function(doc, pattern) {
txt=as.String(doc)
number=50
coord=str_locate(txt,pattern)
res3=substr(txt,coord[1]-number,coord[2]+number)
return (res3)
}
detectPatternsInCorpus returns a data frame with all the patterns detected in a corpus.
detectPatternsInCorpus = function(corpus, patterns){
vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1,
nrow = length(corpus)))
names(vallEntities) <- c("File",patterns)
for (i in 1:length(patterns)) {
vallEntities[,i+1]=unlist(lapply(corpus, detectPatternOnDocument,
pattern=patterns[i]))
}
for (i in 1:length(corpus)) {
vallEntities$File[i]=meta(corpus[[i]])$id
}
return (vallEntities)
}
detectPatternsInTaggedCorpus returns a data frame with all the patterns detected in an annotated corpus.
detectPatternsInTaggedCorpus = function(corpus, taggedCorpus, patterns){
vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1,
nrow = length(corpus)))
names(vallEntities) <- c("File",patterns)
for (i in 1:length(patterns)) {
vallEntities[,i+1]=unlist(lapply(taggedCorpus, detectPatternOnDocument,
pattern=patterns[i]))
}
for (i in 1:length(corpus)) {
vallEntities$File[i]=meta(corpus[[i]])$id
}
return (vallEntities)
}
countMatchesPerColumn returns the number of matches per pattern/column.
Counts the number of columns with non-NA values for each pattern.
countMatchesPerColumn = function (df) {
entityCountPerPattern <- data.frame(matrix(NA, ncol = 2,
nrow = length(names(df))-1))
names(entityCountPerPattern) <- c("Entity","Count")
for (i in 2:length(names(df))) {
entityCountPerPattern$Entity[i-1] = names(df)[i]
entityCountPerPattern$Count[i-1] = nrow(subset(df, !is.na(df[i])))
}
return (entityCountPerPattern)
}
countMatchesPerRow returns the number of entities per file/row.
Counts the number of rows with non-NA values for each file.
countMatchesPerRow = function (df) {
entityCountPerFile <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(entityCountPerFile) <- c("File","Count")
for (i in 1:nrow(df)) {
entityCountPerFile$File[i] = df$File[i]
entityCountPerFile$Count[i] = length(Filter(Negate(is.na),df[i,2:length(df[i,])]))
}
return (entityCountPerFile[entityCountPerFile[2]!=0,])
}
printMatchesPerPattern prints the matches found per pattern.
printMatchesPerPattern = function (patterns, matches) {
for (i in 1:length(patterns)){
print(paste("PATTERN: ",patterns[i]))
strings = matches[,i+1][!is.na(unlist(matches[,i+1]))]
print(strings)
print(" ")
}
}
mergeAllMatchesInLists returns a data frame with all the files and their matches in a single list per file.
mergeAllMatchesInLists = function (df) {
matchesPerFile = rep(list(list()), nrow(df))
for (i in 1:nrow(df)) {
matches=list()
for (j in 2:ncol(df)){
if (grepl(',',df[i,j])){
b=strsplit(as.character(df[i,j]),split=',')
for (j in 1:length(b[[1]])){
matches= c(matches,str_trim(b[[1]][j]))
}
}else{
if (!(is.na(df[i,j]))){
matches = c(matches,str_trim(df[i,j]))
}
}
}
matches = unique(matches)
matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
}
files = df[,1]
matches = matchesPerFile
allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(allMatches) <- c("Files","Matches")
allMatches$Files=files
allMatches$Matches=matches
return (allMatches)
}
mergeGoldStandardInLists returns a data frame with all the files and the gold standard matches in a single list per file.
mergeGoldStandardInLists = function (df) {
matchesPerFile = rep(list(list()), nrow(df))
for (i in 1:nrow(df)) {
matches=as.list(unlist(Filter(Negate(is.na),df[i,2:length(df)])))
matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
}
files = df[,1]
matches = matchesPerFile
allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(allMatches) <- c("Files","Matches")
allMatches$Files=files
allMatches$Matches=matches
return (allMatches)
}
calculateMetrics calculates precision, recall and f-measure according to a gold standard.
calculateMetrics = function (matches, matches.gs) {
metrics<- data.frame(matrix(NA, ncol = 3, nrow = 1))
names(metrics) <- c("Precision","Recall","Fmeasure")
numCorrect = 0
allAnswers = 0
possibleAnswers = 0
for (i in 1:nrow(matches)) {
if (length(matches.gs$Matches[[i]])!=0) {
l = str_trim(unlist(matches[i,2]))
l.gs = unname(unlist(matches.gs[i,2]))
intersection = intersect(l, l.gs)
numCorrect = numCorrect + length(intersect(l, l.gs))
allAnswers = allAnswers + length (l)
possibleAnswers = possibleAnswers + length(l.gs)
}
}
metrics$Precision = numCorrect / allAnswers
metrics$Recall = numCorrect / possibleAnswers
beta = 1
if ((metrics$Precision == 0) & (metrics$Recall == 0)) {
metrics$Fmeasure = 0
} else {
metrics$Fmeasure = ((sqrt(beta)+1) * metrics$Precision * metrics$Recall) /
((sqrt(beta)*metrics$Precision) + metrics$Recall)
}
return(metrics)
}
We are going to use the Movie review data version 2.0, created by Bo Pang and Lillian Lee.
Once unzipped, the data splits the different documents into positive and negative opinions. In this script we are going to use the positive opinions located in ./txt_sentoken/pos.
We are only going to load the first 500 reviews.
source.pos = DirSource("./Corpus/review_polarity_small/txt_sentoken/pos", encoding = "UTF-8")
corpus = Corpus(source.pos)
Let’s take a look at the document in the first entry.
inspect(corpus[[1]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 4226
##
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd .
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes .
## in other words , don't dismiss this film because of its source .
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes .
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ?
## the ghetto in question is , of course , whitechapel in 1888 london's east end .
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision .
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case .
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium .
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach .
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay .
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end .
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts .
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) .
## don't worry - it'll all make sense when you see it .
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) .
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic .
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place .
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent .
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham .
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad .
## the film , however , is all good .
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content
We just apply the getAnnotationsFromDocument function to every document in the corpus using lapply.
This step may take long depending on the size of the corpus and on the annotations that we want to identify.
annotations = lapply(corpus, getAnnotationsFromDocument)
The first annotations are sentence annotations. They indicate where the sentence starts and where it ends. In `` we can access the tokens in the sentence (and check the number of tokens it has). In parse we can access the parse tree.
head(annotations[[1]])
## id type start end features
## 1 sentence 1 265 constituents=<<integer,54>>,
## parse=<<character,1>>
## 2 sentence 268 439 constituents=<<integer,36>>,
## parse=<<character,1>>
## 3 sentence 442 591 constituents=<<integer,27>>,
## parse=<<character,1>>
## 4 sentence 594 797 constituents=<<integer,44>>,
## parse=<<character,1>>
## 5 sentence 800 939 constituents=<<integer,28>>,
## parse=<<character,1>>
## 6 sentence 942 1299 constituents=<<integer,70>>,
## parse=<<character,1>>
Word annotations also are defined. They indicate where the word starts, where it ends, and the part-of-speech tag.
tail(annotations[[1]])
## id type start end features
## 844 word 4189 4197 POS=NN
## 845 word 4199 4199 POS=,
## 846 word 4201 4208 POS=NN
## 847 word 4210 4212 POS=CC
## 848 word 4214 4217 POS=NN
## 849 word 4219 4225 POS=NN
We can create `AnnotatedPlainTextDocuments that attach the annotations to the document and store the annotated corpus in another variable (since we destroy the corpus metadata).
corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)
corpus.tagged[[1]]
## <<AnnotatedPlainTextDocument>>
## Metadata: 0
## Annotations: length: 849
## Content: chars: 4226
We can also store all the annotations inline with the text and store the annotated corpus in another variable (since we destroy the corpus metadata).
corpus.taggedText = Map(getAnnotatedMergedDocument, corpus, annotations)
corpus.taggedText[[1]]
## [1] "films/NNS adapted/VBD from/IN comic/JJ books/NNS have/VBP had/VBN plenty/NN of/IN success/NN ,/, whether/IN they/PRP 're/VBP about/IN superheroes/NNS (/-LRB- batman/NN ,/, superman/NN ,/, spawn/NN )/-RRB- ,/, or/CC geared/VBN toward/IN kids/NNS (/-LRB- casper/NN )/-RRB- or/CC the/DT arthouse/NN crowd/NN (/-LRB- ghost/NN world/NN )/-RRB- ,/, but/CC there/EX 's/VBZ never/RB really/RB been/VBN a/DT comic/JJ book/NN like/IN from/IN hell/NN before/IN ./. for/IN starters/NNS ,/, it/PRP was/VBD created/VBN by/IN alan/NN moore/NN (/-LRB- and/CC eddie/JJ campbell/NN )/-RRB- ,/, who/WP brought/VBD the/DT medium/NN to/TO a/DT whole/JJ new/JJ level/NN in/IN the/DT mid/JJ '80s/NNS with/IN a/DT 12-part/JJ series/NN called/VBN the/DT watchmen/NNS ./. to/TO say/VB moore/NN and/CC campbell/NN thoroughly/RB researched/VBD the/DT subject/NN of/IN jack/NN the/DT ripper/NN would/MD be/VB like/IN saying/VBG michael/NN jackson/NN is/VBZ starting/VBG to/TO look/VB a/DT little/JJ odd/JJ ./. the/DT book/NN (/-LRB- or/CC \"/`` graphic/JJ novel/NN ,/, \"/`` if/IN you/PRP will/MD )/-RRB- is/VBZ over/IN 500/CD pages/NNS long/RB and/CC includes/VBZ nearly/RB 30/CD more/RBR that/IN consist/VB of/IN nothing/NN but/CC footnotes/NNS ./. in/IN other/JJ words/NNS ,/, do/VBP n't/RB dismiss/VB this/DT film/NN because/IN of/IN its/PRP$ source/NN ./. if/IN you/PRP can/MD get/VB past/IN the/DT whole/JJ comic/JJ book/NN thing/NN ,/, you/PRP might/MD find/VB another/DT stumbling/JJ block/NN in/IN from/IN hell/NN 's/POS directors/NNS ,/, albert/NN and/CC allen/JJ hughes/NNS ./. getting/VBG the/DT hughes/NNS brothers/NNS to/TO direct/VB this/DT seems/VBZ almost/RB as/RB ludicrous/JJ as/IN casting/VBG carrot/NN top/NN in/IN ,/, well/RB ,/, anything/NN ,/, but/CC riddle/VB me/PRP this/DT :/: who/WP better/RB to/TO direct/VB a/DT film/NN that/WDT 's/VBZ set/VBN in/IN the/DT ghetto/NN and/CC features/NNS really/RB violent/JJ street/NN crime/NN than/IN the/DT mad/JJ geniuses/NNS behind/IN menace/NN ii/NNS society/NN ?/. the/DT ghetto/NN in/IN question/NN is/VBZ ,/, of/IN course/NN ,/, whitechapel/NN in/IN 1888/CD london/. 's/POS east/JJ end/NN ./. it/PRP 's/VBZ a/DT filthy/JJ ,/, sooty/JJ place/NN where/WRB the/DT whores/NNS (/-LRB- called/VBN \"/`` unfortunates/JJ \"/'' )/-RRB- are/VBP starting/VBG to/TO get/VB a/DT little/JJ nervous/JJ about/IN this/DT mysterious/JJ psychopath/NN who/WP has/VBZ been/VBN carving/VBG through/IN their/PRP$ profession/NN with/IN surgical/JJ precision/NN ./. when/WRB the/DT first/JJ stiff/NN turns/VBZ up/RP ,/, copper/NN peter/NN godley/NN (/-LRB- robbie/NN coltrane/NN ,/, the/DT world/NN is/VBZ not/RB enough/JJ )/-RRB- calls/VBZ in/IN inspector/NN frederick/NN abberline/NN (/-LRB- johnny/JJ depp/NN ,/, blow/NN )/-RRB- to/TO crack/VB the/DT case/NN ./. abberline/NN ,/, a/DT widower/NN ,/, has/VBZ prophetic/JJ dreams/NNS he/PRP unsuccessfully/RB tries/VBZ to/TO quell/VB with/IN copious/JJ amounts/NNS of/IN absinthe/NNS and/CC opium/NN ./. upon/IN arriving/VBG in/IN whitechapel/NN ,/, he/PRP befriends/VBZ an/DT unfortunate/NN named/VBN mary/JJ kelly/NN (/-LRB- heather/NN graham/NN ,/, say/VBP it/PRP is/VBZ n't/RB so/RB )/-RRB- and/CC proceeds/NNS to/TO investigate/VB the/DT horribly/RB gruesome/JJ crimes/NNS that/IN even/RB the/DT police/NN surgeon/NN ca/MD n't/RB stomach/VB ./. i/PRP do/VBP n't/RB think/VB anyone/NN needs/NNS to/TO be/VB briefed/VBN on/IN jack/NN the/DT ripper/NN ,/, so/IN i/PRP wo/MD n't/RB go/VB into/IN the/DT particulars/NNS here/RB ,/, other/JJ than/IN to/TO say/VB moore/NN and/CC campbell/NN have/VBP a/DT unique/JJ and/CC interesting/JJ theory/NN about/IN both/DT the/DT identity/NN of/IN the/DT killer/NN and/CC the/DT reasons/NNS he/PRP chooses/VBZ to/TO slay/VB ./. in/IN the/DT comic/JJ ,/, they/PRP do/VBP n't/RB bother/VB cloaking/VBG the/DT identity/NN of/IN the/DT ripper/NN ,/, but/CC screenwriters/NNS terry/NN hayes/NNS (/-LRB- vertical/JJ limit/NN )/-RRB- and/CC rafael/JJ yglesias/NNS (/-LRB- les/NNS mis/NN ?/. rables/NNS )/-RRB- do/VBP a/DT good/JJ job/NN of/IN keeping/VBG him/PRP hidden/VBN from/IN viewers/NNS until/IN the/DT very/JJ end/NN ./. it/PRP 's/VBZ funny/JJ to/TO watch/VB the/DT locals/NNS blindly/RB point/VBP the/DT finger/NN of/IN blame/NN at/IN jews/NNS and/CC indians/NNS because/IN ,/, after/IN all/DT ,/, an/DT englishman/NN could/MD never/RB be/VB capable/JJ of/IN committing/VBG such/JJ ghastly/JJ acts/NNS ./. and/CC from/IN hell/NN 's/POS ending/NN had/VBD me/PRP whistling/VBG the/DT stonecutters/NNS song/NN from/IN the/DT simpsons/NNS for/IN days/NNS (/-LRB- \"/'' who/WP holds/VBZ back/RB the/DT electric/JJ car/who/NN made/VBD steve/JJ guttenberg/NN a/DT star/NN ?/. \"/`` )/-RRB- ./. do/VBP n't/RB worry/VB -/: it/PRP 'll/MD all/DT make/VB sense/NN when/WRB you/PRP see/VBP it/PRP ./. now/RB onto/IN from/IN hell/NN 's/POS appearance/NN :/: it/PRP 's/VBZ certainly/RB dark/JJ and/CC bleak/JJ enough/JJ ,/, and/CC it/PRP 's/VBZ surprising/JJ to/TO see/VB how/WRB much/RB more/JJR it/PRP looks/VBZ like/IN a/DT tim/JJ burton/NN film/NN than/IN planet/NN of/IN the/DT apes/NNS did/VBD (/-LRB- at/IN times/NNS ,/, it/PRP seems/VBZ like/IN sleepy/JJ hollow/JJ 2/CD )/-RRB- ./. the/DT print/NN i/NN saw/VBD was/VBD n't/RB completely/RB finished/VBN (/-LRB- both/DT color/NN and/CC music/NN had/VBD not/RB been/VBN finalized/VBN ,/, so/IN no/DT comments/NNS about/IN marilyn/JJ manson/NN )/-RRB- ,/, but/CC cinematographer/NN peter/NN deming/NN (/-LRB- do/VBP n't/RB say/VB a/DT word/NN )/-RRB- ably/RB captures/VBZ the/DT dreariness/NN of/IN victorian-era/NN london/RB and/CC helped/VBD make/VB the/DT flashy/JJ killing/NN scenes/NNS remind/VBD me/PRP of/IN the/DT crazy/JJ flashbacks/NNS in/IN twin/JJ peaks/NNS ,/, even/RB though/IN the/DT violence/NN in/IN the/DT film/NN pales/NNS in/IN comparison/NN to/TO that/DT in/IN the/DT black-and-white/JJ comic/JJ ./. oscar/NN winner/NN martin/VBG childs/NNS '/POS (/-LRB- shakespeare/NN in/IN love/NN )/-RRB- production/NN design/NN turns/VBZ the/DT original/JJ prague/NN surroundings/NNS into/IN one/CD creepy/JJ place/NN ./. even/RB the/DT acting/VBG in/IN from/IN hell/NN is/VBZ solid/JJ ,/, with/IN the/DT dreamy/JJ depp/NN turning/VBG in/IN a/DT typically/RB strong/JJ performance/NN and/CC deftly/RB handling/VBG a/DT british/JJ accent/NN ./. ians/NNS holm/VBP (/-LRB- joe/NN gould/NN 's/POS secret/NN )/-RRB- and/CC richardson/NN (/-LRB- 102/CD dalmatians/NNS )/-RRB- log/VBP in/IN great/JJ supporting/VBG roles/NNS ,/, but/CC the/DT big/JJ surprise/NN here/RB is/VBZ graham/NN ./. i/NN cringed/VBD the/DT first/JJ time/NN she/PRP opened/VBD her/PRP$ mouth/NN ,/, imagining/VBG her/PRP$ attempt/NN at/IN an/DT irish/JJ accent/NN ,/, but/CC it/PRP actually/RB was/VBD n't/RB half/DT bad/JJ ./. the/DT film/NN ,/, however/RB ,/, is/VBZ all/DT good/JJ ./. 2/CD :/: 00/CD -/: r/NN for/IN strong/JJ violence/gore/NN ,/, sexuality/NN ,/, language/NN and/CC drug/NN content/NN"
Based on the first file, we define some simple string patterns to try to identify people appearances.
pattern0=c("created by")
pattern0=c(pattern0,"screenwriter[s]?")
pattern0=c(pattern0,"cinematographer")
pattern0=c(pattern0,"oscar winner")
We detect those patterns in the corpus and we can see in which files they do appear.
matches0 = detectPatternsInCorpus(corpus, pattern0)
matches0[!is.na(matches0[3]),c(1,3)]
## File screenwriter[s]?
## 1 cv000_29590.txt screenwriters
## 29 cv028_26746.txt screenwriters
## 30 cv029_18643.txt screenwriter
## 77 cv076_24945.txt screenwriter
## 79 cv078_14730.txt screenwriter, screenwriter
## 87 cv086_18371.txt screenwriter
## 95 cv094_27889.txt screenwriters
## 116 cv115_25396.txt screenwriter
## 122 cv121_17302.txt screenwriter
## 136 cv135_11603.txt screenwriters
## 143 cv142_22516.txt screenwriter
## 144 cv143_19666.txt screenwriter
## 159 cv158_10390.txt screenwriter
## 163 cv162_10424.txt screenwriter, screenwriter, screenwriter
## 179 cv178_12972.txt screenwriter
## 191 cv190_27052.txt screenwriter
## 192 cv191_29719.txt screenwriter
## 209 cv208_9020.txt screenwriters
## 226 cv225_29224.txt screenwriter
## 236 cv235_10217.txt screenwriters
## 241 cv240_14336.txt screenwriters
## 242 cv241_23130.txt screenwriter
## 275 cv274_25253.txt screenwriter
## 319 cv318_10493.txt screenwriter
## 337 cv336_10143.txt screenwriter
## 360 cv359_6647.txt screenwriter
## 366 cv365_11576.txt screenwriter
## 371 cv370_5221.txt screenwriter
## 396 cv395_10849.txt screenwriters
## 405 cv404_20315.txt screenwriter, screenwriter
## 406 cv405_20399.txt screenwriter
## 411 cv410_24266.txt screenwriters
## 433 cv432_14224.txt screenwriter
## 453 cv452_5088.txt screenwriters
## 457 cv456_18985.txt screenwriters
## 465 cv464_15650.txt screenwriters
## 467 cv466_18722.txt screenwriter
## 475 cv474_10209.txt screenwriter
## 477 cv476_16856.txt screenwriter
We check how many patterns we have found in each file.
countMatchesPerRow(matches0)
## File Count
## 1 cv000_29590.txt 4
## 29 cv028_26746.txt 1
## 30 cv029_18643.txt 1
## 68 cv067_19774.txt 1
## 77 cv076_24945.txt 1
## 79 cv078_14730.txt 1
## 87 cv086_18371.txt 1
## 95 cv094_27889.txt 1
## 100 cv099_10534.txt 1
## 109 cv108_15571.txt 1
## 116 cv115_25396.txt 1
## 122 cv121_17302.txt 2
## 136 cv135_11603.txt 1
## 143 cv142_22516.txt 1
## 144 cv143_19666.txt 1
## 155 cv154_9328.txt 1
## 159 cv158_10390.txt 1
## 160 cv159_29505.txt 1
## 163 cv162_10424.txt 1
## 179 cv178_12972.txt 1
## 191 cv190_27052.txt 1
## 192 cv191_29719.txt 1
## 206 cv205_9457.txt 1
## 209 cv208_9020.txt 1
## 216 cv215_22240.txt 1
## 221 cv220_29059.txt 1
## 226 cv225_29224.txt 1
## 236 cv235_10217.txt 1
## 241 cv240_14336.txt 1
## 242 cv241_23130.txt 1
## 274 cv273_29112.txt 1
## 275 cv274_25253.txt 1
## 286 cv285_16494.txt 1
## 295 cv294_11684.txt 1
## 298 cv297_10047.txt 1
## 301 cv300_22284.txt 1
## 315 cv314_14422.txt 1
## 318 cv317_24049.txt 1
## 319 cv318_10493.txt 1
## 324 cv323_29805.txt 1
## 325 cv324_7082.txt 1
## 337 cv336_10143.txt 1
## 352 cv351_15458.txt 1
## 360 cv359_6647.txt 1
## 363 cv362_15341.txt 1
## 366 cv365_11576.txt 1
## 371 cv370_5221.txt 1
## 372 cv371_7630.txt 1
## 387 cv386_10080.txt 1
## 396 cv395_10849.txt 1
## 398 cv397_29023.txt 1
## 405 cv404_20315.txt 1
## 406 cv405_20399.txt 1
## 410 cv409_29786.txt 1
## 411 cv410_24266.txt 1
## 428 cv427_10825.txt 1
## 432 cv431_7085.txt 1
## 433 cv432_14224.txt 1
## 453 cv452_5088.txt 1
## 457 cv456_18985.txt 1
## 465 cv464_15650.txt 1
## 467 cv466_18722.txt 1
## 475 cv474_10209.txt 1
## 476 cv475_21692.txt 1
## 477 cv476_16856.txt 1
## 485 cv484_25054.txt 1
And we check how many times each pattern has been found.
countMatchesPerColumn(matches0)
## Entity Count
## 1 created by 6
## 2 screenwriter[s]? 39
## 3 cinematographer 19
## 4 oscar winner 6
And we print the context in which the patterns are found, to see if we can build better patterns.
for (i in 1:length(pattern0)){
print(paste("PATTERN: ",pattern0[i]))
strings = lapply(corpus, detectPatternOnDocumentWithContext, pattern=pattern0[i])
print(unlist(strings[!is.na(unlist(strings))]))
print(" ")
}
## [1] "PATTERN: created by"
## cv000_29590.txt
## "ok like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought t"
## cv205_9457.txt
## "turvy . \nrobert zemeckis , back from the euphoria created by his last film , forrest gump , once again proves "
## cv285_16494.txt
## "ve got mail like dried-up mistletoe . \nthe sparks created by the earlier movie are , by necessity , not eviden"
## cv324_7082.txt
## "the real thing . \nthe two of them , as characters created by fingal's imagination , serve as aspects of his pe"
## cv371_7630.txt
## "nd always right on the mark , enhancing the moods created by the animated scenery . \nas far as the subtitles g"
## cv484_25054.txt
## " and there are cliches , but the walls of water , created by fluid dynamics simulating real-life phenomena , a"
## [1] " "
## [1] "PATTERN: screenwriter[s]?"
## cv000_29590.txt
## " bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesia"
## cv028_26746.txt
## "d the story is so complex and \" clever \" that the screenwriters are the first to get lost in it . \nthere is is no"
## cv029_18643.txt
## "mise certainly is interesting and director and co-screenwriter alex proyas is able to keep the film consistently"
## cv076_24945.txt
## " logical confusion . \ndirector gregory hoblit and screenwriter toby emmerich structure \" frequency \" as good hol"
## cv078_14730.txt
## "different , it would be easy for the director and screenwriter to dumb it down and appeal to the lowest common d"
## cv086_18371.txt
## "itively made . \ndirector michael winterbottom and screenwriter frank cottrell boyce vividly express the societal"
## cv094_27889.txt
## "it prince acquit themselves admirably . \nkudos to screenwriters james schamus , wang hui ling and tsai kuo jing ,"
## cv115_25396.txt
## "ins an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does n"
## cv121_17302.txt
## "i relish those rare opportunities when a talented screenwriter can make me feel like a fool . \ni spent the first"
## cv135_11603.txt
## "is a highly enjoyable ride . \nonce again , chan's screenwriters ( here edward tang and fibe ma ) have taken the e"
## cv142_22516.txt
## "e hallstrom ( what's eating gilbert grape ? ) and screenwriter/novelist john irving ( the world according to gar"
## cv143_19666.txt
## "sy to have written vivian as merely a flake , but screenwriter j . f . lawton clearly cared too much about his c"
## cv158_10390.txt
## " potentially hilarious comedy , and pitched it to screenwriter paul rudnick . \nit's true that if this same film "
## cv162_10424.txt
## "rise to fame , quickly become the most well-known screenwriter amongst the entertainment weekly-reading , box of"
## cv178_12972.txt
## "aseball bat , but i'm pretty sure andrew niccol , screenwriter for the truman show , has had the same curious th"
## cv190_27052.txt
## "has been sharply pieced together by tony gilroy , screenwriter of the devil's advocate and dolores claiborne . \n"
## cv191_29719.txt
## "\" was right after all . \nfortunately , first-time screenwriter marc klein has sketched strong , well-rounded , c"
## cv208_9020.txt
## "ble , but after all the star power , mega bucks , screenwriters , directors , and cool trailers , men in black is"
## cv225_29224.txt
## "efall the participants en route to silver city . \nscreenwriter andy breckman adds a nice touch by not having the"
## cv235_10217.txt
## "too much to it , but capra and the gang ( various screenwriters , composers , actors ) plumet the material to its"
## cv240_14336.txt
## "sh anything , it just comes natural to him . \nthe screenwriters use the right words and phrases to describe the m"
## cv241_23130.txt
## "lf more seriously this time ; maybe so , or maybe screenwriter ehren kruger ( arlington road ) , who took over a"
## cv274_25253.txt
## "ins an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does n"
## cv318_10493.txt
## "nd-up from veteran horror director wes craven and screenwriter kevin williamson that seemed to breathe new life "
## cv336_10143.txt
## "ed for scream 2 , including director wes craven , screenwriter kevin williamson , and actors neve campbell , cou"
## cv359_6647.txt
## "y after all . \nfavreau also doubled as the film's screenwriter , and he proves he has the gift for creating enga"
## cv365_11576.txt
## "ing . \nthat's not to say the movie isn't funny . \nscreenwriter tim herlihy has written for sandler before ( bill"
## cv370_5221.txt
## " . \nit tore at my heart to watch a gifted lesbian screenwriter explain that , as a rule , gay audiences hunger f"
## cv395_10849.txt
## "jake kasdan , son of one of the best screenwriters around , breaks into filmmaking by writing and di"
## cv404_20315.txt
## "albert brooks plays steven phillips , a hollywood screenwriter who after winning a humanitarian award for his wo"
## cv405_20399.txt
## " the 1999 film outside providence ( 6 . 5/10 ) . \nscreenwriter w . peter iliff also had a part in writing the sc"
## cv410_24266.txt
## " . \nminkoff likes to point out scenes where other screenwriters came in and polished up the script , namely write"
## cv432_14224.txt
## "ments work very well -- for a comic book story . \nscreenwriter david goyer ( who also wrote the crow ) incorpora"
## cv452_5088.txt
## "tober is distinguished by its water-tight plot . \nscreenwriters larry ferguson and donald stewart have gracefully"
## cv456_18985.txt
## "d , and all are handled exceptionally well by the screenwriters . \nthere is no shred of doubt left to ponder afte"
## cv464_15650.txt
## "ture and charm holds right up to the last reel . \nscreenwriters john eskow , ted elliot and terry rosio have unfo"
## cv466_18722.txt
## "david mamet has long been my favorite screenwriter and director . \nwith his distinctive , more often"
## cv474_10209.txt
## "s that \" genius is insanity with some success \" , screenwriter fierstein is taking a lazy shortcut ) , pryce mak"
## cv476_16856.txt
## "threat of class struggle ; for george pal and his screenwriter david duncan , who produced the film in the worst"
## [1] " "
## [1] "PATTERN: cinematographer"
## cv000_29590.txt
## "zed , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures t"
## cv067_19774.txt
## "atmospherically shot by the silence of the lambs' cinematographer , tak fujimoto ) is actually a drama-its spooky ,"
## cv099_10534.txt
## "nger . \nas depicted by hoblit ( primal fear ) and cinematographer newton thomas sigel , philadelphia is a dark , dr"
## cv121_17302.txt
## "occasionally the hyper-real approach works , with cinematographer elliot davis creating a world of fantasy romance "
## cv154_9328.txt
## "rom editors karen schmeer and shondra merrill and cinematographer robert richardson ( oliver stone's longtime colla"
## cv159_29505.txt
## "ly amazing piotr sobocinski , the oscar-nominated cinematographer behind krzysztof kieslowski's red . \nthe acting i"
## cv215_22240.txt
## " echoing the loneliness of the protagonists . \n ( cinematographer remi adafarasin often allows space to engulf them"
## cv220_29059.txt
## ", and comic-panel framed with a virtuoso grace by cinematographer amy vincent ( _death in venice , ca_ ) , while in"
## cv273_29112.txt
## "han he expected at the beginning . \nit helps that cinematographer matthieu poirot-delpech's crisp lensing complimen"
## cv294_11684.txt
## "hudsucker proxy . \" \nthe film was shot by veteran cinematographer roger deakins , who has worked with the coens on "
## cv297_10047.txt
## "work done by production designer nigel phelps and cinematographer darius khondji . \nas technically adept as jeunet'"
## cv323_29805.txt
## "sarossy who directs spent most of his career as a cinematographer and like the kingpin's lair , he has molded image"
## cv351_15458.txt
## "s the result of a perfect pairing of director and cinematographer . \nkapur and his cinematographer remi adefarasin "
## cv362_15341.txt
## "eth are shekhar kapur's visual delights . \nhe and cinematographer remi adefarasin have crafted a film with a rich c"
## cv386_10080.txt
## "keen eye for the stylish ; his collaboration with cinematographer slawomir idziak , production designer jan roelfs "
## cv397_29023.txt
## "of skilled craftsmen to work behind the camera . \ncinematographer matthew f . leonetti has had a long career of sho"
## cv409_29786.txt
## "( once again wielded by superb and ever-attentive cinematographer eric gautier ) is less appropriate here than in h"
## cv427_10825.txt
## "e telephones are old-fashioned , rotary models . \ncinematographer bill butler is given an opportunity to use unconv"
## cv431_7085.txt
## "is death in the early 80's . \nmore practiced as a cinematographer than a director , bava nonetheless sat in the dir"
## [1] " "
## [1] "PATTERN: oscar winner"
## cv000_29590.txt
## "omparison to that in the black-and-white comic . \noscar winner martin childs' ( shakespeare in love ) production"
## cv108_15571.txt
## "sshoppers is hopper , who is fiendishly voiced by oscar winner kevin spacey . \nwhen the offering is lost hopper "
## cv300_22284.txt
## " he stole every scene he was in away from veteran oscar winner tom hanks . \nrockwell , an independent film veter"
## cv314_14422.txt
## "eresting to see that this movie was one of future oscar winner susan sarandon's ( dead man walking ) first film-"
## cv317_24049.txt
## " the former actually had the insight to follow up oscar winner usual suspects with a pauly shore vehicle , and j"
## cv475_21692.txt
## "on and ben affleck in the starring roles . \nbeing oscar winners for the classic 'good will hunting' they give gr"
## [1] " "
Now we define more complex regular expressions that help identifying people appearances.
pattern1=c("created by ([A-z]* [A-z]*)")
pattern1=c(pattern1,"created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)")
pattern1=c(pattern1,"screenwriter[s]? ([A-z]* [A-z]*)")
pattern1=c(pattern1,"cinematographer(?: ,)? ([A-z]* [A-z]*)")
pattern1=c(pattern1,"oscar winner ([A-z]* [A-z]*)")
We detect those patterns in the corpus and we can see in which files they do appear.
matches1 = detectPatternsInCorpus(corpus, pattern1)
matches1[!is.na(matches1[4]),c(1,4)]
## File screenwriter[s]? ([A-z]* [A-z]*)
## 1 cv000_29590.txt terry hayes
## 29 cv028_26746.txt are the
## 30 cv029_18643.txt alex proyas
## 77 cv076_24945.txt toby emmerich
## 79 cv078_14730.txt to dumb
## 87 cv086_18371.txt frank cottrell
## 95 cv094_27889.txt james schamus
## 116 cv115_25396.txt karey kirkpatrick
## 122 cv121_17302.txt can make
## 144 cv143_19666.txt j
## 159 cv158_10390.txt paul rudnick
## 163 cv162_10424.txt amongst the
## 179 cv178_12972.txt for the
## 191 cv190_27052.txt of the
## 192 cv191_29719.txt marc klein
## 226 cv225_29224.txt andy breckman
## 241 cv240_14336.txt use the
## 242 cv241_23130.txt ehren kruger
## 275 cv274_25253.txt karey kirkpatrick
## 319 cv318_10493.txt kevin williamson
## 337 cv336_10143.txt kevin williamson
## 366 cv365_11576.txt tim herlihy
## 371 cv370_5221.txt explain that
## 396 cv395_10849.txt around
## 405 cv404_20315.txt who after
## 406 cv405_20399.txt w
## 411 cv410_24266.txt came in
## 433 cv432_14224.txt david goyer
## 453 cv452_5088.txt larry ferguson
## 465 cv464_15650.txt john eskow
## 467 cv466_18722.txt and director
## 475 cv474_10209.txt fierstein is
## 477 cv476_16856.txt david duncan
We print the matches found per pattern.
printMatchesPerPattern(pattern1, matches1)
## [1] "PATTERN: created by ([A-z]* [A-z]*)"
## [1] "alan moore" "his last" "the earlier" "the animated"
## [5] "fluid dynamics"
## [1] " "
## [1] "PATTERN: created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*)"
## [1] "eddie campbell"
## [1] " "
## [1] "PATTERN: screenwriter[s]? ([A-z]* [A-z]*)"
## [1] "terry hayes" "are the" "alex proyas"
## [4] "toby emmerich" "to dumb" "frank cottrell"
## [7] "james schamus" "karey kirkpatrick" "can make"
## [10] "j " "paul rudnick" "amongst the"
## [13] "for the" "of the" "marc klein"
## [16] "andy breckman" "use the" "ehren kruger"
## [19] "karey kirkpatrick" "kevin williamson" "kevin williamson"
## [22] "tim herlihy" "explain that" "around "
## [25] "who after" "w " "came in"
## [28] "david goyer" "larry ferguson" "john eskow"
## [31] "and director" "fierstein is" "david duncan"
## [1] " "
## [1] "PATTERN: cinematographer(?: ,)? ([A-z]* [A-z]*)"
## [1] "peter deming" "tak fujimoto" "newton thomas"
## [4] "elliot davis" "robert richardson" "behind krzysztof"
## [7] "remi adafarasin" "amy vincent" "matthieu poirot"
## [10] "roger deakins" "darius khondji" "and like"
## [13] "remi adefarasin" "remi adefarasin" "slawomir idziak"
## [16] "matthew f" "eric gautier" "bill butler"
## [19] "than a"
## [1] " "
## [1] "PATTERN: oscar winner ([A-z]* [A-z]*)"
## [1] "martin childs" "kevin spacey" "tom hanks" "susan sarandon"
## [5] "usual suspects"
## [1] " "
We check how many patterns we have found in each file.
countMatchesPerRow(matches1)
## File Count
## 1 cv000_29590.txt 5
## 29 cv028_26746.txt 1
## 30 cv029_18643.txt 1
## 68 cv067_19774.txt 1
## 77 cv076_24945.txt 1
## 79 cv078_14730.txt 1
## 87 cv086_18371.txt 1
## 95 cv094_27889.txt 1
## 100 cv099_10534.txt 1
## 109 cv108_15571.txt 1
## 116 cv115_25396.txt 1
## 122 cv121_17302.txt 2
## 144 cv143_19666.txt 1
## 155 cv154_9328.txt 1
## 159 cv158_10390.txt 1
## 160 cv159_29505.txt 1
## 163 cv162_10424.txt 1
## 179 cv178_12972.txt 1
## 191 cv190_27052.txt 1
## 192 cv191_29719.txt 1
## 206 cv205_9457.txt 1
## 216 cv215_22240.txt 1
## 221 cv220_29059.txt 1
## 226 cv225_29224.txt 1
## 241 cv240_14336.txt 1
## 242 cv241_23130.txt 1
## 274 cv273_29112.txt 1
## 275 cv274_25253.txt 1
## 286 cv285_16494.txt 1
## 295 cv294_11684.txt 1
## 298 cv297_10047.txt 1
## 301 cv300_22284.txt 1
## 315 cv314_14422.txt 1
## 318 cv317_24049.txt 1
## 319 cv318_10493.txt 1
## 324 cv323_29805.txt 1
## 337 cv336_10143.txt 1
## 352 cv351_15458.txt 1
## 363 cv362_15341.txt 1
## 366 cv365_11576.txt 1
## 371 cv370_5221.txt 1
## 372 cv371_7630.txt 1
## 387 cv386_10080.txt 1
## 396 cv395_10849.txt 1
## 398 cv397_29023.txt 1
## 405 cv404_20315.txt 1
## 406 cv405_20399.txt 1
## 410 cv409_29786.txt 1
## 411 cv410_24266.txt 1
## 428 cv427_10825.txt 1
## 432 cv431_7085.txt 1
## 433 cv432_14224.txt 1
## 453 cv452_5088.txt 1
## 465 cv464_15650.txt 1
## 467 cv466_18722.txt 1
## 475 cv474_10209.txt 1
## 477 cv476_16856.txt 1
## 485 cv484_25054.txt 1
And we check how many times each pattern has been found.
countMatchesPerColumn(matches1)
## Entity Count
## 1 created by ([A-z]* [A-z]*) 5
## 2 created by [A-z]* [A-z]* \\( and ([A-z]* [A-z]*) 1
## 3 screenwriter[s]? ([A-z]* [A-z]*) 33
## 4 cinematographer(?: ,)? ([A-z]* [A-z]*) 19
## 5 oscar winner ([A-z]* [A-z]*) 5
Put all matches in a list for comparison with a gold standard.
allMatches = mergeAllMatchesInLists(allEntities)
head(allMatches)
## Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
## Matches
## 1 alan moore, eddie campbell, terry hayes, peter deming, martin childs, ghost world, vertical limit
## 2 _election _, _rushmore _, matthew broderick, bill murray, reese witherspoon
## 3 NULL
## 4 roy scheider, murray hamilton, robert shaw, richard dreyfuss
## 5 NULL
## 6 maka kotto, eriq ebouaney, pascal nzonzi
Load the gold standard and put all gold standard matches in a list for comparison.
goldStandard = read.table(file = "goldStandard.csv", quote = "", na.strings=c(""),
colClasses="character", sep=";")
allMatchesGold = mergeGoldStandardInLists(goldStandard)
head(allMatchesGold)
## Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
## Matches
## 1 alan moore, eddie campbell, moore, campbell, jack, michael jackson, albert, allen hughes, peter godley, robbie coltrane, frederick abberline, johnny depp, abberline, mary kelly, heather graham, terry hayes, rafael yglesias, steve guttenberg, tim burton, marilyn manson, peter deming, martin childs, depp, ians holm, joe gould, richardson, graham
## 2 matthew broderick, reese witherspoon, george washington carver, tracy flick, paul, max fischer, bill murray, broderick, witherspoon, jessica campbell, tammy, rooney, campbell, alexander payne, tracy, m
## 3 ryan, hanks, tom hanks, joe fox, meg ryan, kathleen kelley, fox, kelley
## 4 john williams, steven spielberg, spielberg, williams, martin brody, roy scheider, larry vaughn, murray hamilton, brody, matt hooper, richard dreyfuss, hooper, vaughn, quint, robert shaw, hitchcock, scheider, dreyfuss, shaw, robert redford, paul newman, duddy kravitz, ahab
## 5 herb, jackie chan, barry sanders, sanders, jackie, chan, bruce lee, tim allen, lawrence kazdan, john williams, spielberg, george lucas
## 6 raoul peck, lumumba, patrice lumumba, eriq ebouaney, helmer peck, peck, pascal bonitzer, patrice, joseph kasa vubu, maka kotto, moise tschombe, pascal nzonzi
Calculate the metrics (precision, recall, f-measure).
metrics = calculateMetrics(allMatches, allMatchesGold)
metrics
## Precision Recall Fmeasure
## 1 0.9364508 0.12492 0.2204347