The goal of this document is to show a sample script for pattern-based entity recognition over text documents using a gazetteer. It mainly uses the openNLP (natural language processing), the tm (text mining) and the SPARQL packages in R.
I cannot claim full authorship of this document, since I have taken code snippets and have been inspired by multiple books and documents in the Web. Thanks everyone for sharing.
Check the working directory with wd. If it is not the one where your data are located, change it with setwd.
getwd()
## [1] "/Users/raul/ownCloud/Trabajo/Docencia/2015 Intelligent Systems/R"
setwd("~/ownCloud/Trabajo/Docencia/2015 Intelligent Systems/R")
Now we load the required libraries. Only a couple of things to mention:
annotate function of the openNLP package requires to explicitly include the package name (i.e., NLP::annotate) due to a name clash with ggplot2# Needed for OutOfMemoryError: Java heap space
library(rJava)
.jinit(parameters="-Xmx4g")
# If there are more memory problems, invoke gc() after the POS tagging
library(NLP)
library(openNLP)
library(openNLPmodels.en)
library(tm)
library(stringr)
library(SPARQL)
library(parallel)
getAnnotationsFromDocument returns annotations for the text document: word, sentence, and part-of-speech annotations.
As an alternative, the koRpus package uses TreeTagger for POS tagging.
# Returns annotations for the text document: word, sentence, POS
# As an alternative, the koRpus package uses TreeTagger for POS tagging
getAnnotationsFromDocument = function(doc){
x=as.String(doc)
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
y2 <- annotate(x, pos_tag_annotator, y1)
return(y2)
}
getAnnotatedMergedDocument returns the text document merged with the annotations.
getAnnotatedMergedDocument = function(doc,annotations){
x=as.String(doc)
y2w <- subset(annotations, type == "word")
tags <- sapply(y2w$features, '[[', "POS")
r1 <- sprintf("%s/%s", x[y2w], tags)
r2 <- paste(r1, collapse = " ")
return(r2)
}
getAnnotatedPlainTextDocument returns the text document along with its annotations in an AnnotatedPlainTextDocument.
getAnnotatedPlainTextDocument = function(doc,annotations){
x=as.String(doc)
a = AnnotatedPlainTextDocument(x,annotations)
return(a)
}
detectPatternOnDocument returns the pattern detected on an AnnotatedPlainTextDocument.
detectPatternOnDocument <- function(doc, pattern) {
x=as.String(doc)
res=str_match(x,pattern)
if (length(res)==1){
return (res)
} else {
if (all(is.na(res[,2:length(res)])))
return (NA)
else {
ret=list()
for (i in 2:length(res)){
ret = paste(ret,res[i])
}
return(ret)
}
}
}
detectPatternsInCorpus returns a data frame with all the patterns detected in a corpus.
detectPatternsInCorpus = function(corpus, patterns){
vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1, nrow = length(corpus)))
names(vallEntities) <- c("File",patterns)
for (i in 1:length(patterns)) {
vallEntities[,i+1]=unlist(mclapply(corpus, detectPatternOnDocument, pattern=patterns[i]))
}
for (i in 1:length(corpus)) {
vallEntities$File[i]=meta(corpus[[i]])$id
}
return (vallEntities)
}
countMatchesPerColumn returns the number of matches per pattern/column.
Counts the number of columns with non-NA values for each pattern.
countMatchesPerColumn = function (df) {
entityCountPerPattern <- data.frame(matrix(NA, ncol = 2, nrow = length(names(df))-1))
names(entityCountPerPattern) <- c("Entity","Count")
for (i in 2:length(names(df))) {
entityCountPerPattern$Entity[i-1] = names(df)[i]
entityCountPerPattern$Count[i-1] = nrow(subset(df, !is.na(df[i])))
}
return (entityCountPerPattern)
}
countMatchesPerRow returns the number of entities per file/row.
Counts the number of rows with non-NA values for each file.
countMatchesPerRow = function (df) {
entityCountPerFile <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(entityCountPerFile) <- c("File","Count")
for (i in 1:nrow(df)) {
entityCountPerFile$File[i] = df$File[i]
entityCountPerFile$Count[i] = length(Filter(Negate(is.na),df[i,2:length(df[i,])]))
}
return (entityCountPerFile[entityCountPerFile[2]!=0,])
}
mergeAllMatchesInLists returns a data frame with all the files and their matches in a single list per file.
mergeAllMatchesInLists = function (df) {
matchesPerFile = rep(list(list()), nrow(df))
for (i in 1:nrow(df)) {
matches=as.list(unname(unlist(Filter(Negate(is.na),df[i,2:length(df[i,])]))))
matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
}
files = df[,1]
matches = matchesPerFile
allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(allMatches) <- c("Files","Matches")
allMatches$Files=files
allMatches$Matches=matches
return (allMatches)
}
mergeGoldStandardInLists returns a data frame with all the files and the gold standard matches in a single list per file.
mergeGoldStandardInLists = function (df) {
matchesPerFile = rep(list(list()), nrow(df))
for (i in 1:nrow(df)) {
matches=as.list(unlist(Filter(Negate(is.na),df[i,2:length(df)])))
matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
}
files = df[,1]
matches = matchesPerFile
allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(allMatches) <- c("Files","Matches")
allMatches$Files=files
allMatches$Matches=matches
return (allMatches)
}
calculateMetrics calculates precision, recall and f-measure according to a gold standard.
calculateMetrics = function (matches, matches.gs) {
metrics<- data.frame(matrix(NA, ncol = 3, nrow = 1))
names(metrics) <- c("Precision","Recall","Fmeasure")
numCorrect = 0
allAnswers = 0
possibleAnswers = 0
for (i in 1:nrow(matches)) {
if (length(matches.gs$Matches[[i]])!=0) {
l = str_trim(unlist(matches[i,2]))
l.gs = unname(unlist(matches.gs[i,2]))
intersection = intersect(l, l.gs)
numCorrect = numCorrect + length(intersect(l, l.gs))
allAnswers = allAnswers + length (l)
possibleAnswers = possibleAnswers + length(l.gs)
}
}
metrics$Precision = numCorrect / allAnswers
metrics$Recall = numCorrect / possibleAnswers
beta = 1
metrics$Fmeasure= ((sqrt(beta)+1) * metrics$Precision * metrics$Recall) / ((sqrt(beta)*metrics$Precision) + metrics$Recall)
return(metrics)
}
We are going to use the Movie review data version 2.0, created by Bo Pang and Lillian Lee.
Once unzipped, the data splits the different documents into positive and negative opinions. In this script we are going to use the positive opinions located in ./txt_sentoken/pos.
We are only going to load the first 500 reviews.
source.pos = DirSource("../Corpus/review_polarity/txt_sentoken/pos", encoding = "UTF-8")
corpus = Corpus(source.pos)
Let’s take a look at the document in the first entry.
inspect(corpus[[1]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 4226
##
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd .
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes .
## in other words , don't dismiss this film because of its source .
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes .
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ?
## the ghetto in question is , of course , whitechapel in 1888 london's east end .
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision .
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case .
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium .
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach .
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay .
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end .
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts .
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) .
## don't worry - it'll all make sense when you see it .
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) .
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic .
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place .
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent .
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham .
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad .
## the film , however , is all good .
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content
We just apply the getAnnotationsFromDocument function to every document in the corpus using lapply.
This step may take long depending on the size of the corpus and on the annotations that we want to identify.
annotations = lapply(corpus, getAnnotationsFromDocument)
We can create AnnotatedPlainTextDocuments that attach the annotations to the document and store the annotated corpus in another variable (since we destroy the corpus metadata).
corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)
And we can also store all the annotations inline with the text and store the annotated corpus in another variable (since we destroy the corpus metadata).
corpus.taggedText = Map(getAnnotatedMergedDocument, corpus, annotations)
We define a query to obtain (some) actor names in DBpedia.
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")
sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"
qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
?actor a <http://dbpedia.org/class/yago/Actor109765278> .
?actor rdfs:label ?label .
}
LIMIT 10000
OFFSET 0
")
Let’s evaluate the query against the SPARQL endpoint.
endpointT <- "http://dbpedia.org/sparql"
optionsT=""
actors <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results
And take a look at the output of the query.
length(actors)
## [1] 10000
actors[1:30]
## label label.1 label.2
## 1 "Megan Lawrence"@it "Megan Lawrence"@en "Barry James"@it
## label.3 label.4 label.5 label.6
## 1 "Barry James"@en "Al Pacino"@en "آل باتشينو"@ar "Al Pacino"@de
## label.7 label.8 label.9 label.10
## 1 "Al Pacino"@es "Al Pacino"@fr "Al Pacino"@it "アル・パチーノ"@ja
## label.11 label.12 label.13 label.14
## 1 "Al Pacino"@nl "Al Pacino"@pl "Al Pacino"@pt "Пачино, Аль"@ru
## label.15 label.16 label.17 label.18
## 1 "艾尔·帕西诺"@zh "Alan Rickman"@en "ألان ريكمان"@ar "Alan Rickman"@de
## label.19 label.20 label.21
## 1 "Alan Rickman"@es "Alan Rickman"@fr "Alan Rickman"@it
## label.22 label.23 label.24
## 1 "アラン・リックマン"@ja "Alan Rickman"@nl "Alan Rickman"@pl
## label.25 label.26 label.27 label.28
## 1 "Alan Rickman"@pt "Рикман, Алан"@ru "艾倫·瑞克曼"@zh "Albert Finney"@en
## label.29
## 1 "ألبرت فيني"@ar
We need to clean the output of the query. We need to:
actors.2 <- mclapply(actors, function(x) strsplit(x,'"')[[1]][2])
actors.3 <- mclapply(actors.2, function(x) strsplit(x,' \\(')[[1]][1])
actor.names <- unique(actors.3)
actor.names <- mclapply(actor.names, gsub, pattern="\\.", replacement=" ")
actor.names <- mclapply(actor.names, tolower)
length(actor.names)
## [1] 4565
head(actor.names,10)
## [[1]]
## [1] "megan lawrence"
##
## [[2]]
## [1] "barry james"
##
## [[3]]
## [1] "al pacino"
##
## [[4]]
## [1] "آل باتشينو"
##
## [[5]]
## [1] "アル・パチーノ"
##
## [[6]]
## [1] "пачино, аль"
##
## [[7]]
## [1] "艾尔·帕西诺"
##
## [[8]]
## [1] "alan rickman"
##
## [[9]]
## [1] "ألان ريكمان"
##
## [[10]]
## [1] "アラン・リックマン"
Now we write the gazetteer to a file.
write.table(unlist(actor.names), file = "gazetteer.txt", row.names = F, col.names = F, na="", sep=";")
We include spaces at both sides of the names, to only match full words.
And we detect the patterns in the corpus.
pattern.an <- mclapply(actor.names, function(x) return(paste(" ",x," ",sep = "")))
pattern.an=unlist(pattern.an)
# There is some actor named "you" that is spoiling our results; we remove it
pattern.an = pattern.an[grep("^ you $", pattern.an, invert = TRUE)]
matches.an = detectPatternsInCorpus(corpus, pattern.an)
Let’s see how many patterns we have found per file.
countMatchesPerRow(matches.an)
## File Count
## 1 cv000_29590.txt 1
## 3 cv002_15918.txt 1
## 4 cv003_11664.txt 1
## 5 cv004_11636.txt 1
## 6 cv005_29443.txt 1
## 7 cv006_15448.txt 1
## 8 cv007_4968.txt 1
## 10 cv009_29592.txt 1
## 11 cv010_29198.txt 1
## 15 cv014_13924.txt 1
## 17 cv016_4659.txt 1
## 19 cv018_20137.txt 1
## 21 cv020_8825.txt 1
## 22 cv021_15838.txt 1
## 25 cv024_6778.txt 2
## 26 cv025_3108.txt 1
## 27 cv026_29325.txt 1
## 28 cv027_25219.txt 1
## 31 cv030_21593.txt 2
## 32 cv031_18452.txt 1
## 34 cv033_24444.txt 2
## 35 cv034_29647.txt 2
## 38 cv037_18510.txt 1
## 41 cv040_8276.txt 1
## 42 cv041_21113.txt 1
## 43 cv042_10982.txt 1
## 45 cv044_16969.txt 1
## 52 cv051_10306.txt 1
## 54 cv053_21822.txt 1
## 56 cv055_8338.txt 1
## 60 cv059_28885.txt 1
## 61 cv060_10844.txt 1
## 62 cv061_8837.txt 1
## 63 cv062_23115.txt 1
## 65 cv064_24576.txt 1
## 66 cv065_15248.txt 1
## 67 cv066_10821.txt 2
## 72 cv071_12095.txt 1
## 74 cv073_21785.txt 2
## 77 cv076_24945.txt 1
## 79 cv078_14730.txt 1
## 81 cv080_13465.txt 1
## 83 cv082_11080.txt 1
## 85 cv084_13566.txt 1
## 87 cv086_18371.txt 1
## 89 cv088_24113.txt 1
## 90 cv089_11418.txt 1
## 95 cv094_27889.txt 1
## 97 cv096_11474.txt 1
## 99 cv098_15435.txt 1
## 100 cv099_10534.txt 1
## 108 cv107_24319.txt 1
## 110 cv109_21172.txt 1
## 112 cv111_11473.txt 1
## 113 cv112_11193.txt 1
## 116 cv115_25396.txt 1
## 117 cv116_28942.txt 1
## 118 cv117_24295.txt 2
## 119 cv118_28980.txt 1
## 122 cv121_17302.txt 1
## 123 cv122_7392.txt 2
## 124 cv123_11182.txt 2
## 125 cv124_4122.txt 1
## 127 cv126_28971.txt 1
## 128 cv127_14711.txt 1
## 130 cv129_16741.txt 1
## 131 cv130_17083.txt 1
## 132 cv131_10713.txt 1
## 135 cv134_22246.txt 1
## 137 cv136_11505.txt 1
## 140 cv139_12873.txt 2
## 141 cv140_7479.txt 1
## 142 cv141_15686.txt 1
## 144 cv143_19666.txt 1
## 146 cv145_11472.txt 1
## 150 cv149_15670.txt 1
## 151 cv150_12916.txt 1
## 152 cv151_15771.txt 3
## 154 cv153_10779.txt 1
## 157 cv156_10481.txt 1
## 159 cv158_10390.txt 1
## 160 cv159_29505.txt 2
## 163 cv162_10424.txt 2
## 164 cv163_10052.txt 3
## 166 cv165_22619.txt 2
## 167 cv166_11052.txt 1
## 172 cv171_13537.txt 2
## 173 cv172_11131.txt 1
## 178 cv177_10367.txt 1
## 182 cv181_14401.txt 1
## 183 cv182_7281.txt 1
## 185 cv184_2673.txt 1
## 187 cv186_2269.txt 2
## 191 cv190_27052.txt 2
## 193 cv192_14395.txt 1
## 196 cv195_14528.txt 1
## 200 cv199_9629.txt 2
## 205 cv204_8451.txt 1
## 207 cv206_14293.txt 1
## 211 cv210_9312.txt 1
## 212 cv211_9953.txt 1
## 214 cv213_18934.txt 1
## 219 cv218_24352.txt 1
## 221 cv220_29059.txt 1
## 224 cv223_29066.txt 2
## 225 cv224_17661.txt 1
## 226 cv225_29224.txt 1
## 230 cv229_13611.txt 1
## 234 cv233_15964.txt 1
## 237 cv236_11565.txt 1
## 239 cv238_12931.txt 2
## 242 cv241_23130.txt 1
## 245 cv244_21649.txt 1
## 246 cv245_8569.txt 1
## 249 cv248_13987.txt 2
## 252 cv251_22636.txt 1
## 254 cv253_10077.txt 1
## 257 cv256_14740.txt 1
## 261 cv260_13959.txt 1
## 265 cv264_12801.txt 1
## 267 cv266_25779.txt 3
## 271 cv270_6079.txt 1
## 272 cv271_13837.txt 1
## 273 cv272_18974.txt 1
## 274 cv273_29112.txt 1
## 275 cv274_25253.txt 1
## 279 cv278_13041.txt 1
## 282 cv281_23253.txt 1
## 284 cv283_11055.txt 1
## 285 cv284_19119.txt 3
## 286 cv285_16494.txt 2
## 288 cv287_15900.txt 1
## 290 cv289_6463.txt 1
## 295 cv294_11684.txt 1
## 298 cv297_10047.txt 1
## 299 cv298_23111.txt 2
## 301 cv300_22284.txt 3
## 302 cv301_12146.txt 1
## 304 cv303_27520.txt 1
## 305 cv304_28706.txt 1
## 307 cv306_10364.txt 1
## 309 cv308_5016.txt 1
## 310 cv309_22571.txt 1
## 311 cv310_13091.txt 2
## 312 cv311_16002.txt 2
## 313 cv312_29377.txt 1
## 315 cv314_14422.txt 1
## 316 cv315_11629.txt 4
## 317 cv316_6370.txt 1
## 318 cv317_24049.txt 4
## 319 cv318_10493.txt 1
## 322 cv321_12843.txt 1
## 325 cv324_7082.txt 1
## 326 cv325_16629.txt 1
## 333 cv332_16307.txt 4
## 337 cv336_10143.txt 2
## 342 cv341_24430.txt 2
## 347 cv346_18168.txt 1
## 351 cv350_20670.txt 1
## 355 cv354_8132.txt 1
## 356 cv355_16413.txt 1
## 357 cv356_25163.txt 1
## 360 cv359_6647.txt 1
## 361 cv360_8398.txt 1
## 364 cv363_29332.txt 1
## 365 cv364_12901.txt 2
## 369 cv368_10466.txt 2
## 371 cv370_5221.txt 1
## 374 cv373_20404.txt 1
## 376 cv375_9929.txt 1
## 380 cv379_21963.txt 1
## 382 cv381_20172.txt 1
## 383 cv382_7897.txt 1
## 384 cv383_13116.txt 1
## 387 cv386_10080.txt 1
## 389 cv388_12009.txt 1
## 391 cv390_11345.txt 1
## 392 cv391_10802.txt 1
## 397 cv396_17989.txt 1
## 398 cv397_29023.txt 2
## 399 cv398_15537.txt 3
## 402 cv401_12605.txt 2
## 404 cv403_6621.txt 1
## 405 cv404_20315.txt 1
## 406 cv405_20399.txt 1
## 407 cv406_21020.txt 1
## 410 cv409_29786.txt 1
## 412 cv411_15007.txt 2
## 413 cv412_24095.txt 1
## 415 cv414_10518.txt 2
## 422 cv421_9709.txt 1
## 424 cv423_11155.txt 1
## 428 cv427_10825.txt 1
## 430 cv429_7439.txt 1
## 435 cv434_5793.txt 2
## 437 cv436_19179.txt 1
## 440 cv439_15970.txt 4
## 442 cv441_13711.txt 1
## 450 cv449_8785.txt 1
## 451 cv450_7890.txt 1
## 453 cv452_5088.txt 2
## 454 cv453_10379.txt 1
## 455 cv454_2053.txt 1
## 457 cv456_18985.txt 2
## 458 cv457_18453.txt 1
## 462 cv461_19600.txt 4
## 463 cv462_19350.txt 1
## 466 cv465_22431.txt 1
## 469 cv468_15228.txt 2
## 470 cv469_20630.txt 3
## 474 cv473_7367.txt 2
## 476 cv475_21692.txt 1
## 479 cv478_14309.txt 1
## 484 cv483_16378.txt 1
## 485 cv484_25054.txt 1
## 490 cv489_17906.txt 1
## 492 cv491_12145.txt 1
## 497 cv496_10530.txt 1
## 498 cv497_26980.txt 1
## 499 cv498_8832.txt 1
## 501 cv500_10251.txt 1
## 502 cv501_11657.txt 2
## 503 cv502_10406.txt 1
## 504 cv503_10558.txt 1
## 506 cv505_12090.txt 1
## 508 cv507_9220.txt 1
## 509 cv508_16006.txt 2
## 510 cv509_15888.txt 2
## 511 cv510_23360.txt 1
## 512 cv511_10132.txt 3
## 513 cv512_15965.txt 1
## 514 cv513_6923.txt 2
## 517 cv516_11172.txt 1
## 519 cv518_13331.txt 1
## 520 cv519_14661.txt 2
## 521 cv520_12295.txt 1
## 523 cv522_5583.txt 1
## 524 cv523_16615.txt 1
## 525 cv524_23627.txt 1
## 527 cv526_12083.txt 2
## 529 cv528_10822.txt 1
## 531 cv530_16212.txt 2
## 532 cv531_26486.txt 1
## 533 cv532_6522.txt 3
## 537 cv536_27134.txt 3
## 538 cv537_12370.txt 1
## 539 cv538_28667.txt 1
## 542 cv541_28835.txt 2
## 543 cv542_18980.txt 1
## 544 cv543_5045.txt 2
## 547 cv546_11767.txt 1
## 548 cv547_16324.txt 1
## 553 cv552_10016.txt 2
## 555 cv554_13151.txt 1
## 558 cv557_11449.txt 1
## 561 cv560_17175.txt 1
## 564 cv563_17257.txt 1
## 566 cv565_29572.txt 2
## 575 cv574_22156.txt 2
## 576 cv575_21150.txt 1
## 577 cv576_14094.txt 1
## 578 cv577_28549.txt 1
## 579 cv578_15094.txt 1
## 580 cv579_11605.txt 1
## 581 cv580_14064.txt 1
## 582 cv581_19381.txt 1
## 587 cv586_7543.txt 3
## 589 cv588_13008.txt 2
## 590 cv589_12064.txt 1
## 592 cv591_23640.txt 1
## 594 cv593_10987.txt 2
## 596 cv595_25335.txt 2
## 597 cv596_28311.txt 3
## 598 cv597_26360.txt 1
## 600 cv599_20988.txt 1
## 604 cv603_17694.txt 1
## 605 cv604_2230.txt 1
## 606 cv605_11800.txt 1
## 607 cv606_15985.txt 1
## 609 cv608_23231.txt 1
## 610 cv609_23877.txt 1
## 614 cv613_21796.txt 2
## 615 cv614_10626.txt 1
## 619 cv618_8974.txt 1
## 620 cv619_12462.txt 1
## 621 cv620_24265.txt 2
## 624 cv623_15356.txt 1
## 628 cv627_11620.txt 1
## 629 cv628_19325.txt 1
## 630 cv629_14909.txt 1
## 632 cv631_4967.txt 1
## 633 cv632_9610.txt 2
## 634 cv633_29837.txt 3
## 636 cv635_10022.txt 1
## 640 cv639_10308.txt 1
## 641 cv640_5378.txt 1
## 645 cv644_17154.txt 1
## 647 cv646_15065.txt 1
## 648 cv647_13691.txt 1
## 650 cv649_12735.txt 1
## 654 cv653_19583.txt 1
## 659 cv658_10532.txt 1
## 661 cv660_21893.txt 1
## 662 cv661_2450.txt 2
## 663 cv662_13320.txt 1
## 667 cv666_18963.txt 3
## 668 cv667_18467.txt 1
## 672 cv671_5054.txt 1
## 675 cv674_10732.txt 1
## 676 cv675_21588.txt 2
## 677 cv676_21090.txt 1
## 681 cv680_10160.txt 1
## 685 cv684_11798.txt 1
## 688 cv687_21100.txt 3
## 689 cv688_7368.txt 1
## 693 cv692_15451.txt 1
## 701 cv700_21947.txt 3
## 704 cv703_16143.txt 2
## 705 cv704_15969.txt 3
## 709 cv708_28729.txt 2
## 713 cv712_22920.txt 2
## 715 cv714_18502.txt 1
## 718 cv717_15953.txt 1
## 719 cv718_11434.txt 1
## 720 cv719_5713.txt 1
## 721 cv720_5389.txt 1
## 722 cv721_29121.txt 1
## 724 cv723_8648.txt 1
## 725 cv724_13681.txt 2
## 727 cv726_4719.txt 1
## 728 cv727_4978.txt 1
## 729 cv728_16133.txt 2
## 731 cv730_10279.txt 1
## 732 cv731_4136.txt 2
## 734 cv733_9839.txt 1
## 736 cv735_18801.txt 1
## 737 cv736_23670.txt 1
## 740 cv739_11209.txt 1
## 741 cv740_12445.txt 1
## 742 cv741_11890.txt 2
## 745 cv744_10038.txt 1
## 746 cv745_12773.txt 1
## 747 cv746_10147.txt 3
## 750 cv749_17765.txt 1
## 751 cv750_10180.txt 1
## 752 cv751_15719.txt 1
## 758 cv757_10189.txt 1
## 759 cv758_9671.txt 2
## 762 cv761_12620.txt 4
## 769 cv768_11751.txt 1
## 774 cv773_18817.txt 2
## 778 cv777_10094.txt 1
## 780 cv779_17881.txt 1
## 781 cv780_7984.txt 3
## 782 cv781_5262.txt 1
## 783 cv782_19526.txt 2
## 786 cv785_22600.txt 3
## 788 cv787_13743.txt 1
## 791 cv790_14600.txt 1
## 795 cv794_15868.txt 3
## 797 cv796_15782.txt 4
## 798 cv797_6957.txt 1
## 799 cv798_23531.txt 1
## 800 cv799_18543.txt 1
## 803 cv802_28664.txt 4
## 806 cv805_19601.txt 1
## 808 cv807_21740.txt 1
## 810 cv809_5009.txt 2
## 812 cv811_21386.txt 1
## 814 cv813_6534.txt 2
## 816 cv815_22456.txt 1
## 818 cv817_4041.txt 1
## 819 cv818_10211.txt 2
## 820 cv819_9364.txt 1
## 821 cv820_22892.txt 1
## 825 cv824_8838.txt 1
## 827 cv826_11834.txt 1
## 828 cv827_18331.txt 1
## 830 cv829_20289.txt 2
## 831 cv830_6014.txt 2
## 835 cv834_22195.txt 2
## 837 cv836_12968.txt 1
## 838 cv837_27325.txt 1
## 840 cv839_21467.txt 1
## 841 cv840_16321.txt 3
## 842 cv841_3967.txt 3
## 846 cv845_14290.txt 1
## 847 cv846_29497.txt 1
## 848 cv847_1941.txt 2
## 850 cv849_15729.txt 1
## 853 cv852_27523.txt 1
## 857 cv856_29013.txt 1
## 859 cv858_18819.txt 1
## 861 cv860_13853.txt 1
## 866 cv865_2895.txt 2
## 867 cv866_29691.txt 1
## 869 cv868_11948.txt 1
## 870 cv869_23611.txt 1
## 871 cv870_16348.txt 2
## 872 cv871_24888.txt 1
## 873 cv872_12591.txt 1
## 874 cv873_18636.txt 1
## 876 cv875_5754.txt 1
## 877 cv876_9390.txt 2
## 879 cv878_15694.txt 1
## 881 cv880_29800.txt 2
## 883 cv882_10026.txt 3
## 885 cv884_13632.txt 1
## 891 cv890_3977.txt 1
## 892 cv891_6385.txt 1
## 894 cv893_26269.txt 3
## 898 cv897_10837.txt 1
## 900 cv899_16014.txt 2
## 901 cv900_10331.txt 1
## 904 cv903_17822.txt 2
## 907 cv906_11491.txt 1
## 909 cv908_16009.txt 2
## 911 cv910_20488.txt 1
## 913 cv912_5674.txt 1
## 914 cv913_29252.txt 2
## 917 cv916_15467.txt 1
## 919 cv918_2693.txt 1
## 920 cv919_16380.txt 1
## 922 cv921_12747.txt 1
## 923 cv922_10073.txt 3
## 925 cv924_29540.txt 1
## 927 cv926_17059.txt 1
## 928 cv927_10681.txt 1
## 930 cv929_16908.txt 2
## 931 cv930_13475.txt 1
## 934 cv933_23776.txt 2
## 935 cv934_19027.txt 1
## 936 cv935_23841.txt 1
## 939 cv938_10220.txt 1
## 943 cv942_17082.txt 2
## 946 cv945_12160.txt 1
## 947 cv946_18658.txt 1
## 950 cv949_20112.txt 2
## 952 cv951_10926.txt 1
## 953 cv952_25240.txt 1
## 955 cv954_18628.txt 1
## 961 cv960_29007.txt 1
## 963 cv962_9803.txt 2
## 964 cv963_6895.txt 2
## 965 cv964_6021.txt 1
## 968 cv967_5788.txt 2
## 971 cv970_18450.txt 1
## 972 cv971_10874.txt 2
## 974 cv973_10066.txt 1
## 979 cv978_20929.txt 1
## 980 cv979_18921.txt 1
## 982 cv981_14989.txt 2
## 986 cv985_6359.txt 1
## 987 cv986_13527.txt 1
## 989 cv988_18740.txt 1
## 991 cv990_11591.txt 3
## 996 cv995_21821.txt 1
## 998 cv997_5046.txt 1
## 999 cv998_14111.txt 2
Let’s see which patterns we have found.
countColum = countMatchesPerColumn(matches.an)
countColum[countColum$Count != 0,]
## Entity Count
## 3 al pacino 6
## 8 alan rickman 4
## 13 albert finney 1
## 18 alex cox 1
## 25 andie macdowell 3
## 34 antonio banderas 5
## 43 ashley judd 6
## 53 ava gardner 1
## 69 blake edwards 1
## 75 brad pitt 11
## 80 bruce lee 5
## 85 buster keaton 1
## 90 cameron diaz 15
## 105 cary grant 2
## 116 charlton heston 3
## 129 christopher lee 2
## 134 clark gable 1
## 144 clint eastwood 5
## 152 dan aykroyd 2
## 161 dark angel 2
## 167 data 5
## 177 denise richards 9
## 182 desmond llewelyn 2
## 193 dustin hoffman 8
## 232 fish 20
## 235 franco zeffirelli 1
## 239 fred gwynne 1
## 241 freddie prinze 3
## 242 gary cooper 1
## 251 gillian anderson 5
## 256 glenn close 9
## 261 goldie 2
## 262 goldie hawn 1
## 272 gwyneth paltrow 8
## 282 harrison ford 13
## 300 helen hunt 5
## 305 henry fonda 1
## 331 ingrid bergman 1
## 358 jean reno 1
## 363 jeff bridges 9
## 368 jennifer aniston 2
## 383 joan crawford 1
## 388 john carpenter 3
## 393 john denver 4
## 405 john travolta 16
## 410 jon pertwee 2
## 413 judy garland 1
## 418 juliette binoche 1
## 428 kate winslet 8
## 439 kenneth williams 6
## 462 leonardo dicaprio 8
## 467 leslie nielsen 2
## 472 luc besson 3
## 487 marty feldman 1
## 490 mary martin 1
## 503 meg ryan 13
## 512 mel gibson 12
## 535 milton berle 1
## 544 natalie portman 14
## 565 olivia newton-john 2
## 584 paul newman 3
## 595 prince 19
## 619 richard burton 1
## 635 roberto benigni 5
## 645 robin williams 18
## 650 rosie o'donnell 5
## 656 rutger hauer 1
## 671 sam raimi 5
## 681 sandra bullock 9
## 695 sergio leone 2
## 719 sophie marceau 2
## 724 spencer tracy 1
## 734 ted demme 1
## 750 terry gilliam 4
## 760 the three stooges 1
## 764 three stooges 1
## 778 thora birch 3
## 788 tim burton 11
## 793 tim curry 3
## 805 tom cruise 15
## 810 tom hanks 19
## 828 tricky 5
## 830 tupac shakur 1
## 835 val kilmer 6
## 857 vivien leigh 1
## 862 voice-over 15
## 875 wallace shawn 3
## 880 werner herzog 2
## 890 whoopi goldberg 4
## 920 william shakespeare 3
## 927 winona ryder 8
## 934 woody allen 16
## 1025 dennis quaid 7
## 1113 joe flynn 1
## 1168 peter weller 2
## 1186 robert hays 1
## 1224 aaron spelling 2
## 1248 adrienne posta 1
## 1285 alice krige 1
## 1289 alicia witt 2
## 1384 bebe 2
## 1388 bebe neuwirth 2
## 1436 brendan sexton iii 1
## 1502 charles keating 1
## 1543 christopher lloyd 3
## 1569 colm feore 1
## 1620 david morse 7
## 1677 don knotts 1
## 1680 donald stewart 1
## 1804 george marshall 1
## 1874 heather matarazzo 1
## 1905 henry thomas 1
## 2055 jamie kennedy 9
## 2101 jennifer lien 1
## 2163 john henderson 1
## 2168 john landis 1
## 2281 kathleen quinlan 2
## 2307 kieran culkin 1
## 2501 max schreck 2
## 2514 michael chapman 1
## 2538 mika boorem 2
## 2599 nanni moretti 1
## 2656 oded fehr 1
## 2663 orson bean 1
## 2714 peter macneill 2
## 2732 piper laurie 1
## 2890 ron perlman 2
## 2946 sam neill 4
## 2965 santo 1
## 3060 straight up 1
## 3100 thunder 3
## 3109 timothy olyphant 5
## 3114 timothy spall 2
## 3149 tsutomu yamazaki 1
## 3156 twilight 8
## 3170 vanity 8
## 3203 william hickey 1
## 3211 william petersen 1
## 3220 wink 5
## 3584 eartha 1
## 3618 eriq ebouaney 1
## 3655 gem 28
## 3784 jerry springer 7
## 4117 oliver stone 7
## 4123 oprah winfrey 1
## 4276 sami bouajila 1
Now we write the results to a file.
write.table(matches.an, file = "allEntitiesGazetteer.csv", row.names = F, na="", sep=";")
Let’s put all matches in a list for comparison with a gold standard.
allMatches = mergeAllMatchesInLists(matches.an)
head(allMatches,10)
## Files Matches
## 1 cv000_29590.txt tim burton
## 2 cv001_18431.txt NULL
## 3 cv002_15918.txt meg ryan
## 4 cv003_11664.txt paul newman
## 5 cv004_11636.txt bruce lee
## 6 cv005_29443.txt eriq ebouaney
## 7 cv006_15448.txt jennifer lien
## 8 cv007_4968.txt woody allen
## 9 cv008_29435.txt NULL
## 10 cv009_29592.txt gem
Now we load the gold standard and put all gold standard matches in a list for comparison.
goldStandard = read.table(file = "goldStandard.csv", quote = "", na.strings=c(""), colClasses="character", sep=";")
allMatchesGold = mergeGoldStandardInLists(goldStandard)
head(allMatchesGold,10)
## Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
## 7 cv006_15448.txt
## 8 cv007_4968.txt
## 9 cv008_29435.txt
## 10 cv009_29592.txt
## Matches
## 1 alan moore, eddie campbell, moore, campbell, jack, michael jackson, albert, allen hughes, peter godley, robbie coltrane, frederick abberline, johnny depp, abberline, mary kelly, heather graham, terry hayes, rafael yglesias, steve guttenberg, tim burton, marilyn manson, peter deming, martin childs, depp, ians holm, joe gould, richardson, graham
## 2 matthew broderick, reese witherspoon, george washington carver, tracy flick, paul, max fischer, bill murray, broderick, witherspoon, jessica campbell, tammy, rooney, campbell, alexander payne, tracy, m
## 3 ryan, hanks, tom hanks, joe fox, meg ryan, kathleen kelley, fox, kelley
## 4 john williams, steven spielberg, spielberg, williams, martin brody, roy scheider, larry vaughn, murray hamilton, brody, matt hooper, richard dreyfuss, hooper, vaughn, quint, robert shaw, hitchcock, scheider, dreyfuss, shaw, robert redford, paul newman, duddy kravitz, ahab
## 5 herb, jackie chan, barry sanders, sanders, jackie, chan, bruce lee, tim allen, lawrence kazdan, john williams, spielberg, george lucas
## 6 raoul peck, lumumba, patrice lumumba, eriq ebouaney, helmer peck, peck, pascal bonitzer, patrice, joseph kasa vubu, maka kotto, moise tschombe, pascal nzonzi
## 7 tony kaye, edward norton, norton, derek vinyard, danny, edward furlong, beverly d'angelo, davin, jennifer lien, derek, kaye, avery brooks, furlong, d'angelo, lien
## 8 betsy, molly ringwald, alan alda, ringwald, alda, dylan walsh, walsh, madeline kahn, ally sheedy, sheedy, anthony lapaglia, lapaglia, stevie dee, robert de niro, alec baldwin, de niro, joe pesci, catherine o'hara, woody allen
## 9 lumumba, janssens, rudi delhem, moise tshombe, pascal nzonzi, mobutu, joseph kasa vubu, maka kotto, peck, bonitzer, ebouaney
## 10 schwartznager, stallone, van damme, rongguang yu, wong fei-hong, jackie chan, fei-hong, sze-man tsang, wong kei-ying, yen chi dan, yuen wo ping, fox
Finally, we calculate the metrics.
metrics = calculateMetrics(allMatches, allMatchesGold)
metrics
## Precision Recall Fmeasure
## 1 0.7340659 0.03414435 0.06525349