Introduction

The goal of this document is to show a sample script for pattern-based entity recognition over text documents using a gazetteer. It mainly uses the openNLP (natural language processing), the tm (text mining) and the SPARQL packages in R.

I cannot claim full authorship of this document, since I have taken code snippets and have been inspired by multiple books and documents in the Web. Thanks everyone for sharing.

Preparation

Check working directory

Check the working directory with wd. If it is not the one where your data are located, change it with setwd.

getwd()
## [1] "/Users/raul/ownCloud/Trabajo/Docencia/2015 Intelligent Systems/R"
setwd("~/ownCloud/Trabajo/Docencia/2015 Intelligent Systems/R")

Load libraries

Now we load the required libraries. Only a couple of things to mention:

  • Using the annotate function of the openNLP package requires to explicitly include the package name (i.e., NLP::annotate) due to a name clash with ggplot2
  • Need to change the memory allocated to Java to avoid out-of-memory problems
# Needed for OutOfMemoryError: Java heap space 
library(rJava)
.jinit(parameters="-Xmx4g")
# If there are more memory problems, invoke gc() after the POS tagging

library(NLP)
library(openNLP) 
library(openNLPmodels.en)
library(tm)
library(stringr)
library(SPARQL)
library(parallel)

Auxiliary functions

getAnnotationsFromDocument

getAnnotationsFromDocument returns annotations for the text document: word, sentence, and part-of-speech annotations.

As an alternative, the koRpus package uses TreeTagger for POS tagging.

# Returns annotations for the text document: word, sentence, POS
# As an alternative, the koRpus package uses TreeTagger for POS tagging
getAnnotationsFromDocument = function(doc){
  x=as.String(doc)
  sent_token_annotator <- Maxent_Sent_Token_Annotator()
  word_token_annotator <- Maxent_Word_Token_Annotator()
  pos_tag_annotator <- Maxent_POS_Tag_Annotator()
  y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
  y2 <- annotate(x, pos_tag_annotator, y1)
  return(y2)  
} 

getAnnotatedMergedDocument

getAnnotatedMergedDocument returns the text document merged with the annotations.

getAnnotatedMergedDocument = function(doc,annotations){
  x=as.String(doc)
  y2w <- subset(annotations, type == "word")
  tags <- sapply(y2w$features, '[[', "POS")
  r1 <- sprintf("%s/%s", x[y2w], tags)
  r2 <- paste(r1, collapse = " ")
  return(r2)  
} 

getAnnotatedPlainTextDocument

getAnnotatedPlainTextDocument returns the text document along with its annotations in an AnnotatedPlainTextDocument.

getAnnotatedPlainTextDocument = function(doc,annotations){
  x=as.String(doc)
  a = AnnotatedPlainTextDocument(x,annotations)
  return(a)  
} 

detectPatternOnDocument

detectPatternOnDocument returns the pattern detected on an AnnotatedPlainTextDocument.

detectPatternOnDocument <- function(doc, pattern) {
  x=as.String(doc)
  res=str_match(x,pattern)
  
  if (length(res)==1){
    return (res)
  } else {
    if (all(is.na(res[,2:length(res)])))
      return (NA)
    else {
      ret=list()
      for (i in 2:length(res)){
        ret = paste(ret,res[i])
      }
      return(ret)
    }
  }
}

detectPatternsInCorpus

detectPatternsInCorpus returns a data frame with all the patterns detected in a corpus.

detectPatternsInCorpus = function(corpus, patterns){
  vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1, nrow = length(corpus)))
  names(vallEntities) <- c("File",patterns)
  for (i in 1:length(patterns)) {
    vallEntities[,i+1]=unlist(mclapply(corpus, detectPatternOnDocument, pattern=patterns[i]))
  }
  for (i in 1:length(corpus)) {
    vallEntities$File[i]=meta(corpus[[i]])$id
  }
  return (vallEntities)  
}

countMatchesPerColumn

countMatchesPerColumn returns the number of matches per pattern/column.

Counts the number of columns with non-NA values for each pattern.

countMatchesPerColumn = function (df) {
  entityCountPerPattern <- data.frame(matrix(NA, ncol = 2, nrow = length(names(df))-1))
  names(entityCountPerPattern) <- c("Entity","Count")
  
  for (i in 2:length(names(df))) {
    entityCountPerPattern$Entity[i-1] = names(df)[i]
    entityCountPerPattern$Count[i-1] = nrow(subset(df, !is.na(df[i])))
  }
  return (entityCountPerPattern)
}

countMatchesPerRow

countMatchesPerRow returns the number of entities per file/row.

Counts the number of rows with non-NA values for each file.

countMatchesPerRow = function (df) {
  entityCountPerFile <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(entityCountPerFile) <- c("File","Count")
  
  for (i in 1:nrow(df)) {
    entityCountPerFile$File[i] = df$File[i]
    entityCountPerFile$Count[i] = length(Filter(Negate(is.na),df[i,2:length(df[i,])]))
  }
  return (entityCountPerFile[entityCountPerFile[2]!=0,])
}

mergeAllMatchesInLists

mergeAllMatchesInLists returns a data frame with all the files and their matches in a single list per file.

mergeAllMatchesInLists = function (df) {
  matchesPerFile = rep(list(list()), nrow(df))
  
  for (i in 1:nrow(df)) {    
    matches=as.list(unname(unlist(Filter(Negate(is.na),df[i,2:length(df[i,])]))))
    matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
  }
  
  files = df[,1]
  matches = matchesPerFile
  
  allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(allMatches) <- c("Files","Matches")
  
  allMatches$Files=files
  allMatches$Matches=matches
  
  return (allMatches)
}

mergeGoldStandardInLists

mergeGoldStandardInLists returns a data frame with all the files and the gold standard matches in a single list per file.

mergeGoldStandardInLists = function (df) {
  matchesPerFile = rep(list(list()), nrow(df))
  
  for (i in 1:nrow(df)) {    
    matches=as.list(unlist(Filter(Negate(is.na),df[i,2:length(df)])))
    matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
  }
  
  files = df[,1]
  matches = matchesPerFile
  
  allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(allMatches) <- c("Files","Matches")
  
  allMatches$Files=files
  allMatches$Matches=matches
  
  return (allMatches)
}

calculateMetrics

calculateMetrics calculates precision, recall and f-measure according to a gold standard.

calculateMetrics = function (matches, matches.gs) {
  
  metrics<- data.frame(matrix(NA, ncol = 3, nrow = 1))
  names(metrics) <- c("Precision","Recall","Fmeasure")
  
  numCorrect = 0
  allAnswers = 0
  possibleAnswers = 0
  
  for (i in 1:nrow(matches)) {    
    if (length(matches.gs$Matches[[i]])!=0) {
      l = str_trim(unlist(matches[i,2]))
      l.gs = unname(unlist(matches.gs[i,2]))
      
      intersection = intersect(l, l.gs)
      
      numCorrect = numCorrect + length(intersect(l, l.gs))
      allAnswers = allAnswers + length (l)
      possibleAnswers = possibleAnswers + length(l.gs)    
    }
  }
  
  metrics$Precision = numCorrect / allAnswers
  metrics$Recall = numCorrect / possibleAnswers
  
  beta = 1
  metrics$Fmeasure= ((sqrt(beta)+1) * metrics$Precision * metrics$Recall) / ((sqrt(beta)*metrics$Precision) + metrics$Recall)
  
  return(metrics)
}

Load corpus

We are going to use the Movie review data version 2.0, created by Bo Pang and Lillian Lee.

Once unzipped, the data splits the different documents into positive and negative opinions. In this script we are going to use the positive opinions located in ./txt_sentoken/pos.

We are only going to load the first 500 reviews.

source.pos = DirSource("../Corpus/review_polarity/txt_sentoken/pos", encoding = "UTF-8")
corpus = Corpus(source.pos)

Inspect corpus

Let’s take a look at the document in the first entry.

inspect(corpus[[1]])
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 4226
## 
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
## in other words , don't dismiss this film because of its source . 
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ? 
## the ghetto in question is , of course , whitechapel in 1888 london's east end . 
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision . 
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case . 
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium . 
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach . 
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay . 
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end . 
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts . 
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) . 
## don't worry - it'll all make sense when you see it . 
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) . 
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic . 
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place . 
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent . 
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham . 
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad . 
## the film , however , is all good . 
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content

Annotate corpus

We just apply the getAnnotationsFromDocument function to every document in the corpus using lapply.

This step may take long depending on the size of the corpus and on the annotations that we want to identify.

annotations = lapply(corpus, getAnnotationsFromDocument)

We can create AnnotatedPlainTextDocuments that attach the annotations to the document and store the annotated corpus in another variable (since we destroy the corpus metadata).

corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)

And we can also store all the annotations inline with the text and store the annotated corpus in another variable (since we destroy the corpus metadata).

corpus.taggedText = Map(getAnnotatedMergedDocument, corpus, annotations)

Get actor names from DBpedia

We define a query to obtain (some) actor names in DBpedia.

prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
  ?actor a <http://dbpedia.org/class/yago/Actor109765278> .
  ?actor rdfs:label ?label .
} 
LIMIT 10000
OFFSET 0
")

Let’s evaluate the query against the SPARQL endpoint.

endpointT <- "http://dbpedia.org/sparql"
optionsT=""

actors <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

And take a look at the output of the query.

length(actors)
## [1] 10000
actors[1:30]
##                 label             label.1          label.2
## 1 "Megan Lawrence"@it "Megan Lawrence"@en "Barry James"@it
##            label.3        label.4         label.5        label.6
## 1 "Barry James"@en "Al Pacino"@en "آل باتشينو"@ar "Al Pacino"@de
##          label.7        label.8        label.9            label.10
## 1 "Al Pacino"@es "Al Pacino"@fr "Al Pacino"@it "アル・パチーノ"@ja
##         label.11       label.12       label.13         label.14
## 1 "Al Pacino"@nl "Al Pacino"@pl "Al Pacino"@pt "Пачино, Аль"@ru
##           label.15          label.16         label.17          label.18
## 1 "艾尔·帕西诺"@zh "Alan Rickman"@en "ألان ريكمان"@ar "Alan Rickman"@de
##            label.19          label.20          label.21
## 1 "Alan Rickman"@es "Alan Rickman"@fr "Alan Rickman"@it
##                  label.22          label.23          label.24
## 1 "アラン・リックマン"@ja "Alan Rickman"@nl "Alan Rickman"@pl
##            label.25          label.26         label.27           label.28
## 1 "Alan Rickman"@pt "Рикман, Алан"@ru "艾倫·瑞克曼"@zh "Albert Finney"@en
##          label.29
## 1 "ألبرت فيني"@ar

Clean the query result

We need to clean the output of the query. We need to:

actors.2 <- mclapply(actors, function(x) strsplit(x,'"')[[1]][2])
actors.3 <- mclapply(actors.2, function(x) strsplit(x,' \\(')[[1]][1])
actor.names <- unique(actors.3)
actor.names <- mclapply(actor.names, gsub, pattern="\\.", replacement=" ")
actor.names <- mclapply(actor.names, tolower)
length(actor.names)
## [1] 4565
head(actor.names,10)
## [[1]]
## [1] "megan lawrence"
## 
## [[2]]
## [1] "barry james"
## 
## [[3]]
## [1] "al pacino"
## 
## [[4]]
## [1] "آل باتشينو"
## 
## [[5]]
## [1] "アル・パチーノ"
## 
## [[6]]
## [1] "пачино, аль"
## 
## [[7]]
## [1] "艾尔·帕西诺"
## 
## [[8]]
## [1] "alan rickman"
## 
## [[9]]
## [1] "ألان ريكمان"
## 
## [[10]]
## [1] "アラン・リックマン"

Write gazetteer to a file

Now we write the gazetteer to a file.

write.table(unlist(actor.names), file = "gazetteer.txt", row.names = F, col.names = F, na="", sep=";")

Detect patterns

We include spaces at both sides of the names, to only match full words.

And we detect the patterns in the corpus.

pattern.an <- mclapply(actor.names, function(x) return(paste(" ",x," ",sep = "")))
pattern.an=unlist(pattern.an)

# There is some actor named "you" that is spoiling our results; we remove it
pattern.an = pattern.an[grep("^ you $", pattern.an, invert = TRUE)]

matches.an = detectPatternsInCorpus(corpus, pattern.an)

Let’s see how many patterns we have found per file.

countMatchesPerRow(matches.an) 
##                File Count
## 1   cv000_29590.txt     1
## 3   cv002_15918.txt     1
## 4   cv003_11664.txt     1
## 5   cv004_11636.txt     1
## 6   cv005_29443.txt     1
## 7   cv006_15448.txt     1
## 8    cv007_4968.txt     1
## 10  cv009_29592.txt     1
## 11  cv010_29198.txt     1
## 15  cv014_13924.txt     1
## 17   cv016_4659.txt     1
## 19  cv018_20137.txt     1
## 21   cv020_8825.txt     1
## 22  cv021_15838.txt     1
## 25   cv024_6778.txt     2
## 26   cv025_3108.txt     1
## 27  cv026_29325.txt     1
## 28  cv027_25219.txt     1
## 31  cv030_21593.txt     2
## 32  cv031_18452.txt     1
## 34  cv033_24444.txt     2
## 35  cv034_29647.txt     2
## 38  cv037_18510.txt     1
## 41   cv040_8276.txt     1
## 42  cv041_21113.txt     1
## 43  cv042_10982.txt     1
## 45  cv044_16969.txt     1
## 52  cv051_10306.txt     1
## 54  cv053_21822.txt     1
## 56   cv055_8338.txt     1
## 60  cv059_28885.txt     1
## 61  cv060_10844.txt     1
## 62   cv061_8837.txt     1
## 63  cv062_23115.txt     1
## 65  cv064_24576.txt     1
## 66  cv065_15248.txt     1
## 67  cv066_10821.txt     2
## 72  cv071_12095.txt     1
## 74  cv073_21785.txt     2
## 77  cv076_24945.txt     1
## 79  cv078_14730.txt     1
## 81  cv080_13465.txt     1
## 83  cv082_11080.txt     1
## 85  cv084_13566.txt     1
## 87  cv086_18371.txt     1
## 89  cv088_24113.txt     1
## 90  cv089_11418.txt     1
## 95  cv094_27889.txt     1
## 97  cv096_11474.txt     1
## 99  cv098_15435.txt     1
## 100 cv099_10534.txt     1
## 108 cv107_24319.txt     1
## 110 cv109_21172.txt     1
## 112 cv111_11473.txt     1
## 113 cv112_11193.txt     1
## 116 cv115_25396.txt     1
## 117 cv116_28942.txt     1
## 118 cv117_24295.txt     2
## 119 cv118_28980.txt     1
## 122 cv121_17302.txt     1
## 123  cv122_7392.txt     2
## 124 cv123_11182.txt     2
## 125  cv124_4122.txt     1
## 127 cv126_28971.txt     1
## 128 cv127_14711.txt     1
## 130 cv129_16741.txt     1
## 131 cv130_17083.txt     1
## 132 cv131_10713.txt     1
## 135 cv134_22246.txt     1
## 137 cv136_11505.txt     1
## 140 cv139_12873.txt     2
## 141  cv140_7479.txt     1
## 142 cv141_15686.txt     1
## 144 cv143_19666.txt     1
## 146 cv145_11472.txt     1
## 150 cv149_15670.txt     1
## 151 cv150_12916.txt     1
## 152 cv151_15771.txt     3
## 154 cv153_10779.txt     1
## 157 cv156_10481.txt     1
## 159 cv158_10390.txt     1
## 160 cv159_29505.txt     2
## 163 cv162_10424.txt     2
## 164 cv163_10052.txt     3
## 166 cv165_22619.txt     2
## 167 cv166_11052.txt     1
## 172 cv171_13537.txt     2
## 173 cv172_11131.txt     1
## 178 cv177_10367.txt     1
## 182 cv181_14401.txt     1
## 183  cv182_7281.txt     1
## 185  cv184_2673.txt     1
## 187  cv186_2269.txt     2
## 191 cv190_27052.txt     2
## 193 cv192_14395.txt     1
## 196 cv195_14528.txt     1
## 200  cv199_9629.txt     2
## 205  cv204_8451.txt     1
## 207 cv206_14293.txt     1
## 211  cv210_9312.txt     1
## 212  cv211_9953.txt     1
## 214 cv213_18934.txt     1
## 219 cv218_24352.txt     1
## 221 cv220_29059.txt     1
## 224 cv223_29066.txt     2
## 225 cv224_17661.txt     1
## 226 cv225_29224.txt     1
## 230 cv229_13611.txt     1
## 234 cv233_15964.txt     1
## 237 cv236_11565.txt     1
## 239 cv238_12931.txt     2
## 242 cv241_23130.txt     1
## 245 cv244_21649.txt     1
## 246  cv245_8569.txt     1
## 249 cv248_13987.txt     2
## 252 cv251_22636.txt     1
## 254 cv253_10077.txt     1
## 257 cv256_14740.txt     1
## 261 cv260_13959.txt     1
## 265 cv264_12801.txt     1
## 267 cv266_25779.txt     3
## 271  cv270_6079.txt     1
## 272 cv271_13837.txt     1
## 273 cv272_18974.txt     1
## 274 cv273_29112.txt     1
## 275 cv274_25253.txt     1
## 279 cv278_13041.txt     1
## 282 cv281_23253.txt     1
## 284 cv283_11055.txt     1
## 285 cv284_19119.txt     3
## 286 cv285_16494.txt     2
## 288 cv287_15900.txt     1
## 290  cv289_6463.txt     1
## 295 cv294_11684.txt     1
## 298 cv297_10047.txt     1
## 299 cv298_23111.txt     2
## 301 cv300_22284.txt     3
## 302 cv301_12146.txt     1
## 304 cv303_27520.txt     1
## 305 cv304_28706.txt     1
## 307 cv306_10364.txt     1
## 309  cv308_5016.txt     1
## 310 cv309_22571.txt     1
## 311 cv310_13091.txt     2
## 312 cv311_16002.txt     2
## 313 cv312_29377.txt     1
## 315 cv314_14422.txt     1
## 316 cv315_11629.txt     4
## 317  cv316_6370.txt     1
## 318 cv317_24049.txt     4
## 319 cv318_10493.txt     1
## 322 cv321_12843.txt     1
## 325  cv324_7082.txt     1
## 326 cv325_16629.txt     1
## 333 cv332_16307.txt     4
## 337 cv336_10143.txt     2
## 342 cv341_24430.txt     2
## 347 cv346_18168.txt     1
## 351 cv350_20670.txt     1
## 355  cv354_8132.txt     1
## 356 cv355_16413.txt     1
## 357 cv356_25163.txt     1
## 360  cv359_6647.txt     1
## 361  cv360_8398.txt     1
## 364 cv363_29332.txt     1
## 365 cv364_12901.txt     2
## 369 cv368_10466.txt     2
## 371  cv370_5221.txt     1
## 374 cv373_20404.txt     1
## 376  cv375_9929.txt     1
## 380 cv379_21963.txt     1
## 382 cv381_20172.txt     1
## 383  cv382_7897.txt     1
## 384 cv383_13116.txt     1
## 387 cv386_10080.txt     1
## 389 cv388_12009.txt     1
## 391 cv390_11345.txt     1
## 392 cv391_10802.txt     1
## 397 cv396_17989.txt     1
## 398 cv397_29023.txt     2
## 399 cv398_15537.txt     3
## 402 cv401_12605.txt     2
## 404  cv403_6621.txt     1
## 405 cv404_20315.txt     1
## 406 cv405_20399.txt     1
## 407 cv406_21020.txt     1
## 410 cv409_29786.txt     1
## 412 cv411_15007.txt     2
## 413 cv412_24095.txt     1
## 415 cv414_10518.txt     2
## 422  cv421_9709.txt     1
## 424 cv423_11155.txt     1
## 428 cv427_10825.txt     1
## 430  cv429_7439.txt     1
## 435  cv434_5793.txt     2
## 437 cv436_19179.txt     1
## 440 cv439_15970.txt     4
## 442 cv441_13711.txt     1
## 450  cv449_8785.txt     1
## 451  cv450_7890.txt     1
## 453  cv452_5088.txt     2
## 454 cv453_10379.txt     1
## 455  cv454_2053.txt     1
## 457 cv456_18985.txt     2
## 458 cv457_18453.txt     1
## 462 cv461_19600.txt     4
## 463 cv462_19350.txt     1
## 466 cv465_22431.txt     1
## 469 cv468_15228.txt     2
## 470 cv469_20630.txt     3
## 474  cv473_7367.txt     2
## 476 cv475_21692.txt     1
## 479 cv478_14309.txt     1
## 484 cv483_16378.txt     1
## 485 cv484_25054.txt     1
## 490 cv489_17906.txt     1
## 492 cv491_12145.txt     1
## 497 cv496_10530.txt     1
## 498 cv497_26980.txt     1
## 499  cv498_8832.txt     1
## 501 cv500_10251.txt     1
## 502 cv501_11657.txt     2
## 503 cv502_10406.txt     1
## 504 cv503_10558.txt     1
## 506 cv505_12090.txt     1
## 508  cv507_9220.txt     1
## 509 cv508_16006.txt     2
## 510 cv509_15888.txt     2
## 511 cv510_23360.txt     1
## 512 cv511_10132.txt     3
## 513 cv512_15965.txt     1
## 514  cv513_6923.txt     2
## 517 cv516_11172.txt     1
## 519 cv518_13331.txt     1
## 520 cv519_14661.txt     2
## 521 cv520_12295.txt     1
## 523  cv522_5583.txt     1
## 524 cv523_16615.txt     1
## 525 cv524_23627.txt     1
## 527 cv526_12083.txt     2
## 529 cv528_10822.txt     1
## 531 cv530_16212.txt     2
## 532 cv531_26486.txt     1
## 533  cv532_6522.txt     3
## 537 cv536_27134.txt     3
## 538 cv537_12370.txt     1
## 539 cv538_28667.txt     1
## 542 cv541_28835.txt     2
## 543 cv542_18980.txt     1
## 544  cv543_5045.txt     2
## 547 cv546_11767.txt     1
## 548 cv547_16324.txt     1
## 553 cv552_10016.txt     2
## 555 cv554_13151.txt     1
## 558 cv557_11449.txt     1
## 561 cv560_17175.txt     1
## 564 cv563_17257.txt     1
## 566 cv565_29572.txt     2
## 575 cv574_22156.txt     2
## 576 cv575_21150.txt     1
## 577 cv576_14094.txt     1
## 578 cv577_28549.txt     1
## 579 cv578_15094.txt     1
## 580 cv579_11605.txt     1
## 581 cv580_14064.txt     1
## 582 cv581_19381.txt     1
## 587  cv586_7543.txt     3
## 589 cv588_13008.txt     2
## 590 cv589_12064.txt     1
## 592 cv591_23640.txt     1
## 594 cv593_10987.txt     2
## 596 cv595_25335.txt     2
## 597 cv596_28311.txt     3
## 598 cv597_26360.txt     1
## 600 cv599_20988.txt     1
## 604 cv603_17694.txt     1
## 605  cv604_2230.txt     1
## 606 cv605_11800.txt     1
## 607 cv606_15985.txt     1
## 609 cv608_23231.txt     1
## 610 cv609_23877.txt     1
## 614 cv613_21796.txt     2
## 615 cv614_10626.txt     1
## 619  cv618_8974.txt     1
## 620 cv619_12462.txt     1
## 621 cv620_24265.txt     2
## 624 cv623_15356.txt     1
## 628 cv627_11620.txt     1
## 629 cv628_19325.txt     1
## 630 cv629_14909.txt     1
## 632  cv631_4967.txt     1
## 633  cv632_9610.txt     2
## 634 cv633_29837.txt     3
## 636 cv635_10022.txt     1
## 640 cv639_10308.txt     1
## 641  cv640_5378.txt     1
## 645 cv644_17154.txt     1
## 647 cv646_15065.txt     1
## 648 cv647_13691.txt     1
## 650 cv649_12735.txt     1
## 654 cv653_19583.txt     1
## 659 cv658_10532.txt     1
## 661 cv660_21893.txt     1
## 662  cv661_2450.txt     2
## 663 cv662_13320.txt     1
## 667 cv666_18963.txt     3
## 668 cv667_18467.txt     1
## 672  cv671_5054.txt     1
## 675 cv674_10732.txt     1
## 676 cv675_21588.txt     2
## 677 cv676_21090.txt     1
## 681 cv680_10160.txt     1
## 685 cv684_11798.txt     1
## 688 cv687_21100.txt     3
## 689  cv688_7368.txt     1
## 693 cv692_15451.txt     1
## 701 cv700_21947.txt     3
## 704 cv703_16143.txt     2
## 705 cv704_15969.txt     3
## 709 cv708_28729.txt     2
## 713 cv712_22920.txt     2
## 715 cv714_18502.txt     1
## 718 cv717_15953.txt     1
## 719 cv718_11434.txt     1
## 720  cv719_5713.txt     1
## 721  cv720_5389.txt     1
## 722 cv721_29121.txt     1
## 724  cv723_8648.txt     1
## 725 cv724_13681.txt     2
## 727  cv726_4719.txt     1
## 728  cv727_4978.txt     1
## 729 cv728_16133.txt     2
## 731 cv730_10279.txt     1
## 732  cv731_4136.txt     2
## 734  cv733_9839.txt     1
## 736 cv735_18801.txt     1
## 737 cv736_23670.txt     1
## 740 cv739_11209.txt     1
## 741 cv740_12445.txt     1
## 742 cv741_11890.txt     2
## 745 cv744_10038.txt     1
## 746 cv745_12773.txt     1
## 747 cv746_10147.txt     3
## 750 cv749_17765.txt     1
## 751 cv750_10180.txt     1
## 752 cv751_15719.txt     1
## 758 cv757_10189.txt     1
## 759  cv758_9671.txt     2
## 762 cv761_12620.txt     4
## 769 cv768_11751.txt     1
## 774 cv773_18817.txt     2
## 778 cv777_10094.txt     1
## 780 cv779_17881.txt     1
## 781  cv780_7984.txt     3
## 782  cv781_5262.txt     1
## 783 cv782_19526.txt     2
## 786 cv785_22600.txt     3
## 788 cv787_13743.txt     1
## 791 cv790_14600.txt     1
## 795 cv794_15868.txt     3
## 797 cv796_15782.txt     4
## 798  cv797_6957.txt     1
## 799 cv798_23531.txt     1
## 800 cv799_18543.txt     1
## 803 cv802_28664.txt     4
## 806 cv805_19601.txt     1
## 808 cv807_21740.txt     1
## 810  cv809_5009.txt     2
## 812 cv811_21386.txt     1
## 814  cv813_6534.txt     2
## 816 cv815_22456.txt     1
## 818  cv817_4041.txt     1
## 819 cv818_10211.txt     2
## 820  cv819_9364.txt     1
## 821 cv820_22892.txt     1
## 825  cv824_8838.txt     1
## 827 cv826_11834.txt     1
## 828 cv827_18331.txt     1
## 830 cv829_20289.txt     2
## 831  cv830_6014.txt     2
## 835 cv834_22195.txt     2
## 837 cv836_12968.txt     1
## 838 cv837_27325.txt     1
## 840 cv839_21467.txt     1
## 841 cv840_16321.txt     3
## 842  cv841_3967.txt     3
## 846 cv845_14290.txt     1
## 847 cv846_29497.txt     1
## 848  cv847_1941.txt     2
## 850 cv849_15729.txt     1
## 853 cv852_27523.txt     1
## 857 cv856_29013.txt     1
## 859 cv858_18819.txt     1
## 861 cv860_13853.txt     1
## 866  cv865_2895.txt     2
## 867 cv866_29691.txt     1
## 869 cv868_11948.txt     1
## 870 cv869_23611.txt     1
## 871 cv870_16348.txt     2
## 872 cv871_24888.txt     1
## 873 cv872_12591.txt     1
## 874 cv873_18636.txt     1
## 876  cv875_5754.txt     1
## 877  cv876_9390.txt     2
## 879 cv878_15694.txt     1
## 881 cv880_29800.txt     2
## 883 cv882_10026.txt     3
## 885 cv884_13632.txt     1
## 891  cv890_3977.txt     1
## 892  cv891_6385.txt     1
## 894 cv893_26269.txt     3
## 898 cv897_10837.txt     1
## 900 cv899_16014.txt     2
## 901 cv900_10331.txt     1
## 904 cv903_17822.txt     2
## 907 cv906_11491.txt     1
## 909 cv908_16009.txt     2
## 911 cv910_20488.txt     1
## 913  cv912_5674.txt     1
## 914 cv913_29252.txt     2
## 917 cv916_15467.txt     1
## 919  cv918_2693.txt     1
## 920 cv919_16380.txt     1
## 922 cv921_12747.txt     1
## 923 cv922_10073.txt     3
## 925 cv924_29540.txt     1
## 927 cv926_17059.txt     1
## 928 cv927_10681.txt     1
## 930 cv929_16908.txt     2
## 931 cv930_13475.txt     1
## 934 cv933_23776.txt     2
## 935 cv934_19027.txt     1
## 936 cv935_23841.txt     1
## 939 cv938_10220.txt     1
## 943 cv942_17082.txt     2
## 946 cv945_12160.txt     1
## 947 cv946_18658.txt     1
## 950 cv949_20112.txt     2
## 952 cv951_10926.txt     1
## 953 cv952_25240.txt     1
## 955 cv954_18628.txt     1
## 961 cv960_29007.txt     1
## 963  cv962_9803.txt     2
## 964  cv963_6895.txt     2
## 965  cv964_6021.txt     1
## 968  cv967_5788.txt     2
## 971 cv970_18450.txt     1
## 972 cv971_10874.txt     2
## 974 cv973_10066.txt     1
## 979 cv978_20929.txt     1
## 980 cv979_18921.txt     1
## 982 cv981_14989.txt     2
## 986  cv985_6359.txt     1
## 987 cv986_13527.txt     1
## 989 cv988_18740.txt     1
## 991 cv990_11591.txt     3
## 996 cv995_21821.txt     1
## 998  cv997_5046.txt     1
## 999 cv998_14111.txt     2

Let’s see which patterns we have found.

countColum = countMatchesPerColumn(matches.an) 
countColum[countColum$Count != 0,]
##                     Entity Count
## 3               al pacino      6
## 8            alan rickman      4
## 13          albert finney      1
## 18               alex cox      1
## 25        andie macdowell      3
## 34       antonio banderas      5
## 43            ashley judd      6
## 53            ava gardner      1
## 69          blake edwards      1
## 75              brad pitt     11
## 80              bruce lee      5
## 85          buster keaton      1
## 90           cameron diaz     15
## 105            cary grant      2
## 116       charlton heston      3
## 129       christopher lee      2
## 134           clark gable      1
## 144        clint eastwood      5
## 152           dan aykroyd      2
## 161            dark angel      2
## 167                  data      5
## 177       denise richards      9
## 182      desmond llewelyn      2
## 193        dustin hoffman      8
## 232                  fish     20
## 235     franco zeffirelli      1
## 239           fred gwynne      1
## 241        freddie prinze      3
## 242           gary cooper      1
## 251      gillian anderson      5
## 256           glenn close      9
## 261                goldie      2
## 262           goldie hawn      1
## 272       gwyneth paltrow      8
## 282         harrison ford     13
## 300            helen hunt      5
## 305           henry fonda      1
## 331        ingrid bergman      1
## 358             jean reno      1
## 363          jeff bridges      9
## 368      jennifer aniston      2
## 383         joan crawford      1
## 388        john carpenter      3
## 393           john denver      4
## 405         john travolta     16
## 410           jon pertwee      2
## 413          judy garland      1
## 418      juliette binoche      1
## 428          kate winslet      8
## 439      kenneth williams      6
## 462     leonardo dicaprio      8
## 467        leslie nielsen      2
## 472            luc besson      3
## 487         marty feldman      1
## 490           mary martin      1
## 503              meg ryan     13
## 512            mel gibson     12
## 535          milton berle      1
## 544       natalie portman     14
## 565    olivia newton-john      2
## 584           paul newman      3
## 595                prince     19
## 619        richard burton      1
## 635       roberto benigni      5
## 645        robin williams     18
## 650       rosie o'donnell      5
## 656          rutger hauer      1
## 671             sam raimi      5
## 681        sandra bullock      9
## 695          sergio leone      2
## 719        sophie marceau      2
## 724         spencer tracy      1
## 734             ted demme      1
## 750         terry gilliam      4
## 760     the three stooges      1
## 764         three stooges      1
## 778           thora birch      3
## 788            tim burton     11
## 793             tim curry      3
## 805            tom cruise     15
## 810             tom hanks     19
## 828                tricky      5
## 830          tupac shakur      1
## 835            val kilmer      6
## 857          vivien leigh      1
## 862            voice-over     15
## 875         wallace shawn      3
## 880         werner herzog      2
## 890       whoopi goldberg      4
## 920   william shakespeare      3
## 927          winona ryder      8
## 934           woody allen     16
## 1025         dennis quaid      7
## 1113            joe flynn      1
## 1168         peter weller      2
## 1186          robert hays      1
## 1224       aaron spelling      2
## 1248       adrienne posta      1
## 1285          alice krige      1
## 1289          alicia witt      2
## 1384                 bebe      2
## 1388        bebe neuwirth      2
## 1436   brendan sexton iii      1
## 1502      charles keating      1
## 1543    christopher lloyd      3
## 1569           colm feore      1
## 1620          david morse      7
## 1677           don knotts      1
## 1680       donald stewart      1
## 1804      george marshall      1
## 1874    heather matarazzo      1
## 1905         henry thomas      1
## 2055        jamie kennedy      9
## 2101        jennifer lien      1
## 2163       john henderson      1
## 2168          john landis      1
## 2281     kathleen quinlan      2
## 2307        kieran culkin      1
## 2501          max schreck      2
## 2514      michael chapman      1
## 2538          mika boorem      2
## 2599        nanni moretti      1
## 2656            oded fehr      1
## 2663           orson bean      1
## 2714       peter macneill      2
## 2732         piper laurie      1
## 2890          ron perlman      2
## 2946            sam neill      4
## 2965                santo      1
## 3060          straight up      1
## 3100              thunder      3
## 3109     timothy olyphant      5
## 3114        timothy spall      2
## 3149     tsutomu yamazaki      1
## 3156             twilight      8
## 3170               vanity      8
## 3203       william hickey      1
## 3211     william petersen      1
## 3220                 wink      5
## 3584               eartha      1
## 3618        eriq ebouaney      1
## 3655                  gem     28
## 3784       jerry springer      7
## 4117         oliver stone      7
## 4123        oprah winfrey      1
## 4276        sami bouajila      1

Now we write the results to a file.

write.table(matches.an, file = "allEntitiesGazetteer.csv", row.names = F, na="", sep=";")

Evaluate using gold standard

Let’s put all matches in a list for comparison with a gold standard.

allMatches = mergeAllMatchesInLists(matches.an)
head(allMatches,10)
##              Files         Matches
## 1  cv000_29590.txt     tim burton 
## 2  cv001_18431.txt            NULL
## 3  cv002_15918.txt       meg ryan 
## 4  cv003_11664.txt    paul newman 
## 5  cv004_11636.txt      bruce lee 
## 6  cv005_29443.txt  eriq ebouaney 
## 7  cv006_15448.txt  jennifer lien 
## 8   cv007_4968.txt    woody allen 
## 9  cv008_29435.txt            NULL
## 10 cv009_29592.txt            gem

Now we load the gold standard and put all gold standard matches in a list for comparison.

goldStandard = read.table(file = "goldStandard.csv", quote = "", na.strings=c(""), colClasses="character", sep=";")

allMatchesGold = mergeGoldStandardInLists(goldStandard)
head(allMatchesGold,10)
##              Files
## 1  cv000_29590.txt
## 2  cv001_18431.txt
## 3  cv002_15918.txt
## 4  cv003_11664.txt
## 5  cv004_11636.txt
## 6  cv005_29443.txt
## 7  cv006_15448.txt
## 8   cv007_4968.txt
## 9  cv008_29435.txt
## 10 cv009_29592.txt
##                                                                                                                                                                                                                                                                                                                                                      Matches
## 1  alan moore, eddie campbell, moore, campbell, jack, michael jackson, albert, allen hughes, peter godley, robbie coltrane, frederick abberline, johnny depp, abberline, mary kelly, heather graham, terry hayes, rafael yglesias, steve guttenberg, tim burton, marilyn manson, peter deming, martin childs, depp, ians holm, joe gould, richardson, graham
## 2                                                                                                                                                  matthew broderick, reese witherspoon, george washington carver, tracy flick, paul, max fischer, bill murray, broderick, witherspoon, jessica campbell, tammy, rooney, campbell, alexander payne, tracy, m
## 3                                                                                                                                                                                                                                                                                    ryan, hanks, tom hanks, joe fox, meg ryan, kathleen kelley, fox, kelley
## 4                                                                           john williams, steven spielberg, spielberg, williams, martin brody, roy scheider, larry vaughn, murray hamilton, brody, matt hooper, richard dreyfuss, hooper, vaughn, quint, robert shaw, hitchcock, scheider, dreyfuss, shaw, robert redford, paul newman, duddy kravitz, ahab
## 5                                                                                                                                                                                                                     herb, jackie chan, barry sanders, sanders, jackie, chan, bruce lee, tim allen, lawrence kazdan, john williams, spielberg, george lucas
## 6                                                                                                                                                                                              raoul peck, lumumba, patrice lumumba, eriq ebouaney, helmer peck, peck, pascal bonitzer, patrice, joseph kasa vubu, maka kotto, moise tschombe, pascal nzonzi
## 7                                                                                                                                                                                         tony kaye, edward norton, norton, derek vinyard, danny, edward furlong, beverly d'angelo, davin, jennifer lien, derek, kaye, avery brooks, furlong, d'angelo, lien
## 8                                                                                                                          betsy, molly ringwald, alan alda, ringwald, alda, dylan walsh, walsh, madeline kahn, ally sheedy, sheedy, anthony lapaglia, lapaglia, stevie dee, robert de niro, alec baldwin, de niro, joe pesci, catherine o'hara, woody allen
## 9                                                                                                                                                                                                                               lumumba, janssens, rudi delhem, moise tshombe, pascal nzonzi, mobutu, joseph kasa vubu, maka kotto, peck, bonitzer, ebouaney
## 10                                                                                                                                                                                                      schwartznager, stallone, van damme, rongguang yu, wong fei-hong, jackie chan, fei-hong, sze-man tsang, wong kei-ying, yen chi dan, yuen wo ping, fox

Finally, we calculate the metrics.

metrics = calculateMetrics(allMatches, allMatchesGold)
metrics
##   Precision     Recall   Fmeasure
## 1 0.7340659 0.03414435 0.06525349