annotations
. Once this is done, we perform an evaluation on the correctness of the POS tags.
"UTF-8"
.
# Load corpus
source.pos = DirSource("txt_sentoken/pos", encoding = "UTF-8")
corpus = Corpus(source.pos)
getAnnotationsFromDocument
, getAnnotatedMergedDocument
and getAnnotatedPlainTextDocument
.
# Create function to get annotations from document.
getAnnotationsFromDocument = function(doc){
x=as.String(doc)
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
y2 <- annotate(x, pos_tag_annotator, y1)
parse_annotator <- Parse_Annotator()
y3 <- annotate(x, parse_annotator, y2)
return(y3)
}
# Create function to get annotated merged document.
getAnnotatedMergedDocument = function(doc,annotations){
x=as.String(doc)
y2w <- subset(annotations, type == "word")
tags <- sapply(y2w$features, '[[', "POS")
r1 <- sprintf("%s/%s", x[y2w], tags)
r2 <- paste(r1, collapse = " ")
return(r2)
}
# Create function to get annotated plain text document.
getAnnotatedPlainTextDocument = function(doc,annotations){
x=as.String(doc)
a = AnnotatedPlainTextDocument(x,annotations)
return(a)
}
# Inspect corpus
inspect(corpus[[1]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 4226
##
## films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .
## for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .
## to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd .
## the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes .
## in other words , don't dismiss this film because of its source .
## if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes .
## getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ?
## the ghetto in question is , of course , whitechapel in 1888 london's east end .
## it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision .
## when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case .
## abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium .
## upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach .
## i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay .
## in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end .
## it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts .
## and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) .
## don't worry - it'll all make sense when you see it .
## now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) .
## the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic .
## oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place .
## even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent .
## ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham .
## i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad .
## the film , however , is all good .
## 2 : 00 - r for strong violence/gore , sexuality , language and drug content
# Annotate corpus
annotations = lapply(corpus[1], getAnnotationsFromDocument)
annotations$cv000_29590.txt[1:23]
## id type start end features
## 1 sentence 1 265 constituents=<<integer,54>>,
## parse=<<character,1>>
## 2 sentence 268 439 constituents=<<integer,36>>,
## parse=<<character,1>>
## 3 sentence 442 591 constituents=<<integer,27>>,
## parse=<<character,1>>
## 4 sentence 594 797 constituents=<<integer,44>>,
## parse=<<character,1>>
## 5 sentence 800 939 constituents=<<integer,28>>,
## parse=<<character,1>>
## 6 sentence 942 1299 constituents=<<integer,70>>,
## parse=<<character,1>>
## 7 sentence 1302 1515 constituents=<<integer,38>>,
## parse=<<character,1>>
## 8 sentence 1518 1696 constituents=<<integer,36>>,
## parse=<<character,1>>
## 9 sentence 1699 1820 constituents=<<integer,21>>,
## parse=<<character,1>>
## 10 sentence 1823 2036 constituents=<<integer,39>>,
## parse=<<character,1>>
## 11 sentence 2039 2443 constituents=<<integer,82>>,
## parse=<<character,1>>
## 12 sentence 2445 2522 constituents=<<integer,17>>,
## parse=<<character,1>>
## 13 sentence 2525 2701 constituents=<<integer,34>>,
## parse=<<character,1>>
## 14 sentence 2704 2862 constituents=<<integer,30>>,
## parse=<<character,1>>
## 15 sentence 2864 2868 constituents=<<integer,3>>,
## parse=<<character,1>>
## 16 sentence 2871 2922 constituents=<<integer,14>>,
## parse=<<character,1>>
## 17 sentence 2925 3571 constituents=<<integer,127>>,
## parse=<<character,1>>
## 18 sentence 3574 3705 constituents=<<integer,22>>,
## parse=<<character,1>>
## 19 sentence 3708 3848 constituents=<<integer,26>>,
## parse=<<character,1>>
## 20 sentence 3851 3986 constituents=<<integer,28>>,
## parse=<<character,1>>
## 21 sentence 3989 4112 constituents=<<integer,26>>,
## parse=<<character,1>>
## 22 sentence 4115 4148 constituents=<<integer,9>>,
## parse=<<character,1>>
## 23 sentence 4151 4226 constituents=<<integer,15>>,
## parse=<<character,1>>
annotations$cv000_29590.txt[24:73]
## id type start end features
## 24 word 1 5 POS=NNS
## 25 word 7 13 POS=VBD
## 26 word 15 18 POS=IN
## 27 word 20 24 POS=JJ
## 28 word 26 30 POS=NNS
## 29 word 32 35 POS=VBP
## 30 word 37 39 POS=VBN
## 31 word 41 46 POS=NN
## 32 word 48 49 POS=IN
## 33 word 51 57 POS=NN
## 34 word 59 59 POS=,
## 35 word 61 67 POS=IN
## 36 word 69 72 POS=PRP
## 37 word 73 75 POS=VBP
## 38 word 77 81 POS=IN
## 39 word 83 93 POS=NNS
## 40 word 95 95 POS=-LRB-
## 41 word 97 102 POS=NN
## 42 word 104 104 POS=,
## 43 word 106 113 POS=NN
## 44 word 115 115 POS=,
## 45 word 117 121 POS=NN
## 46 word 123 123 POS=-RRB-
## 47 word 125 125 POS=,
## 48 word 127 128 POS=CC
## 49 word 130 135 POS=VBN
## 50 word 137 142 POS=IN
## 51 word 144 147 POS=NNS
## 52 word 149 149 POS=-LRB-
## 53 word 151 156 POS=NN
## 54 word 158 158 POS=-RRB-
## 55 word 160 161 POS=CC
## 56 word 163 165 POS=DT
## 57 word 167 174 POS=NN
## 58 word 176 180 POS=NN
## 59 word 182 182 POS=-LRB-
## 60 word 184 188 POS=NN
## 61 word 190 194 POS=NN
## 62 word 196 196 POS=-RRB-
## 63 word 198 198 POS=,
## 64 word 200 202 POS=CC
## 65 word 204 208 POS=EX
## 66 word 209 210 POS=VBZ
## 67 word 212 216 POS=RB
## 68 word 218 223 POS=RB
## 69 word 225 228 POS=VBN
## 70 word 230 230 POS=DT
## 71 word 232 236 POS=JJ
## 72 word 238 241 POS=NN
## 73 word 243 246 POS=IN
corpus[[1]]
. Therefore, the first two sentences were already annotated in the annotations
list. We print them again.
annotations$cv000_29590.txt[1:2]
## id type start end features
## 1 sentence 1 265 constituents=<<integer,54>>,
## parse=<<character,1>>
## 2 sentence 268 439 constituents=<<integer,36>>,
## parse=<<character,1>>
corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)
doc = corpus.tagged[[1]]
# Words of the first two sentences (including Punctuation marks)
head(sents(doc),2)
## [[1]]
## [1] "films" "adapted" "from" "comic" "books"
## [6] "have" "had" "plenty" "of" "success"
## [11] "," "whether" "they" "'re" "about"
## [16] "superheroes" "(" "batman" "," "superman"
## [21] "," "spawn" ")" "," "or"
## [26] "geared" "toward" "kids" "(" "casper"
## [31] ")" "or" "the" "arthouse" "crowd"
## [36] "(" "ghost" "world" ")" ","
## [41] "but" "there" "'s" "never" "really"
## [46] "been" "a" "comic" "book" "like"
## [51] "from" "hell" "before" "."
##
## [[2]]
## [1] "for" "starters" "," "it" "was" "created"
## [7] "by" "alan" "moore" "(" "and" "eddie"
## [13] "campbell" ")" "," "who" "brought" "the"
## [19] "medium" "to" "a" "whole" "new" "level"
## [25] "in" "the" "mid" "'80s" "with" "a"
## [31] "12-part" "series" "called" "the" "watchmen" "."
# Words/Tags
head(tagged_sents(doc),2)
## [[1]]
## films/NNS
## adapted/VBD
## from/IN
## comic/JJ
## books/NNS
## have/VBP
## had/VBN
## plenty/NN
## of/IN
## success/NN
## ,/,
## whether/IN
## they/PRP
## 're/VBP
## about/IN
## superheroes/NNS
## (/-LRB-
## batman/NN
## ,/,
## superman/NN
## ,/,
## spawn/NN
## )/-RRB-
## ,/,
## or/CC
## geared/VBN
## toward/IN
## kids/NNS
## (/-LRB-
## casper/NN
## )/-RRB-
## or/CC
## the/DT
## arthouse/NN
## crowd/NN
## (/-LRB-
## ghost/NN
## world/NN
## )/-RRB-
## ,/,
## but/CC
## there/EX
## 's/VBZ
## never/RB
## really/RB
## been/VBN
## a/DT
## comic/JJ
## book/NN
## like/IN
## from/IN
## hell/NN
## before/IN
## ./.
##
## [[2]]
## for/IN
## starters/NNS
## ,/,
## it/PRP
## was/VBD
## created/VBN
## by/IN
## alan/NN
## moore/NN
## (/-LRB-
## and/CC
## eddie/JJ
## campbell/NN
## )/-RRB-
## ,/,
## who/WP
## brought/VBD
## the/DT
## medium/NN
## to/TO
## a/DT
## whole/JJ
## new/JJ
## level/NN
## in/IN
## the/DT
## mid/JJ
## '80s/NNS
## with/IN
## a/DT
## 12-part/JJ
## series/NN
## called/VBN
## the/DT
## watchmen/NNS
## ./.
precision = \(\frac{number of correct tokens given by the system}{total number of tokens given by the system}\) = \(\frac{79}{91}\) = 0.8681 = 86.81 %
recall = \(\frac{number of correct tokens given by the system}{total number of actual tokens in the text}\) = \(\frac{79}{86}\) = 0.9186 = 91.86 %