Here we examine two approaches to tagging nouns refered to by definite/indefinite articles. The first is a parts of speech tagging approach while the second utilizes regular expressions. The former is more accurate but less efficient than the latter.
Some resources around definite/indefinite articles include:
Heim, I. R. (1982). The semantics of definite and indefinite noun phrases (Doctoral dissertation). Retrieved from http://eecoppock.info/DynamicSemantics/Readings/Heim1982-Dissertation.pdf.
Epstein, R. (2006). The definite article, accessibility, and the construction of discourse referents, Cognitive Linguistics, 12(4)333-378, doi: 10.1515/cogl.2002.007
library(qdap)
library(qdapRegex)
library(ggplot2)
library(reshape2)
pos_after <- function(text.var, words, pos){
posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\s+")
namespos <- lapply(posses, function(x) {
y <- unlist(strsplit(x, "/"))
setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)])
})
lapply(namespos, function(x, thewords = words, thepos = pos){
locs <- which(x %in% thewords)
locs <- locs[!is.na(locs)]
if (identical(unclass(locs), integer(0))) return(NA_character_)
nounlocs <- which(names(x) %in% thepos)
unname(x[unique(sapply(locs, function(x){
min(nounlocs[nounlocs - x > 0])
}))])
})
}
out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) {
o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS"))
m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
m[m$freq> 3, ]
}), c("a", "the"))
dat2 <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE"))
dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq")
dat2 <- dat2[order(dat2$freq, dat2$Word), ]
ord2 <- aggregate(freq ~ Word, dat2, sum)
dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1])
rownames(dat2) <- NULL
ggplot(dat2, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Part Of Speech Parsing Approach")
out <- setNames(lapply(c("@@after_a", "@@after_the"), function(x) {
o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue),
pattern = x, extract=TRUE)
m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
m[m$freq> 3, ]
}), c("a", "the"))
dat <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))
dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")
dat <- dat[order(dat$freq, dat$Word), ]
ord <- aggregate(freq ~ Word, dat, sum)
dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
rownames(dat) <- NULL
ggplot(dat, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Regex Approach")