Regex vs. Part of Speech for Identifying Nouns Refered to by Definite/Indefinite Articles

Tyler W. Rinker

Here we examine two approaches to tagging nouns refered to by definite/indefinite articles. The first is a parts of speech tagging approach while the second utilizes regular expressions. The former is more accurate but less efficient than the latter.

Some resources around definite/indefinite articles include:

Heim, I. R. (1982). The semantics of definite and indefinite noun phrases (Doctoral dissertation). Retrieved from http://eecoppock.info/DynamicSemantics/Readings/Heim1982-Dissertation.pdf.

Epstein, R. (2006). The definite article, accessibility, and the construction of discourse referents, Cognitive Linguistics, 12(4)333-378, doi: 10.1515/cogl.2002.007

https://www.msu.edu/~abbottb/def&inde.pdf

Set Up

library(qdap)
library(qdapRegex)
library(ggplot2)
library(reshape2)

Part Off Speech Tagging

pos_after <- function(text.var, words, pos){

    posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\s+")
    namespos <- lapply(posses, function(x) {
        y <- unlist(strsplit(x, "/"))
        setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)])
    })

    lapply(namespos, function(x, thewords = words, thepos = pos){
        locs <- which(x %in% thewords)
        locs <- locs[!is.na(locs)]

        if (identical(unclass(locs), integer(0))) return(NA_character_)

        nounlocs <- which(names(x) %in% thepos)

        unname(x[unique(sapply(locs, function(x){ 
            min(nounlocs[nounlocs - x > 0])
        }))])
    })  
}

out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) {
    o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS"))
    m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
    m[m$freq> 3, ]
}), c("a", "the"))


dat2 <- setNames(Reduce(function(x, y) {
    merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE"))

dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq")

dat2 <- dat2[order(dat2$freq, dat2$Word), ]

ord2 <- aggregate(freq ~ Word, dat2, sum)

dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1])
rownames(dat2) <- NULL
ggplot(dat2, aes(x=freq, y=Word)) + 
    geom_point()+ facet_grid(~Article) + 
    ggtitle("Part Of Speech Parsing Approach")

plot of chunk unnamed-chunk-2

Regular Expressions

out <- setNames(lapply(c("@@after_a", "@@after_the"), function(x) {
    o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue),
        pattern = x, extract=TRUE)
    m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
    m[m$freq> 3, ]
}), c("a", "the"))


dat <- setNames(Reduce(function(x, y) {
    merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))

dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")

dat <- dat[order(dat$freq, dat$Word), ]

ord <- aggregate(freq ~ Word, dat, sum)

dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
rownames(dat) <- NULL
ggplot(dat, aes(x=freq, y=Word)) + 
    geom_point()+ facet_grid(~Article) + 
    ggtitle("Regex Approach")

plot of chunk unnamed-chunk-3