source("http://www.linguistics.ucsb.edu/faculty/stgries/exact.matches.2.r")
corpus.files <- dir(
"files",
pattern="sgml_",
full.names=TRUE)[1:4]
corpus.files
## [1] "files/corp_bnc_sgml_1.txt" "files/corp_bnc_sgml_2.txt"
## [3] "files/corp_bnc_sgml_3.txt" "files/corp_bnc_sgml_4.txt"
all.matches <- character()
for (i in seq(corpus.files)) {
current.corpus.file <- tolower(
scan(
corpus.files[i],
what=character(),
sep="\n",
quote="",
comment.char="",
quiet=TRUE))
current.sentences <- grep(
"<s n=",
current.corpus.file,
perl=TRUE,
value=TRUE)
current.sentences <- gsub("<(?![wc] (...|...-...)).*?>[^<]*","",
current.sentences,
perl=TRUE)
current.matches <- exact.matches.2(
"(?x)
<w\\svv.>
[a-z]*
\\s
<w\\sat0>the
\\s
<w\\snn1>
(morning|noon|afternoon|evening|night|day|week|month|year)
\\s
<w\\sav0>away",
current.sentences)[[1]]
all.matches <- c(all.matches, current.matches)
cat("\f", i/length(corpus.files))
}
## 0.25 0.5 0.75 1
object.size(all.matches)
## 1640 bytes
print(paste0("There are ",length(all.matches)," instances"))
## [1] "There are 21 instances"
There are 21 instances of such utterances.
v <- sub("<.*?>([^<]+) <.*","\\1",all.matches,perl = T)
n <- sub(".*?<w\\snn1>(.*?)\\s<.*","\\1",all.matches,perl = T)
head(data.frame(
MATCHES= all.matches,
VERBS= v,
NOUNS= n),
10
)
## MATCHES VERBS NOUNS
## 1 <w vvb>dance <w at0>the <w nn1>night <w av0>away dance night
## 2 <w vvg>dancing <w at0>the <w nn1>night <w av0>away dancing night
## 3 <w vvg>dancing <w at0>the <w nn1>night <w av0>away dancing night
## 4 <w vvi>waltz <w at0>the <w nn1>evening <w av0>away waltz evening
## 5 <w vvb>dance <w at0>the <w nn1>night <w av0>away dance night
## 6 <w vvd>danced <w at0>the <w nn1>night <w av0>away danced night
## 7 <w vvn>talked <w at0>the <w nn1>night <w av0>away talked night
## 8 <w vvb>dance <w at0>the <w nn1>night <w av0>away dance night
## 9 <w vvd>swung <w at0>the <w nn1>night <w av0>away swung night
## 10 <w vvg>prancing <w at0>the <w nn1>night <w av0>away prancing night
dim(tam <- table(
v,
n))
## [1] 12 3
head(tam)
## n
## v day evening night
## dance 0 0 6
## danced 0 0 1
## dancing 0 0 5
## discoed 0 0 1
## dozing 1 0 0
## love 0 0 1
Among the 21 instances found, 1 is ‘day,’ 1 is ‘evening,’ and 19 are ‘night.’
table(n)
## n
## day evening night
## 1 1 19
Most of the time nouns refer to the time scale of a part of a day. ‘Evening’ and ‘night’ are both parts of a day.
barplot(table(n),main="Time distribution")
The time-away construction is very interesting. First, if we look at all the verbs matched, for example, ‘dance,’ ‘spend,’ ‘talk,’ all these verbs denote an action/activity, not a state. The subject must actively do something to make the time away. Second, if we look at the verbs’ semantics, all these verbs have a positive meaning. The activity/action denoted is enjoyable. This possibly explains why the time just went away.