source("http://www.linguistics.ucsb.edu/faculty/stgries/exact.matches.2.r")
corpus.files <- dir(
   "files",
   pattern="sgml_",
   full.names=TRUE)[1:4]
corpus.files

## [1] "files/corp_bnc_sgml_1.txt" "files/corp_bnc_sgml_2.txt"
## [3] "files/corp_bnc_sgml_3.txt" "files/corp_bnc_sgml_4.txt"

all.matches <- character()

for (i in seq(corpus.files)) {
   current.corpus.file <- tolower(
      scan(                       
         corpus.files[i],          
         what=character(),         
         sep="\n",                 
         quote="",                 
         comment.char="",         
         quiet=TRUE)) 

  current.sentences <- grep( 
        "<s n=",                
        current.corpus.file,    
        perl=TRUE,              
        value=TRUE) 
  
  
  
  current.sentences <- gsub("<(?![wc] (...|...-...)).*?>[^<]*","",
      current.sentences,
      perl=TRUE)
  
  current.matches <- exact.matches.2(
      "(?x)          
      <w\\svv.>        
      [a-z]*          
      \\s              
      <w\\sat0>the
      \\s
      <w\\snn1>
      (morning|noon|afternoon|evening|night|day|week|month|year)
      \\s
      <w\\sav0>away",                              
      current.sentences)[[1]]   
  all.matches <- c(all.matches, current.matches)
  cat("\f", i/length(corpus.files))
}

##  0.25 0.5 0.75 1

object.size(all.matches)

## 1640 bytes

How many instances are there?

print(paste0("There are ",length(all.matches)," instances"))

## [1] "There are 21 instances"

There are 21 instances of such utterances.

What is the distribution of the time nouns?

v <- sub("<.*?>([^<]+) <.*","\\1",all.matches,perl = T)

n <- sub(".*?<w\\snn1>(.*?)\\s<.*","\\1",all.matches,perl = T)

head(data.frame(             
   MATCHES=   all.matches,
   VERBS= v,     
   NOUNS= n),  
   10
)

##                                                MATCHES    VERBS   NOUNS
## 1     <w vvb>dance <w at0>the <w nn1>night <w av0>away    dance   night
## 2   <w vvg>dancing <w at0>the <w nn1>night <w av0>away  dancing   night
## 3   <w vvg>dancing <w at0>the <w nn1>night <w av0>away  dancing   night
## 4   <w vvi>waltz <w at0>the <w nn1>evening <w av0>away    waltz evening
## 5     <w vvb>dance <w at0>the <w nn1>night <w av0>away    dance   night
## 6    <w vvd>danced <w at0>the <w nn1>night <w av0>away   danced   night
## 7    <w vvn>talked <w at0>the <w nn1>night <w av0>away   talked   night
## 8     <w vvb>dance <w at0>the <w nn1>night <w av0>away    dance   night
## 9     <w vvd>swung <w at0>the <w nn1>night <w av0>away    swung   night
## 10 <w vvg>prancing <w at0>the <w nn1>night <w av0>away prancing   night

dim(tam <- table(
   v,     
   n))

## [1] 12  3

head(tam)

##          n
## v         day evening night
##   dance     0       0     6
##   danced    0       0     1
##   dancing   0       0     5
##   discoed   0       0     1
##   dozing    1       0     0
##   love      0       0     1

Among the 21 instances found, 1 is ‘day,’ 1 is ‘evening,’ and 19 are ‘night.’

table(n)

## n
##     day evening   night 
##       1       1      19

What time scale do most of them refer to?

Most of the time nouns refer to the time scale of a part of a day. ‘Evening’ and ‘night’ are both parts of a day.

barplot(table(n),main="Time distribution")

What is the semantic implication of this construction?

The time-away construction is very interesting. First, if we look at all the verbs matched, for example, ‘dance,’ ‘spend,’ ‘talk,’ all these verbs denote an action/activity, not a state. The subject must actively do something to make the time away. Second, if we look at the verbs’ semantics, all these verbs have a positive meaning. The activity/action denoted is enjoyable. This possibly explains why the time just went away.

LING 120 Final Assignment

Lexie Wang

June 10, 2021

How many instances are there?

What is the distribution of the time nouns?

What time scale do most of them refer to?

What is the semantic implication of this construction?