TW: The corpus is based on legislations against sexual violence

Preparation

library(readxl)
library(quanteda)
## Package version: 3.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
setwd("~/Desktop/Rstudio")
d <- read_excel('test.xlsx')
head(d)

Creating the corpus

corp <- corpus(d, text_field = 'text')  ## create the corpus
corp
## Corpus consisting of 21 documents and 2 docvars.
## text1 :
## "Any person who by using violence or threat of violence force..."
## 
## text2 :
## "Sentenced for committing the crime of rape, with imprisonmen..."
## 
## text3 :
## "a man who has intercourse with a woman outside of marriage, ..."
## 
## text4 :
## "a man who have intercourse with a woman outside of marriage,..."
## 
## text5 :
## "a man who have intercourse with women, with the consent of t..."
## 
## text6 :
## "a man who has sexual intercourse with a woman, with the woma..."
## 
## [ reached max_ndoc ... 15 more documents ]

##Preprocessing

dtm = dfm(corp, stem = T, remove = stopwords('en'), remove_punct = T)
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
## Warning: 'stem' is deprecated; use dfm_wordstem() instead
dtm
## Document-feature matrix of: 21 documents, 89 features (89.41% sparse) and 2 docvars.
##        features
## docs    person use violenc threat forc woman sexual intercours marriag shall
##   text1      1   1       2      1    1     1      1          1       1     1
##   text2      0   0       0      0    0     0      0          0       0     0
##   text3      0   0       0      0    0     2      0          1       1     0
##   text4      0   0       0      0    0     2      0          1       1     0
##   text5      0   0       0      1    0     1      0          1       0     0
##   text6      0   0       0      0    0     3      1          1       0     0
## [ reached max_ndoc ... 15 more documents, reached max_nfeat ... 79 more features ]

Analysis

Using a wordcloud to show the most frequent words used in the corpus

textplot_wordcloud(dtm,)

textstat_frequency(dtm, n = 10)                                  

Comparing Corpora

is_2023 <- docvars(dtm)$year == '2023' 
final_dtm <- dtm[is_2023,]
ts <- textstat_keyness(dtm, is_2023)
textplot_keyness(ts)

is_2015 <- docvars(dtm)$year == '2015' 
semifinal_dtm <- dtm[is_2015,]
ts2 <- textstat_keyness(dtm, is_2015)
textplot_keyness(ts2)

is_1915 <- docvars(dtm)$year == '1915' 
initial_dtm <- dtm[is_1915,]
ts3 <- textstat_keyness(dtm, is_1915)
textplot_keyness(ts3)

Keyword in-context

woman <- kwic(tokens(corp), 'woman', window = 5)
head(woman, 10)    ## only view first 10 results
man <- kwic(tokens(corp), 'man', window = 5)
head(man, 10)    ## only view first 10 results
intercourse <- kwic(tokens(corp), 'intercourse', window = 5)
head(intercourse, 10)    ## only view first 10 results
person <- kwic(tokens(corp), 'person', window = 5)
head(person, 10)    ## only view first 10 results