library(readxl)
library(quanteda)
## Package version: 3.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
setwd("~/Desktop/Rstudio")
d <- read_excel('test.xlsx')
head(d)
corp <- corpus(d, text_field = 'text') ## create the corpus
corp
## Corpus consisting of 21 documents and 2 docvars.
## text1 :
## "Any person who by using violence or threat of violence force..."
##
## text2 :
## "Sentenced for committing the crime of rape, with imprisonmen..."
##
## text3 :
## "a man who has intercourse with a woman outside of marriage, ..."
##
## text4 :
## "a man who have intercourse with a woman outside of marriage,..."
##
## text5 :
## "a man who have intercourse with women, with the consent of t..."
##
## text6 :
## "a man who has sexual intercourse with a woman, with the woma..."
##
## [ reached max_ndoc ... 15 more documents ]
##Preprocessing
dtm = dfm(corp, stem = T, remove = stopwords('en'), remove_punct = T)
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
## Warning: 'stem' is deprecated; use dfm_wordstem() instead
dtm
## Document-feature matrix of: 21 documents, 89 features (89.41% sparse) and 2 docvars.
## features
## docs person use violenc threat forc woman sexual intercours marriag shall
## text1 1 1 2 1 1 1 1 1 1 1
## text2 0 0 0 0 0 0 0 0 0 0
## text3 0 0 0 0 0 2 0 1 1 0
## text4 0 0 0 0 0 2 0 1 1 0
## text5 0 0 0 1 0 1 0 1 0 0
## text6 0 0 0 0 0 3 1 1 0 0
## [ reached max_ndoc ... 15 more documents, reached max_nfeat ... 79 more features ]
Using a wordcloud to show the most frequent words used in the corpus
textplot_wordcloud(dtm,)
textstat_frequency(dtm, n = 10)
is_2023 <- docvars(dtm)$year == '2023'
final_dtm <- dtm[is_2023,]
ts <- textstat_keyness(dtm, is_2023)
textplot_keyness(ts)
is_2015 <- docvars(dtm)$year == '2015'
semifinal_dtm <- dtm[is_2015,]
ts2 <- textstat_keyness(dtm, is_2015)
textplot_keyness(ts2)
is_1915 <- docvars(dtm)$year == '1915'
initial_dtm <- dtm[is_1915,]
ts3 <- textstat_keyness(dtm, is_1915)
textplot_keyness(ts3)
woman <- kwic(tokens(corp), 'woman', window = 5)
head(woman, 10) ## only view first 10 results
man <- kwic(tokens(corp), 'man', window = 5)
head(man, 10) ## only view first 10 results
intercourse <- kwic(tokens(corp), 'intercourse', window = 5)
head(intercourse, 10) ## only view first 10 results
person <- kwic(tokens(corp), 'person', window = 5)
head(person, 10) ## only view first 10 results