Doing lexical and stylistic research

This document contains the materials for the lexical and stylistic research workshop organised by the Corpus Linguistics Association of Nigeria (CLAN 2025).

In this workshop session, we will learn how to use corpus methods to analyse texts using R, an open-source tool.

What to do before our session

Install R “https://cran.r-project.org/” R and Rstudio “https://posit.co/download/rstudio-desktop/” on your computer.

Create a folder for this workshop somewhere on your computer, e.g., clan or lex_research, and also create sub folders for data, output and images.

Download the data files and save in the created folder. folder , data_1 , data_2 .

Basics

#installing packages
install.packages("tidyverse")
install.packages("tidytext")
install.packages("quanteda")
install.packages("tm")
install.packages("tokenizers")
install.packages("udpipe")
install.packages("here")


install.packages("flextable")
install.packages("GGally")
install.packages("ggraph")
install.packages("igraph")
install.packages("Matrix")
install.packages("sna")
install.packages("tidygraph")
install.packages("tibble")
install.packages("quanteda.textplots")
install.packages("spacyr")
install.packages("textplot")

#loading packages
library(quanteda) #toolkit for quantitative text data analysis

## Package version: 4.3.1
## Unicode version: 14.0
## ICU version: 71.1

## Parallel computing: disabled

## See https://quanteda.io for tutorials and examples.

library(tidyverse) #for text analysis and visualisations

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext) # for tidy conversion of texts
library(tm) # for textmining

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## The following objects are masked from 'package:quanteda':
## 
##     meta, meta<-
## 
## 
## Attaching package: 'tm'
## 
## The following object is masked from 'package:quanteda':
## 
##     stopwords

library(tokenizers) #converts natural language into tokens
library(udpipe) #NLP toolkit for 'tokenization', 'parts of speech tagging', 'lemmatization' and 'dependency parsing' of raw text.
library(here) #an easier way to find files

## here() starts at /Users/jolietemi/Documents/lex_research

library(magrittr) #easily use %>% (pipe == and then)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(spacyr)
library(readr)
library(purrr)

2.Text basics Texts are called strings in R.

#creating a string with double quotes "" or single quotes ''
text1<-"A LITERATURE REVIEW IS A SURVEY"
text2<-'Chinua Achebe, Wole Soyinka'

tolower(text1) #convert to lower case

## [1] "a literature review is a survey"

toupper(text2) #convert to upper case

## [1] "CHINUA ACHEBE, WOLE SOYINKA"

text1_2<-c(text1, text2) #concatenate two strings
print(text1_2) #print object

## [1] "A LITERATURE REVIEW IS A SURVEY" "Chinua Achebe, Wole Soyinka"

str(text1_2)     #inspect the structure of an object

##  chr [1:2] "A LITERATURE REVIEW IS A SURVEY" "Chinua Achebe, Wole Soyinka"

summary(text1_2) #generates a summary of the object

##    Length     Class      Mode 
##         2 character character

Accessing text files

#loading a textfile
text3 <- readLines(here::here("data/pride_prejudice/chp1.txt"))

#loading multiple textfiles from a folder
pride <- list.files(here::here("data/pride_prejudice"), full.names = T, pattern = ".*txt") %>% 
  purrr::map_chr(~ readr::read_file(.))
#add names
names(pride) <- list.files(here::here("data/pride"), pattern = ".*txt") %>%
  stringr::str_remove_all(".txt")

# inspect first text element by indexing it
pride[1]

head(text3, 4)    #inspect first 4 elements of an object
tail(text3, 4)    #inspect last 4 elements of an object
str(text3)     #inspect the structure of an object


head(pride, 3)
tail(pride, 3)

pride[1] #inspect first element

#Save a single file
writeLines(pride[1], here::here("output", "pride_text1.txt"))

#Save multiple files into a folder
#define location
outs <- file.path(here::here(), "/output", paste0("result_", seq_along(pride), ".txt")) 

# save the files
lapply(seq_along(pride), function(i) 
       writeLines(pride[[i]], outs[i]))

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL

Basic text processing using stringr and frequency analysis

#Use the stringr function for basic text exploration, sorting etc.

stringr::str_count (pride [1:3]) #count the number of characters in elements 1:3
stringr::str_detect(pride, "sensible") #check if a word/pattern occurs in the text
stringr::str_extract_all(pride, "bad") #extract all occurrences of a word/pattern
stringr::str_remove_all(pride, "bad") #remove all occurrences of a word/pattern
stringr::str_replace_all(pride, "the", "THE")#replace all occurrences of a word/pattern with something else
stringr::str_view(pride, "sensible") #view the locations of all occurrences of a word/pattern
str_sort(pride) #sort the elements in the object according to their lengths



# Extracting frequencies
pride %>% as.data.frame() %>%  #convert to data frame

  dplyr::rename(text = 1) %>%  #generates a name column with text
 
  tidytext::unnest_tokens(word, text) %>%  #tokenise

  dplyr::count(word, sort = TRUE) %>%  #count tokens

  head(15) #inspect the 15 most frequent tokens



#Extracting N-grams
pride %>% as.data.frame() %>%
  #convert to data frame
  dplyr::rename(text = 1) %>%  #generates a name column with text
  dplyr::mutate(text = str_remove_all(text, "<.*?>" )) %>%  #clean data
  tidytext::unnest_tokens(word, text, token = "ngrams", n=4) %>%  #tokenise and extract N-grams
  dplyr::count(word, sort = TRUE) %>%  #count tokens
  head(15) #inspect the 15 most frequent 4-grams

Advanced text processing using quanteda

#Sentence tokenization
pride_tok <- quanteda::tokenize_sentence(pride) %>% 
  unlist()

#pride_tok #inspect


# Stop word removal
pride_wostop <- tm::removeWords(pride, tm::stopwords("english"))
#pride_wostop #inspect

#Removing unwanted symbols
pride_wows <- tm::stripWhitespace(pride_wostop) 
#pride_wows #inspect

#Removing punctuation
pride_wopunct <- tm::removePunctuation(pride)
#pride_wopunct #inspect

#Stemming
pride_stem <- tm::stemDocument(pride, language = "en")
#pride_stem #inspect

#pride_stem #inspect
#TOKENISATION, LEMMATISATION, POS TAGGING, PARSING can be easily done via udpipe.

#download the language model for English (English Web Treebank)
m_eng <- udpipe::udpipe_download_model(language = "english-ewt")

## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /Users/jolietemi/Documents/lex_research/english-ewt-ud-2.5-191206.udpipe

##  - This model has been trained on version 2.5 of data from https://universaldependencies.org

##  - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0

##  - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.

##  - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')

## Downloading finished, model stored at '/Users/jolietemi/Documents/lex_research/english-ewt-ud-2.5-191206.udpipe'

#load the language model from your computer
m_eng <-udpipe::udpipe_load_model(file = here::here("english-ewt-ud-2.5-191206.udpipe"))

#annotating texts (tokenise, tag, parse ) 
pride_annotated <- udpipe::udpipe_annotate(m_eng, x = pride) %>% 
  as.data.frame() %>% 
  dplyr::select(-sentence)

#head(pride_annotated, 5) #inspect
str(pride_annotated)

## 'data.frame':    11804 obs. of  13 variables:
##  $ doc_id       : chr  "doc1" "doc1" "doc1" "doc1" ...
##  $ paragraph_id : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sentence_id  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ token_id     : chr  "1" "2" "3" "4" ...
##  $ token        : chr  "It" "is" "a" "truth" ...
##  $ lemma        : chr  "it" "be" "a" "truth" ...
##  $ upos         : chr  "PRON" "AUX" "DET" "NOUN" ...
##  $ xpos         : chr  "PRP" "VBZ" "DT" "NN" ...
##  $ feats        : chr  "Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs" "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin" "Definite=Ind|PronType=Art" "Number=Sing" ...
##  $ head_token_id: chr  "4" "4" "4" "0" ...
##  $ dep_rel      : chr  "nsubj" "cop" "det" "root" ...
##  $ deps         : chr  NA NA NA NA ...
##  $ misc         : chr  NA NA NA NA ...

#combine the POS-tagged elements back into a single textfile
pride_annotated %>% 
  as.data.frame() %>% 
  dplyr::summarise(postxt = paste(token,"_",xpos, collapse = " ", sep = "" )) %>% 
  dplyr::pull(unique(postxt)) -> pride_postagged

#head(pride_postagged, 1) #inspect

#Visualising (syntactic) dependencies
text4 <- udpipe::udpipe_annotate(m_eng, x ="Mama gave the baby a gift") %>% 
  as.data.frame()
#generate dependency plot
text4_plot <- textplot::textplot_dependencyparser(text4, size = 2.5 )

## Loading required namespace: ggraph

#show plot
text4_plot

Text cleaning and Concordancing

#Cleaning the text

pride_clean <- pride %>%
  # replace elements
  stringr::str_replace_all("<.*?>", " ") %>%
  # convert to lower case
  tolower() %>%
  # remove strange symbols
  stringr::str_replace_all("[^[:alnum:][:punct:]]+", " ") %>%
  # remove \"
  stringr::str_remove_all("\"") %>%
  # remove superfluous white spaces
  stringr::str_squish()
# remove very short elements
pride_clean <- pride_clean[nchar(pride_clean) > 10]

#ensure unique document names
names(pride_clean) <- paste0("text", seq_along(pride_clean))

# inspect data
nchar(pride_clean)
#inspect the 3rd element in the textfile
pride_clean[3] 


#Concordancing
kwic_father <- quanteda::tokens(pride_clean) %>%
  quanteda::kwic(pattern = quanteda::phrase("father"),
                 window = 3, 
                 valuetype = "regex") %>%
  as.data.frame()
# inspect data
head(kwic_father)

#Save KWIC output
write.table(kwic_father, here::here("output", "kwic_father.txt"), sep = "\t")

# --- WORD CLOUD SECTION ---

# Get all Pride & Prejudice text files
files <- list.files(
  here::here("data/pride_prejudice"),
  full.names = TRUE,
  pattern = ".*txt"
)

# Read the text files
pride <- files %>% purrr::map_chr(readr::read_file)

# Assign unique names (required for quanteda corpus)
names(pride) <- basename(files)

# Word cloud
pride %>%
  quanteda::corpus() %>%
  quanteda::tokens(remove_punct = TRUE) %>%
  quanteda::tokens_remove(quanteda::stopwords("en")) %>%   # "english" also works
  quanteda::dfm() %>%
  quanteda.textplots::textplot_wordcloud(
    max_words = 50,
    max_size = 6,
    min_size = 1,
    color = scales::viridis_pal(option = "A")(10)
  )

Data Visualisation using ggplot2

Requirements:Specify data, aesthetics and geometric shapes

A statistical graphic is a mapping from data to aesthetic attributes (location, color, shape, size) of geometric objects (points, lines, bars).

ggplot(data, aes(x=, y=, color=, shape=, size=)) + geom_point(), or geom_histogram(), or geom_boxplot() or geom_line(), or geom_bar() etc.

# Read socdat
socdat <- read.csv(here::here("data", "socs.csv"))
head(socdat)

# Line plot

ggplot(socdat, aes(x = Age, y = Words)) +
  geom_line()

ggplot(socdat, aes(x = Age, y = Words, colour = Variant)) +
  geom_line()

ggplot(socdat, aes(x = Category, y = Words)) +
  geom_line()

#boxplot
ggplot(socdat, aes(x = Category, y = Words)) +
  geom_boxplot()

#bar plot
ggplot(socdat, aes(x = Category, fill = Variant)) +
  geom_bar()

# R Session Info
  
  sessionInfo()

Network Analysis using spacyr, quanteda.textplots, igraph, tidygraph and ggraph.

# activate packages
library(flextable)

## 
## Attaching package: 'flextable'

## The following object is masked from 'package:purrr':
## 
##     compose

library(GGally)
library(ggraph)
library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:flextable':
## 
##     compose

## The following objects are masked from 'package:lubridate':
## 
##     %--%, union

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(Matrix)

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(quanteda)
library(sna)

## Loading required package: statnet.common

## 
## Attaching package: 'statnet.common'

## The following objects are masked from 'package:base':
## 
##     attr, order, replace

## Loading required package: network

## 
## 'network' 1.20.0 (2026-02-06), part of the Statnet Project
## * 'news(package="network")' for changes since last version
## * 'citation("network")' for citation information
## * 'https://statnet.org' for help, support, and other information

## 
## Attaching package: 'network'

## The following objects are masked from 'package:igraph':
## 
##     %c%, %s%, add.edges, add.vertices, delete.edges, delete.vertices,
##     get.edge.attribute, get.edges, get.vertex.attribute, is.bipartite,
##     is.directed, list.edge.attributes, list.vertex.attributes,
##     set.edge.attribute, set.vertex.attribute

## sna: Tools for Social Network Analysis
## Version 2.8 created on 2024-09-07.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##  For citation information, type citation("sna").
##  Type help(package="sna") to get started.

## 
## Attaching package: 'sna'

## The following objects are masked from 'package:igraph':
## 
##     betweenness, bonpow, closeness, components, degree, dyad.census,
##     evcent, hierarchy, is.connected, neighborhood, triad.census

library(tidygraph)

## 
## Attaching package: 'tidygraph'

## The following object is masked from 'package:igraph':
## 
##     groups

## The following object is masked from 'package:quanteda':
## 
##     convert

## The following object is masked from 'package:stats':
## 
##     filter

library(tidyverse)
library(tibble)
library(spacyr)

Network Analysis - Extracting Named Entities

pride_parsed <- spacy_parse(pride) #parse text

## successfully initialized (spaCy Version: 3.8.9, language model: en_core_web_sm)

entities <- entity_extract(pride_parsed) # extract named entity recognition




#Attach preceding tokens to all named entities 
pride_parsed2 <- pride_parsed %>%
  dplyr::group_by(doc_id, sentence_id) %>%
  dplyr::mutate(prev_token = dplyr::lag(token)) %>%
  dplyr::ungroup()
unique(pride_parsed2$entity) #inspect the NER tags

##  [1] ""              "ORDINAL_B"     "CARDINAL_B"    "CARDINAL_I"   
##  [5] "PERSON_B"      "DATE_B"        "DATE_I"        "FAC_B"        
##  [9] "FAC_I"         "GPE_B"         "ORG_B"         "PERSON_I"     
## [13] "QUANTITY_B"    "QUANTITY_I"    "TIME_B"        "TIME_I"       
## [17] "WORK_OF_ART_B" "WORK_OF_ART_I" "MONEY_B"       "MONEY_I"      
## [21] "ORG_I"         "NORP_B"        "LOC_B"

#Filter for tokens tagged as PERSON
persons_tokens <- pride_parsed2 %>%
  dplyr::filter(entity == "PERSON_B" | entity == "PERSON_I" | entity == "PERSON") 

titles <- c("Mr.", "Mrs.", "Miss", "Ms.", "Lady", "Sir") #create a list of titles

#Filter for tokens tagged as PERSON and a preceding title
persons_with_titles <- persons_tokens %>% 
  dplyr::mutate(entity_clean = ifelse(prev_token %in% titles,
                          paste(prev_token, token),
                          token))

head(persons_with_titles) #inspect

pride_file_summary <- persons_with_titles %>%
  dplyr::group_by(doc_id, entity_clean) %>%
  dplyr::summarise(
    contributions = dplyr::n(),     # How many times this character appears
    .groups = "drop"
  ) %>%
  dplyr::arrange(doc_id, desc(contributions))

pride_file_summary #inspect

write.table(pride_file_summary, here::here("output", "pride_file_summary.csv"), sep = "\t") #export results as .csv file

Network Analysis - Co-occurence matrix

# load data
net_dat <- read.csv2("data/pride.csv", sep = ",")
# inspect data
net_dat %>%
  as.data.frame() %>%
  head(15)

#Transform the table into a co-occurence matrix
net_cmx <- crossprod(table(net_dat[1:2]))
diag(net_cmx) <- 0
net_df <- as.data.frame(net_cmx)
# inspect data
net_df[1:5, 1:5]%>%
  as.data.frame() %>%
  tibble::rownames_to_column("Persona")

Network Analysis - Visualisation

# create a new data frame 'va' using the 'net_dat' data
net_dat %>%
  # rename the 'person' column to 'node' and 'occurrences' column to 'n'
  dplyr::rename(node = person,
                n = occurrences) %>%
  # group the data by the 'node' column
  dplyr::group_by(node) %>%
  # summarize the data, calculating the total occurrences ('n') for each 'node'
  dplyr::summarise(n = sum(n)) -> va
# inspect
va %>%
  as.data.frame() %>%
  tail(10)

# create a new data frame 'ed' using the 'dat' data
ed <- net_df %>%
  # add a new column 'from' with row names
  dplyr::mutate(from = rownames(.)) %>%
  # reshape the data from wide to long format using 'gather'
  tidyr::gather(to, n, Carter:`young Lucas`) %>%
  # remove zero frequencies 
  dplyr::filter(n != 0)
# inspect
ed %>%
  as.data.frame() %>%
  head(10)

#generate a graph object
ig <- igraph::graph_from_data_frame(d=ed, vertices=va, directed = FALSE)

#add labels to the nodes
tg <- tidygraph::as_tbl_graph(ig) %>% 
  tidygraph::activate(nodes) %>% 
  dplyr::mutate(label=name)

#plot network
# set seed (so that the exact same network graph is created every time)
set.seed(12345)


# create a graph using the 'tg' data frame with the Fruchterman-Reingold layout
tg %>%
  ggraph::ggraph(layout = "fr") +
  
  # add arcs for edges with various aesthetics
  geom_edge_arc(colour = "gray50",
                lineend = "round",
                strength = .1,
                aes(edge_width = ed$n,
                    alpha = ed$n)) +
  
  # add points for nodes with size based on log-transformed 'v.size' and color based on 'va$Family'
  geom_node_point(size = log(va$n) * 2, 
                  aes(color = "Lucasfamily")) +
  # add text labels for nodes with various aesthetics
  geom_node_text(aes(label = name), 
                 repel = TRUE, 
                 point.padding = unit(0.2, "lines"), 
                 size = sqrt(va$n), 
                 colour = "gray10") +
  
  # adjust edge width and alpha scales
  scale_edge_width(range = c(0, 2.5)) +
  scale_edge_alpha(range = c(0, .3)) +
  
  # set graph background color to white
  theme_graph(background = "white") +
  
  # adjust legend position to the top
  theme(legend.position = "top", 
        # suppress legend title
        legend.title = element_blank()) +
   # remove edge width and alpha guides from the legend
  guides(edge_width = FALSE,
         edge_alpha = FALSE)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

dg <- ed[rep(seq_along(ed$n), ed$n), 1:2]
rownames(dg) <- NULL
# inspect data
dg %>%
  as.data.frame() %>%
  head(10)

dgg <- igraph::graph_from_edgelist(as.matrix(dg), directed = T)
# extract degree centrality
igraph::degree(dgg) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("node") %>%
  dplyr::rename(`degree centrality` = 2) %>%
  dplyr::arrange(-`degree centrality`) -> dc_tbl
# inspect data
dc_tbl %>%
  as.data.frame() %>%
  head(10)

names(igraph::degree(dgg))[which(igraph::degree(dgg) == max(igraph::degree(dgg)))]

## [1] "Lizzy_Elizabeth" "Mr. Bingley"

igraph::betweenness(dgg) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("node") %>%
  dplyr::rename(`betweenness centrality` = 2) %>%
  dplyr::arrange(-`betweenness centrality`) -> bc_tbl
# inspect data
bc_tbl %>%
  as.data.frame() %>%
  head(10)

igraph::closeness(dgg) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("node") %>%
  dplyr::rename(closeness = 2) %>%
  dplyr::arrange(-closeness) -> c_tbl
# inspect data
c_tbl %>%
  as.data.frame() %>%
  head(10)

More Resources

One of the best websites for resources and tutorials on data and text analytics is the Language Technology and Data Analysis Laboratory (LADAL) “https://ladal.edu.au/tutorials/introta/introta.html”.

For more information about Quanteda “https://tutorials.quanteda.io/”.

References

Benoit, K., Watanabe, K., Wang, H., Nulty, P., Obeng, A., Müller, S., & Matsuo, A. (2018). Quanteda: An R package for the quantitative analysis of textual data. Journal of Open Source Software, 3, 1-4.

Schweinberger, Martin. 2025. The Language Technology and Data Analysis Laboratory (LADAL). Brisbane: The University of Queensland, School of Languages and Cultures. url: https://ladal.edu.au/ (Version 2025.04.01).

Posit team (2025). RStudio: Integrated Development Environment for R.

R Core Team (2021). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/.

Schweinberger, Martin. 2024. Introduction to R for Social Science. University of Eastern Finland, Joensuu. url: https://martinschweinberger.github.io/IntroR_WS (Version 2024.06.11).