library(quanteda)
## Package version: 4.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
library(readtext)
##
## Attaching package: 'readtext'
## The following object is masked from 'package:quanteda':
##
## texts
library(quanteda.textplots)
library(quanteda.textstats)
library(quanteda.textmodels)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(seededlda)
##
## Attaching package: 'seededlda'
## The following object is masked from 'package:quanteda':
##
## info_tbb
## The following object is masked from 'package:stats':
##
## terms
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(lexicon)
library(FactoMineR)
library(ggplot2)
library(topicmodels)
##
## Attaching package: 'topicmodels'
## The following objects are masked from 'package:seededlda':
##
## perplexity, topics
library(spacyr)
files <- list.files("corpus")
imported <- list()
for (i in 1:length(files)) {
imported[[i]] <- readtext(paste0("corpus/", files[i]))
}
doc_id <- c()
text <- c()
for (i in 1:length(imported)) {
doc_id <- c(doc_id, imported[[i]]$doc_id)
text <- c(text, imported[[i]]$text)
}
grp <- c("A","A","B","B", "B")
data <- data.frame(doc_id, text, grp)
Corpus
corp <- data %>% corpus(text_field = "text")
DFM with punct
output <- corp %>%
tokens(remove_punct = FALSE) %>%
dfm() %>%
convert(to = "data.frame")
df <- output[, c(1, 25:35)]
head(df)
## doc_id how currently manage your finances ? u sure i’m michael 34
## 1 user#1.docx 6 2 2 10 2 13 15 2 6 2 1
## 2 user#2.docx 5 2 0 6 0 12 13 1 3 0 0
## 3 user#3.docx 5 0 1 3 2 11 12 3 3 0 0
## 4 user#4.docx 5 0 1 3 2 11 12 3 3 0 0
## 5 user#5.docx 4 0 1 1 0 11 12 2 1 0 0
#write.csv(df, file = "1_dfm_with_punct.csv", row.names = FALSE)
Without punct
output <- corp %>%
tokens(remove_punct = TRUE) %>%
dfm() %>%
convert(to = "data.frame")
df <- output[, c(1, 25:35)]
head(df)
## doc_id your finances u sure i’m michael 34 work in marketing i’ve
## 1 user#1.docx 10 2 15 2 6 2 1 3 10 1 2
## 2 user#2.docx 6 0 13 1 3 0 0 4 8 0 1
## 3 user#3.docx 3 2 12 3 3 0 0 0 7 0 1
## 4 user#4.docx 3 2 12 3 3 0 0 0 7 0 1
## 5 user#5.docx 1 0 12 2 1 0 0 2 6 0 0
#write.csv(df, file = "2_dfm_without_punct.csv", row.names = FALSE)
Lemma
output <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
dfm() %>%
convert(to = "data.frame")
df <- output[, c(1, 25:35)]
head(df)
## doc_id your finance u sure i’m michael 34 work in market i’ve
## 1 user#1.docx 10 2 15 2 6 2 1 4 10 1 2
## 2 user#2.docx 6 0 13 1 3 0 0 6 8 0 1
## 3 user#3.docx 3 2 12 3 3 0 0 1 7 0 1
## 4 user#4.docx 3 2 12 3 3 0 0 1 7 0 1
## 5 user#5.docx 1 0 12 2 1 0 0 3 6 0 0
#write.csv(df, file = "3_lemma.csv", row.names = FALSE)
Stem
output <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
tokens_wordstem() %>%
dfm() %>%
convert(to = "data.frame")
df <- output[, c(1, 25:35)]
head(df)
## doc_id your financ u sure i’m michael 34 work in market i’ve
## 1 user#1.docx 10 2 15 2 6 2 1 4 10 1 2
## 2 user#2.docx 6 0 13 1 3 0 0 6 8 0 1
## 3 user#3.docx 3 2 12 3 3 0 0 1 7 0 1
## 4 user#4.docx 3 2 12 3 3 0 0 1 7 0 1
## 5 user#5.docx 2 0 12 2 1 0 0 3 6 0 0
#write.csv(df, file = "4_stem.csv", row.names = FALSE)
Remove stopwords
output <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
tokens_wordstem() %>%
dfm() %>%
dfm_remove(pattern = stopwords("en")) %>%
convert(to = "data.frame")
df <- output[, c(1, 25:35)]
head(df)
## doc_id six year now term good guess i’d say alway kind in-between
## 1 user#1.docx 2 2 4 1 5 2 6 4 2 5 1
## 2 user#2.docx 1 3 3 0 5 1 3 1 3 1 0
## 3 user#3.docx 1 2 1 1 1 0 3 0 1 0 0
## 4 user#4.docx 1 2 1 1 1 0 3 0 1 0 0
## 5 user#5.docx 0 0 1 0 3 0 4 1 1 0 0
#write.csv(df, file = "5_stopwords.csv", row.names = FALSE)
FCM
output <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
tokens_wordstem() %>%
dfm() %>%
dfm_remove(pattern = stopwords("en")) %>%
fcm() %>%
convert(to = "data.frame")
df <- output[, c(1, 25:35)]
head(df)
## doc_id six year now term good guess i’d say alway kind in-between
## 1 thank 10 18 20 6 30 6 38 12 16 12 2
## 2 much 36 60 75 18 107 28 120 51 55 58 10
## 3 join 4 6 6 3 7 2 12 4 4 5 1
## 4 us 4 6 7 3 10 2 16 5 5 5 1
## 5 today 7 11 14 4 20 5 25 10 10 11 2
## 6 start 5 9 10 3 15 3 19 6 8 6 1
#write.csv(df, file = "6_fcm.csv", row.names = FALSE)
output <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
kwic(pattern = "need", window = 7) %>%
head(10)
df <- output[, c(1, 25:35)]
head(df)
## Keyword-in-context with 6 matches.
## [user#1.docx, 158] the main thing be just visibility I | need |
## [user#1.docx, 1036] notification sometimes they’re too much I don’t | need |
## [user#2.docx, 733] up with me As a freelancer I | need |
## [user#3.docx, 105] when I be work but I still | need |
## [user#3.docx, 252] that’s what I’ll rely on if I | need |
## [user#3.docx, 569] text clear button few distraction I don’t | need |
##
## to know what’s come in what’s go
## a ping every time I buy a
## thing to be super clear and real-time
## to keep a eye on thing I
## something extra I How easy or difficult
## flashy chart or animation Just show me
#write.csv(df, file = "7_keywords_in_context.csv", row.names = FALSE)
Wordcloud
corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
dfm() %>%
dfm_remove(pattern = stopwords("en")) %>%
dfm_group(groups = grp) %>%
textplot_wordcloud(max_words = 80, comparison = TRUE, rotation = 0, min_size = 0.85)
## Warning in wordcloud_comparison(x, min_size, max_size, min_count, max_words, :
## notification could not be fit on page. It will not be plotted.

Textplot Network
corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
dfm() %>%
dfm_remove(pattern = stopwords("en")) %>%
fcm() %>%
textplot_network(min_freq = 120)

d <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
tokens_wordstem() %>%
dfm() %>%
dfm_remove(pattern = stopwords("en")) %>%
textstat_frequency(n = 20)
d %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()

tmod_ca <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
tokens_wordstem() %>%
dfm() %>%
dfm_remove(pattern = stopwords("en")) %>%
textmodel_ca()
dat_ca <- data.frame(dim1 = coef(tmod_ca, doc_dim = 1)$coef_document,
dim2 = coef(tmod_ca, doc_dim = 2)$coef_document)
ggplot(dat_ca, aes(x = dim1, y = dim2, label = rownames(dat_ca))) +
geom_text(color = rgb(0, 0, 0, 0.7), size = 5) +
coord_cartesian(xlim = c(-2, 2), ylim = c(-2, 2)) +
labs(x = "Dimension 1", y = "Dimension 2") +
theme_minimal()

corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
tokens_wordstem() %>%
dfm() %>%
dfm_remove(pattern = stopwords("en")) %>%
textplot_wordcloud(max_words = 80, rotation = 0, min_size = 0.85)

corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
tokens_wordstem() %>%
tokens_lookup(dictionary = data_dictionary_LSD2015) %>%
dfm() %>%
convert(to = "data.frame") %>%
mutate(ratio = round(positive / (negative+positive), 2))
## doc_id negative positive neg_positive neg_negative ratio
## 1 user#1.docx 25 66 0 0 0.73
## 2 user#2.docx 20 57 1 0 0.74
## 3 user#3.docx 15 39 0 0 0.72
## 4 user#4.docx 15 39 0 0 0.72
## 5 user#5.docx 13 38 0 0 0.75
Collocations
corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
textstat_collocations(size = 2) %>%
as_tibble() %>%
head()
## # A tibble: 6 × 6
## collocation count count_nested length lambda z
## <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 the tool 39 0 2 4.50 14.6
## 2 the app 26 0 2 4.55 11.7
## 3 how do 10 0 2 4.90 10.8
## 4 can you 14 0 2 3.64 10.4
## 5 time when 8 0 2 4.95 10.0
## 6 tool u 13 0 2 3.30 9.64
LDA
dfm <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
dfm() %>%
dfm_remove(pattern = stopwords("en"))
lda_model <- LDA(convert(dfm, to = "topicmodels"), k = 8)
terms(lda_model, 10) # les 5 mots les plus représentatifs par thème
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
## [1,] "like" "can" "like" "u" "like" "like" "u"
## [2,] "u" "much" "really" "tool" "easy" "every" "tool"
## [3,] "app" "like" "feel" "app" "little" "can" "can"
## [4,] "much" "it’s" "pot" "like" "much" "month" "family"
## [5,] "tool" "really" "just" "make" "app" "feel" "like"
## [6,] "just" "time" "grocery" "payment" "tool" "much" "kid"
## [7,] "can" "tool" "time" "can" "just" "i’d" "app"
## [8,] "it’s" "tax" "can" "month" "mind" "tool" "doesn’t"
## [9,] "balance" "actually" "u" "tax" "people" "really" "it’s"
## [10,] "i’m" "get" "one" "work" "it’s" "app" "money"
## Topic 8
## [1,] "u"
## [2,] "tool"
## [3,] "like"
## [4,] "use"
## [5,] "need"
## [6,] "make"
## [7,] "just"
## [8,] "help"
## [9,] "can"
## [10,] "bank"
Keyness
dfm <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_replace(
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma
) %>%
dfm() %>%
dfm_remove(pattern = stopwords("en"))
# 2) Test de keyness (χ²) : cible = groupe A
res <- textstat_keyness(dfm, target = corp$grp == "A", measure = "chi2")
# 3) Voir les premiers termes les plus “distinctifs” de A
head(res, 10)
## feature chi2 p n_target n_reference
## 1 tax 5.774475 0.01626058 7 0
## 2 kind 4.690878 0.03032315 6 0
## 3 late 4.690878 0.03032315 6 0
## 4 app 3.329906 0.06803075 22 13
## 5 month 2.805438 0.09394520 8 3
## 6 currently 2.562188 0.10944704 4 0
## 7 okay 2.562188 0.10944704 4 0
## 8 problem 2.562188 0.10944704 4 0
## 9 super 2.562188 0.10944704 4 0
## 10 much 2.475898 0.11560340 18 11
# 2) Test de keyness (χ²) : cible = groupe B
res <- textstat_keyness(dfm, target = corp$grp == "B", measure = "chi2")
# 3) Voir les premiers termes les plus “distinctifs” de B
head(res, 10)
## feature chi2 p n_target n_reference
## 1 use 6.317045 0.01195828 20 6
## 2 need 5.360684 0.02059570 13 3
## 3 family 4.598565 0.03199873 7 0
## 4 £ 4.268824 0.03881779 9 1
## 5 simple 3.711352 0.05404359 6 0
## 6 text 3.711352 0.05404359 6 0
## 7 big 3.457313 0.06297259 8 1
## 8 give 3.457313 0.06297259 8 1
## 9 grocery 3.457313 0.06297259 8 1
## 10 kid 2.834428 0.09226366 5 0
Spacy
spacy_install()
## Warning in spacy_install(): Skipping installation. Use `force` to force
## installation or update. Or use `spacy_download_langmodel()` if you just want to
## install a model.
spacy_initialize() # une fois par session
## successfully initialized (spaCy Version: 3.8.7, language model: en_core_web_sm)
Part-Of-Speech POS Tagging
parsed <- spacy_parse(text, pos = TRUE, tag = TRUE, lemma = TRUE)
df <- parsed %>%
subset(pos == "ADJ") %>%
count(lemma, sort = TRUE)
# analyser par POS
#table(parsed$pos)
# reconstruire tokens/dfm dans quanteda
#toks <- tokens(parsed$token, what = "word")
#dfmat <- dfm(tokens(txt)) # ou travaillez directement depuis `parsed`
head(df)
## lemma n
## 1 big 9
## 2 little 9
## 3 easy 8
## 4 last 8
## 5 sure 8
## 6 digital 7
#write.csv(df, file = "pos_tagging.csv", row.names = FALSE)
Named Entity Recognition
# Analyse + entités
parsed <- spacy_parse(text, entity = TRUE)
ents <- entity_consolidate(parsed) # recolle les entités multi-mots
# Voir les entités
df <- subset(ents, entity_type != "") %>% count(entity_type, token, sort = TRUE)
# ou par doc :
#subset(ents, entity_type != "") %>% count(doc_id, entity_type, sort = TRUE)
head(df)
## entity_type token n
## 1 DATE today 6
## 2 DATE this_month 4
## 3 ORDINAL First 4
## 4 CARDINAL six 3
## 5 CARDINAL ten 3
## 6 CARDINAL three 3
#write.csv(df, file = "named_entity_recognition.csv", row.names = FALSE)