library(quanteda)
## Package version: 4.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
library(readtext)
## 
## Attaching package: 'readtext'
## The following object is masked from 'package:quanteda':
## 
##     texts
library(quanteda.textplots)
library(quanteda.textstats)
library(quanteda.textmodels)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(seededlda)
## 
## Attaching package: 'seededlda'
## The following object is masked from 'package:quanteda':
## 
##     info_tbb
## The following object is masked from 'package:stats':
## 
##     terms
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(lexicon)
library(FactoMineR)
library(ggplot2)
library(topicmodels)
## 
## Attaching package: 'topicmodels'
## The following objects are masked from 'package:seededlda':
## 
##     perplexity, topics
library(spacyr)
files <- list.files("corpus")

imported <- list()

for (i in 1:length(files)) {
  imported[[i]] <- readtext(paste0("corpus/", files[i]))
}
doc_id <- c()
text <- c()

for (i in 1:length(imported)) {
  doc_id <- c(doc_id, imported[[i]]$doc_id)
  text <- c(text, imported[[i]]$text)
}

grp <- c("A","A","B","B", "B")
data <- data.frame(doc_id, text, grp)

Corpus

corp <- data %>% corpus(text_field = "text")

DFM with punct

output <- corp %>%
  tokens(remove_punct = FALSE) %>%
  dfm() %>%
  convert(to = "data.frame")

df <- output[, c(1, 25:35)]
head(df)
##        doc_id how currently manage your finances  ?  u sure i’m michael 34
## 1 user#1.docx   6         2      2   10        2 13 15    2   6       2  1
## 2 user#2.docx   5         2      0    6        0 12 13    1   3       0  0
## 3 user#3.docx   5         0      1    3        2 11 12    3   3       0  0
## 4 user#4.docx   5         0      1    3        2 11 12    3   3       0  0
## 5 user#5.docx   4         0      1    1        0 11 12    2   1       0  0
#write.csv(df, file = "1_dfm_with_punct.csv", row.names = FALSE)

Without punct

output <- corp %>%
  tokens(remove_punct = TRUE) %>%
  dfm() %>%
  convert(to = "data.frame")

df <- output[, c(1, 25:35)]
head(df)
##        doc_id your finances  u sure i’m michael 34 work in marketing i’ve
## 1 user#1.docx   10        2 15    2   6       2  1    3 10         1    2
## 2 user#2.docx    6        0 13    1   3       0  0    4  8         0    1
## 3 user#3.docx    3        2 12    3   3       0  0    0  7         0    1
## 4 user#4.docx    3        2 12    3   3       0  0    0  7         0    1
## 5 user#5.docx    1        0 12    2   1       0  0    2  6         0    0
#write.csv(df, file = "2_dfm_without_punct.csv", row.names = FALSE)

Lemma

output <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  dfm() %>%
  convert(to = "data.frame")

df <- output[, c(1, 25:35)]
head(df)
##        doc_id your finance  u sure i’m michael 34 work in market i’ve
## 1 user#1.docx   10       2 15    2   6       2  1    4 10      1    2
## 2 user#2.docx    6       0 13    1   3       0  0    6  8      0    1
## 3 user#3.docx    3       2 12    3   3       0  0    1  7      0    1
## 4 user#4.docx    3       2 12    3   3       0  0    1  7      0    1
## 5 user#5.docx    1       0 12    2   1       0  0    3  6      0    0
#write.csv(df, file = "3_lemma.csv", row.names = FALSE)

Stem

output <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  tokens_wordstem() %>%
  dfm() %>%
  convert(to = "data.frame")

df <- output[, c(1, 25:35)]
head(df)
##        doc_id your financ  u sure i’m michael 34 work in market i’ve
## 1 user#1.docx   10      2 15    2   6       2  1    4 10      1    2
## 2 user#2.docx    6      0 13    1   3       0  0    6  8      0    1
## 3 user#3.docx    3      2 12    3   3       0  0    1  7      0    1
## 4 user#4.docx    3      2 12    3   3       0  0    1  7      0    1
## 5 user#5.docx    2      0 12    2   1       0  0    3  6      0    0
#write.csv(df, file = "4_stem.csv", row.names = FALSE)

Remove stopwords

output <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  tokens_wordstem() %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en")) %>%
  convert(to = "data.frame")

df <- output[, c(1, 25:35)]
head(df)
##        doc_id six year now term good guess i’d say alway kind in-between
## 1 user#1.docx   2    2   4    1    5     2   6   4     2    5          1
## 2 user#2.docx   1    3   3    0    5     1   3   1     3    1          0
## 3 user#3.docx   1    2   1    1    1     0   3   0     1    0          0
## 4 user#4.docx   1    2   1    1    1     0   3   0     1    0          0
## 5 user#5.docx   0    0   1    0    3     0   4   1     1    0          0
#write.csv(df, file = "5_stopwords.csv", row.names = FALSE)

FCM

output <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  tokens_wordstem() %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en")) %>%
  fcm() %>%
  convert(to = "data.frame")

df <- output[, c(1, 25:35)]
head(df)
##   doc_id six year now term good guess i’d say alway kind in-between
## 1  thank  10   18  20    6   30     6  38  12    16   12          2
## 2   much  36   60  75   18  107    28 120  51    55   58         10
## 3   join   4    6   6    3    7     2  12   4     4    5          1
## 4     us   4    6   7    3   10     2  16   5     5    5          1
## 5  today   7   11  14    4   20     5  25  10    10   11          2
## 6  start   5    9  10    3   15     3  19   6     8    6          1
#write.csv(df, file = "6_fcm.csv", row.names = FALSE)
output <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  kwic(pattern = "need", window = 7) %>%
  head(10)

df <- output[, c(1, 25:35)]
head(df)
## Keyword-in-context with 6 matches.                                                                             
##   [user#1.docx, 158]             the main thing be just visibility I | need |
##  [user#1.docx, 1036] notification sometimes they’re too much I don’t | need |
##   [user#2.docx, 733]                    up with me As a freelancer I | need |
##   [user#3.docx, 105]                      when I be work but I still | need |
##   [user#3.docx, 252]                   that’s what I’ll rely on if I | need |
##   [user#3.docx, 569]       text clear button few distraction I don’t | need |
##                                         
##  to know what’s come in what’s go       
##  a ping every time I buy a              
##  thing to be super clear and real-time  
##  to keep a eye on thing I               
##  something extra I How easy or difficult
##  flashy chart or animation Just show me
#write.csv(df, file = "7_keywords_in_context.csv", row.names = FALSE)

Wordcloud

corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en")) %>%
  dfm_group(groups = grp) %>%
  textplot_wordcloud(max_words = 80, comparison = TRUE, rotation = 0, min_size = 0.85)
## Warning in wordcloud_comparison(x, min_size, max_size, min_count, max_words, :
## notification could not be fit on page. It will not be plotted.

Textplot Network

corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en")) %>%
  fcm() %>%
  textplot_network(min_freq = 120)

d <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  tokens_wordstem() %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en")) %>%
  textstat_frequency(n = 20)

d %>%
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL, y = "Frequency") +
  theme_minimal()

tmod_ca <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  tokens_wordstem() %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en")) %>%
  textmodel_ca()

dat_ca <- data.frame(dim1 = coef(tmod_ca, doc_dim = 1)$coef_document, 
                     dim2 = coef(tmod_ca, doc_dim = 2)$coef_document)

ggplot(dat_ca, aes(x = dim1, y = dim2, label = rownames(dat_ca))) +
  geom_text(color = rgb(0, 0, 0, 0.7), size = 5) +
  coord_cartesian(xlim = c(-2, 2), ylim = c(-2, 2)) +
  labs(x = "Dimension 1", y = "Dimension 2") +
  theme_minimal()

corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  tokens_wordstem() %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en")) %>%
  textplot_wordcloud(max_words = 80, rotation = 0, min_size = 0.85)

corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  tokens_wordstem() %>%
  tokens_lookup(dictionary =  data_dictionary_LSD2015) %>%
  dfm() %>%
  convert(to = "data.frame") %>%
  mutate(ratio = round(positive / (negative+positive), 2))
##        doc_id negative positive neg_positive neg_negative ratio
## 1 user#1.docx       25       66            0            0  0.73
## 2 user#2.docx       20       57            1            0  0.74
## 3 user#3.docx       15       39            0            0  0.72
## 4 user#4.docx       15       39            0            0  0.72
## 5 user#5.docx       13       38            0            0  0.75

Collocations

corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  textstat_collocations(size = 2) %>%
  as_tibble() %>%
  head()
## # A tibble: 6 × 6
##   collocation count count_nested length lambda     z
##   <chr>       <int>        <int>  <dbl>  <dbl> <dbl>
## 1 the tool       39            0      2   4.50 14.6 
## 2 the app        26            0      2   4.55 11.7 
## 3 how do         10            0      2   4.90 10.8 
## 4 can you        14            0      2   3.64 10.4 
## 5 time when       8            0      2   4.95 10.0 
## 6 tool u         13            0      2   3.30  9.64

LDA

dfm <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en"))

lda_model <- LDA(convert(dfm, to = "topicmodels"), k = 8)

terms(lda_model, 10)  # les 5 mots les plus représentatifs par thème
##       Topic 1   Topic 2    Topic 3   Topic 4   Topic 5  Topic 6  Topic 7  
##  [1,] "like"    "can"      "like"    "u"       "like"   "like"   "u"      
##  [2,] "u"       "much"     "really"  "tool"    "easy"   "every"  "tool"   
##  [3,] "app"     "like"     "feel"    "app"     "little" "can"    "can"    
##  [4,] "much"    "it’s"     "pot"     "like"    "much"   "month"  "family" 
##  [5,] "tool"    "really"   "just"    "make"    "app"    "feel"   "like"   
##  [6,] "just"    "time"     "grocery" "payment" "tool"   "much"   "kid"    
##  [7,] "can"     "tool"     "time"    "can"     "just"   "i’d"    "app"    
##  [8,] "it’s"    "tax"      "can"     "month"   "mind"   "tool"   "doesn’t"
##  [9,] "balance" "actually" "u"       "tax"     "people" "really" "it’s"   
## [10,] "i’m"     "get"      "one"     "work"    "it’s"   "app"    "money"  
##       Topic 8
##  [1,] "u"    
##  [2,] "tool" 
##  [3,] "like" 
##  [4,] "use"  
##  [5,] "need" 
##  [6,] "make" 
##  [7,] "just" 
##  [8,] "help" 
##  [9,] "can"  
## [10,] "bank"

Keyness

dfm <- corp %>%
  tokens(remove_punct = TRUE) %>%
  tokens_replace(
    pattern     = lexicon::hash_lemmas$token,
    replacement = lexicon::hash_lemmas$lemma
  ) %>%
  dfm() %>%
  dfm_remove(pattern = stopwords("en"))

# 2) Test de keyness (χ²) : cible = groupe A
res <- textstat_keyness(dfm, target = corp$grp == "A", measure = "chi2")
# 3) Voir les premiers termes les plus “distinctifs” de A
head(res, 10)
##      feature     chi2          p n_target n_reference
## 1        tax 5.774475 0.01626058        7           0
## 2       kind 4.690878 0.03032315        6           0
## 3       late 4.690878 0.03032315        6           0
## 4        app 3.329906 0.06803075       22          13
## 5      month 2.805438 0.09394520        8           3
## 6  currently 2.562188 0.10944704        4           0
## 7       okay 2.562188 0.10944704        4           0
## 8    problem 2.562188 0.10944704        4           0
## 9      super 2.562188 0.10944704        4           0
## 10      much 2.475898 0.11560340       18          11
# 2) Test de keyness (χ²) : cible = groupe B
res <- textstat_keyness(dfm, target = corp$grp == "B", measure = "chi2")
# 3) Voir les premiers termes les plus “distinctifs” de B
head(res, 10)
##    feature     chi2          p n_target n_reference
## 1      use 6.317045 0.01195828       20           6
## 2     need 5.360684 0.02059570       13           3
## 3   family 4.598565 0.03199873        7           0
## 4        £ 4.268824 0.03881779        9           1
## 5   simple 3.711352 0.05404359        6           0
## 6     text 3.711352 0.05404359        6           0
## 7      big 3.457313 0.06297259        8           1
## 8     give 3.457313 0.06297259        8           1
## 9  grocery 3.457313 0.06297259        8           1
## 10     kid 2.834428 0.09226366        5           0

Spacy

spacy_install()
## Warning in spacy_install(): Skipping installation. Use `force` to force
## installation or update. Or use `spacy_download_langmodel()` if you just want to
## install a model.
spacy_initialize()  # une fois par session
## successfully initialized (spaCy Version: 3.8.7, language model: en_core_web_sm)

Part-Of-Speech POS Tagging

parsed <- spacy_parse(text, pos = TRUE, tag = TRUE, lemma = TRUE)

df <- parsed %>%
  subset(pos == "ADJ") %>%
  count(lemma, sort = TRUE)

# analyser par POS
#table(parsed$pos)

# reconstruire tokens/dfm dans quanteda
#toks <- tokens(parsed$token, what = "word")
#dfmat <- dfm(tokens(txt))  # ou travaillez directement depuis `parsed`

head(df)
##     lemma n
## 1     big 9
## 2  little 9
## 3    easy 8
## 4    last 8
## 5    sure 8
## 6 digital 7
#write.csv(df, file = "pos_tagging.csv", row.names = FALSE)

Named Entity Recognition

# Analyse + entités
parsed <- spacy_parse(text, entity = TRUE)
ents   <- entity_consolidate(parsed)        # recolle les entités multi-mots

# Voir les entités
df <- subset(ents, entity_type != "") %>% count(entity_type, token, sort = TRUE)
# ou par doc :
#subset(ents, entity_type != "") %>% count(doc_id, entity_type, sort = TRUE)

head(df)
##   entity_type      token n
## 1        DATE      today 6
## 2        DATE this_month 4
## 3     ORDINAL      First 4
## 4    CARDINAL        six 3
## 5    CARDINAL        ten 3
## 6    CARDINAL      three 3
#write.csv(df, file = "named_entity_recognition.csv", row.names = FALSE)