Technique d’analyse de texte (text mining)

Nous aurons besoin d’utiliser quelques package R pour cette analyse.

library(tm)
library(RTextTools)
library(SnowballC)
library(wordcloud)
set.seed(1234)

D’abord charger et inspecter nos données.

st.df <- read.csv("C:/sb-waq/R/search-terms/search-terms.csv", header=TRUE, sep=",", quote="\"", encoding="UTF-8", stringsAsFactors=FALSE, strip.white=TRUE, blank.lines.skip=TRUE)
st.df[53:59,]

##           searchKeyword searchResultViews
## 53         calculatrice               433
## 54      visa desjardins               429
## 55                acces               426
## 56               voyage               422
## 57     virement interac               420
## 58 assurance collective               419
## 59              acces-d               417

Remplacer les caractères accentués par leur version sans accents.

st.df$'searchKeyword' <- iconv(st.df$'searchKeyword', "UTF-8", "ASCII")

Transformer nos données :

A, 3
B, 2
C, 1

Deviendra :

st.df <- data.frame(searchKeyword = rep(st.df$'searchKeyword', st.df$'searchResultViews'))

Prendre un échantillon (limitation de mémoire).

st.sdf <- na.omit(as.data.frame(st.df$'searchKeyword'[sample(1:nrow(st.df), 50000)]))
names(st.sdf) <- c("searchKeyword")

Création et transformation d’un corpus pour diminuer sa taille.

st.corpus <- Corpus(DataframeSource(data.frame(doc_id = seq(1:nrow(st.sdf)), text = as.character(st.sdf$'searchKeyword'))))
st.corpus <- tm_map(st.corpus, removePunctuation)
st.corpus <- tm_map(st.corpus, content_transformer(tolower))
st.corpus <- tm_map(st.corpus, function(x) removeWords(x, stopwords("french")))
st.corpus <- tm_map(st.corpus, stemDocument, language = "french")

Retirons les termes de marque.

st.corpus <- tm_map(st.corpus, function(x) removeWords(x, c("accesd", "desjardin")))

Créer un nuage de mots-clés.

st.tdm <- TermDocumentMatrix(st.corpus)
st.m <- as.matrix(st.tdm)
st.v <- sort(rowSums(st.m),decreasing=TRUE)
st.d <- data.frame(word = names(st.v),freq=st.v)
pal2 <- rev(brewer.pal(8,"RdYlBu"))
wordcloud(st.d$word,st.d$freq, scale=c(6,0.75),min.freq=3,max.words=300, random.order=FALSE, rot.per=.0, colors=pal2)

Trouver des associations entre mots.

findAssocs(st.tdm, c("taux", "calcul", "bonidollar", "cart"), c(0.04, 0.04, 0.04, 0.04))

## $taux
##        chang      interet  hypothecair     dinteret      dechang 
##         0.63         0.11         0.09         0.09         0.08 
##       echang          cpg         rend      vigueur       reduit 
##         0.07         0.07         0.06         0.06         0.05 
##       billet     marginal   hypotequer preferentiel       bonifi 
##         0.04         0.04         0.04         0.04         0.04 
## 
## $calcul
##    hypothequ        outil         pret  hypothecair         vers 
##         0.19         0.17         0.16         0.14         0.07 
## hypothequair      retrait      hipotec      dinvest    dhypotequ 
##         0.07         0.07         0.07         0.06         0.06 
##  hyphotecair        grill     hypotequ   hypotecair   dhypothequ 
##         0.06         0.05         0.05         0.05         0.05 
## 
## $bonidollar
## catalogu programm   utilis 
##     0.22     0.05     0.04 
## 
## $cart
##     cred    perdu      vis      deb     pert    activ  remplac   cadeau 
##     0.33     0.22     0.15     0.15     0.12     0.12     0.12     0.11 
##  guichet   prepay classiqu   nouvel  renforc  prepaye    annul renouvel 
##     0.09     0.09     0.08     0.08     0.08     0.07     0.06     0.06 
##  reactiv   onglet endommag   prepai  affinit   credit     crid   modulo 
##     0.06     0.06     0.06     0.06     0.06     0.06     0.06     0.05 
##     dacc      puc  tanguay    expir      air      mil     usag 
##     0.04     0.04     0.04     0.04     0.04     0.04     0.04