Nous aurons besoin d’utiliser quelques package R pour cette analyse.
library(tm)
library(RTextTools)
library(SnowballC)
library(wordcloud)
set.seed(1234)
D’abord charger et inspecter nos données.
st.df <- read.csv("C:/sb-waq/R/search-terms/search-terms.csv", header=TRUE, sep=",", quote="\"", encoding="UTF-8", stringsAsFactors=FALSE, strip.white=TRUE, blank.lines.skip=TRUE)
st.df[53:59,]
## searchKeyword searchResultViews
## 53 calculatrice 433
## 54 visa desjardins 429
## 55 acces 426
## 56 voyage 422
## 57 virement interac 420
## 58 assurance collective 419
## 59 acces-d 417
Remplacer les caractères accentués par leur version sans accents.
st.df$'searchKeyword' <- iconv(st.df$'searchKeyword', "UTF-8", "ASCII")
Deviendra :
st.df <- data.frame(searchKeyword = rep(st.df$'searchKeyword', st.df$'searchResultViews'))
Prendre un échantillon (limitation de mémoire).
st.sdf <- na.omit(as.data.frame(st.df$'searchKeyword'[sample(1:nrow(st.df), 50000)]))
names(st.sdf) <- c("searchKeyword")
Retirons les termes de marque.
st.corpus <- tm_map(st.corpus, function(x) removeWords(x, c("accesd", "desjardin")))
Créer un nuage de mots-clés.
st.tdm <- TermDocumentMatrix(st.corpus)
st.m <- as.matrix(st.tdm)
st.v <- sort(rowSums(st.m),decreasing=TRUE)
st.d <- data.frame(word = names(st.v),freq=st.v)
pal2 <- rev(brewer.pal(8,"RdYlBu"))
wordcloud(st.d$word,st.d$freq, scale=c(6,0.75),min.freq=3,max.words=300, random.order=FALSE, rot.per=.0, colors=pal2)

Trouver des associations entre mots.
findAssocs(st.tdm, c("taux", "calcul", "bonidollar", "cart"), c(0.04, 0.04, 0.04, 0.04))
## $taux
## chang interet hypothecair dinteret dechang
## 0.63 0.11 0.09 0.09 0.08
## echang cpg rend vigueur reduit
## 0.07 0.07 0.06 0.06 0.05
## billet marginal hypotequer preferentiel bonifi
## 0.04 0.04 0.04 0.04 0.04
##
## $calcul
## hypothequ outil pret hypothecair vers
## 0.19 0.17 0.16 0.14 0.07
## hypothequair retrait hipotec dinvest dhypotequ
## 0.07 0.07 0.07 0.06 0.06
## hyphotecair grill hypotequ hypotecair dhypothequ
## 0.06 0.05 0.05 0.05 0.05
##
## $bonidollar
## catalogu programm utilis
## 0.22 0.05 0.04
##
## $cart
## cred perdu vis deb pert activ remplac cadeau
## 0.33 0.22 0.15 0.15 0.12 0.12 0.12 0.11
## guichet prepay classiqu nouvel renforc prepaye annul renouvel
## 0.09 0.09 0.08 0.08 0.08 0.07 0.06 0.06
## reactiv onglet endommag prepai affinit credit crid modulo
## 0.06 0.06 0.06 0.06 0.06 0.06 0.06 0.05
## dacc puc tanguay expir air mil usag
## 0.04 0.04 0.04 0.04 0.04 0.04 0.04