Text mining (determine group of comments that belong to the same cluster) using the Latent Dirichlet allocation (LDA) algorithm
The dataset was extracted from costumer feedback on a certain service. I will use the LDA algorithm to create different sentiment clusters and examine the words that are abundant and/or unique to each cluster.
setwd("C:/Users/Owner/Desktop")
text <- read.csv("text11.csv", header = TRUE)
str(text)
## 'data.frame': 1132 obs. of 1 variable:
## $ Comment: Factor w/ 1035 levels "$330 - Met with JC Sherman, VP of Vendor Mgmt , and Jon Damron , Director of Procurement to review entire list"| __truncated__,..: 912 994 97 398 849 930 937 880 931 58 ...
head(text, n =5)
## Comment
## 1 The missingcomponents were slow to arrive. This would not be a problem if IGF understood.This is annoying because we have to pay fortheir mistakes.
## 2 We have been sent faulty components and deliveries have been slow.Components have been left out of configurations and had to be reordered. IGF did not wait until these were delivered and installed.
## 3 Assembling the package. The attention that NS received during the process. Communication. Working with sourcing, IGF package, and sales support.
## 4 Had a very bad experience with IGF early last year where erroneous information was provided to my VP.
## 5 Sometimes its putting on a lot of pressure for the IGF team to answer.
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
text_corpus <- VCorpus(VectorSource(text$Comment))
print(text_corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1132
text_corpus_clean <- tm_map(text_corpus,
content_transformer(tolower))
text_corpus_clean <- tm_map(text_corpus_clean, stemDocument)
text_corpus_clean <- tm_map(text_corpus_clean, removeNumbers)
text_corpus_clean <- tm_map(text_corpus_clean,
removeWords, stopwords())
text_corpus_clean <- tm_map(text_corpus_clean, removePunctuation)
text_corpus_clean <- tm_map(text_corpus_clean, stripWhitespace)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.2
## Loading required package: RColorBrewer
wordcloud(text_corpus_clean, min.freq = 10, random.order = FALSE,
colors=brewer.pal(8, "Dark2"))
I will use the DocumentTermMatrix() from the tm package to determine the frequency of words in each document.
text_dtm <- DocumentTermMatrix(text_corpus_clean)
text_dtm
## <<DocumentTermMatrix (documents: 1132, terms: 2971)>>
## Non-/sparse entries: 15716/3347456
## Sparsity : 100%
## Maximal term length: 32
## Weighting : term frequency (tf)
findFreqTerms(text_dtm, lowfreq = 20) ## Words that occur at least 20 times in the entire dataset
## [1] "abl" "account" "agreement" "alreadi" "also"
## [6] "alway" "amount" "ani" "approv" "ask"
## [11] "back" "bank" "becaus" "bill" "brand"
## [16] "busi" "call" "can" "case" "chang"
## [21] "check" "clarifi" "clear" "client" "close"
## [26] "coa" "confirm" "contact" "contract" "correct"
## [31] "creat" "credit" "crr" "custom" "date"
## [36] "day" "deal" "deals" "direct" "discuss"
## [41] "disput" "doe" "dont" "due" "email"
## [46] "end" "engag" "ensur" "error" "etc"
## [51] "explain" "file" "final" "financ" "find"
## [56] "follow" "gbs" "get" "good" "got"
## [61] "gts" "help" "ibm" "igf" "improv"
## [66] "includ" "incorrect" "inform" "interest" "intern"
## [71] "invoic" "invoice" "invoices" "involv" "issu"
## [76] "issues" "item" "know" "leas" "like"
## [81] "list" "mail" "mainten" "make" "manag"
## [86] "meet" "met" "miss" "month" "need"
## [91] "new" "now" "number" "offer" "one"
## [96] "onli" "open" "option" "order" "paid"
## [101] "part" "pay" "payment" "plan" "point"
## [106] "price" "problem" "process" "project" "propos"
## [111] "provid" "receiv" "redacted" "refer" "regard"
## [116] "relat" "rep" "report" "request" "requir"
## [121] "review" "sale" "see" "sell" "seller"
## [126] "send" "sent" "servic" "set" "sever"
## [131] "sign" "solut" "specif" "still" "sts"
## [136] "support" "system" "team" "term" "time"
## [141] "trade" "transfer" "tri" "tss" "understand"
## [146] "unpaid" "updat" "use" "valu" "vat"
## [151] "veri" "version" "via" "want" "will"
## [156] "without" "work" "wrong" "year"
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.4.2
text_lda <- LDA(text_dtm, k = 2, method = "VEM", control = NULL)
text_lda
## A LDA_VEM topic model with 2 topics.
The 10 most abundant words in each group topic
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.2
text_topics <- tidy(text_lda, matrix = "beta")
## Warning: package 'bindrcpp' was built under R version 3.4.2
text_topics
## # A tibble: 5,942 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 aäì 2.572655e-04
## 2 2 aäì 1.898116e-106
## 3 1 abb 1.286327e-04
## 4 2 abb 6.099025e-36
## 5 1 abil 3.858982e-04
## 6 2 abil 1.612118e-49
## 7 1 abl 1.576992e-04
## 8 2 abl 4.499592e-03
## 9 1 abov 1.029062e-03
## 10 2 abov 1.127032e-25
## # ... with 5,932 more rows
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
text_top_terms <- text_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
text_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
library(topicmodels)
text_lda1 <- LDA(text_dtm, k = 4, method = "VEM", control = NULL)
text_lda1
## A LDA_VEM topic model with 4 topics.
The 10 most abundant words in each group
library(tidytext)
text_topics1 <- tidy(text_lda1, matrix = "beta")
text_topics1
## # A tibble: 11,884 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 aäì 4.432470e-04
## 2 2 aäì 2.887501e-187
## 3 3 aäì 8.295412e-185
## 4 4 aäì 4.087324e-183
## 5 1 abb 1.770162e-136
## 6 2 abb 2.486035e-173
## 7 3 abb 2.791759e-167
## 8 4 abb 2.503157e-04
## 9 1 abil 6.705383e-125
## 10 2 abil 2.448762e-04
## # ... with 11,874 more rows
library(ggplot2)
library(dplyr)
text_top_terms1 <- text_topics1 %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
text_top_terms1 %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.2
beta_spread <- text_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 307 x 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 abl 1.576992e-04 4.499592e-03 4.8345474
## 2 abov 1.029062e-03 1.127032e-25 -72.9512199
## 3 accept 1.169880e-03 7.770886e-04 -0.5902099
## 4 access 1.800858e-03 1.722110e-13 -33.2837883
## 5 accord 1.227476e-04 1.380699e-03 3.4916316
## 6 account 1.889721e-03 7.107999e-03 1.9112705
## 7 addit 3.069265e-04 1.043351e-03 1.7652589
## 8 address 2.845822e-17 1.769402e-03 45.8214097
## 9 age 4.524862e-19 1.179601e-03 51.2112748
## 10 agre 8.921366e-04 1.185938e-03 0.4106925
## # ... with 297 more rows
beta_spread %>%
group_by(direction = log_ratio > 0) %>%
top_n(15, abs(log_ratio)) %>%
ungroup() %>%
mutate(term = reorder(term, log_ratio)) %>%
ggplot(aes(term, log_ratio)) +
geom_col() +
labs(y = "Log2 ratio of beta in topic 2 / topic 1") +
coord_flip()
References: