Task:

Text mining (determine group of comments that belong to the same cluster) using the Latent Dirichlet allocation (LDA) algorithm

Get data

The dataset was extracted from costumer feedback on a certain service. I will use the LDA algorithm to create different sentiment clusters and examine the words that are abundant and/or unique to each cluster.


setwd("C:/Users/Owner/Desktop")
text <- read.csv("text11.csv", header = TRUE)
str(text)
## 'data.frame':    1132 obs. of  1 variable:
##  $ Comment: Factor w/ 1035 levels "$330 -  Met with JC Sherman, VP of Vendor Mgmt , and Jon Damron , Director of Procurement to review entire list"| __truncated__,..: 912 994 97 398 849 930 937 880 931 58 ...
head(text, n =5)
##                                                                                                                                                                                                 Comment
## 1                                                   The missingcomponents were slow to arrive. This would not be a problem if IGF understood.This is annoying because we have to pay fortheir mistakes.
## 2 We have been sent faulty components and deliveries have been slow.Components have been left out of configurations and had to be reordered. IGF did not wait until these were delivered and installed.
## 3                                                      Assembling the package. The attention that NS received during the process. Communication. Working with sourcing, IGF package, and sales support.
## 4                                                                                                 Had a very bad experience with IGF early last year where erroneous information was provided to my VP.
## 5                                                                                                                                Sometimes its putting on a lot of pressure for the IGF team to answer.

Data preparation

Create a corpus for the text document for the “Text” column of each row. T

library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
text_corpus <- VCorpus(VectorSource(text$Comment))

print(text_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1132

Text preparation (Using the tm package, I will perform a series of cleaning steps including removal of numbers and punctuations, word stemming and whitespace striping)

text_corpus_clean <- tm_map(text_corpus,
content_transformer(tolower))
text_corpus_clean <- tm_map(text_corpus_clean, stemDocument)

text_corpus_clean <- tm_map(text_corpus_clean, removeNumbers)
text_corpus_clean <- tm_map(text_corpus_clean,
removeWords, stopwords())
text_corpus_clean <- tm_map(text_corpus_clean, removePunctuation)

text_corpus_clean <- tm_map(text_corpus_clean, stripWhitespace)

Data Exploration

I will examine the words that have at least a frequency of 10 in all the documents.

library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.2
## Loading required package: RColorBrewer
wordcloud(text_corpus_clean, min.freq = 10, random.order = FALSE,
          colors=brewer.pal(8, "Dark2"))

Model training

I will use the DocumentTermMatrix() from the tm package to determine the frequency of words in each document.

text_dtm <- DocumentTermMatrix(text_corpus_clean)
text_dtm
## <<DocumentTermMatrix (documents: 1132, terms: 2971)>>
## Non-/sparse entries: 15716/3347456
## Sparsity           : 100%
## Maximal term length: 32
## Weighting          : term frequency (tf)
findFreqTerms(text_dtm, lowfreq = 20) ## Words that occur at least 20 times in the entire dataset
##   [1] "abl"        "account"    "agreement"  "alreadi"    "also"      
##   [6] "alway"      "amount"     "ani"        "approv"     "ask"       
##  [11] "back"       "bank"       "becaus"     "bill"       "brand"     
##  [16] "busi"       "call"       "can"        "case"       "chang"     
##  [21] "check"      "clarifi"    "clear"      "client"     "close"     
##  [26] "coa"        "confirm"    "contact"    "contract"   "correct"   
##  [31] "creat"      "credit"     "crr"        "custom"     "date"      
##  [36] "day"        "deal"       "deals"      "direct"     "discuss"   
##  [41] "disput"     "doe"        "dont"       "due"        "email"     
##  [46] "end"        "engag"      "ensur"      "error"      "etc"       
##  [51] "explain"    "file"       "final"      "financ"     "find"      
##  [56] "follow"     "gbs"        "get"        "good"       "got"       
##  [61] "gts"        "help"       "ibm"        "igf"        "improv"    
##  [66] "includ"     "incorrect"  "inform"     "interest"   "intern"    
##  [71] "invoic"     "invoice"    "invoices"   "involv"     "issu"      
##  [76] "issues"     "item"       "know"       "leas"       "like"      
##  [81] "list"       "mail"       "mainten"    "make"       "manag"     
##  [86] "meet"       "met"        "miss"       "month"      "need"      
##  [91] "new"        "now"        "number"     "offer"      "one"       
##  [96] "onli"       "open"       "option"     "order"      "paid"      
## [101] "part"       "pay"        "payment"    "plan"       "point"     
## [106] "price"      "problem"    "process"    "project"    "propos"    
## [111] "provid"     "receiv"     "redacted"   "refer"      "regard"    
## [116] "relat"      "rep"        "report"     "request"    "requir"    
## [121] "review"     "sale"       "see"        "sell"       "seller"    
## [126] "send"       "sent"       "servic"     "set"        "sever"     
## [131] "sign"       "solut"      "specif"     "still"      "sts"       
## [136] "support"    "system"     "team"       "term"       "time"      
## [141] "trade"      "transfer"   "tri"        "tss"        "understand"
## [146] "unpaid"     "updat"      "use"        "valu"       "vat"       
## [151] "veri"       "version"    "via"        "want"       "will"      
## [156] "without"    "work"       "wrong"      "year"

(A) I will use the LDA function from the topicmodels package to create a 2-LDA model (a 2 topics model)

library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.4.2
text_lda <- LDA(text_dtm, k = 2, method = "VEM", control = NULL)
text_lda
## A LDA_VEM topic model with 2 topics.

Model Evaluation

The 10 most abundant words in each group topic

library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.2
text_topics <- tidy(text_lda, matrix = "beta")
## Warning: package 'bindrcpp' was built under R version 3.4.2
text_topics
## # A tibble: 5,942 x 3
##    topic  term          beta
##    <int> <chr>         <dbl>
##  1     1   aäì  2.572655e-04
##  2     2   aäì 1.898116e-106
##  3     1   abb  1.286327e-04
##  4     2   abb  6.099025e-36
##  5     1  abil  3.858982e-04
##  6     2  abil  1.612118e-49
##  7     1   abl  1.576992e-04
##  8     2   abl  4.499592e-03
##  9     1  abov  1.029062e-03
## 10     2  abov  1.127032e-25
## # ... with 5,932 more rows
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
text_top_terms <- text_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

text_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

(B) I will also create a model with 4 topics.

library(topicmodels)
text_lda1 <- LDA(text_dtm, k = 4, method = "VEM", control = NULL)
text_lda1
## A LDA_VEM topic model with 4 topics.

Model Evaluation

The 10 most abundant words in each group

library(tidytext)
text_topics1 <- tidy(text_lda1, matrix = "beta")
text_topics1
## # A tibble: 11,884 x 3
##    topic  term          beta
##    <int> <chr>         <dbl>
##  1     1   aäì  4.432470e-04
##  2     2   aäì 2.887501e-187
##  3     3   aäì 8.295412e-185
##  4     4   aäì 4.087324e-183
##  5     1   abb 1.770162e-136
##  6     2   abb 2.486035e-173
##  7     3   abb 2.791759e-167
##  8     4   abb  2.503157e-04
##  9     1  abil 6.705383e-125
## 10     2  abil  2.448762e-04
## # ... with 11,874 more rows
library(ggplot2)
library(dplyr)

text_top_terms1 <- text_topics1 %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

text_top_terms1 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

A slightly diffferent appraoch is to determine the terms that have the greatest difference in Beta between Group 1 and Group 2 by calculating log2(beta2/beta1). I am assuming a 2 topics model

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.2
beta_spread <- text_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread
## # A tibble: 307 x 4
##       term       topic1       topic2   log_ratio
##      <chr>        <dbl>        <dbl>       <dbl>
##  1     abl 1.576992e-04 4.499592e-03   4.8345474
##  2    abov 1.029062e-03 1.127032e-25 -72.9512199
##  3  accept 1.169880e-03 7.770886e-04  -0.5902099
##  4  access 1.800858e-03 1.722110e-13 -33.2837883
##  5  accord 1.227476e-04 1.380699e-03   3.4916316
##  6 account 1.889721e-03 7.107999e-03   1.9112705
##  7   addit 3.069265e-04 1.043351e-03   1.7652589
##  8 address 2.845822e-17 1.769402e-03  45.8214097
##  9     age 4.524862e-19 1.179601e-03  51.2112748
## 10    agre 8.921366e-04 1.185938e-03   0.4106925
## # ... with 297 more rows
beta_spread %>%
  group_by(direction = log_ratio > 0) %>%
  top_n(15, abs(log_ratio)) %>%
  ungroup() %>%
  mutate(term = reorder(term, log_ratio)) %>%
  ggplot(aes(term, log_ratio)) +
  geom_col() +
  labs(y = "Log2 ratio of beta in topic 2 / topic 1") +
  coord_flip()

References:

  1. Machine Learning with R (2nd edition), by Brett Lantz
  2. ggplot2: Elegant Graphics for Data Analysis (2nd edition), by Hadley Wickham
  3. Text Mining with R, by Julia Silge and David Robinson