Stripping raw text using unnest_tokens()
install.packages(c("tibble", "tidytext", "ggplot2", "gutenbergr", "tidyr"))
## Installing packages into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
##
## There is a binary version available but the source version is
## later:
## binary source needs_compilation
## ggplot2 3.3.0 3.3.1 FALSE
##
## package 'tibble' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'tibble'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\tibble\libs\x64\tibble.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\tibble\libs\x64\tibble.dll:
## Permission denied
## Warning: restored 'tibble'
## package 'tidytext' successfully unpacked and MD5 sums checked
## package 'gutenbergr' successfully unpacked and MD5 sums checked
## package 'tidyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'tidyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\tidyr\libs\x64\tidyr.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\tidyr\libs\x64\tidyr.dll:
## Permission denied
## Warning: restored 'tidyr'
##
## The downloaded binary packages are in
## C:\Users\hktse\AppData\Local\Temp\RtmpGUJ8JW\downloaded_packages
## installing the source package 'ggplot2'
library(tibble)
## Warning: package 'tibble' was built under R version 3.6.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
library(ggplot2)
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 3.6.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.3
text <- c("A market meltdown. Surging recession fears. And a sudden spotlight on America's health care system. Goldman Sachs is warning Wall Street that the coronavirus could cost President Donald Trump the election.",
"The potential political fallout from the coronavirus adds yet more uncertainty for investors trying to assess the impact of the fast-moving epidemic. ",
"If the coronavirus epidemic materially affects US economic growth it may increase the likelihood of Democratic victory in the 2020 election, Goldman Sachs analysts led by Ben Snider wrote in a report published Wednesday night.",
"That could be a negative for stocks because investors have been hoping for a continuation of the low-tax, light-regulation approach of the Trump administration. And Trump of course has been laser-focused on boosting stock prices."
)
text_df <- tibble(line = 1:4, text = text) # tibble() function comes from tidyverse, we have four lines of text
# note the data is in tibbles format, A tibble is a modern class of data frame within R, available in the dplyr and tibble packages, that has a convenient print method, will not convert strings to factors, and does not use row names.
text_df
## # A tibble: 4 x 2
## line text
## <int> <chr>
## 1 1 "A market meltdown. Surging recession fears. And a sudden spotligh~
## 2 2 "The potential political fallout from the coronavirus adds yet mor~
## 3 3 "If the coronavirus epidemic materially affects US economic growth~
## 4 4 "That could be a negative for stocks because investors have been h~
# we now need to convert text_df to one-token-per-document-per-row format for further analysis
text_df %>%
unnest_tokens(word, text)
## # A tibble: 127 x 2
## line word
## <int> <chr>
## 1 1 a
## 2 1 market
## 3 1 meltdown
## 4 1 surging
## 5 1 recession
## 6 1 fears
## 7 1 and
## 8 1 a
## 9 1 sudden
## 10 1 spotlight
## # ... with 117 more rows
# download Jane Austen's books
library(janeaustenr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.6.2
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", # using regular expression (regex) to remove unwanted formatting
ignore_case = TRUE)))) %>% # ignore capital or lowercase letters
ungroup() # convert back
# check out output
original_books
## # A tibble: 73,422 x 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## # ... with 73,412 more rows
# parsing using tidytext
library(tidytext)
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ... with 725,045 more rows
# anti join with stop_words (a dataframe class) to remove stopwords. Note: stop_words are a list of pre-specified of stopwords stored in a dataframe/tibble object, you can specify your own stopword list and combine with stop_words (pls. refer to the appendix)
data(stop_words)
tidy_books <- tidy_books %>%
anti_join(stop_words)
## Joining, by = "word"
# count word frequency
tidy_books %>%
dplyr::count(word, sort = TRUE)
## # A tibble: 13,914 x 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # ... with 13,904 more rows
# plot word frequency
library(ggplot2)
tidy_books %>%
dplyr::count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>% # sort by frequency in descending order
ggplot(aes(word, n)) + # plotting
geom_col() +
xlab(NULL) +
coord_flip() + # flip coordinate, such that word appears on the y-axis, freq on the x-axis
theme_bw() # black and white plotting background
# download H.G. Wells's 4 books from gutenberg archive
library(gutenbergr)
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# parsing
tidy_hgwells <- hgwells %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
# count word frequency
tidy_hgwells %>%
dplyr::count(word, sort = TRUE)
## # A tibble: 11,769 x 2
## word n
## <chr> <int>
## 1 time 454
## 2 people 302
## 3 door 260
## 4 heard 249
## 5 black 232
## 6 stood 229
## 7 white 222
## 8 hand 218
## 9 kemp 213
## 10 eyes 210
## # ... with 11,759 more rows
# sort word frequencies for words commonly used by the two authors
library(tidyr)
frequency <- bind_rows(mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
dplyr::count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(author, proportion) %>%
gather(author, proportion, `H.G. Wells`)
library(scales) # automatically set breaks and labels for axes on ggplot plotting environment
## Warning: package 'scales' was built under R version 3.6.3
# arranging the Jane Austen's words on the y-axis, H.G. Wells's words on the x-axis
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Jane Austen", x = NULL)
## Warning: Removed 13230 rows containing missing values (geom_point).
## Warning: Removed 13231 rows containing missing values (geom_text).
What kind of financial products were the customers complaining about?
install.packages(c("plyr", "tidyverse", "wordcloud2", "tm", "topicmodels", "stringr", "dplyr", "ldatuning", "snow", "vctrs", "scales"))
## Installing packages into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## also installing the dependency 'glue'
##
## There is a binary version available but the source version is
## later:
## binary source needs_compilation
## dplyr 0.8.5 1.0.0 TRUE
##
## Binaries will be installed
## Warning: packages 'stringr', 'dplyr', 'scales' are in use and will not be
## installed
## package 'glue' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glue'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\hktse\Documents\R\win-library\3.6\00LOCK\glue\libs\x64\glue.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\glue\libs\x64\glue.dll:
## Permission denied
## Warning: restored 'glue'
## package 'plyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'plyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\hktse\Documents\R\win-library\3.6\00LOCK\plyr\libs\x64\plyr.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\plyr\libs\x64\plyr.dll:
## Permission denied
## Warning: restored 'plyr'
## package 'tidyverse' successfully unpacked and MD5 sums checked
## package 'wordcloud2' successfully unpacked and MD5 sums checked
## package 'tm' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'tm'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\tm\libs\x64\tm.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\tm\libs\x64\tm.dll: Permission
## denied
## Warning: restored 'tm'
## package 'topicmodels' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'topicmodels'
## Warning in file.copy(savedcopy, lib, recursive =
## TRUE): problem copying C:\Users\hktse\Documents\R\win-
## library\3.6\00LOCK\topicmodels\libs\x64\topicmodels.dll
## to C:\Users\hktse\Documents\R\win-
## library\3.6\topicmodels\libs\x64\topicmodels.dll: Permission denied
## Warning: restored 'topicmodels'
## package 'ldatuning' successfully unpacked and MD5 sums checked
## package 'snow' successfully unpacked and MD5 sums checked
## package 'vctrs' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'vctrs'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\vctrs\libs\x64\vctrs.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\vctrs\libs\x64\vctrs.dll:
## Permission denied
## Warning: restored 'vctrs'
##
## The downloaded binary packages are in
## C:\Users\hktse\AppData\Local\Temp\RtmpGUJ8JW\downloaded_packages
library(vctrs)
## Warning: package 'vctrs' was built under R version 3.6.3
library(plyr)
## Warning: package 'plyr' was built under R version 3.6.3
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(dplyr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages --------------------- tidyverse 1.3.0 --
## v readr 1.3.1 v forcats 0.4.0
## v purrr 0.3.3
## Warning: package 'purrr' was built under R version 3.6.2
## -- Conflicts ------------------------ tidyverse_conflicts() --
## x plyr::arrange() masks dplyr::arrange()
## x readr::col_factor() masks scales::col_factor()
## x purrr::compact() masks plyr::compact()
## x plyr::count() masks dplyr::count()
## x purrr::discard() masks scales::discard()
## x plyr::failwith() masks dplyr::failwith()
## x dplyr::filter() masks stats::filter()
## x plyr::id() masks dplyr::id()
## x dplyr::lag() masks stats::lag()
## x plyr::mutate() masks dplyr::mutate()
## x plyr::rename() masks dplyr::rename()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
library(tidytext)
library(tidyr)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.6.3
library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.6.3
library(ggplot2)
library(stringr)
# load the subsetted customer complaints data (10k random sample)
data <- read.csv("https://www.dropbox.com/s/6kofxbf8f3cq8cw/complaints10k.csv?dl=1", stringsAsFactors=FALSE)
# check out the data
dim(data)
## [1] 10000 18
names(data)
## [1] "Date.received" "Product"
## [3] "Sub.product" "Issue"
## [5] "Sub.issue" "Consumer.complaint.narrative"
## [7] "Company.public.response" "Company"
## [9] "State" "ZIP.code"
## [11] "Tags" "Consumer.consent.provided."
## [13] "Submitted.via" "Date.sent.to.company"
## [15] "Company.response.to.consumer" "Timely.response."
## [17] "Consumer.disputed." "Complaint.ID"
# we'll take a look at the "Consumer.complaint.narrative" (the 6th column)
head(data[6])
## Consumer.complaint.narrative
## 1
## 2
## 3 I submitted a complaint to SPS last month via CFPB. SPS did not respond to the specific questions contained in the Qualified Written Request, nor did they respond to my request to schedule an in-person meeting at SPS HQ in Utah to review their " original copies '' of documents they allege are valid and are associated with my property. SPS is trying to use these so-called documents to foreclose our home based on an assertion of a financial interest that does not exist.
## 4
## 5
## 6
# rename
names(data)[6] <- "text"
text_df <- tibble(document = 1:10000, text = data$text) # use "document" in lieu of "line"
# load stop_words and create our own stopword list
data(stop_words)
word = c("xxxx", "xx")
# set lexicon (aka., index), index = 2
lexicon = rep("anonymous", 2)
# create a dataframe for "words"
anonymous = data.frame(word, lexicon)
# append to tidytext's stop_words list, now you have a new stop_words list
stop_words <- rbind(stop_words, anonymous)
# text processing
text_unnested <- text_df %>%
unnest_tokens(word, text) %>% # remove punctuations, convert to lower case, etc.
anti_join(stop_words) # remove stopwords
## Joining, by = "word"
text_counts <- text_unnested %>% # generate and append a word count column
dplyr::count(document, word, sort = TRUE)
# plot word frequency
library(ggplot2)
text_counts %>%
dplyr::count(word, sort = TRUE) %>%
filter(n > 400) %>%
mutate(word = reorder(word, n)) %>% # sort by frequency in descending order
ggplot(aes(word, n)) + # plotting
geom_col() +
xlab(NULL) +
coord_flip() + # flip coordinate, such that word appears on the y-axis, freq on the x-axis
theme_bw() # black and white plotting background
# you also plot those words in the media-savvy "wordcloud" way using wordcloud2() function
# before that, we need "word" and "n" (count) columns from text_counts object, make them into another tidy object
text_cloud <- text_counts %>%
dplyr::count(word, sort = TRUE)
library(wordcloud2) # load wordcloud2 package
wordcloud2(text_cloud) # the default setting
# wordcloud2(text_cloud[1:100,]) # you can control the number of words/terms appearing on the cloud to make it sparser
# wordcloud2(text_cloud, shape = "circle", color = "random-light", backgroundColor="black") # you can also set your preferred shape, color, and background color
## "cast" the tibble object into document term matrix (DTM) to feed into LDA analysis
dtm <- text_counts %>%
cast_dtm(document, word, n)
## perplexity analysis: find out ideal number of topics, conditional on the distribution of words/terms and the number of documents
library(ldatuning)
## Warning: package 'ldatuning' was built under R version 3.6.3
library(ggplot2)
library(scales)
library(snow)
# do not run, just load the data from our Dropbox shared folder
# result <- FindTopicsNumber(
# dtm,
# topics = seq(from = 2, to = 50, by = 1), # candidate number of K: 2 to 50 (must be > 1)
# metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"), # 4 metrics
# method = "Gibbs",
# control = list(seed = 77),
# mc.cores = 2L,
# verbose = TRUE)
# the Griffiths (2004) and Cao and Juan (2009) metrics aim to locate the minimum values
# the Arun (2010) and Deveaud (2014) metrics seek to find the maximum values
# you should try to find the region (using eyeballing) to find K that best satisfies the above criteria
result <- readRDS(url("https://www.dropbox.com/s/ap8omatxqm3v2g3/result.rds?dl=1"))
# the plotting result suggests that K = 15 should be adequate, so we will train a 15-topic LDA
FindTopicsNumber_plot(result)
# 15-topic LDA
library(topicmodels)
# text.lda <- LDA(dtm, k = 15, control = list(seed = 123)) # do not run, just load the data from our dropbox folder
text.lda <- readRDS(url("https://www.dropbox.com/s/4po0veokpr4thju/text.lda.rds?dl=1"))
text.lda
## A LDA_VEM topic model with 15 topics.
# note that you can turn the model back to the one-topic-per-term-per-row format using tidy()
text.td <- tidy(text.lda) # td represents topic-term, so the output is a topic-term matrix
# term-topic assignment: for each term-topic combination the model will have a estimated beta -- the probability of that term being generated from that topic.
text.td
## # A tibble: 161,445 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 consumer 9.82e- 2
## 2 2 consumer 1.26e- 3
## 3 3 consumer 9.45e-16
## 4 4 consumer 2.62e- 9
## 5 5 consumer 1.30e- 5
## 6 6 consumer 1.47e- 7
## 7 7 consumer 1.52e- 8
## 8 8 consumer 1.13e- 5
## 9 9 consumer 2.51e-28
## 10 10 consumer 6.06e-23
## # ... with 161,435 more rows
# we could use top_n() function from the dplyr package to find the top 10 terms within each topic
top_terms <- text.td %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 150 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 consumer 0.0982
## 2 1 information 0.0570
## 3 1 identity 0.0432
## 4 1 theft 0.0423
## 5 1 section 0.0327
## 6 1 reporting 0.0317
## 7 1 agency 0.0274
## 8 1 block 0.0220
## 9 1 report 0.0217
## 10 1 victim 0.0176
## # ... with 140 more rows
# topic-document assignment: here the tidy package uses "gamma" to represent "theta"
text.lda_gamma <- tidy(text.lda, matrix = "gamma")
text.lda_gamma
## # A tibble: 38,265 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 6551 1 0.999
## 2 7005 1 0.000393
## 3 7660 1 0.999
## 4 1815 1 0.998
## 5 2675 1 0.998
## 6 2817 1 0.998
## 7 2981 1 0.998
## 8 4282 1 0.998
## 9 6830 1 0.998
## 10 9016 1 0.998
## # ... with 38,255 more rows
# since we have 10,000 documents, it's hard to visualize their distribution across different topics, but we can do that for term-topic distribution, that is, the beta parameter!
term_topics <- tidy(text.lda, matrix = "beta")
# display beta values
term_topics
## # A tibble: 161,445 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 consumer 9.82e- 2
## 2 2 consumer 1.26e- 3
## 3 3 consumer 9.45e-16
## 4 4 consumer 2.62e- 9
## 5 5 consumer 1.30e- 5
## 6 6 consumer 1.47e- 7
## 7 7 consumer 1.52e- 8
## 8 8 consumer 1.13e- 5
## 9 9 consumer 2.51e-28
## 10 10 consumer 6.06e-23
## # ... with 161,435 more rows
# using dplyr's top_n() function to find out the top 10 terms within each topic
top_terms <- term_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
# finally, we can plot term freq. distribution across all 15 topics using the ggplot() function
library(ggplot2)
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
# do you find something insightful regarding what types of financial products customers were complaining about?
## prediction accuracy, lets process the data again but (i) retain their Product labels and (ii) collapse their categories
# collapose Product categories
require("car")
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
data$Product <- recode(data$Product, "c('Bank account or service','Checking or savings account') = 'Accounts'; else = data$Product")
data$Product <- recode(data$Product, "c('Consumer Loan','Payday loan', 'Payday loan, title loan, or personal loan', 'Student loan', 'Vehicle loan or lease') = 'Loans'; else = data$Product")
data$Product <- recode(data$Product, "c('Credit card','Credit card or prepaid card', 'Prepaid card') = 'Credit cards'; else = data$Product")
data$Product <- recode(data$Product, "c('Credit reporting','Credit reporting, credit repair services, or other personal consumer reports') = 'Credit reporting'; else = data$Product")
data$Product <- recode(data$Product, "c('Money transfer, virtual currency, or money service','Money transfers', 'Other financial service', 'Virtual currency') = 'Money transfers'; else = data$Product")
# now we have 7 main product categories
table(data$Product)
##
## Accounts Credit cards Credit reporting Debt collection
## 970 1037 3217 1801
## Loans Money transfers Mortgage
## 729 134 2112
# second, subset the data
names(data)[6] <- "text"
new_data <- data[, c("text", "Product")]
new_data_unnested <- new_data %>%
unnest_tokens(word, text)
new_data_counts <- new_data_unnested %>%
anti_join(stop_words) %>%
dplyr::count(Product, word, sort = TRUE)
## Joining, by = "word"
# display result
new_data_counts
## # A tibble: 25,553 x 3
## Product word n
## <chr> <chr> <int>
## 1 Credit reporting credit 2386
## 2 Credit reporting report 1290
## 3 Credit reporting account 1178
## 4 Credit reporting information 1136
## 5 Debt collection debt 951
## 6 Credit reporting reporting 796
## 7 Debt collection credit 654
## 8 Credit cards card 653
## 9 Accounts account 644
## 10 Credit cards credit 610
## # ... with 25,543 more rows
# sort by TF-IDF to increase topic-specific identifiability
new_data_counts <- new_data_counts %>%
bind_tf_idf(word, Product, n)
# sort by TF-IDF
new_data_counts %>%
arrange(desc(tf_idf))
## # A tibble: 25,553 x 6
## Product word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Money transfers coinbase 29 0.00651 1.95 0.0127
## 2 Mortgage escrow 182 0.00621 1.25 0.00778
## 3 Money transfers gram 15 0.00337 1.95 0.00655
## 4 Loans navient 122 0.00636 0.847 0.00539
## 5 Mortgage modification 176 0.00601 0.847 0.00509
## 6 Credit reporting transunion 201 0.00351 1.25 0.00440
## 7 Money transfers usd 15 0.00337 1.25 0.00422
## 8 Mortgage homeowner 63 0.00215 1.95 0.00418
## 9 Credit reporting experian 272 0.00475 0.847 0.00403
## 10 Loans pslf 39 0.00203 1.95 0.00396
## # ... with 25,543 more rows
# plot it
new_data_counts %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(Product) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = Product)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~Product, ncol = 4, scales = "free") +
coord_flip()
## Selecting by tf_idf
# do you find any interpretable patterns?
# cast to dtm for further processing and analysis
new_dtm <- new_data_counts %>%
cast_dtm(Product, word, n)
new_dtm
## <<DocumentTermMatrix (documents: 7, terms: 10763)>>
## Non-/sparse entries: 25553/49788
## Sparsity : 66%
## Maximal term length: 45
## Weighting : term frequency (tf)
# convert to lda class output (we set K = 7 because we have 7 product categories)
# new_lda <- LDA(new_dtm, k = 7, control = list(seed = 123)) # do not run, just download the new_lda output from dropbox folder
new_lda <- readRDS(url("https://www.dropbox.com/s/epre5su5i0km5ho/new_lda.rds?dl=1"))
new_lda
## A LDA_VEM topic model with 7 topics.
# join term-topic assignment to document-term matrix to see the most likely "Product" (i.e., topic) for a document (customer complaint)
# you will need broom package to be able to use augment() function
library(broom)
## Warning: package 'broom' was built under R version 3.6.3
assignments <- augment(new_lda, data = new_dtm)
assignments
## # A tibble: 25,553 x 4
## document term count .topic
## <chr> <chr> <dbl> <dbl>
## 1 Credit reporting credit 2386 6
## 2 Debt collection credit 654 5
## 3 Credit cards credit 610 3
## 4 Accounts credit 79 4
## 5 Mortgage credit 113 2
## 6 Loans credit 198 1
## 7 Money transfers credit 29 3
## 8 Credit reporting report 1290 6
## 9 Debt collection report 305 5
## 10 Credit cards report 92 3
## # ... with 25,543 more rows
# get topic-document matrix (theta = gamma), use that to calculate product classification to determine the most likely Product type for a given topic
new_theta <- tidy(new_lda, matrix = "gamma")
names(new_theta)[1] <- "Product"
new_theta
## # A tibble: 49 x 3
## Product topic gamma
## <chr> <int> <dbl>
## 1 Credit reporting 1 0.000000274
## 2 Debt collection 1 0.000000484
## 3 Credit cards 1 0.000000744
## 4 Accounts 1 0.00000109
## 5 Mortgage 1 0.000000535
## 6 Loans 1 0.928
## 7 Money transfers 1 0.00000352
## 8 Credit reporting 2 0.000000274
## 9 Debt collection 2 0.000000484
## 10 Credit cards 2 0.000000744
## # ... with 39 more rows
# sort product classification result
product_classifications <- new_theta %>%
group_by(Product) %>%
top_n(1, gamma) %>%
ungroup() %>%
arrange(gamma)
product_classifications
## # A tibble: 7 x 3
## Product topic gamma
## <chr> <int> <dbl>
## 1 Money transfers 7 0.658
## 2 Loans 1 0.928
## 3 Mortgage 2 0.995
## 4 Credit reporting 6 1.00
## 5 Accounts 4 1.00
## 6 Credit cards 3 1.00
## 7 Debt collection 5 1.00
# rename column
names(product_classifications)[1] <- "predicted"
# "join" the original product labels with our product (topic) assignment result side by side
assignments <- assignments %>%
inner_join(product_classifications, by = c(".topic" = "topic"))
assignments
## # A tibble: 25,553 x 6
## document term count .topic predicted gamma
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Credit reporting credit 2386 6 Credit reporting 1.00
## 2 Debt collection credit 654 5 Debt collection 1.00
## 3 Credit cards credit 610 3 Credit cards 1.00
## 4 Accounts credit 79 4 Accounts 1.00
## 5 Mortgage credit 113 2 Mortgage 0.995
## 6 Loans credit 198 1 Loans 0.928
## 7 Money transfers credit 29 3 Credit cards 1.00
## 8 Credit reporting report 1290 6 Credit reporting 1.00
## 9 Debt collection report 305 5 Debt collection 1.00
## 10 Credit cards report 92 3 Credit cards 1.00
## # ... with 25,543 more rows
# finally, we can plot the result, using the color scale intensity to show the percentage of complaints that are correctly assigned to their TRUE Product labels
library(scales)
assignments %>%
dplyr::count(document, predicted, wt = count) %>%
group_by(document) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(predicted, document, fill = percent)) +
geom_tile() +
scale_fill_gradient2(high = "red", label = percent_format()) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid = element_blank()) +
labs(x = "Complaints assigned to",
y = "Complaints came from",
fill = "% of assignments")
# it seems that, with the exception of credit-cards-money transfers, we have done a pretty good job at classifying complaint product types
Counting positive and negative sentiments, visualization, and more…
library(ggplot2)
library(plyr)
library(dplyr)
library(tibble)
library(tidyverse)
library(tidytext)
library(tidyr)
library(tm)
library(topicmodels)
library(stringr)
# most common positive and negative words
# preprocessing
new_data_unnested <- new_data %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
# count number of positive and negative words in the entire corpus
bing_word_counts <- new_data_unnested %>%
inner_join(get_sentiments("bing")) %>%
dplyr::count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
# display output (unsorted)
bing_word_counts
## # A tibble: 1,417 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 debt negative 1413
## 2 dispute negative 562
## 3 complaint negative 404
## 4 fraud negative 400
## 5 issue negative 360
## 6 fraudulent negative 332
## 7 disputed negative 256
## 8 correct positive 232
## 9 inaccurate negative 232
## 10 fair positive 223
## # ... with 1,407 more rows
# visualize it
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
# show it in "comparison" wordcloud
library(wordcloud)
## Loading required package: RColorBrewer
install.packages("reshape2") # we will need it to use acast() function
## Installing package into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'reshape2' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'reshape2'
## Warning in file.copy(savedcopy, lib, recursive =
## TRUE): problem copying C:\Users\hktse\Documents\R\win-
## library\3.6\00LOCK\reshape2\libs\x64\reshape2.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\reshape2\libs\x64\reshape2.dll:
## Permission denied
## Warning: restored 'reshape2'
##
## The downloaded binary packages are in
## C:\Users\hktse\AppData\Local\Temp\RtmpGUJ8JW\downloaded_packages
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.6.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
new_data_unnested %>%
inner_join(get_sentiments("bing")) %>%
dplyr::count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"), # comparison.cloud() comes from wordcloud package
max.words = 100)
## Joining, by = "word"
# distribution of positive and negative sentiments across different types of financial products
new_text <- tibble(document = 1:nrow(new_data), text = new_data$text, Product = new_data$Product) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
group_by(Product, document) %>%
mutate(text = paste(word,collapse =' ')) %>%
distinct(Product, document, text)
## Joining, by = "word"
# using analyzeSentiment() from the SentimentAnalysis package to count positive and negative words per document and rate the sentiment type of each document
install.packages("SentimentAnalysis")
## Installing package into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'SentimentAnalysis' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\hktse\AppData\Local\Temp\RtmpGUJ8JW\downloaded_packages
library(SentimentAnalysis)
## Warning: package 'SentimentAnalysis' was built under R version 3.6.3
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
# sentiment <- analyzeSentiment(new_text$text)
sentiment <- readRDS(url("https://www.dropbox.com/s/q4em5vu8s9t9vdd/sentiment.rds?dl=1"))
new_text$sentiment <- as.character(convertToBinaryResponse(sentiment)$SentimentGI)
# plot it
new_text %>%
dplyr::count(Product, sentiment, sort = TRUE) %>%
mutate(sentiment = reorder(sentiment, n)) %>% # sort by frequency in descending order
ggplot(aes(sentiment, n, fill = sentiment)) + # plotting
geom_col(show.legend = FALSE) +
facet_wrap(~ Product, scales = "free") +
scale_x_reordered() +
ylab("Frequency") +
theme_bw()
Make your own stopword list
#Note: stop_words (from tidytext package) are a list of pre-specified of stopwords stored in a dataframe/tibble object, you can specify your own stopword list and combine with stop_words
library(tidytext)
data(stop_words)
# check out stop_words dimension and content
dim(stop_words)
head(stop_words)
# suppose you are working on political science text, you want to define a set of terms to remove
# you first define your own stopword list
words = c("authoritarianism", "oligarchy", "democratization", "coup")
# set lexicon (aka., index), index = 4 because you previously include 4 terms in the word object
lexicon = rep("psc", 4)
# create a dataframe for "words"
psc = data.frame(word, lexicon)
# append to tidytext's stop_words list, now you have a new stop_words list
stop_words <- rbind(stop_words, psc)
## perplexity analysis
## perplexity analysis: find out ideal number of topics, conditional on the distribution of words/terms and the number of documents
install.packages(c("ldatuning", "ggplot2", "scales", "snow", "foreach", "doParallel"))
library(ldatuning)
library(ggplot2)
library(scales)
library(snow)
library(foreach)
library(doParallel)
# do not run, just load the data from our Dropbox shared folder
result <- FindTopicsNumber(
dtm,
topics = seq(from = 2, to = 50, by = 1), # candidate number of K: 2 to 50 (must be > 1)
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"), # 4 metrics
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE
)
# the plotting result suggests that K = 15 should be adequate, so we will train a 15-topic LDA
FindTopicsNumber_plot(result)
## perplexity analysis: cross-validation version
## cross-validation
burnin = 1000
iter = 1000
keep = 50
n <- nrow(dtm)
k <- 15 # number of topics
splitter <- sample(1:n, round(n * 0.75))
train_set <- dtm[splitter, ]
valid_set <- dtm[-splitter, ]
fitted <- LDA(train_set, k = k, method = "Gibbs",
control = list(burnin = burnin, iter = iter, keep = keep) )
perplexity(fitted, newdata = train_set)
perplexity(fitted, newdata = valid_set)
## CV with many candidate n of topics
cluster <- makeCluster(detectCores(logical = TRUE) - 1) # leave one CPU spare
registerDoParallel(cluster)
clusterEvalQ(cluster, {
library(topicmodels)
})
folds <- 5
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2, 3, 4, 5, 10, 20, 30, 40, 50, 75, 100) # candidates for how many topics
clusterExport(cluster, c("dtm", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))
system.time({
results <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
k <- candidate_k[j]
results_1k <- matrix(0, nrow = folds, ncol = 2)
colnames(results_1k) <- c("k", "perplexity")
for(i in 1:folds){
train_set <- dtm[splitfolds != i , ]
valid_set <- dtm[splitfolds == i, ]
fitted <- LDA(train_set, k = k, method = "Gibbs",
control = list(burnin = burnin, iter = iter, keep = keep) )
results_1k[i,] <- c(k, perplexity(fitted, newdata = valid_set))
}
return(results_1k)
}
})
stopCluster(cluster)
results_df <- as.data.frame(results)
ggplot(results_df, aes(x = k, y = perplexity)) +
geom_point() +
geom_smooth(se = FALSE) +
ggtitle("5-fold CV of topic modeling", "when fitting the trained model to the hold-out set") +
labs(x = "Candidate number of topics", y = "Perplexity")
# which gives us