Stripping raw text using unnest_tokens()
# load required packages
# install.packages(c("tibble", "tidytext", "ggplot2", "gutenbergr", "tidyr"))
library(tibble)
library(tidytext)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(gutenbergr)
library(tidyr)
# example text
text <- c("A market meltdown. Surging recession fears. And a sudden spotlight on America's health care system. Goldman Sachs is warning Wall Street that the coronavirus could cost President Donald Trump the election.",
"The potential political fallout from the coronavirus adds yet more uncertainty for investors trying to assess the impact of the fast-moving epidemic. ",
"If the coronavirus epidemic materially affects US economic growth it may increase the likelihood of Democratic victory in the 2020 election, Goldman Sachs analysts led by Ben Snider wrote in a report published Wednesday night.",
"That could be a negative for stocks because investors have been hoping for a continuation of the low-tax, light-regulation approach of the Trump administration. And Trump of course has been laser-focused on boosting stock prices."
)
text_df <- tibble(line = 1:4, text = text) # tibble() function comes from tidyverse, we use it to arrange four lines of text
# note the data is in tibble format, A tibble is a modern class of R data frame, available in the dplyr and tibble packages, it has a convenient print method and does not use row names. The texts in the tibble object are of factor class.
text_df
## # A tibble: 4 × 2
## line text
## <int> <chr>
## 1 1 "A market meltdown. Surging recession fears. And a sudden spotlight on …
## 2 2 "The potential political fallout from the coronavirus adds yet more unc…
## 3 3 "If the coronavirus epidemic materially affects US economic growth it m…
## 4 4 "That could be a negative for stocks because investors have been hoping…
# we now need to convert text_df to one-token-per-document-per-row format for further analysis
text_df %>%
unnest_tokens(word, text) # this will break each token (word) from a string of text into its own row
## # A tibble: 127 × 2
## line word
## <int> <chr>
## 1 1 a
## 2 1 market
## 3 1 meltdown
## 4 1 surging
## 5 1 recession
## 6 1 fears
## 7 1 and
## 8 1 a
## 9 1 sudden
## 10 1 spotlight
## # ℹ 117 more rows
# download Jane Austen's books
library(janeaustenr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.1
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", # using regular expression (regex) to identify chapter ID. divxlc identifies Roman numerals.
ignore_case = TRUE)))) %>% # ignore capital or lowercase letters
ungroup() # convert back
# check out output
original_books
## # A tibble: 73,422 × 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## # ℹ 73,412 more rows
# parsing using tidytext
library(tidytext)
tidy_austen <- original_books %>%
unnest_tokens(word, text)
tidy_austen
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ℹ 725,045 more rows
# anti join with stop_words (which is of dataframe class) to remove stopwords. Note: stop_words are a list of pre-defined stopwords stored in a dataframe/tibble object, you can specify your own stopword list and combine with existing stop_words dataframe (please refer to the appendix)
data(stop_words)
tidy_austen <- tidy_austen %>%
anti_join(stop_words) # using anti_join() to remove stopwords from your text data (stored in tidy_books)
## Joining with `by = join_by(word)`
# count and sort by word frequency
tidy_austen %>%
dplyr::count(word, sort = TRUE)
## # A tibble: 13,914 × 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # ℹ 13,904 more rows
# plot word frequency
library(ggplot2)
tidy_austen %>%
dplyr::count(word, sort = TRUE) %>%
filter(n > 600) %>% # select those that appear more than 600 times
mutate(word = reorder(word, n)) %>% # sort by frequency in descending order
ggplot(aes(word, n)) + # plotting (word in x-axis, sort by its frequency (n) in the y-axis)
geom_col() +
xlab(NULL) +
coord_flip() + # flip coordinate, such that word appears on the y-axis, freq on the x-axis
theme_bw() # black and white plotting background
# download H.G. Wells's 4 books from gutenberg archive
library(gutenbergr)
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Warning: ! Could not download a book at http://aleph.gutenberg.org/1/5/159/159.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
# parsing
tidy_hgwells <- hgwells %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
# count word frequency
tidy_hgwells %>%
dplyr::count(word, sort = TRUE)
## # A tibble: 10,320 × 2
## word n
## <chr> <int>
## 1 time 396
## 2 people 249
## 3 door 224
## 4 kemp 213
## 5 invisible 197
## 6 black 178
## 7 stood 174
## 8 night 168
## 9 heard 167
## 10 hall 165
## # ℹ 10,310 more rows
# sort word frequencies for words commonly used by the two authors
library(tidyr)
# create additional column "author" and use bind_row() to bind together two data frames by rows
frequency <- bind_rows(mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_austen, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>% # extract any text elements that contain "[a-z']+""
dplyr::count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>% # create a new column "proportion" and append it to existing dataframe
select(-n) %>% # in reverse order
spread(author, proportion) %>% # spread() rotates "author" column into 2 separate columns (because we have 2 authors) right next to the column containing each term used by the two authors
gather(author, proportion, `H.G. Wells`) # gather() combines multiple columns into key-value pairs, we use this trick to create an author column for H.G. Wells
library(scales) # automatically set breaks and labels for axes on ggplot plotting environment
## Warning: package 'scales' was built under R version 4.3.2
# arranging the Jane Austen's words on the y-axis, H.G. Wells's words on the x-axis
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
# adding small random variation to the location of each data point
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Jane Austen", x = NULL)
## Warning: Removed 12833 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12834 rows containing missing values or values outside the scale range
## (`geom_text()`).
What kind of financial products were customers complaining about?
# install.packages(c("plyr", "tidyverse", "wordcloud2", "tm", "topicmodels", "stringr", "dplyr", "ldatuning", "snow", "vctrs", "scales"))
library(vctrs)
## Warning: package 'vctrs' was built under R version 4.3.2
##
## Attaching package: 'vctrs'
## The following object is masked from 'package:dplyr':
##
## data_frame
## The following object is masked from 'package:tibble':
##
## data_frame
library(plyr)
## Warning: package 'plyr' was built under R version 4.3.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ purrr 1.0.1
## ✔ lubridate 1.9.2 ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plyr::arrange() masks dplyr::arrange()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::compact() masks plyr::compact()
## ✖ plyr::count() masks dplyr::count()
## ✖ vctrs::data_frame() masks dplyr::data_frame(), tibble::data_frame()
## ✖ plyr::desc() masks dplyr::desc()
## ✖ purrr::discard() masks scales::discard()
## ✖ plyr::failwith() masks dplyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ plyr::id() masks dplyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ plyr::mutate() masks dplyr::mutate()
## ✖ plyr::rename() masks dplyr::rename()
## ✖ plyr::summarise() masks dplyr::summarise()
## ✖ plyr::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(tidyr)
library(wordcloud2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(topicmodels)
library(ggplot2)
library(stringr)
# load customer complaints data (10k random sample)
data <- read.csv("https://www.dropbox.com/s/6kofxbf8f3cq8cw/complaints10k.csv?dl=1", stringsAsFactors=FALSE)
# check out the data
dim(data)
## [1] 10000 18
names(data)
## [1] "Date.received" "Product"
## [3] "Sub.product" "Issue"
## [5] "Sub.issue" "Consumer.complaint.narrative"
## [7] "Company.public.response" "Company"
## [9] "State" "ZIP.code"
## [11] "Tags" "Consumer.consent.provided."
## [13] "Submitted.via" "Date.sent.to.company"
## [15] "Company.response.to.consumer" "Timely.response."
## [17] "Consumer.disputed." "Complaint.ID"
# we'll take a look at the "Consumer.complaint.narrative" (the 6th column)
head(data[6])
## Consumer.complaint.narrative
## 1
## 2
## 3 I submitted a complaint to SPS last month via CFPB. SPS did not respond to the specific questions contained in the Qualified Written Request, nor did they respond to my request to schedule an in-person meeting at SPS HQ in Utah to review their " original copies '' of documents they allege are valid and are associated with my property. SPS is trying to use these so-called documents to foreclose our home based on an assertion of a financial interest that does not exist.
## 4
## 5
## 6
# rename
names(data)[6] <- "text"
text_df <- tibble(document = 1:10000, text = data$text) # use "document" in lieu of "line", this will create a 2-column tibble object consists of "document ID" and "text"
# load stop_words and create our own stopword list
data(stop_words)
word = c("xxxx", "xx") # create a vector containing these two anonymity abbreviations
# set lexicon (aka., index), index = 2
lexicon = rep("anonymous", 2) # create a new stopword label "anonymous" for "xxxx" and "xx"
# create a dataframe for "words"
anonymous = data.frame(word, lexicon) # combine them into a dataframe
# append to tidytext's stop_words list, now you have a new stop_words list
stop_words <- rbind(stop_words, anonymous) # combine with existing stop_words list
# text processing
text_unnested <- text_df %>%
unnest_tokens(word, text) %>% # remove punctuations, convert to lower case, etc.
anti_join(stop_words) # remove stopwords
## Joining with `by = join_by(word)`
text_counts <- text_unnested %>% # generate and append a word count column "n" to existing tibble object and sort by freq.
dplyr::count(document, word, sort = TRUE)
# plot word frequency
library(ggplot2)
text_counts %>%
dplyr::count(word, sort = TRUE) %>%
filter(n > 400) %>% # only display those that appeared > 400 times
mutate(word = reorder(word, n)) %>% # sort by frequency in descending order
ggplot(aes(word, n)) + # plotting
geom_col() +
xlab(NULL) +
coord_flip() + # flip coordinate, word should appear on the y-axis, freq on the x-axis
theme_bw() # black and white plotting background
# you can also plot these words in the media-savvy "wordcloud" way using wordcloud2() function
# to do that, we need "word" and "n" (count) columns from text_counts object, make them into another tidy object
text_cloud <- text_counts %>%
dplyr::count(word, sort = TRUE) # this will remove the document ID column (which is not needed here)
library(wordcloud2) # load wordcloud2 package
wordcloud2(text_cloud) # the default setting, simply apply wordcloud2() on the new tidy object you just created
# wordcloud2(text_cloud[1:100,]) # you can control the number of words/terms appearing on the cloud to make it sparser (or denser)
# wordcloud2(text_cloud, shape = "circle", color = "random-light", backgroundColor="black") # you can also set your preferred shape (for example, heart shape "cardioid"), color, and background color
## "cast" the tibble object into document term matrix (DTM) to feed into LDA analysis
dtm <- text_counts %>%
cast_dtm(document, word, n)
## sees what's inside the dtm object
str(dtm)
## List of 6
## $ i : int [1:117742] 1 3 4 5 6 7 8 9 10 11 ...
## $ j : int [1:117742] 1 1 1 1 1 1 1 1 1 1 ...
## $ v : num [1:117742] 46 46 44 44 44 44 44 44 44 44 ...
## $ nrow : int 2551
## $ ncol : int 10734
## $ dimnames:List of 2
## ..$ Docs : chr [1:2551] "6551" "7005" "7660" "1815" ...
## ..$ Terms: chr [1:10734] "consumer" "inquiry" "debt" "fees" ...
## - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
# you see the followings: i, j, v, nrow, ncol, and dimnames
# i and j are integer vectors of row and column indices, corresponding to document ID and the j-th term in the i-th document
# v is the weight applied to the j-th term in the i-th document
# nrow and ncol tell you the dimension of the data
# dimnames index a term's document ID (where it comes from) and its original text
## perplexity analysis: find out the ideal number of topics, conditional on the distribution of words/terms and the number of documents
library(ldatuning)
library(ggplot2)
library(scales)
library(snow)
# do not run, just load the data from our Dropbox shared folder
# result <- FindTopicsNumber(
# dtm,
# topics = seq(from = 2, to = 50, by = 1), # candidate number of topic K: 2 to 50 (must be > 1)
# metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"), # 4 metrics
# method = "Gibbs", # Gibbs sampling
# control = list(seed = 123),
# mc.cores = 2L,
# verbose = FALSE)
# the Griffiths (2004) and Cao and Juan (2009) metrics output the minimum values
# the Arun (2010) and Deveaud (2014) metrics output the maximum values
# you should try to locate the region (using eyeballing) to find the number of topics (k) that best satisfies all four criteria
result <- readRDS(url("https://www.dropbox.com/s/ap8omatxqm3v2g3/result.rds?dl=1"))
# the result suggests that K = 15 should be adequate, so we will train a 15-topic LDA
FindTopicsNumber_plot(result)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the ldatuning package.
## Please report the issue at <https://github.com/nikita-moor/ldatuning/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 15-topic LDA
library(topicmodels)
# text.lda <- LDA(dtm, k = 15, control = list(seed = 123)) # do not run, just load the output from our dropbox folder
text.lda <- readRDS(url("https://www.dropbox.com/s/4po0veokpr4thju/text.lda.rds?dl=1"))
class(text.lda) # note: the elements inside this object is not subsettable
## [1] "LDA_VEM"
## attr(,"package")
## [1] "topicmodels"
# note that you can turn the model back to the one-topic-per-term-per-row format using tidy()
text_beta <- tidy(text.lda, matrix = "beta") # the output is a term-topic matrix φ. Note: the default matrix argument is "beta"
# term-topic assignment: for each term-topic combination the model will have a estimated beta -- the probability of a term being generated from a topic.
text_beta
## # A tibble: 161,445 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 consumer 9.82e- 2
## 2 2 consumer 1.26e- 3
## 3 3 consumer 9.45e-16
## 4 4 consumer 2.62e- 9
## 5 5 consumer 1.30e- 5
## 6 6 consumer 1.47e- 7
## 7 7 consumer 1.52e- 8
## 8 8 consumer 1.13e- 5
## 9 9 consumer 2.51e-28
## 10 10 consumer 6.06e-23
## # ℹ 161,435 more rows
# we could use top_n() function from the dplyr package to find the top 10 terms of each topic
top_terms <- text_beta %>%
group_by(topic) %>%
top_n(10, beta) %>% # top 10 terms of each group ordered by beta
ungroup() %>%
arrange(topic, -beta) # display in descending order
table(top_terms$topic) # 15 unique topics and 10 top terms / each
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
# the structure of the data
top_terms
## # A tibble: 150 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 consumer 0.0982
## 2 1 information 0.0570
## 3 1 identity 0.0432
## 4 1 theft 0.0423
## 5 1 section 0.0327
## 6 1 reporting 0.0317
## 7 1 agency 0.0274
## 8 1 block 0.0220
## 9 1 report 0.0217
## 10 1 victim 0.0176
## # ℹ 140 more rows
# topic-document assignment: here the tidy package uses "gamma" in lieu of "alpha" (i.e., document-topic matrix)
text.lda_gamma <- tidy(text.lda, matrix = "gamma")
text.lda_gamma
## # A tibble: 38,265 × 3
## document topic gamma
## <chr> <int> <dbl>
## 1 6551 1 0.999
## 2 7005 1 0.000393
## 3 7660 1 0.999
## 4 1815 1 0.998
## 5 2675 1 0.998
## 6 2817 1 0.998
## 7 2981 1 0.998
## 8 4282 1 0.998
## 9 6830 1 0.998
## 10 9016 1 0.998
## # ℹ 38,255 more rows
# since we have 10,000 documents, it's infeasible to visualize their distribution across different topics, but we can do that for term-topic distribution, that is, the beta parameter.
# We have previously generated the term-topic matrix named "text_beta"
# using dplyr's top_n() function to find out the top 10 terms of each topic
top_terms <- text_beta %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta) # arrange in descending order
# finally, we can plot term freq. distribution across all 15 topics using the ggplot() function
library(ggplot2)
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>% # sort term by beta AND within each topic
ggplot(aes(term, beta, fill = factor(topic))) + # term on the x-axis, beta on the y-axis
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + # arrange sub-plots by topic, here we should output 15 sub-plots because we have 15 topics
coord_flip() + # flip x-y axis, so term will be on the y-axis, beta on the x-axis
scale_x_reordered()
# do you find something insightful regarding what types of financial products customers were complaining about?
## prediction accuracy, lets process the data again but (i) retain their Product labels and (ii) collapse their categories to only 7 main types
# collapse Product categories
require("car")
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
data$Product <- recode(data$Product, "c('Bank account or service','Checking or savings account') = 'Accounts'; else = data$Product")
data$Product <- recode(data$Product, "c('Consumer Loan','Payday loan', 'Payday loan, title loan, or personal loan', 'Student loan', 'Vehicle loan or lease') = 'Loans'; else = data$Product")
data$Product <- recode(data$Product, "c('Credit card','Credit card or prepaid card', 'Prepaid card') = 'Credit cards'; else = data$Product")
data$Product <- recode(data$Product, "c('Credit reporting','Credit reporting, credit repair services, or other personal consumer reports') = 'Credit reporting'; else = data$Product")
data$Product <- recode(data$Product, "c('Money transfer, virtual currency, or money service','Money transfers', 'Other financial service', 'Virtual currency') = 'Money transfers'; else = data$Product")
# now we have 7 main product categories
table(data$Product) # get a glimpse of their freq. counts
##
## Accounts Credit cards Credit reporting Debt collection
## 970 1037 3217 1801
## Loans Money transfers Mortgage
## 729 134 2112
# subset the data
new_data <- data[, c("text", "Product")]
new_data_unnested <- new_data %>%
unnest_tokens(word, text)
new_data_counts <- new_data_unnested %>%
anti_join(stop_words) %>% # remove stop words
dplyr::count(Product, word, sort = TRUE) # this will generate a freq. count column "n"
## Joining with `by = join_by(word)`
# display result
head(new_data_counts, 10)
## Product word n
## 1 Credit reporting credit 2386
## 2 Credit reporting report 1290
## 3 Credit reporting account 1178
## 4 Credit reporting information 1136
## 5 Debt collection debt 951
## 6 Credit reporting reporting 796
## 7 Debt collection credit 654
## 8 Credit cards card 653
## 9 Accounts account 644
## 10 Credit cards credit 610
# sort by TF-IDF to increase topic-specific term identifiability: terms appearing more often in a certain topic will receive more weight
new_data_counts <- new_data_counts %>%
bind_tf_idf(word, Product, n)
# sort by TF-IDF
new_data_counts %>%
arrange(desc(tf_idf)) %>% # order by tf_idf in descending order
top_n(100) # display top 100 terms
## Selecting by tf_idf
## Product word n tf idf tf_idf
## 1 Money transfers coinbase 29 0.0065153898 1.9459101 0.0126783631
## 2 Mortgage escrow 182 0.0062107562 1.2527630 0.0077806054
## 3 Money transfers gram 15 0.0033700292 1.9459101 0.0065577740
## 4 Loans navient 122 0.0063690942 0.8472979 0.0053965199
## 5 Mortgage modification 176 0.0060060060 0.8472979 0.0050888760
## 6 Credit reporting transunion 201 0.0035121440 1.2527630 0.0043998839
## 7 Money transfers usd 15 0.0033700292 1.2527630 0.0042218478
## 8 Mortgage homeowner 63 0.0021498771 1.9459101 0.0041834678
## 9 Credit reporting experian 272 0.0047527521 0.8472979 0.0040269966
## 10 Loans pslf 39 0.0020360219 1.9459101 0.0039619157
## 11 Money transfers tech 9 0.0020220175 1.9459101 0.0039346644
## 12 Credit reporting equifax 354 0.0061855670 0.5596158 0.0034615410
## 13 Mortgage pmi 49 0.0016721267 1.9459101 0.0032538083
## 14 Money transfers wire 24 0.0053920467 0.5596158 0.0030174745
## 15 Mortgage appraisal 44 0.0015015015 1.9459101 0.0029217870
## 16 Mortgage homeowners 43 0.0014673765 1.9459101 0.0028553828
## 17 Mortgage nationstar 64 0.0021840022 1.2527630 0.0027360371
## 18 Money transfers 60000.00 6 0.0013480117 1.9459101 0.0026231096
## 19 Money transfers cryptocurrency 6 0.0013480117 1.9459101 0.0026231096
## 20 Money transfers skrill 6 0.0013480117 1.9459101 0.0026231096
## 21 Money transfers western 9 0.0020220175 1.2527630 0.0025331087
## 22 Money transfers paypal 31 0.0069647270 0.3364722 0.0023434373
## 23 Loans loans 277 0.0144609762 0.1541507 0.0022291693
## 24 Mortgage ditech 52 0.0017745018 1.2527630 0.0022230301
## 25 Credit cards citi 55 0.0026118340 0.8472979 0.0022130013
## 26 Money transfers ltc 5 0.0011233431 1.9459101 0.0021859247
## 27 Money transfers receiver 5 0.0011233431 1.9459101 0.0021859247
## 28 Accounts atm 36 0.0024972253 0.8472979 0.0021158937
## 29 Loans repayment 72 0.0037588097 0.5596158 0.0021034893
## 30 Credit cards sam 34 0.0016145883 1.2527630 0.0020226964
## 31 Mortgage realtor 29 0.0009896260 1.9459101 0.0019257233
## 32 Mortgage wf 29 0.0009896260 1.9459101 0.0019257233
## 33 Money transfers seller 10 0.0022466861 0.8472979 0.0019036124
## 34 Money transfers shipping 10 0.0022466861 0.8472979 0.0019036124
## 35 Loans fedloan 29 0.0015139650 1.2527630 0.0018966393
## 36 Accounts umb 14 0.0009711432 1.9459101 0.0018897574
## 37 Loans education 42 0.0021926390 0.8472979 0.0018578183
## 38 Debt collection creditor 106 0.0032729181 0.5596158 0.0018315767
## 39 Debt collection validation 106 0.0032729181 0.5596158 0.0018315767
## 40 Loans ally 40 0.0020882276 0.8472979 0.0017693508
## 41 Money transfers 31000.00 4 0.0008986745 1.9459101 0.0017487397
## 42 Money transfers gbp 4 0.0008986745 1.9459101 0.0017487397
## 43 Money transfers lexington 4 0.0008986745 1.9459101 0.0017487397
## 44 Money transfers litecoin 4 0.0008986745 1.9459101 0.0017487397
## 45 Credit reporting reseller 51 0.0008911410 1.9459101 0.0017340803
## 46 Mortgage fay 26 0.0008872509 1.9459101 0.0017265105
## 47 Mortgage remic 26 0.0008872509 1.9459101 0.0017265105
## 48 Loans nelnet 26 0.0013573480 1.2527630 0.0017004352
## 49 Loans students 26 0.0013573480 1.2527630 0.0017004352
## 50 Mortgage ocwen 88 0.0030030030 0.5596158 0.0016805279
## 51 Loans salliemae 15 0.0007830854 1.9459101 0.0015238137
## 52 Credit reporting section 255 0.0044557050 0.3364722 0.0014992210
## 53 Credit reporting theft 253 0.0044207583 0.3364722 0.0014874624
## 54 Accounts schwab 11 0.0007630411 1.9459101 0.0014848094
## 55 Credit reporting bureaus 248 0.0043333916 0.3364722 0.0014580660
## 56 Loans ibr 22 0.0011485252 1.2527630 0.0014388298
## 57 Credit reporting subsection 65 0.0011357680 1.2527630 0.0014228480
## 58 Loans pheaa 14 0.0007308797 1.9459101 0.0014222262
## 59 Money transfers signup 5 0.0011233431 1.2527630 0.0014072826
## 60 Loans forbearance 48 0.0025058731 0.5596158 0.0014023262
## 61 Debt collection collector 81 0.0025010035 0.5596158 0.0013996010
## 62 Money transfers citibank 11 0.0024713548 0.5596158 0.0013830091
## 63 Credit cards mastercard 23 0.0010922215 1.2527630 0.0013682946
## 64 Money transfers pnc 18 0.0040440350 0.3364722 0.0013607055
## 65 Accounts cd 23 0.0015954495 0.8472979 0.0013518209
## 66 Accounts usaa 23 0.0015954495 0.8472979 0.0013518209
## 67 Accounts bb 10 0.0006936737 1.9459101 0.0013498267
## 68 Credit reporting creditor 136 0.0023763760 0.5596158 0.0013298575
## 69 Mortgage loancare 20 0.0006825007 1.9459101 0.0013280850
## 70 Money transfers 62000.00 3 0.0006740058 1.9459101 0.0013115548
## 71 Money transfers 69.00 3 0.0006740058 1.9459101 0.0013115548
## 72 Money transfers gallery 3 0.0006740058 1.9459101 0.0013115548
## 73 Money transfers grams 3 0.0006740058 1.9459101 0.0013115548
## 74 Money transfers hacking 3 0.0006740058 1.9459101 0.0013115548
## 75 Money transfers moneygram 3 0.0006740058 1.9459101 0.0013115548
## 76 Money transfers ria 3 0.0006740058 1.9459101 0.0013115548
## 77 Debt collection erc 21 0.0006484083 1.9459101 0.0012617443
## 78 Debt collection portfolio 48 0.0014820761 0.8472979 0.0012557599
## 79 Credit cards menards 13 0.0006173426 1.9459101 0.0012012932
## 80 Mortgage phh 18 0.0006142506 1.9459101 0.0011952765
## 81 Loans student 147 0.0076742365 0.1541507 0.0011829888
## 82 Money transfers currency 4 0.0008986745 1.2527630 0.0011258261
## 83 Money transfers hrs 4 0.0008986745 1.2527630 0.0011258261
## 84 Loans gm 11 0.0005742626 1.9459101 0.0011174634
## 85 Credit reporting inquiries 190 0.0033199371 0.3364722 0.0011170667
## 86 Credit cards american 42 0.0019944914 0.5596158 0.0011161489
## 87 Credit reporting verifiable 50 0.0008736677 1.2527630 0.0010944985
## 88 Accounts dormant 8 0.0005549390 1.9459101 0.0010798613
## 89 Credit cards rewards 18 0.0008547820 1.2527630 0.0010708393
## 90 Credit reporting fcra 181 0.0031626769 0.3364722 0.0010641530
## 91 Mortgage planet 16 0.0005460005 1.9459101 0.0010624680
## 92 Loans coast 10 0.0005220569 1.9459101 0.0010158758
## 93 Credit cards amex 17 0.0008072941 1.2527630 0.0010113482
## 94 Mortgage bayview 15 0.0005118755 1.9459101 0.0009960638
## 95 Mortgage preservation 15 0.0005118755 1.9459101 0.0009960638
## 96 Mortgage seterus 15 0.0005118755 1.9459101 0.0009960638
## 97 Mortgage quicken 23 0.0007848758 1.2527630 0.0009832633
## 98 Mortgage freedom 34 0.0011602512 0.8472979 0.0009830783
## 99 Loans aes 22 0.0011485252 0.8472979 0.0009731429
## 100 Accounts citibank 25 0.0017341842 0.5596158 0.0009704769
# plot it: words frequency by topic according to TF-IDF (more topic-specific)
new_data_counts %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(Product) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = Product)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "TF-IDF") +
facet_wrap(~Product, ncol = 4, scales = "free") + # generate 7 sub-plots
coord_flip() # flip x and y axis
## Selecting by tf_idf
# do you find any interpretable patterns?
# cast to dtm for further processing and analysis
new_dtm <- new_data_counts %>%
cast_dtm(Product, word, n)
new_dtm
## <<DocumentTermMatrix (documents: 7, terms: 10734)>>
## Non-/sparse entries: 25522/49616
## Sparsity : 66%
## Maximal term length: 45
## Weighting : term frequency (tf)
# convert to lda class output (we set K = 7 because we have 7 product categories)
# new_lda <- LDA(new_dtm, k = 7, control = list(seed = 123)) # do not run, just download the new_lda output from dropbox folder
new_lda <- readRDS(url("https://www.dropbox.com/s/epre5su5i0km5ho/new_lda.rds?dl=1"))
# join term-topic assignment (𝜑 matrix based on probability (beta)) with document-term matrix to see the most likely product type (i.e., topic) for a document (customer complaint)
# you will need the "broom" package to be able to use augment() function
library(broom)
## Warning: package 'broom' was built under R version 4.3.2
assignments <- augment(new_lda, data = new_dtm) # this will append a "topic" column to the DTM object (it's the topic assigned to a term by the LDA model)
assignments
## # A tibble: 25,522 × 4
## document term count .topic
## <chr> <chr> <dbl> <dbl>
## 1 Credit reporting credit 2386 6
## 2 Debt collection credit 654 5
## 3 Credit cards credit 610 3
## 4 Accounts credit 79 4
## 5 Mortgage credit 113 2
## 6 Loans credit 198 1
## 7 Money transfers credit 29 3
## 8 Credit reporting report 1290 6
## 9 Debt collection report 305 5
## 10 Credit cards report 92 3
## # ℹ 25,512 more rows
# get document-topic matrix (theta, which is parameterized by alpha = gamma), use that to calculate product classification to determine the most likely topic (product) for a document
new_theta <- tidy(new_lda, matrix = "gamma")
names(new_theta)[1] <- "Product"
head(new_theta, 10)
## # A tibble: 10 × 3
## Product topic gamma
## <chr> <int> <dbl>
## 1 Credit reporting 1 0.000000274
## 2 Debt collection 1 0.000000484
## 3 Credit cards 1 0.000000744
## 4 Accounts 1 0.00000109
## 5 Mortgage 1 0.000000535
## 6 Loans 1 0.928
## 7 Money transfers 1 0.00000352
## 8 Credit reporting 2 0.000000274
## 9 Debt collection 2 0.000000484
## 10 Credit cards 2 0.000000744
# sort product classification result
product_classifications <- new_theta %>%
group_by(Product) %>%
top_n(1, gamma) %>% # highest predicted topic assignment (gamma) for a document
ungroup() %>%
arrange(gamma)
product_classifications
## # A tibble: 7 × 3
## Product topic gamma
## <chr> <int> <dbl>
## 1 Money transfers 7 0.658
## 2 Loans 1 0.928
## 3 Mortgage 2 0.995
## 4 Credit reporting 6 1.00
## 5 Accounts 4 1.00
## 6 Credit cards 3 1.00
## 7 Debt collection 5 1.00
# rename column
names(product_classifications)[1] <- "predicted"
# "join" the original product labels with our product (topic) assignment result by "topic"
assignments <- assignments %>%
inner_join(product_classifications, by = c(".topic" = "topic"))
# take a look at the format of this output
head(assignments, 10)
## # A tibble: 10 × 6
## document term count .topic predicted gamma
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Credit reporting credit 2386 6 Credit reporting 1.00
## 2 Debt collection credit 654 5 Debt collection 1.00
## 3 Credit cards credit 610 3 Credit cards 1.00
## 4 Accounts credit 79 4 Accounts 1.00
## 5 Mortgage credit 113 2 Mortgage 0.995
## 6 Loans credit 198 1 Loans 0.928
## 7 Money transfers credit 29 3 Credit cards 1.00
## 8 Credit reporting report 1290 6 Credit reporting 1.00
## 9 Debt collection report 305 5 Debt collection 1.00
## 10 Credit cards report 92 3 Credit cards 1.00
# finally, we can plot the result, using the color scale intensity to show the percentage of documents whose original product types "agree" with the topics (product) assigned by our model
library(scales)
assignments %>%
dplyr::count(document, predicted, wt = count) %>% # wt means weight, meaning weighted by freq. count
group_by(document) %>%
mutate(percent = n / sum(n)) %>% # percent of the same type of documents correctly predicted
ggplot(aes(predicted, document, fill = percent)) +
geom_tile() +
scale_fill_gradient2(high = "red", label = percent_format()) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1), # let document labels dropped down 90-degree on the horizontal axis
panel.grid = element_blank()) +
labs(x = "Predicted complaint type",
y = "Actual complaint type",
fill = "% of agreement")
# it seems that, with the exception of credit-cards-money transfers, we have done a decent job at classifying complaint product types
Counting positive and negative sentiments, visualization, and more…
library(ggplot2)
library(plyr)
library(dplyr)
library(tibble)
library(tidyverse)
library(tidytext)
library(tidyr)
library(tm)
library(topicmodels)
library(stringr)
# most common positive and negative words
# preprocessing
new_data_unnested <- new_data %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
# count number of positive and negative words in the entire corpus
bing_word_counts <- new_data_unnested %>%
inner_join(get_sentiments("bing")) %>% # inner join with Microsoft bing sentiment dictionary
dplyr::count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
# display output (unsorted)
head(bing_word_counts, 50)
## word sentiment n
## 1 debt negative 1413
## 2 dispute negative 562
## 3 complaint negative 404
## 4 fraud negative 400
## 5 issue negative 360
## 6 fraudulent negative 332
## 7 disputed negative 256
## 8 correct positive 232
## 9 inaccurate negative 232
## 10 fair positive 223
## 11 negative negative 218
## 12 refused negative 212
## 13 error negative 182
## 14 violation negative 178
## 15 failed negative 177
## 16 denied negative 167
## 17 incorrect negative 167
## 18 issues negative 157
## 19 hard negative 154
## 20 unable negative 154
## 21 wrong negative 154
## 22 false negative 146
## 23 refund positive 145
## 24 accurate positive 128
## 25 lost negative 106
## 26 missed negative 101
## 27 freeze negative 100
## 28 recovery positive 100
## 29 limit negative 98
## 30 protection positive 90
## 31 breach negative 85
## 32 support positive 82
## 33 illegal negative 80
## 34 refuse negative 77
## 35 free positive 76
## 36 proper positive 75
## 37 concern negative 73
## 38 savings positive 73
## 39 hung negative 71
## 40 refuses negative 69
## 41 mistake negative 68
## 42 delinquent negative 67
## 43 complaints negative 66
## 44 bad negative 65
## 45 hardship negative 65
## 46 afford positive 64
## 47 debts negative 64
## 48 correctly positive 63
## 49 promised positive 62
## 50 stolen negative 62
# visualize it
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") + # create two subplots by sentiment type
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
# show it in "comparison" wordcloud
library(wordcloud)
## Loading required package: RColorBrewer
# install.packages("reshape2") # we will need it in order to use acast() function
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
new_data_unnested %>%
inner_join(get_sentiments("bing")) %>%
dplyr::count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% # cast words to their sentiment types with size proportional to its freq. (n)
comparison.cloud(colors = c("gray20", "gray80"), # comparison.cloud() comes from wordcloud package
max.words = 100)
## Joining with `by = join_by(word)`
# distribution of positive and negative sentiments across different types of financial products
# convert the data into tibble format for the ease of data wrangling
new_text <- tibble(document = 1:nrow(new_data), text = new_data$text, Product = new_data$Product) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
group_by(Product, document) %>%
mutate(text = paste(word,collapse =' ')) %>% # connect tokens (words) into their original text
distinct(Product, document, text) # remove original texts (rows) that contain empty string (which cannot be fed into sentiment analysis function)
## Joining with `by = join_by(word)`
# using analyzeSentiment() from the SentimentAnalysis package to count positive and negative words per document and rate the sentiment type of each document
#install.packages("SentimentAnalysis")
library(SentimentAnalysis)
##
## Attaching package: 'SentimentAnalysis'
##
## The following object is masked from 'package:base':
##
## write
# sentiment <- analyzeSentiment(new_text$text) # this will take a while, so just download the output from our shared Dropbox folder
sentiment <- readRDS(url("https://www.dropbox.com/s/q4em5vu8s9t9vdd/sentiment.rds?dl=1"))
# the verdict: subset the sentiment rating (SentimentGI) from the output of convertToBinaryResponse() and convert the result to ("positive", "negative") character, append the result to a new column of new_text (a tibble object)
new_text$sentiment <- as.character(convertToBinaryResponse(sentiment)$SentimentGI)
# plot it
new_text %>%
dplyr::count(Product, sentiment, sort = TRUE) %>%
mutate(sentiment = reorder(sentiment, n)) %>% # sort by frequency in descending order
ggplot(aes(sentiment, n, fill = sentiment)) + # plotting
geom_col(show.legend = FALSE) +
facet_wrap(~ Product, scales = "free") + # arrange sub-plots by "Product" type, this will output 7 sub-plots
scale_x_reordered() +
ylab("Frequency") +
xlab("Sentiment") +
theme_bw()
Make your own stopword list
#Note: stop_words (from tidytext package) are a list of pre-specified of stopwords stored in a dataframe/tibble object, you can specify your own stopword list and combine with stop_words
library(tidytext)
data(stop_words)
# check out stop_words dimension and content
dim(stop_words)
head(stop_words)
# suppose you are working on political science text, you want to define a set of terms to remove
# you first define your own stopword list
words = c("authoritarianism", "oligarchy", "democratization", "coup")
# set lexicon (aka., index), index = 4 because you previously include 4 terms in the word object
lexicon = rep("psc", 4)
# create a dataframe for "words"
psc = data.frame(word, lexicon)
# append to tidytext's stop_words list, now you have a new stop_words list
stop_words <- rbind(stop_words, psc)
## perplexity analysis
## perplexity analysis: find out ideal number of topics, conditional on the distribution of words/terms and the number of documents
# install.packages(c("ldatuning", "ggplot2", "scales", "snow", "foreach", "doParallel"))
library(ldatuning)
library(ggplot2)
library(scales)
library(snow)
library(foreach)
library(doParallel)
# do not run, just load the data from our Dropbox shared folder
result <- FindTopicsNumber(
dtm,
topics = seq(from = 2, to = 50, by = 1), # candidate number of K: 2 to 50 (must be > 1)
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"), # 4 metrics
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE
)
# the plotting result suggests that K = 15 should be adequate, so we will train a 15-topic LDA
FindTopicsNumber_plot(result)
## perplexity analysis: cross-validation version
## cross-validation
burnin = 1000
iter = 1000
keep = 50
n <- nrow(dtm)
k <- 15 # number of topics
splitter <- sample(1:n, round(n * 0.75))
train_set <- dtm[splitter, ]
valid_set <- dtm[-splitter, ]
fitted <- LDA(train_set, k = k, method = "Gibbs",
control = list(burnin = burnin, iter = iter, keep = keep) )
perplexity(fitted, newdata = train_set)
perplexity(fitted, newdata = valid_set)
## CV with many candidate n of topics
cluster <- makeCluster(detectCores(logical = TRUE) - 1) # leave one CPU spare
registerDoParallel(cluster)
clusterEvalQ(cluster, {
library(topicmodels)
})
folds <- 5
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2, 3, 4, 5, 10, 20, 30, 40, 50, 75, 100) # candidates number of topics
clusterExport(cluster, c("dtm", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))
system.time({
results <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
k <- candidate_k[j]
results_1k <- matrix(0, nrow = folds, ncol = 2)
colnames(results_1k) <- c("k", "perplexity")
for(i in 1:folds){
train_set <- dtm[splitfolds != i , ]
valid_set <- dtm[splitfolds == i, ]
fitted <- LDA(train_set, k = k, method = "Gibbs",
control = list(burnin = burnin, iter = iter, keep = keep) )
results_1k[i,] <- c(k, perplexity(fitted, newdata = valid_set)) # this will output the perplexity score (log likelihood) of the trained model in sorting topics from the validation data
}
return(results_1k)
}
})
stopCluster(cluster)
results_df <- as.data.frame(results)
ggplot(results_df, aes(x = k, y = perplexity)) +
geom_point() +
geom_smooth(se = FALSE) +
ggtitle("5-fold CV of topic modeling", "when fitting the trained model to the hold-out set") +
labs(x = "Candidate number of topics", y = "Perplexity")