Statistical Learning: Week-13

Stripping raw text

Stripping raw text using unnest_tokens()

# load required packages
# install.packages(c("tibble", "tidytext", "ggplot2", "gutenbergr", "tidyr"))
library(tibble)
library(tidytext)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

library(gutenbergr)
library(tidyr)

# example text
text <- c("A market meltdown. Surging recession fears. And a sudden spotlight on America's health care system. Goldman Sachs is warning Wall Street that the coronavirus could cost President Donald Trump the election.", 
          "The potential political fallout from the coronavirus adds yet more uncertainty for investors trying to assess the impact of the fast-moving epidemic. ",
"If the coronavirus epidemic materially affects US economic growth it may increase the likelihood of Democratic victory in the 2020 election, Goldman Sachs analysts led by Ben Snider wrote in a report published Wednesday night.",
"That could be a negative for stocks because investors have been hoping for a continuation of the low-tax, light-regulation approach of the Trump administration. And Trump of course has been laser-focused on boosting stock prices."
)

text_df <- tibble(line = 1:4, text = text) # tibble() function comes from tidyverse, we use it to arrange four lines of text

# note the data is in tibble format, A tibble is a modern class of R data frame, available in the dplyr and tibble packages, it has a convenient print method and does not use row names. The texts in the tibble object are of factor class.
text_df

## # A tibble: 4 × 2
##    line text                                                                    
##   <int> <chr>                                                                   
## 1     1 "A market meltdown. Surging recession fears. And a sudden spotlight on …
## 2     2 "The potential political fallout from the coronavirus adds yet more unc…
## 3     3 "If the coronavirus epidemic materially affects US economic growth it m…
## 4     4 "That could be a negative for stocks because investors have been hoping…

# we now need to convert text_df to one-token-per-document-per-row format for further analysis
text_df %>%
    unnest_tokens(word, text)  # this will break each token (word) from a string of text into its own row

## # A tibble: 127 × 2
##     line word     
##    <int> <chr>    
##  1     1 a        
##  2     1 market   
##  3     1 meltdown 
##  4     1 surging  
##  5     1 recession
##  6     1 fears    
##  7     1 and      
##  8     1 a        
##  9     1 sudden   
## 10     1 spotlight
## # ℹ 117 more rows

# download Jane Austen's books 

library(janeaustenr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

## Warning: package 'stringr' was built under R version 4.3.1

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",  # using regular expression (regex) to identify chapter ID. divxlc identifies Roman numerals.
                                                 ignore_case = TRUE)))) %>%    # ignore capital or lowercase letters
  ungroup()     # convert back

# check out output
original_books

## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ℹ 73,412 more rows

# parsing using tidytext
library(tidytext)

tidy_austen <- original_books %>%
  unnest_tokens(word, text)

tidy_austen

## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ℹ 725,045 more rows

# anti join with stop_words (which is of dataframe class) to remove stopwords. Note: stop_words are a list of pre-defined stopwords stored in a dataframe/tibble object, you can specify your own stopword list and combine with existing stop_words dataframe (please refer to the appendix)
data(stop_words)

tidy_austen <- tidy_austen %>%
  anti_join(stop_words)  # using anti_join() to remove stopwords from your text data (stored in tidy_books)

## Joining with `by = join_by(word)`

# count and sort by word frequency
tidy_austen %>%
  dplyr::count(word, sort = TRUE)

## # A tibble: 13,914 × 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ℹ 13,904 more rows

# plot word frequency
library(ggplot2)

tidy_austen %>%
  dplyr::count(word, sort = TRUE) %>%
  filter(n > 600) %>%  # select those that appear more than 600 times
  mutate(word = reorder(word, n)) %>%   # sort by frequency in descending order
  ggplot(aes(word, n)) +        # plotting (word in x-axis, sort by its frequency (n) in the y-axis)
  geom_col() +
  xlab(NULL) +
  coord_flip() +   # flip coordinate, such that word appears on the y-axis, freq on the x-axis
  theme_bw()       # black and white plotting background

# download H.G. Wells's 4 books from gutenberg archive

library(gutenbergr)

hgwells <- gutenberg_download(c(35, 36, 5230, 159))

## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

## Warning: ! Could not download a book at http://aleph.gutenberg.org/1/5/159/159.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.

# parsing
tidy_hgwells <- hgwells %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

# count word frequency
tidy_hgwells %>%
  dplyr::count(word, sort = TRUE)

## # A tibble: 10,320 × 2
##    word          n
##    <chr>     <int>
##  1 time        396
##  2 people      249
##  3 door        224
##  4 kemp        213
##  5 invisible   197
##  6 black       178
##  7 stood       174
##  8 night       168
##  9 heard       167
## 10 hall        165
## # ℹ 10,310 more rows

# sort word frequencies for words commonly used by the two authors

library(tidyr)

# create additional column "author" and use bind_row() to bind together two data frames by rows

frequency <- bind_rows(mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_austen, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%  # extract any text elements that contain "[a-z']+""
  dplyr::count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>%  # create a new column "proportion" and append it to existing dataframe
  select(-n) %>%  # in reverse order
  spread(author, proportion) %>%   # spread() rotates "author" column into 2 separate columns (because we have 2 authors) right next to the column containing each term used by the two authors
  gather(author, proportion, `H.G. Wells`)  # gather() combines multiple columns into key-value pairs, we use this trick to create an author column for H.G. Wells


library(scales)  # automatically set breaks and labels for axes on ggplot plotting environment

## Warning: package 'scales' was built under R version 4.3.2

# arranging the Jane Austen's words on the y-axis, H.G. Wells's words on the x-axis
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  # adding small random variation to the location of each data point
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Jane Austen", x = NULL)

## Warning: Removed 12833 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12834 rows containing missing values or values outside the scale range
## (`geom_text()`).

Inferring customer complaints product type using LDA

What kind of financial products were customers complaining about?

# install.packages(c("plyr", "tidyverse", "wordcloud2", "tm", "topicmodels", "stringr", "dplyr", "ldatuning", "snow", "vctrs", "scales"))

library(vctrs)

## Warning: package 'vctrs' was built under R version 4.3.2

## 
## Attaching package: 'vctrs'

## The following object is masked from 'package:dplyr':
## 
##     data_frame

## The following object is masked from 'package:tibble':
## 
##     data_frame

library(plyr)

## Warning: package 'plyr' was built under R version 4.3.3

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(dplyr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ purrr     1.0.1
## ✔ lubridate 1.9.2     ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plyr::arrange()     masks dplyr::arrange()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::compact()    masks plyr::compact()
## ✖ plyr::count()       masks dplyr::count()
## ✖ vctrs::data_frame() masks dplyr::data_frame(), tibble::data_frame()
## ✖ plyr::desc()        masks dplyr::desc()
## ✖ purrr::discard()    masks scales::discard()
## ✖ plyr::failwith()    masks dplyr::failwith()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ plyr::id()          masks dplyr::id()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ plyr::mutate()      masks dplyr::mutate()
## ✖ plyr::rename()      masks dplyr::rename()
## ✖ plyr::summarise()   masks dplyr::summarise()
## ✖ plyr::summarize()   masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)
library(tidyr)
library(wordcloud2)
library(tm)

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(topicmodels)
library(ggplot2)
library(stringr)

# load customer complaints data (10k random sample)
data <- read.csv("https://www.dropbox.com/s/6kofxbf8f3cq8cw/complaints10k.csv?dl=1", stringsAsFactors=FALSE)

# check out the data
dim(data)

## [1] 10000    18

names(data)

##  [1] "Date.received"                "Product"                     
##  [3] "Sub.product"                  "Issue"                       
##  [5] "Sub.issue"                    "Consumer.complaint.narrative"
##  [7] "Company.public.response"      "Company"                     
##  [9] "State"                        "ZIP.code"                    
## [11] "Tags"                         "Consumer.consent.provided."  
## [13] "Submitted.via"                "Date.sent.to.company"        
## [15] "Company.response.to.consumer" "Timely.response."            
## [17] "Consumer.disputed."           "Complaint.ID"

# we'll take a look at the "Consumer.complaint.narrative" (the 6th column)
head(data[6])

##                                                                                                                                                                                                                                                                                                                                                                                                                                                               Consumer.complaint.narrative
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 3 I submitted a complaint to SPS last month via CFPB. SPS did not respond to the specific questions contained in the Qualified Written Request, nor did they respond to my request to schedule an in-person meeting at SPS HQ in Utah to review their " original copies '' of documents they allege are valid and are associated with my property. SPS is trying to use these so-called documents to foreclose our home based on an assertion of a financial interest that does not exist.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 6

# rename
names(data)[6] <- "text"

text_df <- tibble(document = 1:10000, text = data$text)  # use "document" in lieu of "line", this will create a 2-column tibble object consists of "document ID" and "text" 

# load stop_words and create our own stopword list
data(stop_words)

word = c("xxxx", "xx")  # create a vector containing these two anonymity abbreviations

# set lexicon (aka., index), index = 2
lexicon = rep("anonymous", 2)  # create a new stopword label "anonymous" for "xxxx" and "xx"

# create a dataframe for "words"
anonymous = data.frame(word, lexicon)  # combine them into a dataframe

# append to tidytext's stop_words list, now you have a new stop_words list
stop_words <- rbind(stop_words, anonymous)  # combine with existing stop_words list

# text processing
text_unnested <- text_df %>%
         unnest_tokens(word, text) %>%   # remove punctuations, convert to lower case, etc.
         anti_join(stop_words)           # remove stopwords

## Joining with `by = join_by(word)`

text_counts <- text_unnested %>%         # generate and append a word count column "n" to existing tibble object and sort by freq.
  dplyr::count(document, word, sort = TRUE)

# plot word frequency
library(ggplot2)

text_counts %>%
  dplyr::count(word, sort = TRUE) %>%
  filter(n > 400) %>%  # only display those that appeared > 400 times
  mutate(word = reorder(word, n)) %>%   # sort by frequency in descending order
  ggplot(aes(word, n)) +        # plotting
  geom_col() +
  xlab(NULL) +
  coord_flip() +   # flip coordinate, word should appear on the y-axis, freq on the x-axis
  theme_bw()       # black and white plotting background

# you can also plot these words in the media-savvy "wordcloud" way using wordcloud2() function
# to do that, we need "word" and "n" (count) columns from text_counts object, make them into another tidy object
text_cloud <- text_counts %>%
    dplyr::count(word, sort = TRUE)  # this will remove the document ID column (which is not needed here)

library(wordcloud2)  # load wordcloud2 package
wordcloud2(text_cloud)    # the default setting, simply apply wordcloud2() on the new tidy object you just created

# wordcloud2(text_cloud[1:100,])    # you can control the number of words/terms appearing on the cloud to make it sparser (or denser)
# wordcloud2(text_cloud, shape = "circle", color = "random-light", backgroundColor="black")   # you can also set your preferred shape (for example, heart shape "cardioid"), color, and background color

## "cast" the tibble object into document term matrix (DTM) to feed into LDA analysis 
dtm <- text_counts %>%
  cast_dtm(document, word, n)

## sees what's inside the dtm object
str(dtm)

## List of 6
##  $ i       : int [1:117742] 1 3 4 5 6 7 8 9 10 11 ...
##  $ j       : int [1:117742] 1 1 1 1 1 1 1 1 1 1 ...
##  $ v       : num [1:117742] 46 46 44 44 44 44 44 44 44 44 ...
##  $ nrow    : int 2551
##  $ ncol    : int 10734
##  $ dimnames:List of 2
##   ..$ Docs : chr [1:2551] "6551" "7005" "7660" "1815" ...
##   ..$ Terms: chr [1:10734] "consumer" "inquiry" "debt" "fees" ...
##  - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
##  - attr(*, "weighting")= chr [1:2] "term frequency" "tf"

# you see the followings: i, j, v, nrow, ncol, and dimnames
# i and j are integer vectors of row and column indices, corresponding to document ID and the j-th term in the i-th document
# v is the weight applied to the j-th term in the i-th document
# nrow and ncol tell you the dimension of the data
# dimnames index a term's document ID (where it comes from) and its original text


## perplexity analysis: find out the ideal number of topics, conditional on the distribution of words/terms and the number of documents 
library(ldatuning)
library(ggplot2)
library(scales)
library(snow)

# do not run, just load the data from our Dropbox shared folder

# result <- FindTopicsNumber(
#  dtm,
#  topics = seq(from = 2, to = 50, by = 1),     # candidate number of topic K: 2 to 50 (must be > 1)
#  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),  # 4 metrics
#  method = "Gibbs", # Gibbs sampling
#  control = list(seed = 123),
#  mc.cores = 2L,
#  verbose = FALSE)

# the Griffiths (2004) and Cao and Juan (2009) metrics output the minimum values
# the Arun (2010) and Deveaud (2014) metrics output the maximum values
# you should try to locate the region (using eyeballing) to find the number of topics (k) that best satisfies all four criteria

result <- readRDS(url("https://www.dropbox.com/s/ap8omatxqm3v2g3/result.rds?dl=1"))

# the result suggests that K = 15 should be adequate, so we will train a 15-topic LDA
FindTopicsNumber_plot(result)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the ldatuning package.
##   Please report the issue at <https://github.com/nikita-moor/ldatuning/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 15-topic LDA
library(topicmodels)
# text.lda <- LDA(dtm, k = 15, control = list(seed = 123))  # do not run, just load the output from our dropbox folder
text.lda <- readRDS(url("https://www.dropbox.com/s/4po0veokpr4thju/text.lda.rds?dl=1"))
class(text.lda)  # note: the elements inside this object is not subsettable

## [1] "LDA_VEM"
## attr(,"package")
## [1] "topicmodels"

# note that you can turn the model back to the one-topic-per-term-per-row format using tidy() 

text_beta <- tidy(text.lda, matrix = "beta")    # the output is a term-topic matrix φ. Note: the default matrix argument is "beta"


# term-topic assignment: for each term-topic combination the model will have a estimated beta -- the probability of a term being generated from a topic.
text_beta

## # A tibble: 161,445 × 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 consumer 9.82e- 2
##  2     2 consumer 1.26e- 3
##  3     3 consumer 9.45e-16
##  4     4 consumer 2.62e- 9
##  5     5 consumer 1.30e- 5
##  6     6 consumer 1.47e- 7
##  7     7 consumer 1.52e- 8
##  8     8 consumer 1.13e- 5
##  9     9 consumer 2.51e-28
## 10    10 consumer 6.06e-23
## # ℹ 161,435 more rows

# we could use top_n() function from the dplyr package to find the top 10 terms of each topic

top_terms <- text_beta %>%
  group_by(topic) %>%
  top_n(10, beta) %>%  # top 10 terms of each group ordered by beta
  ungroup() %>%
  arrange(topic, -beta)  # display in descending order


table(top_terms$topic) # 15 unique topics and 10 top terms / each

## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 
## 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10

# the structure of the data
top_terms

## # A tibble: 150 × 3
##    topic term          beta
##    <int> <chr>        <dbl>
##  1     1 consumer    0.0982
##  2     1 information 0.0570
##  3     1 identity    0.0432
##  4     1 theft       0.0423
##  5     1 section     0.0327
##  6     1 reporting   0.0317
##  7     1 agency      0.0274
##  8     1 block       0.0220
##  9     1 report      0.0217
## 10     1 victim      0.0176
## # ℹ 140 more rows

# topic-document assignment: here the tidy package uses "gamma" in lieu of "alpha" (i.e., document-topic matrix)

text.lda_gamma <- tidy(text.lda, matrix = "gamma")
text.lda_gamma

## # A tibble: 38,265 × 3
##    document topic    gamma
##    <chr>    <int>    <dbl>
##  1 6551         1 0.999   
##  2 7005         1 0.000393
##  3 7660         1 0.999   
##  4 1815         1 0.998   
##  5 2675         1 0.998   
##  6 2817         1 0.998   
##  7 2981         1 0.998   
##  8 4282         1 0.998   
##  9 6830         1 0.998   
## 10 9016         1 0.998   
## # ℹ 38,255 more rows

# since we have 10,000 documents, it's infeasible to visualize their distribution across different topics, but we can do that for term-topic distribution, that is, the beta parameter.
# We have previously generated the term-topic matrix named "text_beta"


# using dplyr's top_n() function to find out the top 10 terms of each topic

top_terms <- text_beta %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)  # arrange in descending order

# finally, we can plot term freq. distribution across all 15 topics using the ggplot() function

library(ggplot2)

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>% # sort term by beta AND within each topic
  ggplot(aes(term, beta, fill = factor(topic))) + # term on the x-axis, beta on the y-axis
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +   # arrange sub-plots by topic, here we should output 15 sub-plots because we have 15 topics
  coord_flip() + # flip x-y axis, so term will be on the y-axis, beta on the x-axis
  scale_x_reordered()

# do you find something insightful regarding what types of financial products customers were complaining about?

## prediction accuracy, lets process the data again but (i) retain their Product labels and (ii) collapse their categories to only 7 main types

# collapse Product categories
require("car")

## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode

data$Product <- recode(data$Product, "c('Bank account or service','Checking or savings account') = 'Accounts'; else = data$Product")
data$Product <- recode(data$Product, "c('Consumer Loan','Payday loan', 'Payday loan, title loan, or personal loan', 'Student loan', 'Vehicle loan or lease') = 'Loans'; else = data$Product")
data$Product <- recode(data$Product, "c('Credit card','Credit card or prepaid card', 'Prepaid card') = 'Credit cards'; else = data$Product")
data$Product <- recode(data$Product, "c('Credit reporting','Credit reporting, credit repair services, or other personal consumer reports') = 'Credit reporting'; else = data$Product")
data$Product <- recode(data$Product, "c('Money transfer, virtual currency, or money service','Money transfers', 'Other financial service', 'Virtual currency') = 'Money transfers'; else = data$Product")

# now we have 7 main product categories
table(data$Product) # get a glimpse of their freq. counts

## 
##         Accounts     Credit cards Credit reporting  Debt collection 
##              970             1037             3217             1801 
##            Loans  Money transfers         Mortgage 
##              729              134             2112

# subset the data
new_data <- data[, c("text", "Product")]

new_data_unnested <- new_data %>%
                 unnest_tokens(word, text)

new_data_counts <- new_data_unnested %>%
  anti_join(stop_words) %>%  # remove stop words
  dplyr::count(Product, word, sort = TRUE)  # this will generate a freq. count column "n"

## Joining with `by = join_by(word)`

# display result

head(new_data_counts, 10)

##             Product        word    n
## 1  Credit reporting      credit 2386
## 2  Credit reporting      report 1290
## 3  Credit reporting     account 1178
## 4  Credit reporting information 1136
## 5   Debt collection        debt  951
## 6  Credit reporting   reporting  796
## 7   Debt collection      credit  654
## 8      Credit cards        card  653
## 9          Accounts     account  644
## 10     Credit cards      credit  610

# sort by TF-IDF to increase topic-specific term identifiability: terms appearing more often in a certain topic will receive more weight

new_data_counts <- new_data_counts %>%
  bind_tf_idf(word, Product, n)

# sort by TF-IDF
new_data_counts %>%
  arrange(desc(tf_idf)) %>% # order by tf_idf in descending order
  top_n(100)   # display top 100 terms

## Selecting by tf_idf

##              Product           word   n           tf       idf       tf_idf
## 1    Money transfers       coinbase  29 0.0065153898 1.9459101 0.0126783631
## 2           Mortgage         escrow 182 0.0062107562 1.2527630 0.0077806054
## 3    Money transfers           gram  15 0.0033700292 1.9459101 0.0065577740
## 4              Loans        navient 122 0.0063690942 0.8472979 0.0053965199
## 5           Mortgage   modification 176 0.0060060060 0.8472979 0.0050888760
## 6   Credit reporting     transunion 201 0.0035121440 1.2527630 0.0043998839
## 7    Money transfers            usd  15 0.0033700292 1.2527630 0.0042218478
## 8           Mortgage      homeowner  63 0.0021498771 1.9459101 0.0041834678
## 9   Credit reporting       experian 272 0.0047527521 0.8472979 0.0040269966
## 10             Loans           pslf  39 0.0020360219 1.9459101 0.0039619157
## 11   Money transfers           tech   9 0.0020220175 1.9459101 0.0039346644
## 12  Credit reporting        equifax 354 0.0061855670 0.5596158 0.0034615410
## 13          Mortgage            pmi  49 0.0016721267 1.9459101 0.0032538083
## 14   Money transfers           wire  24 0.0053920467 0.5596158 0.0030174745
## 15          Mortgage      appraisal  44 0.0015015015 1.9459101 0.0029217870
## 16          Mortgage     homeowners  43 0.0014673765 1.9459101 0.0028553828
## 17          Mortgage     nationstar  64 0.0021840022 1.2527630 0.0027360371
## 18   Money transfers       60000.00   6 0.0013480117 1.9459101 0.0026231096
## 19   Money transfers cryptocurrency   6 0.0013480117 1.9459101 0.0026231096
## 20   Money transfers         skrill   6 0.0013480117 1.9459101 0.0026231096
## 21   Money transfers        western   9 0.0020220175 1.2527630 0.0025331087
## 22   Money transfers         paypal  31 0.0069647270 0.3364722 0.0023434373
## 23             Loans          loans 277 0.0144609762 0.1541507 0.0022291693
## 24          Mortgage         ditech  52 0.0017745018 1.2527630 0.0022230301
## 25      Credit cards           citi  55 0.0026118340 0.8472979 0.0022130013
## 26   Money transfers            ltc   5 0.0011233431 1.9459101 0.0021859247
## 27   Money transfers       receiver   5 0.0011233431 1.9459101 0.0021859247
## 28          Accounts            atm  36 0.0024972253 0.8472979 0.0021158937
## 29             Loans      repayment  72 0.0037588097 0.5596158 0.0021034893
## 30      Credit cards            sam  34 0.0016145883 1.2527630 0.0020226964
## 31          Mortgage        realtor  29 0.0009896260 1.9459101 0.0019257233
## 32          Mortgage             wf  29 0.0009896260 1.9459101 0.0019257233
## 33   Money transfers         seller  10 0.0022466861 0.8472979 0.0019036124
## 34   Money transfers       shipping  10 0.0022466861 0.8472979 0.0019036124
## 35             Loans        fedloan  29 0.0015139650 1.2527630 0.0018966393
## 36          Accounts            umb  14 0.0009711432 1.9459101 0.0018897574
## 37             Loans      education  42 0.0021926390 0.8472979 0.0018578183
## 38   Debt collection       creditor 106 0.0032729181 0.5596158 0.0018315767
## 39   Debt collection     validation 106 0.0032729181 0.5596158 0.0018315767
## 40             Loans           ally  40 0.0020882276 0.8472979 0.0017693508
## 41   Money transfers       31000.00   4 0.0008986745 1.9459101 0.0017487397
## 42   Money transfers            gbp   4 0.0008986745 1.9459101 0.0017487397
## 43   Money transfers      lexington   4 0.0008986745 1.9459101 0.0017487397
## 44   Money transfers       litecoin   4 0.0008986745 1.9459101 0.0017487397
## 45  Credit reporting       reseller  51 0.0008911410 1.9459101 0.0017340803
## 46          Mortgage            fay  26 0.0008872509 1.9459101 0.0017265105
## 47          Mortgage          remic  26 0.0008872509 1.9459101 0.0017265105
## 48             Loans         nelnet  26 0.0013573480 1.2527630 0.0017004352
## 49             Loans       students  26 0.0013573480 1.2527630 0.0017004352
## 50          Mortgage          ocwen  88 0.0030030030 0.5596158 0.0016805279
## 51             Loans      salliemae  15 0.0007830854 1.9459101 0.0015238137
## 52  Credit reporting        section 255 0.0044557050 0.3364722 0.0014992210
## 53  Credit reporting          theft 253 0.0044207583 0.3364722 0.0014874624
## 54          Accounts         schwab  11 0.0007630411 1.9459101 0.0014848094
## 55  Credit reporting        bureaus 248 0.0043333916 0.3364722 0.0014580660
## 56             Loans            ibr  22 0.0011485252 1.2527630 0.0014388298
## 57  Credit reporting     subsection  65 0.0011357680 1.2527630 0.0014228480
## 58             Loans          pheaa  14 0.0007308797 1.9459101 0.0014222262
## 59   Money transfers         signup   5 0.0011233431 1.2527630 0.0014072826
## 60             Loans    forbearance  48 0.0025058731 0.5596158 0.0014023262
## 61   Debt collection      collector  81 0.0025010035 0.5596158 0.0013996010
## 62   Money transfers       citibank  11 0.0024713548 0.5596158 0.0013830091
## 63      Credit cards     mastercard  23 0.0010922215 1.2527630 0.0013682946
## 64   Money transfers            pnc  18 0.0040440350 0.3364722 0.0013607055
## 65          Accounts             cd  23 0.0015954495 0.8472979 0.0013518209
## 66          Accounts           usaa  23 0.0015954495 0.8472979 0.0013518209
## 67          Accounts             bb  10 0.0006936737 1.9459101 0.0013498267
## 68  Credit reporting       creditor 136 0.0023763760 0.5596158 0.0013298575
## 69          Mortgage       loancare  20 0.0006825007 1.9459101 0.0013280850
## 70   Money transfers       62000.00   3 0.0006740058 1.9459101 0.0013115548
## 71   Money transfers          69.00   3 0.0006740058 1.9459101 0.0013115548
## 72   Money transfers        gallery   3 0.0006740058 1.9459101 0.0013115548
## 73   Money transfers          grams   3 0.0006740058 1.9459101 0.0013115548
## 74   Money transfers        hacking   3 0.0006740058 1.9459101 0.0013115548
## 75   Money transfers      moneygram   3 0.0006740058 1.9459101 0.0013115548
## 76   Money transfers            ria   3 0.0006740058 1.9459101 0.0013115548
## 77   Debt collection            erc  21 0.0006484083 1.9459101 0.0012617443
## 78   Debt collection      portfolio  48 0.0014820761 0.8472979 0.0012557599
## 79      Credit cards        menards  13 0.0006173426 1.9459101 0.0012012932
## 80          Mortgage            phh  18 0.0006142506 1.9459101 0.0011952765
## 81             Loans        student 147 0.0076742365 0.1541507 0.0011829888
## 82   Money transfers       currency   4 0.0008986745 1.2527630 0.0011258261
## 83   Money transfers            hrs   4 0.0008986745 1.2527630 0.0011258261
## 84             Loans             gm  11 0.0005742626 1.9459101 0.0011174634
## 85  Credit reporting      inquiries 190 0.0033199371 0.3364722 0.0011170667
## 86      Credit cards       american  42 0.0019944914 0.5596158 0.0011161489
## 87  Credit reporting     verifiable  50 0.0008736677 1.2527630 0.0010944985
## 88          Accounts        dormant   8 0.0005549390 1.9459101 0.0010798613
## 89      Credit cards        rewards  18 0.0008547820 1.2527630 0.0010708393
## 90  Credit reporting           fcra 181 0.0031626769 0.3364722 0.0010641530
## 91          Mortgage         planet  16 0.0005460005 1.9459101 0.0010624680
## 92             Loans          coast  10 0.0005220569 1.9459101 0.0010158758
## 93      Credit cards           amex  17 0.0008072941 1.2527630 0.0010113482
## 94          Mortgage        bayview  15 0.0005118755 1.9459101 0.0009960638
## 95          Mortgage   preservation  15 0.0005118755 1.9459101 0.0009960638
## 96          Mortgage        seterus  15 0.0005118755 1.9459101 0.0009960638
## 97          Mortgage        quicken  23 0.0007848758 1.2527630 0.0009832633
## 98          Mortgage        freedom  34 0.0011602512 0.8472979 0.0009830783
## 99             Loans            aes  22 0.0011485252 0.8472979 0.0009731429
## 100         Accounts       citibank  25 0.0017341842 0.5596158 0.0009704769

# plot it: words frequency by topic according to TF-IDF (more topic-specific)
new_data_counts %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(Product) %>% 
  top_n(10) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = Product)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "TF-IDF") +
  facet_wrap(~Product, ncol = 4, scales = "free") + # generate 7 sub-plots
  coord_flip()  # flip x and y axis

## Selecting by tf_idf

# do you find any interpretable patterns?

# cast to dtm for further processing and analysis

new_dtm  <- new_data_counts %>%
  cast_dtm(Product, word, n)

new_dtm

## <<DocumentTermMatrix (documents: 7, terms: 10734)>>
## Non-/sparse entries: 25522/49616
## Sparsity           : 66%
## Maximal term length: 45
## Weighting          : term frequency (tf)

# convert to lda class output (we set K = 7 because we have 7 product categories)
# new_lda <- LDA(new_dtm, k = 7, control = list(seed = 123))   # do not run, just download the new_lda output from dropbox folder
new_lda <- readRDS(url("https://www.dropbox.com/s/epre5su5i0km5ho/new_lda.rds?dl=1"))

# join term-topic assignment (𝜑 matrix based on probability (beta)) with document-term matrix to see the most likely product type (i.e., topic) for a document (customer complaint)
# you will need the "broom" package to be able to use augment() function
library(broom)

## Warning: package 'broom' was built under R version 4.3.2

assignments <- augment(new_lda, data = new_dtm) # this will append a "topic" column to the DTM object (it's the topic assigned to a term by the LDA model)
assignments

## # A tibble: 25,522 × 4
##    document         term   count .topic
##    <chr>            <chr>  <dbl>  <dbl>
##  1 Credit reporting credit  2386      6
##  2 Debt collection  credit   654      5
##  3 Credit cards     credit   610      3
##  4 Accounts         credit    79      4
##  5 Mortgage         credit   113      2
##  6 Loans            credit   198      1
##  7 Money transfers  credit    29      3
##  8 Credit reporting report  1290      6
##  9 Debt collection  report   305      5
## 10 Credit cards     report    92      3
## # ℹ 25,512 more rows

# get document-topic matrix (theta, which is parameterized by alpha = gamma), use that to calculate product classification to determine the most likely topic (product) for a document
new_theta <- tidy(new_lda, matrix = "gamma")
names(new_theta)[1] <- "Product"

head(new_theta, 10)

## # A tibble: 10 × 3
##    Product          topic       gamma
##    <chr>            <int>       <dbl>
##  1 Credit reporting     1 0.000000274
##  2 Debt collection      1 0.000000484
##  3 Credit cards         1 0.000000744
##  4 Accounts             1 0.00000109 
##  5 Mortgage             1 0.000000535
##  6 Loans                1 0.928      
##  7 Money transfers      1 0.00000352 
##  8 Credit reporting     2 0.000000274
##  9 Debt collection      2 0.000000484
## 10 Credit cards         2 0.000000744

# sort product classification result
product_classifications <- new_theta %>%
  group_by(Product) %>%
  top_n(1, gamma) %>%  # highest predicted topic assignment (gamma) for a document
  ungroup() %>%
  arrange(gamma)

product_classifications

## # A tibble: 7 × 3
##   Product          topic gamma
##   <chr>            <int> <dbl>
## 1 Money transfers      7 0.658
## 2 Loans                1 0.928
## 3 Mortgage             2 0.995
## 4 Credit reporting     6 1.00 
## 5 Accounts             4 1.00 
## 6 Credit cards         3 1.00 
## 7 Debt collection      5 1.00

# rename column
names(product_classifications)[1] <- "predicted"

# "join" the original product labels with our product (topic) assignment result by "topic"
assignments <- assignments %>%
  inner_join(product_classifications, by = c(".topic" = "topic"))

# take a look at the format of this output
head(assignments, 10)

## # A tibble: 10 × 6
##    document         term   count .topic predicted        gamma
##    <chr>            <chr>  <dbl>  <dbl> <chr>            <dbl>
##  1 Credit reporting credit  2386      6 Credit reporting 1.00 
##  2 Debt collection  credit   654      5 Debt collection  1.00 
##  3 Credit cards     credit   610      3 Credit cards     1.00 
##  4 Accounts         credit    79      4 Accounts         1.00 
##  5 Mortgage         credit   113      2 Mortgage         0.995
##  6 Loans            credit   198      1 Loans            0.928
##  7 Money transfers  credit    29      3 Credit cards     1.00 
##  8 Credit reporting report  1290      6 Credit reporting 1.00 
##  9 Debt collection  report   305      5 Debt collection  1.00 
## 10 Credit cards     report    92      3 Credit cards     1.00

# finally, we can plot the result, using the color scale intensity to show the percentage of documents whose original product types "agree" with the topics (product) assigned by our model
library(scales)

assignments %>%
  dplyr::count(document, predicted, wt = count) %>%   # wt means weight, meaning weighted by freq. count
  group_by(document) %>%
  mutate(percent = n / sum(n)) %>%   # percent of the same type of documents correctly predicted
  ggplot(aes(predicted, document, fill = percent)) +
  geom_tile() +
  scale_fill_gradient2(high = "red", label = percent_format()) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), # let document labels dropped down 90-degree on the horizontal axis
        panel.grid = element_blank()) +
  labs(x = "Predicted complaint type",
       y = "Actual complaint type",
       fill = "% of agreement")

# it seems that, with the exception of credit-cards-money transfers, we have done a decent job at classifying complaint product types

Sentiment Analysis

Counting positive and negative sentiments, visualization, and more…

library(ggplot2)
library(plyr)
library(dplyr)
library(tibble)
library(tidyverse)
library(tidytext)
library(tidyr)
library(tm)
library(topicmodels)
library(stringr)


# most common positive and negative words

# preprocessing
new_data_unnested <- new_data %>%
                 unnest_tokens(word, text) %>%
                 anti_join(stop_words)

## Joining with `by = join_by(word)`

# count number of positive and negative words in the entire corpus
bing_word_counts <- new_data_unnested %>%
  inner_join(get_sentiments("bing")) %>%   # inner join with Microsoft bing sentiment dictionary
  dplyr::count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

# display output (unsorted)
head(bing_word_counts, 50)

##          word sentiment    n
## 1        debt  negative 1413
## 2     dispute  negative  562
## 3   complaint  negative  404
## 4       fraud  negative  400
## 5       issue  negative  360
## 6  fraudulent  negative  332
## 7    disputed  negative  256
## 8     correct  positive  232
## 9  inaccurate  negative  232
## 10       fair  positive  223
## 11   negative  negative  218
## 12    refused  negative  212
## 13      error  negative  182
## 14  violation  negative  178
## 15     failed  negative  177
## 16     denied  negative  167
## 17  incorrect  negative  167
## 18     issues  negative  157
## 19       hard  negative  154
## 20     unable  negative  154
## 21      wrong  negative  154
## 22      false  negative  146
## 23     refund  positive  145
## 24   accurate  positive  128
## 25       lost  negative  106
## 26     missed  negative  101
## 27     freeze  negative  100
## 28   recovery  positive  100
## 29      limit  negative   98
## 30 protection  positive   90
## 31     breach  negative   85
## 32    support  positive   82
## 33    illegal  negative   80
## 34     refuse  negative   77
## 35       free  positive   76
## 36     proper  positive   75
## 37    concern  negative   73
## 38    savings  positive   73
## 39       hung  negative   71
## 40    refuses  negative   69
## 41    mistake  negative   68
## 42 delinquent  negative   67
## 43 complaints  negative   66
## 44        bad  negative   65
## 45   hardship  negative   65
## 46     afford  positive   64
## 47      debts  negative   64
## 48  correctly  positive   63
## 49   promised  positive   62
## 50     stolen  negative   62

# visualize it

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +  # create two subplots by sentiment type
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

# show it in "comparison" wordcloud
library(wordcloud)

## Loading required package: RColorBrewer

# install.packages("reshape2") # we will need it in order to use acast() function
library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

new_data_unnested %>%
  inner_join(get_sentiments("bing")) %>%
  dplyr::count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%   # cast words to their sentiment types with size proportional to its freq. (n)
  comparison.cloud(colors = c("gray20", "gray80"),     # comparison.cloud() comes from wordcloud package
                   max.words = 100)

## Joining with `by = join_by(word)`

# distribution of positive and negative sentiments across different types of financial products
# convert the data into tibble format for the ease of data wrangling
new_text <- tibble(document = 1:nrow(new_data), text = new_data$text, Product = new_data$Product) %>% 
                    unnest_tokens(word, text) %>%
                    anti_join(stop_words)  %>%
                    group_by(Product, document) %>%
                    mutate(text = paste(word,collapse =' ')) %>%  # connect tokens (words) into their original text
                    distinct(Product, document, text) # remove original texts (rows) that contain empty string (which cannot be fed into sentiment analysis function)

## Joining with `by = join_by(word)`

# using analyzeSentiment() from the SentimentAnalysis package to count positive and negative words per document and rate the sentiment type of each document

#install.packages("SentimentAnalysis")
library(SentimentAnalysis)

## 
## Attaching package: 'SentimentAnalysis'
## 
## The following object is masked from 'package:base':
## 
##     write

# sentiment <- analyzeSentiment(new_text$text)  # this will take a while, so just download the output from our shared Dropbox folder
sentiment <- readRDS(url("https://www.dropbox.com/s/q4em5vu8s9t9vdd/sentiment.rds?dl=1"))

# the verdict: subset the sentiment rating (SentimentGI) from the output of convertToBinaryResponse() and convert the result to ("positive", "negative") character, append the result to a new column of new_text (a tibble object) 
new_text$sentiment <- as.character(convertToBinaryResponse(sentiment)$SentimentGI)

# plot it

new_text %>%
  dplyr::count(Product, sentiment, sort = TRUE)  %>%
  mutate(sentiment = reorder(sentiment, n)) %>%   # sort by frequency in descending order
  ggplot(aes(sentiment, n, fill = sentiment)) +   # plotting
  geom_col(show.legend = FALSE) +
  facet_wrap(~ Product, scales = "free") +  # arrange sub-plots by "Product" type, this will output 7 sub-plots
  scale_x_reordered() +
  ylab("Frequency") + 
  xlab("Sentiment") +
  theme_bw()

Supplementary materials

Make your own stopword list

#Note: stop_words (from tidytext package) are a list of pre-specified of stopwords stored in a dataframe/tibble object, you can specify your own stopword list and combine with stop_words

library(tidytext)
data(stop_words)

# check out stop_words dimension and content
dim(stop_words)
head(stop_words)

# suppose you are working on political science text, you want to define a set of terms to remove

# you first define your own stopword list
words = c("authoritarianism", "oligarchy", "democratization", "coup")

# set lexicon (aka., index), index = 4 because you previously include 4 terms in the word object
lexicon = rep("psc", 4)

# create a dataframe for "words"
psc = data.frame(word, lexicon)

# append to tidytext's stop_words list, now you have a new stop_words list
stop_words <- rbind(stop_words, psc)



## perplexity analysis
## perplexity analysis: find out ideal number of topics, conditional on the distribution of words/terms and the number of documents
# install.packages(c("ldatuning", "ggplot2", "scales", "snow", "foreach", "doParallel"))
library(ldatuning)
library(ggplot2)
library(scales)
library(snow)
library(foreach)
library(doParallel)

# do not run, just load the data from our Dropbox shared folder

result <- FindTopicsNumber(
  dtm,
  topics = seq(from = 2, to = 50, by = 1),     # candidate number of K: 2 to 50 (must be > 1)
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),  # 4 metrics
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

# the plotting result suggests that K = 15 should be adequate, so we will train a 15-topic LDA
FindTopicsNumber_plot(result)

## perplexity analysis: cross-validation version

## cross-validation

burnin = 1000
iter = 1000
keep = 50

n <- nrow(dtm)

k <- 15 # number of topics

splitter <- sample(1:n, round(n * 0.75))
train_set <- dtm[splitter, ]
valid_set <- dtm[-splitter, ]

fitted <- LDA(train_set, k = k, method = "Gibbs",
                          control = list(burnin = burnin, iter = iter, keep = keep) )
perplexity(fitted, newdata = train_set)
perplexity(fitted, newdata = valid_set)

## CV with many candidate n of topics

cluster <- makeCluster(detectCores(logical = TRUE) - 1) # leave one CPU spare
registerDoParallel(cluster)

clusterEvalQ(cluster, {
   library(topicmodels)
})

folds <- 5
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2, 3, 4, 5, 10, 20, 30, 40, 50, 75, 100) # candidates number of topics
clusterExport(cluster, c("dtm", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))

system.time({
results <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
   k <- candidate_k[j]
   results_1k <- matrix(0, nrow = folds, ncol = 2)
   colnames(results_1k) <- c("k", "perplexity")
   for(i in 1:folds){
      train_set <- dtm[splitfolds != i , ]
      valid_set <- dtm[splitfolds == i, ]
      
      fitted <- LDA(train_set, k = k, method = "Gibbs",
                    control = list(burnin = burnin, iter = iter, keep = keep) )
      results_1k[i,] <- c(k, perplexity(fitted, newdata = valid_set))  # this will output the perplexity score (log likelihood) of the trained model in sorting topics from the validation data
   }
   return(results_1k)
}
})
stopCluster(cluster)

results_df <- as.data.frame(results)

ggplot(results_df, aes(x = k, y = perplexity)) +
   geom_point() +
   geom_smooth(se = FALSE) +
   ggtitle("5-fold CV of topic modeling", "when fitting the trained model to the hold-out set") +
   labs(x = "Candidate number of topics", y = "Perplexity")

Statistical Learning: Week-13

H. K. Tseng

5/14/2024

Stripping raw text

Inferring customer complaints product type using LDA

Sentiment Analysis

Supplementary materials