Statistical Learning: Week-13

Stripping raw text

Stripping raw text using unnest_tokens()

install.packages(c("tibble", "tidytext", "ggplot2", "gutenbergr", "tidyr"))

## Installing packages into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## 
##   There is a binary version available but the source version is
##   later:
##         binary source needs_compilation
## ggplot2  3.3.0  3.3.1             FALSE
## 
## package 'tibble' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'tibble'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\tibble\libs\x64\tibble.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\tibble\libs\x64\tibble.dll:
## Permission denied

## Warning: restored 'tibble'

## package 'tidytext' successfully unpacked and MD5 sums checked
## package 'gutenbergr' successfully unpacked and MD5 sums checked
## package 'tidyr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'tidyr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\tidyr\libs\x64\tidyr.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\tidyr\libs\x64\tidyr.dll:
## Permission denied

## Warning: restored 'tidyr'

## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpGUJ8JW\downloaded_packages

## installing the source package 'ggplot2'

library(tibble)

## Warning: package 'tibble' was built under R version 3.6.3

library(tidytext)

## Warning: package 'tidytext' was built under R version 3.6.3

library(ggplot2)
library(gutenbergr)

## Warning: package 'gutenbergr' was built under R version 3.6.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.6.3

text <- c("A market meltdown. Surging recession fears. And a sudden spotlight on America's health care system. Goldman Sachs is warning Wall Street that the coronavirus could cost President Donald Trump the election.", 
          "The potential political fallout from the coronavirus adds yet more uncertainty for investors trying to assess the impact of the fast-moving epidemic. ",
"If the coronavirus epidemic materially affects US economic growth it may increase the likelihood of Democratic victory in the 2020 election, Goldman Sachs analysts led by Ben Snider wrote in a report published Wednesday night.",
"That could be a negative for stocks because investors have been hoping for a continuation of the low-tax, light-regulation approach of the Trump administration. And Trump of course has been laser-focused on boosting stock prices."
)

text_df <- tibble(line = 1:4, text = text) # tibble() function comes from tidyverse, we have four lines of text

# note the data is in tibbles format, A tibble is a modern class of data frame within R, available in the dplyr and tibble packages, that has a convenient print method, will not convert strings to factors, and does not use row names.
text_df

## # A tibble: 4 x 2
##    line text                                                               
##   <int> <chr>                                                              
## 1     1 "A market meltdown. Surging recession fears. And a sudden spotligh~
## 2     2 "The potential political fallout from the coronavirus adds yet mor~
## 3     3 "If the coronavirus epidemic materially affects US economic growth~
## 4     4 "That could be a negative for stocks because investors have been h~

# we now need to convert text_df to one-token-per-document-per-row format for further analysis
text_df %>%
    unnest_tokens(word, text)

## # A tibble: 127 x 2
##     line word     
##    <int> <chr>    
##  1     1 a        
##  2     1 market   
##  3     1 meltdown 
##  4     1 surging  
##  5     1 recession
##  6     1 fears    
##  7     1 and      
##  8     1 a        
##  9     1 sudden   
## 10     1 spotlight
## # ... with 117 more rows

# download Jane Austen's books 

library(janeaustenr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

## Warning: package 'stringr' was built under R version 3.6.2

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",  # using regular expression (regex) to remove unwanted formatting
                                                 ignore_case = TRUE)))) %>%    # ignore capital or lowercase letters
  ungroup()     # convert back

# check out output
original_books

## # A tibble: 73,422 x 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ... with 73,412 more rows

# parsing using tidytext
library(tidytext)

tidy_books <- original_books %>%
  unnest_tokens(word, text)

tidy_books

## # A tibble: 725,055 x 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ... with 725,045 more rows

# anti join with stop_words (a dataframe class) to remove stopwords. Note: stop_words are a list of pre-specified of stopwords stored in a dataframe/tibble object, you can specify your own stopword list and combine with stop_words (pls. refer to the appendix)
data(stop_words)

tidy_books <- tidy_books %>%
  anti_join(stop_words)

## Joining, by = "word"

# count word frequency
tidy_books %>%
  dplyr::count(word, sort = TRUE)

## # A tibble: 13,914 x 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ... with 13,904 more rows

# plot word frequency
library(ggplot2)

tidy_books %>%
  dplyr::count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%   # sort by frequency in descending order
  ggplot(aes(word, n)) +        # plotting
  geom_col() +
  xlab(NULL) +
  coord_flip() +   # flip coordinate, such that word appears on the y-axis, freq on the x-axis
  theme_bw()       # black and white plotting background

# download H.G. Wells's 4 books from gutenberg archive

library(gutenbergr)

hgwells <- gutenberg_download(c(35, 36, 5230, 159))

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

# parsing
tidy_hgwells <- hgwells %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

## Joining, by = "word"

# count word frequency
tidy_hgwells %>%
  dplyr::count(word, sort = TRUE)

## # A tibble: 11,769 x 2
##    word       n
##    <chr>  <int>
##  1 time     454
##  2 people   302
##  3 door     260
##  4 heard    249
##  5 black    232
##  6 stood    229
##  7 white    222
##  8 hand     218
##  9 kemp     213
## 10 eyes     210
## # ... with 11,759 more rows

# sort word frequencies for words commonly used by the two authors

library(tidyr)

frequency <- bind_rows(mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_books, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  dplyr::count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(author, proportion) %>% 
  gather(author, proportion, `H.G. Wells`)


library(scales)  # automatically set breaks and labels for axes on ggplot plotting environment

## Warning: package 'scales' was built under R version 3.6.3

# arranging the Jane Austen's words on the y-axis, H.G. Wells's words on the x-axis
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Jane Austen", x = NULL)

## Warning: Removed 13230 rows containing missing values (geom_point).

## Warning: Removed 13231 rows containing missing values (geom_text).

Inferring customer complaints product type through LDA

What kind of financial products were the customers complaining about?

install.packages(c("plyr", "tidyverse", "wordcloud2", "tm", "topicmodels", "stringr", "dplyr", "ldatuning", "snow", "vctrs", "scales"))

## Installing packages into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## also installing the dependency 'glue'

## 
##   There is a binary version available but the source version is
##   later:
##       binary source needs_compilation
## dplyr  0.8.5  1.0.0              TRUE
## 
##   Binaries will be installed

## Warning: packages 'stringr', 'dplyr', 'scales' are in use and will not be
## installed

## package 'glue' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'glue'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\hktse\Documents\R\win-library\3.6\00LOCK\glue\libs\x64\glue.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\glue\libs\x64\glue.dll:
## Permission denied

## Warning: restored 'glue'

## package 'plyr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'plyr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\hktse\Documents\R\win-library\3.6\00LOCK\plyr\libs\x64\plyr.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\plyr\libs\x64\plyr.dll:
## Permission denied

## Warning: restored 'plyr'

## package 'tidyverse' successfully unpacked and MD5 sums checked
## package 'wordcloud2' successfully unpacked and MD5 sums checked
## package 'tm' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'tm'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\tm\libs\x64\tm.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\tm\libs\x64\tm.dll: Permission
## denied

## Warning: restored 'tm'

## package 'topicmodels' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'topicmodels'

## Warning in file.copy(savedcopy, lib, recursive =
## TRUE): problem copying C:\Users\hktse\Documents\R\win-
## library\3.6\00LOCK\topicmodels\libs\x64\topicmodels.dll
## to C:\Users\hktse\Documents\R\win-
## library\3.6\topicmodels\libs\x64\topicmodels.dll: Permission denied

## Warning: restored 'topicmodels'

## package 'ldatuning' successfully unpacked and MD5 sums checked
## package 'snow' successfully unpacked and MD5 sums checked
## package 'vctrs' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'vctrs'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\vctrs\libs\x64\vctrs.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\vctrs\libs\x64\vctrs.dll:
## Permission denied

## Warning: restored 'vctrs'

## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpGUJ8JW\downloaded_packages

library(vctrs)

## Warning: package 'vctrs' was built under R version 3.6.3

library(plyr)

## Warning: package 'plyr' was built under R version 3.6.3

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(dplyr)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.3

## -- Attaching packages --------------------- tidyverse 1.3.0 --

## v readr   1.3.1     v forcats 0.4.0
## v purrr   0.3.3

## Warning: package 'purrr' was built under R version 3.6.2

## -- Conflicts ------------------------ tidyverse_conflicts() --
## x plyr::arrange()     masks dplyr::arrange()
## x readr::col_factor() masks scales::col_factor()
## x purrr::compact()    masks plyr::compact()
## x plyr::count()       masks dplyr::count()
## x purrr::discard()    masks scales::discard()
## x plyr::failwith()    masks dplyr::failwith()
## x dplyr::filter()     masks stats::filter()
## x plyr::id()          masks dplyr::id()
## x dplyr::lag()        masks stats::lag()
## x plyr::mutate()      masks dplyr::mutate()
## x plyr::rename()      masks dplyr::rename()
## x plyr::summarise()   masks dplyr::summarise()
## x plyr::summarize()   masks dplyr::summarize()

library(tidytext)
library(tidyr)
library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 3.6.3

library(tm)

## Warning: package 'tm' was built under R version 3.6.3

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(topicmodels)

## Warning: package 'topicmodels' was built under R version 3.6.3

library(ggplot2)
library(stringr)

# load the subsetted customer complaints data (10k random sample)
data <- read.csv("https://www.dropbox.com/s/6kofxbf8f3cq8cw/complaints10k.csv?dl=1", stringsAsFactors=FALSE)

# check out the data
dim(data)

## [1] 10000    18

names(data)

##  [1] "Date.received"                "Product"                     
##  [3] "Sub.product"                  "Issue"                       
##  [5] "Sub.issue"                    "Consumer.complaint.narrative"
##  [7] "Company.public.response"      "Company"                     
##  [9] "State"                        "ZIP.code"                    
## [11] "Tags"                         "Consumer.consent.provided."  
## [13] "Submitted.via"                "Date.sent.to.company"        
## [15] "Company.response.to.consumer" "Timely.response."            
## [17] "Consumer.disputed."           "Complaint.ID"

# we'll take a look at the "Consumer.complaint.narrative" (the 6th column)
head(data[6])

##                                                                                                                                                                                                                                                                                                                                                                                                                                                               Consumer.complaint.narrative
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 3 I submitted a complaint to SPS last month via CFPB. SPS did not respond to the specific questions contained in the Qualified Written Request, nor did they respond to my request to schedule an in-person meeting at SPS HQ in Utah to review their " original copies '' of documents they allege are valid and are associated with my property. SPS is trying to use these so-called documents to foreclose our home based on an assertion of a financial interest that does not exist.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 6

# rename
names(data)[6] <- "text"

text_df <- tibble(document = 1:10000, text = data$text)  # use "document" in lieu of "line"

# load stop_words and create our own stopword list
data(stop_words)

word = c("xxxx", "xx")

# set lexicon (aka., index), index = 2
lexicon = rep("anonymous", 2)

# create a dataframe for "words"
anonymous = data.frame(word, lexicon)

# append to tidytext's stop_words list, now you have a new stop_words list
stop_words <- rbind(stop_words, anonymous)

# text processing
text_unnested <- text_df %>%
         unnest_tokens(word, text) %>%   # remove punctuations, convert to lower case, etc.
         anti_join(stop_words)           # remove stopwords

## Joining, by = "word"

text_counts <- text_unnested %>%         # generate and append a word count column
  dplyr::count(document, word, sort = TRUE)

# plot word frequency
library(ggplot2)

text_counts %>%
  dplyr::count(word, sort = TRUE) %>%
  filter(n > 400) %>%
  mutate(word = reorder(word, n)) %>%   # sort by frequency in descending order
  ggplot(aes(word, n)) +        # plotting
  geom_col() +
  xlab(NULL) +
  coord_flip() +   # flip coordinate, such that word appears on the y-axis, freq on the x-axis
  theme_bw()       # black and white plotting background

# you also plot those words in the media-savvy "wordcloud" way using wordcloud2() function
# before that, we need "word" and "n" (count) columns from text_counts object, make them into another tidy object
text_cloud <- text_counts %>%
    dplyr::count(word, sort = TRUE)

library(wordcloud2)  # load wordcloud2 package
wordcloud2(text_cloud)    # the default setting

# wordcloud2(text_cloud[1:100,])    # you can control the number of words/terms appearing on the cloud to make it sparser
# wordcloud2(text_cloud, shape = "circle", color = "random-light", backgroundColor="black")   # you can also set your preferred shape, color, and background color

## "cast" the tibble object into document term matrix (DTM) to feed into LDA analysis 
dtm <- text_counts %>%
  cast_dtm(document, word, n)

## perplexity analysis: find out ideal number of topics, conditional on the distribution of words/terms and the number of documents 
library(ldatuning)

## Warning: package 'ldatuning' was built under R version 3.6.3

library(ggplot2)
library(scales)
library(snow)

# do not run, just load the data from our Dropbox shared folder

# result <- FindTopicsNumber(
#  dtm,
#  topics = seq(from = 2, to = 50, by = 1),     # candidate number of K: 2 to 50 (must be > 1)
#  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),  # 4 metrics
#  method = "Gibbs",
#  control = list(seed = 77),
#  mc.cores = 2L,
#  verbose = TRUE)

# the Griffiths (2004) and Cao and Juan (2009) metrics aim to locate the minimum values
# the Arun (2010) and Deveaud (2014) metrics seek to find the maximum values
# you should try to find the region (using eyeballing) to find K that best satisfies the above criteria

result <- readRDS(url("https://www.dropbox.com/s/ap8omatxqm3v2g3/result.rds?dl=1"))

# the plotting result suggests that K = 15 should be adequate, so we will train a 15-topic LDA
FindTopicsNumber_plot(result)

# 15-topic LDA
library(topicmodels)
# text.lda <- LDA(dtm, k = 15, control = list(seed = 123))  # do not run, just load the data from our dropbox folder
text.lda <- readRDS(url("https://www.dropbox.com/s/4po0veokpr4thju/text.lda.rds?dl=1"))
text.lda

## A LDA_VEM topic model with 15 topics.

# note that you can turn the model back to the one-topic-per-term-per-row format using tidy() 

text.td <- tidy(text.lda)    # td represents topic-term, so the output is a topic-term matrix

# term-topic assignment: for each term-topic combination the model will have a estimated beta -- the probability of that term being generated from that topic.
text.td

## # A tibble: 161,445 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 consumer 9.82e- 2
##  2     2 consumer 1.26e- 3
##  3     3 consumer 9.45e-16
##  4     4 consumer 2.62e- 9
##  5     5 consumer 1.30e- 5
##  6     6 consumer 1.47e- 7
##  7     7 consumer 1.52e- 8
##  8     8 consumer 1.13e- 5
##  9     9 consumer 2.51e-28
## 10    10 consumer 6.06e-23
## # ... with 161,435 more rows

# we could use top_n() function from the dplyr package to find the top 10 terms within each topic

top_terms <- text.td %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms

## # A tibble: 150 x 3
##    topic term          beta
##    <int> <chr>        <dbl>
##  1     1 consumer    0.0982
##  2     1 information 0.0570
##  3     1 identity    0.0432
##  4     1 theft       0.0423
##  5     1 section     0.0327
##  6     1 reporting   0.0317
##  7     1 agency      0.0274
##  8     1 block       0.0220
##  9     1 report      0.0217
## 10     1 victim      0.0176
## # ... with 140 more rows

# topic-document assignment: here the tidy package uses "gamma" to represent "theta"

text.lda_gamma <- tidy(text.lda, matrix = "gamma")
text.lda_gamma

## # A tibble: 38,265 x 3
##    document topic    gamma
##    <chr>    <int>    <dbl>
##  1 6551         1 0.999   
##  2 7005         1 0.000393
##  3 7660         1 0.999   
##  4 1815         1 0.998   
##  5 2675         1 0.998   
##  6 2817         1 0.998   
##  7 2981         1 0.998   
##  8 4282         1 0.998   
##  9 6830         1 0.998   
## 10 9016         1 0.998   
## # ... with 38,255 more rows

# since we have 10,000 documents, it's hard to visualize their distribution across different topics, but we can do that for term-topic distribution, that is, the beta parameter!
term_topics <- tidy(text.lda, matrix = "beta")

# display beta values
term_topics

## # A tibble: 161,445 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 consumer 9.82e- 2
##  2     2 consumer 1.26e- 3
##  3     3 consumer 9.45e-16
##  4     4 consumer 2.62e- 9
##  5     5 consumer 1.30e- 5
##  6     6 consumer 1.47e- 7
##  7     7 consumer 1.52e- 8
##  8     8 consumer 1.13e- 5
##  9     9 consumer 2.51e-28
## 10    10 consumer 6.06e-23
## # ... with 161,435 more rows

# using dplyr's top_n() function to find out the top 10 terms within each topic

top_terms <- term_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# finally, we can plot term freq. distribution across all 15 topics using the ggplot() function

library(ggplot2)

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

# do you find something insightful regarding what types of financial products customers were complaining about?

## prediction accuracy, lets process the data again but (i) retain their Product labels and (ii) collapse their categories

# collapose Product categories
require("car")

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

data$Product <- recode(data$Product, "c('Bank account or service','Checking or savings account') = 'Accounts'; else = data$Product")
data$Product <- recode(data$Product, "c('Consumer Loan','Payday loan', 'Payday loan, title loan, or personal loan', 'Student loan', 'Vehicle loan or lease') = 'Loans'; else = data$Product")
data$Product <- recode(data$Product, "c('Credit card','Credit card or prepaid card', 'Prepaid card') = 'Credit cards'; else = data$Product")
data$Product <- recode(data$Product, "c('Credit reporting','Credit reporting, credit repair services, or other personal consumer reports') = 'Credit reporting'; else = data$Product")
data$Product <- recode(data$Product, "c('Money transfer, virtual currency, or money service','Money transfers', 'Other financial service', 'Virtual currency') = 'Money transfers'; else = data$Product")

# now we have 7 main product categories
table(data$Product)

## 
##         Accounts     Credit cards Credit reporting  Debt collection 
##              970             1037             3217             1801 
##            Loans  Money transfers         Mortgage 
##              729              134             2112

# second, subset the data
names(data)[6] <- "text"
new_data <- data[, c("text", "Product")]

new_data_unnested <- new_data %>%
                 unnest_tokens(word, text)

new_data_counts <- new_data_unnested %>%
  anti_join(stop_words) %>%
  dplyr::count(Product, word, sort = TRUE)

## Joining, by = "word"

# display result

new_data_counts

## # A tibble: 25,553 x 3
##    Product          word            n
##    <chr>            <chr>       <int>
##  1 Credit reporting credit       2386
##  2 Credit reporting report       1290
##  3 Credit reporting account      1178
##  4 Credit reporting information  1136
##  5 Debt collection  debt          951
##  6 Credit reporting reporting     796
##  7 Debt collection  credit        654
##  8 Credit cards     card          653
##  9 Accounts         account       644
## 10 Credit cards     credit        610
## # ... with 25,543 more rows

# sort by TF-IDF to increase topic-specific identifiability

new_data_counts <- new_data_counts %>%
  bind_tf_idf(word, Product, n)

# sort by TF-IDF
new_data_counts %>%
  arrange(desc(tf_idf))

## # A tibble: 25,553 x 6
##    Product          word             n      tf   idf  tf_idf
##    <chr>            <chr>        <int>   <dbl> <dbl>   <dbl>
##  1 Money transfers  coinbase        29 0.00651 1.95  0.0127 
##  2 Mortgage         escrow         182 0.00621 1.25  0.00778
##  3 Money transfers  gram            15 0.00337 1.95  0.00655
##  4 Loans            navient        122 0.00636 0.847 0.00539
##  5 Mortgage         modification   176 0.00601 0.847 0.00509
##  6 Credit reporting transunion     201 0.00351 1.25  0.00440
##  7 Money transfers  usd             15 0.00337 1.25  0.00422
##  8 Mortgage         homeowner       63 0.00215 1.95  0.00418
##  9 Credit reporting experian       272 0.00475 0.847 0.00403
## 10 Loans            pslf            39 0.00203 1.95  0.00396
## # ... with 25,543 more rows

# plot it
new_data_counts %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(Product) %>% 
  top_n(10) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = Product)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~Product, ncol = 4, scales = "free") +
  coord_flip()

## Selecting by tf_idf

# do you find any interpretable patterns?

# cast to dtm for further processing and analysis

new_dtm  <- new_data_counts %>%
  cast_dtm(Product, word, n)

new_dtm

## <<DocumentTermMatrix (documents: 7, terms: 10763)>>
## Non-/sparse entries: 25553/49788
## Sparsity           : 66%
## Maximal term length: 45
## Weighting          : term frequency (tf)

# convert to lda class output (we set K = 7 because we have 7 product categories)
# new_lda <- LDA(new_dtm, k = 7, control = list(seed = 123))   # do not run, just download the new_lda output from dropbox folder
new_lda <- readRDS(url("https://www.dropbox.com/s/epre5su5i0km5ho/new_lda.rds?dl=1"))

new_lda

## A LDA_VEM topic model with 7 topics.

# join term-topic assignment to document-term matrix to see the most likely "Product" (i.e., topic) for a document (customer complaint)
# you will need broom package to be able to use augment() function
library(broom)

## Warning: package 'broom' was built under R version 3.6.3

assignments <- augment(new_lda, data = new_dtm)
assignments

## # A tibble: 25,553 x 4
##    document         term   count .topic
##    <chr>            <chr>  <dbl>  <dbl>
##  1 Credit reporting credit  2386      6
##  2 Debt collection  credit   654      5
##  3 Credit cards     credit   610      3
##  4 Accounts         credit    79      4
##  5 Mortgage         credit   113      2
##  6 Loans            credit   198      1
##  7 Money transfers  credit    29      3
##  8 Credit reporting report  1290      6
##  9 Debt collection  report   305      5
## 10 Credit cards     report    92      3
## # ... with 25,543 more rows

# get topic-document matrix (theta = gamma), use that to calculate product classification to determine the most likely Product type for a given topic
new_theta <- tidy(new_lda, matrix = "gamma")
names(new_theta)[1] <- "Product"

new_theta

## # A tibble: 49 x 3
##    Product          topic       gamma
##    <chr>            <int>       <dbl>
##  1 Credit reporting     1 0.000000274
##  2 Debt collection      1 0.000000484
##  3 Credit cards         1 0.000000744
##  4 Accounts             1 0.00000109 
##  5 Mortgage             1 0.000000535
##  6 Loans                1 0.928      
##  7 Money transfers      1 0.00000352 
##  8 Credit reporting     2 0.000000274
##  9 Debt collection      2 0.000000484
## 10 Credit cards         2 0.000000744
## # ... with 39 more rows

# sort product classification result
product_classifications <- new_theta %>%
  group_by(Product) %>%
  top_n(1, gamma) %>%
  ungroup() %>%
  arrange(gamma)

product_classifications

## # A tibble: 7 x 3
##   Product          topic gamma
##   <chr>            <int> <dbl>
## 1 Money transfers      7 0.658
## 2 Loans                1 0.928
## 3 Mortgage             2 0.995
## 4 Credit reporting     6 1.00 
## 5 Accounts             4 1.00 
## 6 Credit cards         3 1.00 
## 7 Debt collection      5 1.00

# rename column
names(product_classifications)[1] <- "predicted"

# "join" the original product labels with our product (topic) assignment result side by side 
assignments <- assignments %>%
  inner_join(product_classifications, by = c(".topic" = "topic"))

assignments

## # A tibble: 25,553 x 6
##    document         term   count .topic predicted        gamma
##    <chr>            <chr>  <dbl>  <dbl> <chr>            <dbl>
##  1 Credit reporting credit  2386      6 Credit reporting 1.00 
##  2 Debt collection  credit   654      5 Debt collection  1.00 
##  3 Credit cards     credit   610      3 Credit cards     1.00 
##  4 Accounts         credit    79      4 Accounts         1.00 
##  5 Mortgage         credit   113      2 Mortgage         0.995
##  6 Loans            credit   198      1 Loans            0.928
##  7 Money transfers  credit    29      3 Credit cards     1.00 
##  8 Credit reporting report  1290      6 Credit reporting 1.00 
##  9 Debt collection  report   305      5 Debt collection  1.00 
## 10 Credit cards     report    92      3 Credit cards     1.00 
## # ... with 25,543 more rows

# finally, we can plot the result, using the color scale intensity to show the percentage of complaints that are correctly assigned to their TRUE Product labels
library(scales)

assignments %>%
  dplyr::count(document, predicted, wt = count) %>%
  group_by(document) %>%
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(predicted, document, fill = percent)) +
  geom_tile() +
  scale_fill_gradient2(high = "red", label = percent_format()) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid = element_blank()) +
  labs(x = "Complaints assigned to",
       y = "Complaints came from",
       fill = "% of assignments")

# it seems that, with the exception of credit-cards-money transfers, we have done a pretty good job at classifying complaint product types

Sentiment Analysis

Counting positive and negative sentiments, visualization, and more…

library(ggplot2)
library(plyr)
library(dplyr)
library(tibble)
library(tidyverse)
library(tidytext)
library(tidyr)
library(tm)
library(topicmodels)
library(stringr)


# most common positive and negative words

# preprocessing
new_data_unnested <- new_data %>%
                 unnest_tokens(word, text) %>%
                 anti_join(stop_words)

## Joining, by = "word"

# count number of positive and negative words in the entire corpus
bing_word_counts <- new_data_unnested %>%
  inner_join(get_sentiments("bing")) %>%
  dplyr::count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

# display output (unsorted)
bing_word_counts

## # A tibble: 1,417 x 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 debt       negative   1413
##  2 dispute    negative    562
##  3 complaint  negative    404
##  4 fraud      negative    400
##  5 issue      negative    360
##  6 fraudulent negative    332
##  7 disputed   negative    256
##  8 correct    positive    232
##  9 inaccurate negative    232
## 10 fair       positive    223
## # ... with 1,407 more rows

# visualize it

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

# show it in "comparison" wordcloud
library(wordcloud)

## Loading required package: RColorBrewer

install.packages("reshape2") # we will need it to use acast() function

## Installing package into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'reshape2' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'reshape2'

## Warning in file.copy(savedcopy, lib, recursive =
## TRUE): problem copying C:\Users\hktse\Documents\R\win-
## library\3.6\00LOCK\reshape2\libs\x64\reshape2.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\reshape2\libs\x64\reshape2.dll:
## Permission denied

## Warning: restored 'reshape2'

## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpGUJ8JW\downloaded_packages

library(reshape2)

## Warning: package 'reshape2' was built under R version 3.6.3

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

new_data_unnested %>%
  inner_join(get_sentiments("bing")) %>%
  dplyr::count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),     # comparison.cloud() comes from wordcloud package
                   max.words = 100)

## Joining, by = "word"

# distribution of positive and negative sentiments across different types of financial products

new_text <- tibble(document = 1:nrow(new_data), text = new_data$text, Product = new_data$Product) %>% 
                    unnest_tokens(word, text) %>%
                    anti_join(stop_words)  %>%
                    group_by(Product, document) %>%
                    mutate(text = paste(word,collapse =' ')) %>%
                    distinct(Product, document, text)

## Joining, by = "word"

# using analyzeSentiment() from the SentimentAnalysis package to count positive and negative words per document and rate the sentiment type of each document

install.packages("SentimentAnalysis")

## Installing package into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'SentimentAnalysis' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpGUJ8JW\downloaded_packages

library(SentimentAnalysis)

## Warning: package 'SentimentAnalysis' was built under R version 3.6.3

## 
## Attaching package: 'SentimentAnalysis'

## The following object is masked from 'package:base':
## 
##     write

# sentiment <- analyzeSentiment(new_text$text)
sentiment <- readRDS(url("https://www.dropbox.com/s/q4em5vu8s9t9vdd/sentiment.rds?dl=1"))
new_text$sentiment <- as.character(convertToBinaryResponse(sentiment)$SentimentGI)

# plot it

new_text %>%
  dplyr::count(Product, sentiment, sort = TRUE)  %>%
  mutate(sentiment = reorder(sentiment, n)) %>%   # sort by frequency in descending order
  ggplot(aes(sentiment, n, fill = sentiment)) +        # plotting
  geom_col(show.legend = FALSE) +
  facet_wrap(~ Product, scales = "free") +
  scale_x_reordered() +
  ylab("Frequency") + 
  theme_bw()

Supplementary materials

Make your own stopword list

#Note: stop_words (from tidytext package) are a list of pre-specified of stopwords stored in a dataframe/tibble object, you can specify your own stopword list and combine with stop_words

library(tidytext)
data(stop_words)

# check out stop_words dimension and content
dim(stop_words)
head(stop_words)

# suppose you are working on political science text, you want to define a set of terms to remove

# you first define your own stopword list
words = c("authoritarianism", "oligarchy", "democratization", "coup")

# set lexicon (aka., index), index = 4 because you previously include 4 terms in the word object
lexicon = rep("psc", 4)

# create a dataframe for "words"
psc = data.frame(word, lexicon)

# append to tidytext's stop_words list, now you have a new stop_words list
stop_words <- rbind(stop_words, psc)



## perplexity analysis
## perplexity analysis: find out ideal number of topics, conditional on the distribution of words/terms and the number of documents
install.packages(c("ldatuning", "ggplot2", "scales", "snow", "foreach", "doParallel"))
library(ldatuning)
library(ggplot2)
library(scales)
library(snow)
library(foreach)
library(doParallel)

# do not run, just load the data from our Dropbox shared folder

result <- FindTopicsNumber(
  dtm,
  topics = seq(from = 2, to = 50, by = 1),     # candidate number of K: 2 to 50 (must be > 1)
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),  # 4 metrics
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

# the plotting result suggests that K = 15 should be adequate, so we will train a 15-topic LDA
FindTopicsNumber_plot(result)

## perplexity analysis: cross-validation version

## cross-validation

burnin = 1000
iter = 1000
keep = 50

n <- nrow(dtm)

k <- 15 # number of topics

splitter <- sample(1:n, round(n * 0.75))
train_set <- dtm[splitter, ]
valid_set <- dtm[-splitter, ]

fitted <- LDA(train_set, k = k, method = "Gibbs",
                          control = list(burnin = burnin, iter = iter, keep = keep) )
perplexity(fitted, newdata = train_set)
perplexity(fitted, newdata = valid_set)

## CV with many candidate n of topics

cluster <- makeCluster(detectCores(logical = TRUE) - 1) # leave one CPU spare
registerDoParallel(cluster)

clusterEvalQ(cluster, {
   library(topicmodels)
})

folds <- 5
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2, 3, 4, 5, 10, 20, 30, 40, 50, 75, 100) # candidates for how many topics
clusterExport(cluster, c("dtm", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))

system.time({
results <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
   k <- candidate_k[j]
   results_1k <- matrix(0, nrow = folds, ncol = 2)
   colnames(results_1k) <- c("k", "perplexity")
   for(i in 1:folds){
      train_set <- dtm[splitfolds != i , ]
      valid_set <- dtm[splitfolds == i, ]
      
      fitted <- LDA(train_set, k = k, method = "Gibbs",
                    control = list(burnin = burnin, iter = iter, keep = keep) )
      results_1k[i,] <- c(k, perplexity(fitted, newdata = valid_set))
   }
   return(results_1k)
}
})
stopCluster(cluster)

results_df <- as.data.frame(results)

ggplot(results_df, aes(x = k, y = perplexity)) +
   geom_point() +
   geom_smooth(se = FALSE) +
   ggtitle("5-fold CV of topic modeling", "when fitting the trained model to the hold-out set") +
   labs(x = "Candidate number of topics", y = "Perplexity")

# which gives us

Statistical Learning: Week-13

H. K. Tseng

2/28/2020

Stripping raw text

Inferring customer complaints product type through LDA

Sentiment Analysis

Supplementary materials