Text Analysis in R

R Markdown

library("tidyverse")
library("data.table")
library("tidytext")
library("DT")
library("wordcloud")
library("topicmodels")

Including Plots

You can also embed plots, for example:

review_data %>% 
  group_by(Product) %>% 
  summarise(starts_mean = mean(Stars))

## # A tibble: 2 x 2
##   Product                                  starts_mean
##   <chr>                                          <dbl>
## 1 iRobot Roomba 650 for Pets                      4.49
## 2 iRobot Roomba 880 for Pets and Allergies        4.42

Text Data is categorical…

review_data %>% 
  group_by(Product) %>% 
  summarise(number_rows = n()) %>% 
  arrange(desc(number_rows))

## # A tibble: 2 x 2
##   Product                                  number_rows
##   <chr>                                          <int>
## 1 iRobot Roomba 880 for Pets and Allergies        1200
## 2 iRobot Roomba 650 for Pets                       633

Some natural NLP vocabulary:

Bag of words: Words in a document are independent.
Every separate body of text is a document
Each unique word is a term
Every ocurrence of a term is a token
Creating bags of words is called tokenizing

unnest_tokens Is like a gather() function but for all words in the text column, remove punctuation,each word is lowercase and white space has been removed too.

tidy_review <- review_data %>% 
                unnest_tokens(word,Review)

datatable(data =tidy_review %>% 
  count(word) %>% 
  arrange(desc(n))  ,style = "Bootstrap")

Removing stop words

head(stop_words)

## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART

tidy_review2 <- review_data %>% 
            mutate(id = row_number()) %>% 
            unnest_tokens(word, Review) %>% 
            anti_join(stop_words)

## Joining, by = "word"

tidy_review2 %>% 
      count(word) %>% 
      arrange(desc(n)) %>%
      filter(n > 300) %>% 
      ggplot(
        aes(x = word, y = n)
      ) +
      geom_col() +
      theme_minimal() + 
      coord_flip() + 
      ggtitle("Review Word Counts")

Custom stop words

custom_stopwords <-  tribble(~word, ~lexicon,
         "roomba", "CUSTOM",
         "2", "CUSTOM")

stop_words_c <- stop_words %>% 
                bind_rows(custom_stopwords)

tidy_review3 <- review_data %>% 
            mutate(id = row_number()) %>% 
            unnest_tokens(word, Review) %>% 
            anti_join(stop_words_c)

## Joining, by = "word"

tidy_review3 %>% 
      count(word, Product) %>% 
      group_by(Product) %>%
      top_n(10,n) %>% 
      ungroup() %>% 
  mutate(word = fct_reorder(word,n)) %>% 
      ggplot(
        aes(x = word, y = n, fill = Product)
      ) +
      geom_col(show.legend = F) +
      facet_wrap(~ Product, scales = "free_y") +
      theme_minimal() + 
      coord_flip() + 
      ggtitle("Review Word Counts")

Word Clouds

tidy_words <- tidy_review3 %>% count(word)

wordcloud(
  words = tidy_words$word,
  freq = tidy_words$n,
  max.words = 30,
  random.order = FALSE,
  colors = brewer.pal(6,"Dark2")
  
)

Sentiment analysis

# Bing dictionary: Positive and negative words

get_sentiments("bing") %>% 
      count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

# Afinn dictionary:  assigns scores to words, positive scores, positive words and negative scores for negative words

get_sentiments("afinn") %>% 
  summarize(
min = min(value),
max = max(value)
)

## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1    -5     5

#Loughran dictionary:

sentiment_counts <- get_sentiments("loughran") %>%
              count(sentiment) %>%
              mutate(sentiment2 = fct_reorder(sentiment, n))


ggplot(sentiment_counts, aes(x = sentiment2, y = n)) +
      geom_col() +
      coord_flip() +
      labs(
      title = "Sentiment Counts in Loughran",
      x = "Counts",
      y = "Sentiment"
)

# nrc dictionary

# Count the number of words associated with each sentiment in nrcs
sentiment_counts2 <- get_sentiments("nrc") %>% 
  count(sentiment) %>% 
            mutate(sentiment2 = fct_reorder(sentiment, n))


ggplot(sentiment_counts2, aes(x = sentiment2, y = n)) +
      geom_col() +
      coord_flip() +
      labs(
      title = "Sentiment Counts in nrc",
      x = "Counts",
      y = "Sentiment"
)

tidy_review_sentiment <- tidy_review3 %>%
  inner_join(get_sentiments("loughran")) %>% 
  count(word, sentiment) %>% 
  filter(sentiment %in% c("positive", "negative", "uncertainty")) %>% 
  group_by(sentiment) %>% 
  top_n(10,n) %>% 
  ungroup() %>% 
  mutate(word = fct_reorder(word, n))


ggplot(tidy_review_sentiment,
       aes(x = word, y = n , fill = sentiment)) +
      geom_col() +
      facet_wrap(~sentiment, scales = "free")+
      coord_flip()

## Improving sentiment analysis

sentiment_stars <- tidy_review %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(Stars,sentiment) %>% 
  spread(sentiment,n) %>% 
    mutate(overall_sentiment = positive - negative,
         Stars = fct_reorder(as.character(Stars), overall_sentiment))

## Joining, by = "word"

ggplot(sentiment_stars,
       aes(x = Stars, y = overall_sentiment, fill = as.factor(Stars))) +
    geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Overall sentiment stars")

Latent dirichlet allocation

Searches for patterns of words ocurring together within and across a collection documents, also known as corpus.

LDA is a standard topic model
Bag of words is treating every word in a document separately
Topic models find patterns of words appearing to together
Searching for patters rather than predicting is known as unsupervised learning

Topic modeling

Topics are covered based on word frequency, which is discrete.
Every document is a mixture (i.e. partial member) of every topic.

Document term matrix and LDA modeling

Adding topics that are different is good
If we start repeating topics , we’ve gone too far
Name the topics based on the combination of high probability words

dtm_matrix<- tidy_review %>%
  mutate(id = row_number()) %>% 
   inner_join(get_sentiments("bing")) %>% 
  count(word, id) %>% 
  sample_n(size = 500) %>% 
  cast_dtm(id, word, n) %>%  as.matrix()

## Joining, by = "word"

lda_out <-  LDA(
              dtm_matrix,
              k = 2,
              method = 'Gibbs',
              control = list(seed = 42)
  
)

glimpse(lda_out)

## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
##   ..@ seedwords      : NULL
##   ..@ z              : int [1:500] 1 1 2 2 1 1 1 1 1 1 ...
##   ..@ alpha          : num 25
##   ..@ call           : language LDA(x = dtm_matrix, k = 2, method = "Gibbs", control = list(seed = 42))
##   ..@ Dim            : int [1:2] 500 211
##   ..@ control        :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
##   ..@ k              : int 2
##   ..@ terms          : chr [1:211] "thank" "enthusiasm" "hung" "well" ...
##   ..@ documents      : chr [1:500] "222819" "93079" "26657" "51889" ...
##   ..@ beta           : num [1:2, 1:211] -4.89 -7.87 -5.54 -7.87 -7.94 ...
##   ..@ gamma          : num [1:500, 1:2] 0.51 0.51 0.49 0.49 0.51 ...
##   ..@ wordassignments:List of 5
##   .. ..$ i   : int [1:500] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ j   : int [1:500] 1 2 3 4 5 6 7 6 8 9 ...
##   .. ..$ v   : num [1:500] 1 1 2 2 1 1 1 1 1 2 ...
##   .. ..$ nrow: int 500
##   .. ..$ ncol: int 211
##   .. ..- attr(*, "class")= chr "simple_triplet_matrix"
##   ..@ loglikelihood  : num -2492
##   ..@ iter           : int 2000
##   ..@ logLiks        : num(0) 
##   ..@ n              : int 500

lda_topics <- lda_out %>% 
            tidy(matrix = "beta") %>% 
            arrange(desc(beta)) %>% 
            group_by(topic) %>% 
            top_n(15,beta) %>% 
            ungroup() %>% 
            mutate(term = fct_reorder(term,beta))

ggplot(lda_topics,
       aes(x = term,
           y =beta,
           fill = as.factor(topic))) +
      geom_col() +
      facet_wrap(~topic, scales = "free") +
      coord_flip()

Text Analysis in R - NLP

Miguel Arquez Abdala

24/10/2019