Intro to Text Analysis

library(tidyverse)

Airline tweets data

The twitter_data data frame has over 7,000 tweets about airlines. The tweets have already been classified as either complaints or non-complaints in the complaint_label column. Let’s get a sense of how many of these tweets are complaints.

twitter_data <- read_csv("twitter_data.csv")

## Parsed with column specification:
## cols(
##   index = col_double(),
##   tweet_id = col_double(),
##   date = col_datetime(format = ""),
##   complaint_label = col_character(),
##   tweet_text = col_character(),
##   usr_followers_count = col_double(),
##   usr_verified = col_logical()
## )

head(twitter_data)

# Start with the data frame
twitter_data %>% 
  # Group the data by whether or not the tweet is a complaint
  group_by(complaint_label) %>% 
  # Compute the mean, min, and max follower counts
  summarize(
    avg_followers = mean(usr_followers_count),
    min_followers = min(usr_followers_count),
    max_followers = max(usr_followers_count)
  )

## `summarise()` ungrouping output (override with `.groups` argument)

twitter_data %>% 
  # Group by whether or not a user is verified
  group_by(usr_verified) %>% 
  summarize(
    # Compute the average number of followers
    avg_followers = mean(usr_followers_count),
    # Count the number of users in each category
    n = n()
  )

## `summarise()` ungrouping output (override with `.groups` argument)

Tokenizing and cleaning

# Load the tidyverse and tidytext packages
library(tidyverse)
library(tidytext)

tidy_twitter <- twitter_data %>% 
  # Tokenize the twitter data
  unnest_tokens(word, tweet_text) 

tidy_twitter %>% 
  # Compute word counts
  count(word) %>% 
  # Arrange the counts in descending order
  arrange(desc(n))

tidy_twitter <- twitter_data %>% 
  # Tokenize the twitter data
  unnest_tokens(word, tweet_text) %>% 
  # Remove stop words
  anti_join(stop_words)

## Joining, by = "word"

tidy_twitter %>% 
  # Filter to keep complaints only
  filter(complaint_label == "Complaint") %>% 
  # Compute word counts and arrange in descending order
  count(word) %>% 
  arrange(desc(n))

It looks like complaints include frequent references to time, delays, and service. However, there are simply a lot of specific airlines referenced. These could be considered as stop words specific to this data, and we’ll see how to remove them in the next chapter.

Plotting word counts

word_counts <- tidy_twitter %>% 
  filter(complaint_label == "Complaint") %>% 
  count(word) %>% 
  # Keep words with count greater than 100
  filter(n > 100)

# Create a bar plot using word_counts with x = word
ggplot(word_counts, aes(x = word, y = n )) +
  geom_col() +
  # Flip the plot coordinates
  coord_flip()

word_counts <- tidy_twitter %>% 
  # Only keep the non-complaints
  filter(complaint_label == "Non-Complaint") %>% 
  count(word) %>% 
  filter(n > 150)

# Create a bar plot using the new word_counts
ggplot(word_counts, aes(x = word, y = n)) +
  geom_col() +
  coord_flip() +
  # Title the plot "Non-Complaint Word Counts"
  ggtitle("Non-Complaint Word Counts")

Improving word count plots

custom_stop_words <- tribble(
  # Column names should match stop_words
  ~word, ~lexicon,
  # Add http, win, and t.co as custom stop words
  "http", "CUSTOM",
  "win", "CUSTOM",
  "t.co", "CUSTOM"
)

# Bind the custom stop words to stop_words
stop_words2 <- stop_words %>% 
  rbind(custom_stop_words)

tidy_twitter <- twitter_data %>% 
  # Tokenize the twitter data
  unnest_tokens(word, tweet_text) %>% 
  # Remove stop words
  anti_join(stop_words2)

## Joining, by = "word"

word_counts <- tidy_twitter %>% 
  filter(complaint_label == "Non-Complaint") %>% 
  count(word) %>% 
  # Keep terms that occur more than 100 times
  filter(n>100) %>% 
  # Reorder word as an ordered factor by word counts
  mutate(word2 = fct_reorder(word, n))

# Plot the new word column with type factor
ggplot(word_counts, aes(x=word2, y=n)) +
  geom_col() +
  coord_flip() +
  ggtitle("Non-Complaint Word Counts")

word_counts <- tidy_twitter %>%
  # Count words by whether or not its a complaint
  count(word, complaint_label) %>%
  # Group by whether or not its a complaint
  group_by(complaint_label) %>%
  # Keep the top 20 words
  top_n(20, n) %>%
  # Ungroup before reordering word as a factor by the count
  ungroup() %>%
  mutate(word2 = fct_reorder(word, n))

# Include a color aesthetic tied to whether or not its a complaint
ggplot(word_counts, aes(x = word2, y = n, fill = complaint_label)) +
  # Don't include the lengend for the column plot
  geom_col(show.legend = FALSE) +
  # Facet by whether or not its a complaint and make the y-axis free
  facet_wrap(~ complaint_label, scales = "free_y") +
  # Flip the coordinates and add a title: "Twitter Word Counts"
  coord_flip() +
  ggtitle("Twitter Word Counts")

Plotting word clouds

# Load the wordcloud package
library(wordcloud)

# Compute word counts and assign to word_counts
word_counts <- tidy_twitter %>% 
  count(word)

wordcloud(
  # Assign the word column to words
  word = word_counts$word, 
  # Assign the count column to freq
  freq = word_counts$n,
  max.words = 30
)

# Compute complaint word counts and assign to word_counts
word_counts <- tidy_twitter %>% 
  filter(complaint_label == "Complaint") %>% 
  count(word)

# Create a complaint word cloud of the top 50 terms, colored red
wordcloud(
  words = word_counts$word, 
  freq = word_counts$n, 
  max.words = 50, 
  color = "red"
)

Sentiment dictionaries

get_sentiments("bing") %>% count(sentiment)

get_sentiments("afinn") %>% summarize(min = min(value), max = max(value))

get_sentiments("loughran") %>% 
  count(sentiment) %>% 
  arrange(desc(n))

# Count the number of words associated with each sentiment in nrc
get_sentiments('nrc') %>% 
  count(sentiment) %>% 
  # Arrange the counts in descending order
  arrange(desc(n))

Appending dictionaries

# Join tidy_twitter and the NRC sentiment dictionary
sentiment_twitter <- tidy_twitter %>% 
  inner_join(get_sentiments("nrc"))

## Joining, by = "word"

# Count the sentiments in sentiment_twitter
sentiment_twitter %>% 
  count(sentiment) %>% 
  # Arrange the sentiment counts in descending order
  arrange(desc(n))

word_counts <- tidy_twitter %>% 
  # Append the NRC dictionary and filter for positive, fear, and trust
  inner_join(get_sentiments('nrc')) %>% 
  filter(sentiment %in% c("positive", "fear", "trust")) %>%
  # Count by word and sentiment and keep the top 10 of each
  count(word, sentiment) %>% 
  group_by(sentiment) %>% 
  top_n(10, n) %>% 
  ungroup() %>% 
  # Create a factor called word2 that has each word ordered by the count
  mutate(word2 = fct_reorder(word, n))

## Joining, by = "word"

Improving sentiment analysis

tidy_twitter %>% 
  # Append the NRC sentiment dictionary
  inner_join(get_sentiments('nrc')) %>% 
  # Count by complaint label and sentiment
  count(complaint_label, sentiment) %>% 
  # Spread the sentiment and count columns
  spread(sentiment, n)

## Joining, by = "word"

tidy_twitter %>% 
  # Append the afinn sentiment dictionary
  inner_join(get_sentiments('afinn')) %>% 
  # Group by both complaint label and whether or not the user is verified
  group_by(complaint_label, usr_verified) %>% 
  # Summarize the data with an aggregate_value = sum(value)
  summarize(aggregate_value = sum(value)) %>% 
  # Spread the complaint_label and aggregate_value columns
  spread(complaint_label, aggregate_value) %>% 
  mutate(overall_sentiment = Complaint + `Non-Complaint`)

## Joining, by = "word"

## `summarise()` regrouping output by 'complaint_label' (override with `.groups` argument)

With the output of the grouped summary spread(), we can easily use mutate() to create a new overall_sentiment column. It looks like unverified users complain more often, on aggregate.

sentiment_twitter <- tidy_twitter %>% 
  # Append the bing sentiment dictionary
  inner_join(get_sentiments("bing")) %>% 
  # Count by complaint label and sentiment
  count(complaint_label, sentiment) %>% 
  # Spread the sentiment and count columns
  spread(sentiment, n) %>% 
  # Compute overall_sentiment = positive - negative
  mutate(overall_sentiment = positive - negative)

## Joining, by = "word"

# Create a bar plot out of overall sentiment by complaint label, colored by complaint label as a factor
ggplot(
  sentiment_twitter, 
  aes(x = complaint_label, y = overall_sentiment, fill = as.factor(complaint_label))
) +
  geom_col(show.legend = FALSE) +
  coord_flip() + 
  # Title the plot "Overall Sentiment by Complaint Label" with an "Airline Twitter Data" subtitle
  labs(
    title = "Overall Sentiment by Complaint Label",
    subtitle = "Airline Twitter Data"
  )

Complaints are very negative while non-complaints are neutral at best.

Latent Dirichlet allocation

LDA is a standard topic model. Topic models find patterns of words appearing together.

Document term matrices

Creating a DTM Create a DTM using our tidy_twitter data. In this case, each tweet is considered a document. Print tidy_twitter in the console to confirm the column names.

# Start with the tidied Twitter data
tidy_twitter %>% 
  # Count each word used in each tweet
  count(word, tweet_id) %>% 
  # Use the word counts by tweet to create a DTM
  cast_dtm(tweet_id, word, n)

## <<DocumentTermMatrix (documents: 7044, terms: 17994)>>
## Non-/sparse entries: 59122/126690614
## Sparsity           : 100%
## Maximal term length: 44
## Weighting          : term frequency (tf)

# Assign the DTM to dtm_twitter
dtm_twitter <- tidy_twitter %>% 
  count(word, tweet_id) %>% 
  # Cast the word counts by tweet into a DTM
  cast_dtm(tweet_id, word, n)

# Coerce dtm_twitter into a matrix called matrix_twitter
matrix_twitter <- as.matrix(dtm_twitter)

# Print rows 1 through 5 and columns 90 through 95
# print(matrix_twitter[1:5, 90:95])

Running topic models

Fitting an LDA It’s time to run your first topic model! As discussed, the three additional arguments of the LDA() function are critical for properly running a topic model. Note that running the LDA() function could take about 10 seconds. The tidyverse and tidytext packages along with the tidy_twitter dataset have been loaded for you.

library(topicmodels)
# Run an LDA with 2 topics and a Gibbs sampler
lda_out <- LDA(
  dtm_twitter,
  k = 2,
  method = 'Gibbs',
  control = list(seed = 42)
)

glimpse(lda_out)

## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
##   ..@ seedwords      : NULL
##   ..@ z              : int [1:60688] 2 1 1 1 1 1 2 1 2 1 ...
##   ..@ alpha          : num 25
##   ..@ call           : language LDA(x = dtm_twitter, k = 2, method = "Gibbs", control = list(seed = 42))
##   ..@ Dim            : int [1:2] 7044 17994
##   ..@ control        :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
##   ..@ k              : int 2
##   ..@ terms          : chr [1:17994] "_adowaa_" "_arzar" "_austrian" "_bbbb_" ...
##   ..@ documents      : chr [1:7044] "486973619952971776" "478816318784036864" "477008545637224448" "477077022695768064" ...
##   ..@ beta           : num [1:2, 1:17994] -12.65 -10.31 -10.26 -12.71 -7.94 ...
##   ..@ gamma          : num [1:7044, 1:2] 0.509 0.516 0.517 0.491 0.441 ...
##   ..@ wordassignments:List of 5
##   .. ..$ i   : int [1:59122] 1 1 1 2 2 2 2 2 2 2 ...
##   .. ..$ j   : int [1:59122] 1 5306 17631 2 2155 9755 10134 10337 10974 12281 ...
##   .. ..$ v   : num [1:59122] 2 1 1 1 1 1 2 1 2 1 ...
##   .. ..$ nrow: int 7044
##   .. ..$ ncol: int 17994
##   .. ..- attr(*, "class")= chr "simple_triplet_matrix"
##   ..@ loglikelihood  : num -506519
##   ..@ iter           : int 2000
##   ..@ logLiks        : num(0) 
##   ..@ n              : int 60688

# Tidy the matrix of word probabilities
lda_topics <- lda_out %>% 
  tidy("beta")

# Arrange the topics by word probabilities in descending order
lda_topics %>% 
  arrange(desc(beta))