The twitter_data data frame has over 7,000 tweets about airlines. The tweets have already been classified as either complaints or non-complaints in the complaint_label column. Let’s get a sense of how many of these tweets are complaints.
## Parsed with column specification:
## cols(
## index = col_double(),
## tweet_id = col_double(),
## date = col_datetime(format = ""),
## complaint_label = col_character(),
## tweet_text = col_character(),
## usr_followers_count = col_double(),
## usr_verified = col_logical()
## )
# Start with the data frame
twitter_data %>%
# Group the data by whether or not the tweet is a complaint
group_by(complaint_label) %>%
# Compute the mean, min, and max follower counts
summarize(
avg_followers = mean(usr_followers_count),
min_followers = min(usr_followers_count),
max_followers = max(usr_followers_count)
)## `summarise()` ungrouping output (override with `.groups` argument)
twitter_data %>%
# Group by whether or not a user is verified
group_by(usr_verified) %>%
summarize(
# Compute the average number of followers
avg_followers = mean(usr_followers_count),
# Count the number of users in each category
n = n()
)## `summarise()` ungrouping output (override with `.groups` argument)
# Load the tidyverse and tidytext packages
library(tidyverse)
library(tidytext)
tidy_twitter <- twitter_data %>%
# Tokenize the twitter data
unnest_tokens(word, tweet_text)
tidy_twitter %>%
# Compute word counts
count(word) %>%
# Arrange the counts in descending order
arrange(desc(n))tidy_twitter <- twitter_data %>%
# Tokenize the twitter data
unnest_tokens(word, tweet_text) %>%
# Remove stop words
anti_join(stop_words)## Joining, by = "word"
tidy_twitter %>%
# Filter to keep complaints only
filter(complaint_label == "Complaint") %>%
# Compute word counts and arrange in descending order
count(word) %>%
arrange(desc(n))It looks like complaints include frequent references to time, delays, and service. However, there are simply a lot of specific airlines referenced. These could be considered as stop words specific to this data, and we’ll see how to remove them in the next chapter.
word_counts <- tidy_twitter %>%
filter(complaint_label == "Complaint") %>%
count(word) %>%
# Keep words with count greater than 100
filter(n > 100)
# Create a bar plot using word_counts with x = word
ggplot(word_counts, aes(x = word, y = n )) +
geom_col() +
# Flip the plot coordinates
coord_flip()word_counts <- tidy_twitter %>%
# Only keep the non-complaints
filter(complaint_label == "Non-Complaint") %>%
count(word) %>%
filter(n > 150)
# Create a bar plot using the new word_counts
ggplot(word_counts, aes(x = word, y = n)) +
geom_col() +
coord_flip() +
# Title the plot "Non-Complaint Word Counts"
ggtitle("Non-Complaint Word Counts")custom_stop_words <- tribble(
# Column names should match stop_words
~word, ~lexicon,
# Add http, win, and t.co as custom stop words
"http", "CUSTOM",
"win", "CUSTOM",
"t.co", "CUSTOM"
)
# Bind the custom stop words to stop_words
stop_words2 <- stop_words %>%
rbind(custom_stop_words)tidy_twitter <- twitter_data %>%
# Tokenize the twitter data
unnest_tokens(word, tweet_text) %>%
# Remove stop words
anti_join(stop_words2)## Joining, by = "word"
word_counts <- tidy_twitter %>%
filter(complaint_label == "Non-Complaint") %>%
count(word) %>%
# Keep terms that occur more than 100 times
filter(n>100) %>%
# Reorder word as an ordered factor by word counts
mutate(word2 = fct_reorder(word, n))
# Plot the new word column with type factor
ggplot(word_counts, aes(x=word2, y=n)) +
geom_col() +
coord_flip() +
ggtitle("Non-Complaint Word Counts")word_counts <- tidy_twitter %>%
# Count words by whether or not its a complaint
count(word, complaint_label) %>%
# Group by whether or not its a complaint
group_by(complaint_label) %>%
# Keep the top 20 words
top_n(20, n) %>%
# Ungroup before reordering word as a factor by the count
ungroup() %>%
mutate(word2 = fct_reorder(word, n))# Include a color aesthetic tied to whether or not its a complaint
ggplot(word_counts, aes(x = word2, y = n, fill = complaint_label)) +
# Don't include the lengend for the column plot
geom_col(show.legend = FALSE) +
# Facet by whether or not its a complaint and make the y-axis free
facet_wrap(~ complaint_label, scales = "free_y") +
# Flip the coordinates and add a title: "Twitter Word Counts"
coord_flip() +
ggtitle("Twitter Word Counts")# Load the wordcloud package
library(wordcloud)
# Compute word counts and assign to word_counts
word_counts <- tidy_twitter %>%
count(word)
wordcloud(
# Assign the word column to words
word = word_counts$word,
# Assign the count column to freq
freq = word_counts$n,
max.words = 30
)# Compute complaint word counts and assign to word_counts
word_counts <- tidy_twitter %>%
filter(complaint_label == "Complaint") %>%
count(word)
# Create a complaint word cloud of the top 50 terms, colored red
wordcloud(
words = word_counts$word,
freq = word_counts$n,
max.words = 50,
color = "red"
)# Count the number of words associated with each sentiment in nrc
get_sentiments('nrc') %>%
count(sentiment) %>%
# Arrange the counts in descending order
arrange(desc(n))# Join tidy_twitter and the NRC sentiment dictionary
sentiment_twitter <- tidy_twitter %>%
inner_join(get_sentiments("nrc"))## Joining, by = "word"
# Count the sentiments in sentiment_twitter
sentiment_twitter %>%
count(sentiment) %>%
# Arrange the sentiment counts in descending order
arrange(desc(n))word_counts <- tidy_twitter %>%
# Append the NRC dictionary and filter for positive, fear, and trust
inner_join(get_sentiments('nrc')) %>%
filter(sentiment %in% c("positive", "fear", "trust")) %>%
# Count by word and sentiment and keep the top 10 of each
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
# Create a factor called word2 that has each word ordered by the count
mutate(word2 = fct_reorder(word, n))## Joining, by = "word"
tidy_twitter %>%
# Append the NRC sentiment dictionary
inner_join(get_sentiments('nrc')) %>%
# Count by complaint label and sentiment
count(complaint_label, sentiment) %>%
# Spread the sentiment and count columns
spread(sentiment, n)## Joining, by = "word"
tidy_twitter %>%
# Append the afinn sentiment dictionary
inner_join(get_sentiments('afinn')) %>%
# Group by both complaint label and whether or not the user is verified
group_by(complaint_label, usr_verified) %>%
# Summarize the data with an aggregate_value = sum(value)
summarize(aggregate_value = sum(value)) %>%
# Spread the complaint_label and aggregate_value columns
spread(complaint_label, aggregate_value) %>%
mutate(overall_sentiment = Complaint + `Non-Complaint`)## Joining, by = "word"
## `summarise()` regrouping output by 'complaint_label' (override with `.groups` argument)
With the output of the grouped summary spread(), we can easily use mutate() to create a new overall_sentiment column. It looks like unverified users complain more often, on aggregate.
sentiment_twitter <- tidy_twitter %>%
# Append the bing sentiment dictionary
inner_join(get_sentiments("bing")) %>%
# Count by complaint label and sentiment
count(complaint_label, sentiment) %>%
# Spread the sentiment and count columns
spread(sentiment, n) %>%
# Compute overall_sentiment = positive - negative
mutate(overall_sentiment = positive - negative)## Joining, by = "word"
# Create a bar plot out of overall sentiment by complaint label, colored by complaint label as a factor
ggplot(
sentiment_twitter,
aes(x = complaint_label, y = overall_sentiment, fill = as.factor(complaint_label))
) +
geom_col(show.legend = FALSE) +
coord_flip() +
# Title the plot "Overall Sentiment by Complaint Label" with an "Airline Twitter Data" subtitle
labs(
title = "Overall Sentiment by Complaint Label",
subtitle = "Airline Twitter Data"
)Complaints are very negative while non-complaints are neutral at best.
LDA is a standard topic model. Topic models find patterns of words appearing together.
Creating a DTM Create a DTM using our tidy_twitter data. In this case, each tweet is considered a document. Print tidy_twitter in the console to confirm the column names.
# Start with the tidied Twitter data
tidy_twitter %>%
# Count each word used in each tweet
count(word, tweet_id) %>%
# Use the word counts by tweet to create a DTM
cast_dtm(tweet_id, word, n)## <<DocumentTermMatrix (documents: 7044, terms: 17994)>>
## Non-/sparse entries: 59122/126690614
## Sparsity : 100%
## Maximal term length: 44
## Weighting : term frequency (tf)
# Assign the DTM to dtm_twitter
dtm_twitter <- tidy_twitter %>%
count(word, tweet_id) %>%
# Cast the word counts by tweet into a DTM
cast_dtm(tweet_id, word, n)
# Coerce dtm_twitter into a matrix called matrix_twitter
matrix_twitter <- as.matrix(dtm_twitter)
# Print rows 1 through 5 and columns 90 through 95
# print(matrix_twitter[1:5, 90:95])Fitting an LDA It’s time to run your first topic model! As discussed, the three additional arguments of the LDA() function are critical for properly running a topic model. Note that running the LDA() function could take about 10 seconds. The tidyverse and tidytext packages along with the tidy_twitter dataset have been loaded for you.
library(topicmodels)
# Run an LDA with 2 topics and a Gibbs sampler
lda_out <- LDA(
dtm_twitter,
k = 2,
method = 'Gibbs',
control = list(seed = 42)
)## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
## ..@ seedwords : NULL
## ..@ z : int [1:60688] 2 1 1 1 1 1 2 1 2 1 ...
## ..@ alpha : num 25
## ..@ call : language LDA(x = dtm_twitter, k = 2, method = "Gibbs", control = list(seed = 42))
## ..@ Dim : int [1:2] 7044 17994
## ..@ control :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
## ..@ k : int 2
## ..@ terms : chr [1:17994] "_adowaa_" "_arzar" "_austrian" "_bbbb_" ...
## ..@ documents : chr [1:7044] "486973619952971776" "478816318784036864" "477008545637224448" "477077022695768064" ...
## ..@ beta : num [1:2, 1:17994] -12.65 -10.31 -10.26 -12.71 -7.94 ...
## ..@ gamma : num [1:7044, 1:2] 0.509 0.516 0.517 0.491 0.441 ...
## ..@ wordassignments:List of 5
## .. ..$ i : int [1:59122] 1 1 1 2 2 2 2 2 2 2 ...
## .. ..$ j : int [1:59122] 1 5306 17631 2 2155 9755 10134 10337 10974 12281 ...
## .. ..$ v : num [1:59122] 2 1 1 1 1 1 2 1 2 1 ...
## .. ..$ nrow: int 7044
## .. ..$ ncol: int 17994
## .. ..- attr(*, "class")= chr "simple_triplet_matrix"
## ..@ loglikelihood : num -506519
## ..@ iter : int 2000
## ..@ logLiks : num(0)
## ..@ n : int 60688
# Tidy the matrix of word probabilities
lda_topics <- lda_out %>%
tidy("beta")
# Arrange the topics by word probabilities in descending order
lda_topics %>%
arrange(desc(beta))