The CrowdFlower data can be found and downloaded here.
library(readr)
library(dplyr)
library(tidytext)
library(tidyr)
library(stringr)
airline_tweets <- read_csv("~/Downloads/Airline-Sentiment-2-w-AA.csv")
colnames(airline_tweets) <- make.names(colnames(airline_tweets))
# clean text and separate into words, similar to
# http://varianceexplained.org/r/trump-tweets/
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
airline_words <- airline_tweets %>%
mutate(text = str_replace_all(text, "https?://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
anti_join(stop_words, by = "word")
The most common words in these tweets are:
library(ggplot2)
theme_set(theme_bw())
airline_words %>%
count(word, sort = TRUE) %>%
head(20) %>%
mutate(word = factor(word, levels = rev(word))) %>%
ggplot(aes(word, n)) +
geom_bar(stat = "identity") +
coord_flip()
Words with the most positive or negative sentiment, compared to frequency:
word_sentiment <- airline_words %>%
distinct(X_unit_id, airline_sentiment, word) %>%
count(airline_sentiment, word) %>%
spread(airline_sentiment, n, fill = 0) %>%
mutate(total = negative + neutral + positive) %>%
filter(total >= 25) %>%
mutate(sentiment_score = positive / total - negative / total)
word_sentiment
## # A tibble: 658 x 6
## word negative neutral positive total sentiment_score
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 @ 49 14 14 77 -0.45454545
## 2 @americanair 2100 499 351 2950 -0.59288136
## 3 @delta 56 8 2 66 -0.81818182
## 4 @imaginedragons 0 28 17 45 0.37777778
## 5 @jetblue 963 702 542 2207 -0.19075668
## 6 @jetblue's 2 22 3 27 0.03703704
## 7 @southwestair 1207 668 572 2447 -0.25950143
## 8 @united 2667 699 497 3863 -0.56173958
## 9 @usairways 2317 390 274 2981 -0.68534049
## 10 @virginamerica 190 174 152 516 -0.07364341
## # ... with 648 more rows
word_sentiment %>%
arrange(desc(total)) %>%
ggplot(aes(total / nrow(airline_tweets), sentiment_score)) +
geom_point() +
geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) +
scale_x_log10() +
geom_hline(yintercept = 0, color = "red", lty = 2) +
xlab("% of tweets containing word") +
ylab("% positive - % negative")
Or compared to the AFINN sentiment score:
AFINN <- sentiments %>%
filter(lexicon == "AFINN")
word_sentiment %>%
inner_join(AFINN, by = "word") %>%
ggplot(aes(score, sentiment_score)) +
geom_point() +
geom_text(aes(label = word), vjust = 1, hjust = 1) +
xlab("AFINN Sentiment Score") +
ylab("% positive - % negative in tweets")