Analysis of CrowdFlower Airline tweet sentiment data

The CrowdFlower data can be found and downloaded here.

library(readr)
library(dplyr)
library(tidytext)
library(tidyr)
library(stringr)

airline_tweets <- read_csv("~/Downloads/Airline-Sentiment-2-w-AA.csv")
colnames(airline_tweets) <- make.names(colnames(airline_tweets))

# clean text and separate into words, similar to
# http://varianceexplained.org/r/trump-tweets/
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
airline_words <- airline_tweets %>%
  mutate(text = str_replace_all(text, "https?://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = reg) %>%
  anti_join(stop_words, by = "word")

The most common words in these tweets are:

library(ggplot2)
theme_set(theme_bw())

airline_words %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = factor(word, levels = rev(word))) %>%
  ggplot(aes(word, n)) +
  geom_bar(stat = "identity") +
  coord_flip()

Words with the most positive or negative sentiment, compared to frequency:

word_sentiment <- airline_words %>%
  distinct(X_unit_id, airline_sentiment, word) %>%
  count(airline_sentiment, word) %>%
  spread(airline_sentiment, n, fill = 0) %>%
  mutate(total = negative + neutral + positive) %>%
  filter(total >= 25) %>%
  mutate(sentiment_score = positive / total - negative / total)

word_sentiment

## # A tibble: 658 x 6
##               word negative neutral positive total sentiment_score
##              <chr>    <dbl>   <dbl>    <dbl> <dbl>           <dbl>
## 1                @       49      14       14    77     -0.45454545
## 2     @americanair     2100     499      351  2950     -0.59288136
## 3           @delta       56       8        2    66     -0.81818182
## 4  @imaginedragons        0      28       17    45      0.37777778
## 5         @jetblue      963     702      542  2207     -0.19075668
## 6       @jetblue's        2      22        3    27      0.03703704
## 7    @southwestair     1207     668      572  2447     -0.25950143
## 8          @united     2667     699      497  3863     -0.56173958
## 9       @usairways     2317     390      274  2981     -0.68534049
## 10  @virginamerica      190     174      152   516     -0.07364341
## # ... with 648 more rows

word_sentiment %>%
  arrange(desc(total)) %>%
  ggplot(aes(total / nrow(airline_tweets), sentiment_score)) +
  geom_point() +
  geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) +
  scale_x_log10() +
  geom_hline(yintercept = 0, color = "red", lty = 2) +
  xlab("% of tweets containing word") +
  ylab("% positive - % negative")

Or compared to the AFINN sentiment score:

AFINN <- sentiments %>%
  filter(lexicon == "AFINN")

word_sentiment %>%
  inner_join(AFINN, by = "word") %>%
  ggplot(aes(score, sentiment_score)) +
  geom_point() +
  geom_text(aes(label = word), vjust = 1, hjust = 1) +
  xlab("AFINN Sentiment Score") +
  ylab("% positive - % negative in tweets")

Analysis of CrowdFlower Airline tweet sentiment data

David Robinson

September 8, 2016