1 Word Frequency Summary

Load data

source("R/utils.R")
theme_set(theme_light())
tw <- readRDS("data/tweets_simp_522.rds")
users <- readRDS("data/users_simp_521.rds")

1.1 Top Hashtags (other than #Monsanto)

ht.top <- tw %>%
  related_hashtags_freq(upper = T) %>%
  sort() %>%
  as.data.frame() %>%
  filter(tags.all != "MONSANTO") %>%
  top_n(15, wt = Freq) 
ht.top %>%
  ggplot(aes(tags.all, Freq)) +
  geom_col() +
  labs(x = "", y = "Number of appearances") +
  coord_flip()

1.1.1 Hashtag cooccurrences

ht.net <- 
  readRDS("data/ht_adj_mat.rds") %>%
  graph_from_adjacency_matrix(mode = "undirected", weighted = T, diag = F) 
htop <- induced_subgraph(ht.net, as.vector(ht.top$tags.all))
graph_attr(htop, "layout") <- layout_in_circle
plot(
  htop,
  vertex.shape = "none",
  edge.color = "orange",
  edge.width = E(ht.net)$weight / 3,
  vertex.label.dist = 0,
  vertex.label.color = "steel blue",
  vertex.label.font = 1.5,
  vertex.label.cex = .7,
  vertex.color = "gray50"
)

The edge weight indicates how frequently a pair of hashtags would appear at the same time.

1.2 Basic Word Count

1.2.1 For all tweets

tw.words <- tw %>% clean_tweets() 
tw.words %>% 
  plot_word_freq()

1.2.1.1 Would cloud for all tweets

words.count <- tw.words %>% count(word, sort = T)
wordcloud(words = words.count$word, freq = words.count$n, 
          random.order = F, max.words = 200, rot.per = 0.15,
          colors = brewer.pal(8, "Dark2"))

1.2.2 For all tweets except tweets from bots

tw %>% 
  filter(!is.bot) %>% 
  clean_tweets() %>% 
  plot_word_freq()

1.2.3 For all unique tweets except tweets from bots

tw %>% 
  filter(!is.bot) %>% 
  distinct(text) %>% 
  clean_tweets() %>% 
  plot_word_freq()

1.2.4 For all retweets except tweets from bots

tw %>% 
  filter(is_retweet & !is.bot) %>% 
  clean_tweets() %>% 
  plot_word_freq()

1.2.5 For all tweets by bots

tw %>% 
  filter(is.bot) %>% 
  clean_tweets() %>% 
  plot_word_freq()

2 Sentiment Extraction

2.1 Unigram-based Analysis

2.1.1 Most common positive and negative words

words.bing <- tw.words %>% 
  inner_join(get_sentiments("bing"), by = "word")
words.bing %>%
  count(word, sentiment, sort = T) %>%
  group_by(sentiment) %>%
  top_n(10, wt = n) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = F) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  coord_flip()

2.1.2 Sentiment Wordclouds

words.bing %>%
  count(word, sentiment, sort = TRUE) %>%
  reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"), max.words = 150)

2.2 Sentence-based Analysis

The sentence-based sentiment analysis is pre-performed using sentimentr package. Each tweet was assigned a polarity score (positive/negative) by sentence-based sentiment mining algorithm. The dictionary Jockers (2017) is used for our analysis.

senti_score <- 
  tw$text %>% 
  sentimentr::get_sentences() %>%
  sentimentr::sentiment_by(polarity_dt = lexicon::hash_sentiment_jockers_rinker)

tw.ex <- mutate(tw, senti = senti_score$ave_sentiment)

2.2.1 Hitogram of sentiment score for all tweets

plot_density(tw.ex, "senti") +
  labs(x = "sentiment score")

Retweet may not mean endorsement.

2.2.2 Hitogram of sentiment score for all unique tweets

tw.ex %>% 
  distinct(text, .keep_all = T) %>%
  plot_density("senti") +
  labs(x = "sentiment score")

2.2.3 Hitogram of sentiment score of all users

users.ex <- 
  tw.ex %>%
  group_by(user_id) %>%
  summarise(senti.mean = mean(senti), senti.sd = sd(senti)) %>%
  inner_join(users, by = "user_id")
plot_density(users.ex, "senti.mean", bins = 40) +
  labs(x = "sentiment score")

2.2.4 Sentiment variation over time

tw.ex %>% 
  group_by(date = lubridate::floor_date(created_at, unit = "day")) %>%
  summarise(senti.mean = mean(senti), senti.sd = sd(senti)) %>%
  ggplot(aes(x = date, y = senti.mean)) +
  geom_ribbon(aes(ymin = senti.mean - senti.sd, ymax = senti.mean + senti.sd),fill = "grey70") +
  geom_line() +
  scale_x_datetime(date_breaks = "1 month", date_labels = "%b") +
  labs(x = "", y = "avg. sentiment score")

2.2.5 Comparison of sentiment score between user groups

2.2.5.1 Bots vs. Normal users

ggplot(data = users.ex, aes(x = is.bot, y = senti.mean)) +
  geom_boxplot() +
  labs(x = "", y = "sentiment score") +
  coord_flip() +
  scale_x_discrete(labels = c("users", "bots"))

t.test(filter(users.ex, is.bot)$senti.mean, filter(users.ex, !is.bot)$senti.mean)

## 
##  Welch Two Sample t-test
## 
## data:  filter(users.ex, is.bot)$senti.mean and filter(users.ex, !is.bot)$senti.mean
## t = 6.6519, df = 452.66, p-value = 8.368e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.02016328 0.03707310
## sample estimates:
##  mean of x  mean of y 
## -0.1232990 -0.1519172

The difference of mean sentiment score between the group “Bots” and the group “Normal users” is significant at a = 0.001, where the tweets posted/retweeted by bots tend to have more neutral sentiments.

2.2.5.2 Active users vs. Less active users

We treat the users that contributed > 7 tweets (1 sd. above the mean) as avtive users.

ggplot(data = users.ex, aes(x = ntweets > 7, y = senti.mean)) +
  geom_boxplot() +
  labs(x = "", y = "sentiment score") +
  coord_flip() +
  scale_x_discrete(labels = c("less", "more"))

t.test(filter(users.ex, ntweets > 7)$senti.mean, filter(users.ex, ntweets <= 7)$senti.mean)

## 
##  Welch Two Sample t-test
## 
## data:  filter(users.ex, ntweets > 7)$senti.mean and filter(users.ex, ntweets <= 7)$senti.mean
## t = 13.575, df = 1376.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03065838 0.04101555
## sample estimates:
##  mean of x  mean of y 
## -0.1165339 -0.1523709

The difference of mean sentiment score between the group “Active users” and the group “Less active users” is significant at a = 0.001, where the tweets posted/retweeted by active users tend to have more neutral sentiments. Note that the bots we identified is also a subset of active users.

2.2.5.3 Creator vs. ReTweeter vs. Mixed behaviour users

Quick definition: A creator is who never retweets, a retweeter is who always retweets.

users.ex <- mutate(users.ex,
                role = ifelse(ncreate / ntweets == 0, "retweeter",
                ifelse(ncreate / ntweets == 1, "creator", "mix")))
ggplot(data = users.ex, aes(x = role, y = senti.mean)) +
  geom_boxplot() +
  labs(x = "", y = "sentiment score") +
  coord_flip()

role.aov <- aov(senti.mean ~ role, data = users.ex)
summary(role.aov)

##                Df Sum Sq Mean Sq F value Pr(>F)    
## role            2   11.6   5.786   180.1 <2e-16 ***
## Residuals   66637 2141.1   0.032                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(role.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = senti.mean ~ role, data = users.ex)
## 
## $role
##                          diff          lwr        upr     p adj
## mix-creator       0.026947064  0.018405989 0.03548814 0.0000000
## retweeter-creator 0.030367237  0.026607380 0.03412709 0.0000000
## retweeter-mix     0.003420173 -0.004725811 0.01156616 0.5869729

The difference in mean sentiment score among the three groups is significant (p < 2e-16). Furthermore, our post-hoc test shows that the difference between the pair mix-creator and the pair retweeter-creator is significant at any confidence level, but there is no significant difference between the pair retweeter-mix. We conclude that the tweets posted by the “creators” tend to have a more negative sentiment.

2.2.5.4 Compare by gender group

ggplot(data = users.ex, aes(x = gender, y = senti.mean)) +
  geom_boxplot() +
  labs(x = "", y = "sentiment score") +
  coord_flip()

role.aov <- aov(senti.mean ~ gender, data = users.ex)
summary(role.aov)

##                Df Sum Sq Mean Sq F value Pr(>F)    
## gender          3    3.2  1.0775   33.41 <2e-16 ***
## Residuals   66636 2149.4  0.0323                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(role.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = senti.mean ~ gender, data = users.ex)
## 
## $gender
##                         diff          lwr           upr     p adj
## MALE-FEMALE    -0.0156877802 -0.020724003 -1.065156e-02 0.0000000
## UNISEX-FEMALE  -0.0085301278 -0.016866177 -1.940784e-04 0.0425731
## UNKNOWN-FEMALE -0.0165390657 -0.021045649 -1.203248e-02 0.0000000
## UNISEX-MALE     0.0071576524 -0.001135818  1.545112e-02 0.1185321
## UNKNOWN-MALE   -0.0008512856 -0.005278613  3.576042e-03 0.9604574
## UNKNOWN-UNISEX -0.0080089380 -0.015991883 -2.599348e-05 0.0489000

The difference in mean sentiment score among gender groups is significant (p < 2e-16). Furthermore, our post-hoc test shows that the difference between the pair male-female and the pair unknown-female is significant at any confidence level, which indicates that the tweets posted by the (known) female users tend to have a more neutral sentiment.

3 Emotion Extraction

We use nrc lexicon for emotion analysis. It categorizes words in a binary fashion (“yes”/“no”) into categories of 8 basic emotions: anger, anticipation, disgust, fear, joy, sadness, surprise, trust and 2 sentiments: positive/negative. So each tweet is associated with a 10-dimensional vector, which stores the count of the words belong to these 10 categories. Here we only look at the first 8 dimensions (emotion part). We then adopt the k-means algorithm to cluster the latent “emotion groups” among the Twitter discussion, and the result is visualized by the Euler diagram. The whole procedure is encapsulated in one simple function plot_nrc().

3.1 For all unique tweets

set.seed(1000) # to fix the plot layout

nrc <- fread("data/nrc_uniq.csv")
nrc %>%
  as.matrix() %>%
  drop_lastN(2) %>%  # ignore sentiment attribute
  extract_emo() %>%
  euler_km() %>%
  plot()

The dominant emotion among the whole discussion is “fear”, and also its composition with “anger” and “sadness”.

Text Analysis

Steven Liu

May 2019