Motivation

Setting up

import json
import numpy as np
import tweepy
import pandas as pd
from pandas.io.json import json_normalize

consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

def df_Search(searchTerm):
    dFrame = pd.DataFrame()
    for tweet in tweepy.Cursor(api.search, q=searchTerm, count=100, 
    result_type="recent",include_entities=True, lang="en").items():
        df = pd.DataFrame.from_dict(json_normalize(tweet._json), orient='columns')
        dFrame = dFrame.append(df)
    return dFrame
    
hebatnya, menangbesar = df_Search('#HebatkanNegaraku'), df_Search('#KitaMestiMenang')
library(DescTools)
library(readr)
library(patchwork) # for easy ggarrange operator 
library(tidyverse)

hebatnya <- read_csv("hebatnya.csv")
menangbesar <- read_csv("menangbesar.csv")

Sample of top average favourited tweets

summarise(group_by(menangbesar, text), 
          mean_favourites = mean(as.numeric(favorite_count, na.rm = TRUE)), 
          retweets = sum(retweet_count)) %>% 
  arrange(desc(mean_favourites)) %>% head(5) %>% 
  knitr::kable(format = 'html')
text mean_favourites retweets

#MayThe4th be with you, join the resistance!

#KitaMestiMenang https://t.co/Kg8lE5y6f4
3014 1904

May the fourth be with you!

Join the resistance.

#KitaMestiMenang https://t.co/j4fIoa1fpN
1903 1130
From the bottom of my heart, give the Pakatan Harapan a chance to prove themselves worth enough for Malaysia. #GE14… https://t.co/PvOpEuWTAO 461 342
Good luck @chedetofficial - if anyone can reach out to the Malay voters that another govt is possible other than Um… https://t.co/RsQzDVv3Mn 278 156
DECIDE WHO WILL DETERMINE THE FUTURE OF MALAYSIA! Download the game now! #PRU14 #GE14 #1MALAYSIA #jomBN #IniKaliLah… https://t.co/O6HN7n7QH8 228 25
summarise(group_by(hebatnya, text), 
          mean_favourites = mean(as.numeric(favorite_count, na.rm = TRUE)), 
          retweets = sum(retweet_count)) %>% 
  arrange(desc(mean_favourites)) %>% head(5) %>% 
  knitr::kable(format = 'html')
## Warning in mean(as.numeric(favorite_count, na.rm = TRUE)): NAs introduced
## by coercion
text mean_favourites retweets
“For the Prime Minister to think about this, that means he’s very detailed. Targetting at the suffering of the peop… https://t.co/gEmRmtaLi1 480 313
“Once the High Speed Rail is completed, you have less travelling time, increased productivity, enhancing the trade… https://t.co/bkZyPQxGG3 467 317
“The Manifesto is driven towards getting women to hold 30% of decision-making powers across all sectors.” - Dato’ B… https://t.co/Ky37pnXNCs 458 314
“Right now this manifesto is very much looking at giving this opportunity to youths, handing down the reigns of the… https://t.co/g7fIXBjApv 412 273
I’m glad Wong Tack apologised. Let’s hope there are no more baseless allegations coming this way.… https://t.co/czk1kCAj1o 226 179

Wordclouds

library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))

hebatnya$text %>% 
  iconv("latin1", "ASCII", sub="") %>% # remove Chinese and othe non-ASCII characters
  VectorSource %>% 
  Corpus %>%
  tm_map(toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") %>% 
  tm_map(toSpace, "@") %>% 
  tm_map(toSpace, "\\|") %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removeNumbers) %>% 
  tm_map(removeWords, stopwords("english")) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(stripWhitespace) %>% 
  TermDocumentMatrix %>%
  as.matrix %>% 
  rowSums %>% 
  sort(decreasing=TRUE) -> v 

d <- data.frame(word = names(v), freq=v)

menangbesar$text %>% 
  iconv("latin1", "ASCII", sub="") %>% # remove Chinese and othe non-ASCII characters
  VectorSource %>% 
  Corpus %>%
  tm_map(toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") %>% 
  tm_map(toSpace, "@") %>% 
  tm_map(toSpace, "\\|") %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removeNumbers) %>% 
  tm_map(removeWords, stopwords("english")) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(stripWhitespace) %>% 
  TermDocumentMatrix %>%
  as.matrix %>% 
  rowSums %>% 
  sort(decreasing=TRUE) -> v 

d2 <- data.frame(word = names(v), freq=v)

head(d, 10)
head(d2, 10)
d <- d[d!='hebatkannegaraku',]
d$word <- factor(d$word)
d <- d[complete.cases(d),]
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), scale=c(3,0.25))
title('#HebatkanNegaraku', sub = 'Top 50 words by frequency')

d2 <- d2[d2!='kitamestimenang',]
d2$word <- factor(d2$word)
d2 <- d2[complete.cases(d2),]
wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
        max.words=50, random.order=FALSE, rot.per=0.35, 
        colors=brewer.pal(8, "Dark2"), scale=c(3,0.25))
title('#KitaMestiMenang', sub = 'Top 50 words by frequency')

User created at

hebatnya %>% 
  select('user.created_at') %>% 
  table %>% 
  Entropy %>% round(2)
## [1] 9.89
menangbesar %>% 
  select('user.created_at') %>% 
  table %>% 
  Entropy %>% round(2)
## [1] 11.2
hebatnya %>% group_by(user.created_at) %>% summarise(count = n()) -> a

menangbesar %>% group_by(user.created_at) %>% summarise(count = n()) -> b

str_sub(a$user.created_at, -4,-1) %>% 
  table() %>% as.data.frame() %>% 
  ggplot(aes(x = ., Freq, fill = . )) + geom_bar(stat = 'identity') +
  theme_minimal() + theme(legend.position="none") + coord_flip() + 
  labs(title = 'Distribution of account age', subtitle = '#HebatkanNegaraku', 
       x = 'Year user created', y = 'Tweets') + {
    str_sub(b$user.created_at, -4,-1) %>% 
      table() %>% as.data.frame() %>%
      ggplot(aes(x = ., Freq, fill = . ))  + geom_bar(stat = 'identity') +
      theme_minimal() + theme(legend.position="none") + coord_flip() +
      labs(subtitle = '#KitaMestiMenang', x = 'Year user created', y = 'Tweets') }

User time zone

hebatnya %>% 
  select('user.time_zone') %>% 
  table %>% Entropy %>% 
  round(2)
## [1] 2.02
menangbesar %>% 
  select('user.time_zone') %>% 
  table %>% Entropy %>% 
  round(2)
## [1] 2.3
hebatnya %>% 
  group_by(user.time_zone) %>% 
  summarise(count = n()) %>%
  na.omit %>% 
  mutate(percent = count/nrow(hebatnya)*100, 
         pct = paste0(round(count/nrow(hebatnya)*100, 1),'%')) -> a1

menangbesar %>% 
  group_by(user.time_zone) %>% 
  summarise(count = n()) %>%
  na.omit %>%
  mutate(percent = count/nrow(menangbesar)*100, 
         pct = paste0(round(count/nrow(menangbesar)*100, 1),'%')) -> b1

a1[a1$pct > 1,] %>% 
  ggplot(aes(x = reorder(user.time_zone, -count), y = count, 
             fill = user.time_zone)) + 
  geom_bar(stat = 'identity') + coord_flip() + 
  geom_text(aes(label = pct)) + theme_minimal() + theme(legend.position="none") + 
  labs(title = 'Distribution of time zones', subtitle = '#HebatkanNegaraku', 
       x = 'Time zone', y = 'Number of tweets') +  {
    b1[b1$percent >1,] %>% 
    ggplot(aes(x = reorder(user.time_zone, -count), y = count, 
               fill = user.time_zone)) + 
    geom_bar(stat = 'identity') + coord_flip() + 
    geom_text(aes(label = pct)) + theme_minimal() + theme(legend.position="none") + 
    labs(subtitle = '#KitaMestiMenang', x = 'Time zone', y = 'Number of tweets', 
         caption = 'Only percentages over 1% reported.')
    } + 
  plot_layout(ncol = 1, heights = c(.4, .6))

User screen name

hebatnya %>% 
  select('user.screen_name') %>% 
  table %>% Entropy %>% round(2)
## [1] 9.89
menangbesar %>% 
  select('user.screen_name') %>% 
  table %>% Entropy %>% round(2)
## [1] 11.2
hebatnya %>% 
  group_by(user.screen_name) %>% 
  summarise(count = n()) %>%
  na.omit %>%
  mutate(pct = paste0(round(count/nrow(hebatnya)*100, 1),'%')) %>% 
  arrange(desc(count)) %>% 
  head(10) -> a2

menangbesar %>% 
  group_by(user.screen_name) %>% 
  summarise(count = n()) %>%
  na.omit %>%
  mutate(pct = paste0(round(count/nrow(hebatnya)*100, 1),'%')) %>% 
  arrange(desc(count)) %>% 
  head(10) -> b2

a2[a2$pct != '0%',] %>% 
  ggplot(aes(x = reorder(user.screen_name, -count), y = count, 
             fill = user.screen_name)) + 
  geom_bar(stat = 'identity') + coord_flip() + 
  geom_text(aes(label = pct)) + theme_minimal() + theme(legend.position="none") + 
  labs(title = "Breakdown of users", 
       subtitle = '#HebatkanNegaraku', x = 'User screen name', y = 'Number of tweets') +
  {
    b2[b2$pct != '0%',] %>% 
      ggplot(aes(x = reorder(user.screen_name, -count), y = count, 
             fill = user.screen_name)) + 
      geom_bar(stat = 'identity') + coord_flip() + 
      geom_text(aes(label = pct)) + theme_minimal() + theme(legend.position="none") +
      labs(subtitle = '#KitaMestiMenang', 
           x = 'User screen name', y = 'Number of tweets') 
    }

Retweets and favourites for each hashtag

menangbesar$geomtext <- ifelse(menangbesar$retweet_count > 1000 & menangbesar$favorite_count > 1000, menangbesar$user.screen_name, "")

ggplot()  + 
  geom_jitter(aes(menangbesar$favorite_count, menangbesar$retweet_count, 
                  color = '#KitaMestiMenang')) + 
  geom_jitter(aes(as.numeric(hebatnya$favorite_count), hebatnya$retweet_count, 
                  color = '#HebatkanNegaraku')) +
  geom_text(aes(menangbesar$favorite_count, menangbesar$retweet_count, 
                label = menangbesar$geomtext), nudge_x = -400) + theme_minimal() +
  theme(legend.position = 'bottom') + 
  labs(title = 'Retweets and favourites', 
       subtitle = "'#MayThe4th be with you' and friends", 
       x = 'Favourite count', 
       y = 'Retweet count', caption = 'Tweets above 1000 retweet counts are retweets of SyedSaddiq and chedetofficial\'s #MayThe4th tweets.\n
       Each data point represents a tweet.') +
  scale_color_manual(values = c('blue', 'red')) + xlim(0,3500)
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning: Removed 1666 rows containing missing values (geom_point).
## Warning: Removed 1587 rows containing missing values (geom_point).
## Warning: Removed 3432 rows containing missing values (geom_text).

Conclusions