reticulate package.rtweet) but couldn’t be set up in time.reticulate, but we could.tweetpy’s Cursor function.import json
import numpy as np
import tweepy
import pandas as pd
from pandas.io.json import json_normalize
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
def df_Search(searchTerm):
dFrame = pd.DataFrame()
for tweet in tweepy.Cursor(api.search, q=searchTerm, count=100,
result_type="recent",include_entities=True, lang="en").items():
df = pd.DataFrame.from_dict(json_normalize(tweet._json), orient='columns')
dFrame = dFrame.append(df)
return dFrame
hebatnya, menangbesar = df_Search('#HebatkanNegaraku'), df_Search('#KitaMestiMenang')
library(DescTools)
library(readr)
library(patchwork) # for easy ggarrange operator
library(tidyverse)
hebatnya <- read_csv("hebatnya.csv")
menangbesar <- read_csv("menangbesar.csv")
summarise(group_by(menangbesar, text),
mean_favourites = mean(as.numeric(favorite_count, na.rm = TRUE)),
retweets = sum(retweet_count)) %>%
arrange(desc(mean_favourites)) %>% head(5) %>%
knitr::kable(format = 'html')
| text | mean_favourites | retweets |
|---|---|---|
|
#MayThe4th be with you, join the resistance! #KitaMestiMenang https://t.co/Kg8lE5y6f4 |
3014 | 1904 |
|
May the fourth be with you! Join the resistance. #KitaMestiMenang https://t.co/j4fIoa1fpN |
1903 | 1130 |
| From the bottom of my heart, give the Pakatan Harapan a chance to prove themselves worth enough for Malaysia. #GE14… https://t.co/PvOpEuWTAO | 461 | 342 |
| Good luck @chedetofficial - if anyone can reach out to the Malay voters that another govt is possible other than Um… https://t.co/RsQzDVv3Mn | 278 | 156 |
| DECIDE WHO WILL DETERMINE THE FUTURE OF MALAYSIA! Download the game now! #PRU14 #GE14 #1MALAYSIA #jomBN #IniKaliLah… https://t.co/O6HN7n7QH8 | 228 | 25 |
summarise(group_by(hebatnya, text),
mean_favourites = mean(as.numeric(favorite_count, na.rm = TRUE)),
retweets = sum(retweet_count)) %>%
arrange(desc(mean_favourites)) %>% head(5) %>%
knitr::kable(format = 'html')
## Warning in mean(as.numeric(favorite_count, na.rm = TRUE)): NAs introduced
## by coercion
| text | mean_favourites | retweets |
|---|---|---|
| “For the Prime Minister to think about this, that means he’s very detailed. Targetting at the suffering of the peop… https://t.co/gEmRmtaLi1 | 480 | 313 |
| “Once the High Speed Rail is completed, you have less travelling time, increased productivity, enhancing the trade… https://t.co/bkZyPQxGG3 | 467 | 317 |
| “The Manifesto is driven towards getting women to hold 30% of decision-making powers across all sectors.” - Dato’ B… https://t.co/Ky37pnXNCs | 458 | 314 |
| “Right now this manifesto is very much looking at giving this opportunity to youths, handing down the reigns of the… https://t.co/g7fIXBjApv | 412 | 273 |
| I’m glad Wong Tack apologised. Let’s hope there are no more baseless allegations coming this way.… https://t.co/czk1kCAj1o | 226 | 179 |
library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
hebatnya$text %>%
iconv("latin1", "ASCII", sub="") %>% # remove Chinese and othe non-ASCII characters
VectorSource %>%
Corpus %>%
tm_map(toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") %>%
tm_map(toSpace, "@") %>%
tm_map(toSpace, "\\|") %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
TermDocumentMatrix %>%
as.matrix %>%
rowSums %>%
sort(decreasing=TRUE) -> v
d <- data.frame(word = names(v), freq=v)
menangbesar$text %>%
iconv("latin1", "ASCII", sub="") %>% # remove Chinese and othe non-ASCII characters
VectorSource %>%
Corpus %>%
tm_map(toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") %>%
tm_map(toSpace, "@") %>%
tm_map(toSpace, "\\|") %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
TermDocumentMatrix %>%
as.matrix %>%
rowSums %>%
sort(decreasing=TRUE) -> v
d2 <- data.frame(word = names(v), freq=v)
head(d, 10)
head(d2, 10)
d <- d[d!='hebatkannegaraku',]
d$word <- factor(d$word)
d <- d[complete.cases(d),]
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), scale=c(3,0.25))
title('#HebatkanNegaraku', sub = 'Top 50 words by frequency')
d2 <- d2[d2!='kitamestimenang',]
d2$word <- factor(d2$word)
d2 <- d2[complete.cases(d2),]
wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), scale=c(3,0.25))
title('#KitaMestiMenang', sub = 'Top 50 words by frequency')
hebatnya %>%
select('user.created_at') %>%
table %>%
Entropy %>% round(2)
## [1] 9.89
menangbesar %>%
select('user.created_at') %>%
table %>%
Entropy %>% round(2)
## [1] 11.2
hebatnya %>% group_by(user.created_at) %>% summarise(count = n()) -> a
menangbesar %>% group_by(user.created_at) %>% summarise(count = n()) -> b
str_sub(a$user.created_at, -4,-1) %>%
table() %>% as.data.frame() %>%
ggplot(aes(x = ., Freq, fill = . )) + geom_bar(stat = 'identity') +
theme_minimal() + theme(legend.position="none") + coord_flip() +
labs(title = 'Distribution of account age', subtitle = '#HebatkanNegaraku',
x = 'Year user created', y = 'Tweets') + {
str_sub(b$user.created_at, -4,-1) %>%
table() %>% as.data.frame() %>%
ggplot(aes(x = ., Freq, fill = . )) + geom_bar(stat = 'identity') +
theme_minimal() + theme(legend.position="none") + coord_flip() +
labs(subtitle = '#KitaMestiMenang', x = 'Year user created', y = 'Tweets') }
hebatnya %>%
select('user.time_zone') %>%
table %>% Entropy %>%
round(2)
## [1] 2.02
menangbesar %>%
select('user.time_zone') %>%
table %>% Entropy %>%
round(2)
## [1] 2.3
hebatnya %>%
group_by(user.time_zone) %>%
summarise(count = n()) %>%
na.omit %>%
mutate(percent = count/nrow(hebatnya)*100,
pct = paste0(round(count/nrow(hebatnya)*100, 1),'%')) -> a1
menangbesar %>%
group_by(user.time_zone) %>%
summarise(count = n()) %>%
na.omit %>%
mutate(percent = count/nrow(menangbesar)*100,
pct = paste0(round(count/nrow(menangbesar)*100, 1),'%')) -> b1
a1[a1$pct > 1,] %>%
ggplot(aes(x = reorder(user.time_zone, -count), y = count,
fill = user.time_zone)) +
geom_bar(stat = 'identity') + coord_flip() +
geom_text(aes(label = pct)) + theme_minimal() + theme(legend.position="none") +
labs(title = 'Distribution of time zones', subtitle = '#HebatkanNegaraku',
x = 'Time zone', y = 'Number of tweets') + {
b1[b1$percent >1,] %>%
ggplot(aes(x = reorder(user.time_zone, -count), y = count,
fill = user.time_zone)) +
geom_bar(stat = 'identity') + coord_flip() +
geom_text(aes(label = pct)) + theme_minimal() + theme(legend.position="none") +
labs(subtitle = '#KitaMestiMenang', x = 'Time zone', y = 'Number of tweets',
caption = 'Only percentages over 1% reported.')
} +
plot_layout(ncol = 1, heights = c(.4, .6))
hebatnya %>%
select('user.screen_name') %>%
table %>% Entropy %>% round(2)
## [1] 9.89
menangbesar %>%
select('user.screen_name') %>%
table %>% Entropy %>% round(2)
## [1] 11.2
hebatnya %>%
group_by(user.screen_name) %>%
summarise(count = n()) %>%
na.omit %>%
mutate(pct = paste0(round(count/nrow(hebatnya)*100, 1),'%')) %>%
arrange(desc(count)) %>%
head(10) -> a2
menangbesar %>%
group_by(user.screen_name) %>%
summarise(count = n()) %>%
na.omit %>%
mutate(pct = paste0(round(count/nrow(hebatnya)*100, 1),'%')) %>%
arrange(desc(count)) %>%
head(10) -> b2
a2[a2$pct != '0%',] %>%
ggplot(aes(x = reorder(user.screen_name, -count), y = count,
fill = user.screen_name)) +
geom_bar(stat = 'identity') + coord_flip() +
geom_text(aes(label = pct)) + theme_minimal() + theme(legend.position="none") +
labs(title = "Breakdown of users",
subtitle = '#HebatkanNegaraku', x = 'User screen name', y = 'Number of tweets') +
{
b2[b2$pct != '0%',] %>%
ggplot(aes(x = reorder(user.screen_name, -count), y = count,
fill = user.screen_name)) +
geom_bar(stat = 'identity') + coord_flip() +
geom_text(aes(label = pct)) + theme_minimal() + theme(legend.position="none") +
labs(subtitle = '#KitaMestiMenang',
x = 'User screen name', y = 'Number of tweets')
}
menangbesar$geomtext <- ifelse(menangbesar$retweet_count > 1000 & menangbesar$favorite_count > 1000, menangbesar$user.screen_name, "")
ggplot() +
geom_jitter(aes(menangbesar$favorite_count, menangbesar$retweet_count,
color = '#KitaMestiMenang')) +
geom_jitter(aes(as.numeric(hebatnya$favorite_count), hebatnya$retweet_count,
color = '#HebatkanNegaraku')) +
geom_text(aes(menangbesar$favorite_count, menangbesar$retweet_count,
label = menangbesar$geomtext), nudge_x = -400) + theme_minimal() +
theme(legend.position = 'bottom') +
labs(title = 'Retweets and favourites',
subtitle = "'#MayThe4th be with you' and friends",
x = 'Favourite count',
y = 'Retweet count', caption = 'Tweets above 1000 retweet counts are retweets of SyedSaddiq and chedetofficial\'s #MayThe4th tweets.\n
Each data point represents a tweet.') +
scale_color_manual(values = c('blue', 'red')) + xlim(0,3500)
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning: Removed 1666 rows containing missing values (geom_point).
## Warning: Removed 1587 rows containing missing values (geom_point).
## Warning: Removed 3432 rows containing missing values (geom_text).