Drew Conway has proposed a very interesting option of wordcloud to compare the words and terms of two texts (check his better cloud: https://github.com/drewconway/ZIA/tree/master/R/better_word_cloud.). Following his example, we can try his approach on tweets from @realDonaldTrump and @cnn
# load packages
library(twitteR)
## Warning: package 'twitteR' was built under R version 3.3.3
library(tm)
## Loading required package: NLP
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
You need to use your own Twitter API keys
# api_key <- "xxxxxxxxxxxxxxxxxxxxxxxxx"
# api_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
# collect tweets
trump_tweets = userTimeline("realDonaldTrump", n=1500)
cnn_tweets = userTimeline("cnn", n=1500)
# get text
trump_txt = sapply(trump_tweets, function(x) x$getText())
cnn_txt = sapply(cnn_tweets, function(x) x$getText())
# clean text function
clean.text <- function(some_txt)
{
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
some_txt = gsub("@\\w+", "", some_txt)
some_txt = gsub("[[:punct:]]", "", some_txt)
some_txt = gsub("[[:digit:]]", "", some_txt)
some_txt = gsub("http\\w+", "", some_txt)
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
# define "tolower error handling" function
try.tolower = function(x)
{
y = NA
try_error = tryCatch(tolower(x), error=function(e) e)
if (!inherits(try_error, "error"))
y = tolower(x)
return(y)
}
some_txt = sapply(some_txt, try.tolower)
some_txt = some_txt[some_txt != ""]
names(some_txt) = NULL
return(some_txt)
}
# clean text
trump_clean = clean.text(trump_txt)
cnn_clean = clean.text(cnn_txt)
# join cleaned texts in a single vector
trump = paste(trump_clean, collapse=" ")
cnn = paste(cnn_clean, collapse=" ")
trump_cnn = c(trump, cnn)
# Corpus
or_corpus = Corpus(VectorSource(trump_cnn))
# remove stopwords
skipwords = c(stopwords("english"), "president", "presidents",
"trump", "cnn", "video", "todays", "reads", "live", "watch")
or_corpus = tm_map(or_corpus, removeWords, skipwords)
# term-document matrix
or_df = as.data.frame(inspect(TermDocumentMatrix(or_corpus)))
# need to run the following line code to get dataframe:
# or_df = as.data.frame(inspect(tdm))
names(or_df) = c("trump.txt", "cnn.txt")
# get rid of low frequency words
or_df = subset(or_df, trump.txt>2 & cnn.txt>2)
# get rid of low frequency words
or_df = subset(or_df, trump.txt>5 & cnn.txt>5)
# calculate frequency differences
or_df$freq.dif = or_df$trump.txt - or_df$cnn.txt
# twitted more often by trump
trump_df = subset(or_df, freq.dif > 0)
# twitted more often by cnn
cnn_df = subset(or_df, freq.dif < 0)
# twitted equally
both_df = subset(or_df, freq.dif == 0)
# function
optimal.spacing <- function(spaces)
{
if(spaces > 1) {
spacing <- 1 / spaces
if(spaces%%2 > 0) {
lim = spacing * floor(spaces/2)
return(seq(-lim, lim, spacing))
}
else {
lim = spacing * (spaces-1)
return(seq(-lim, lim, spacing*2))
}
}
else {
# add some jitter when 0
return(jitter(0, amount=0.2))
}
}
# Get spacing for each frequency type
trump_spacing = sapply(table(trump_df$freq.dif),
function(x) optimal.spacing(x))
cnn_spacing = sapply(table(cnn_df$freq.dif),
function(x) optimal.spacing(x))
both_spacing = sapply(table(both_df$freq.dif),
function(x) optimal.spacing(x))
# add spacings
trump_optim = rep(0, nrow(trump_df))
for(n in names(trump_spacing)) {
trump_optim[trump_df$freq.dif == as.numeric(n)] <- trump_spacing[[n]]
}
trump_df = transform(trump_df, Spacing=trump_optim)
cnn_optim = rep(0, nrow(cnn_df))
for(n in names(cnn_spacing)) {
cnn_optim[cnn_df$freq.dif == as.numeric(n)] <- cnn_spacing[[n]]
}
cnn_df = transform(cnn_df, Spacing=cnn_optim)
both_df$Spacing = as.vector(both_spacing)
options(warn=-1)
# use ggplot
ggplot(trump_df, aes(x=freq.dif, y=Spacing)) +
geom_text(aes(size=trump.txt, label=row.names(trump_df),
colour=freq.dif), alpha=0.7, family='Times') +
geom_text(data=cnn_df, aes(x=freq.dif, y=Spacing,
label=row.names(cnn_df), size=cnn.txt, color=freq.dif),
alpha=0.7, family='Times') +
geom_text(data=both_df, aes(x=freq.dif, y=Spacing,
label=row.names(both_df), size=trump.txt, color=freq.dif),
alpha=0.7, family='Times') +
scale_size(range=c(3,11)) +
scale_colour_gradient(low="red3", high="blue3", guide="none") +
scale_x_continuous(breaks=c(min(cnn_df$freq.dif), 0, max(trump_df$freq.dif)),
labels=c("Twitted More by cnn","Twitted Equally","Twitted More by trump")) +
scale_y_continuous(breaks=c(0), labels=c("")) +
labs(x="", y="", size="Word Frequency") +
theme_bw() +
labs(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
title="Conway's Word Cloud, Tweets (trump -vs- cnn)",
plot.title = element_text(family="Times", size=18))
# save plot in jpg
ggsave("trump_cnn_ModifyCloud.jpg", width=13, height=8, units="in")