Drew Conway’s Comparative Cloud

Drew Conway has proposed a very interesting option of wordcloud to compare the words and terms of two texts (check his better cloud: https://github.com/drewconway/ZIA/tree/master/R/better_word_cloud.). Following his example, we can try his approach on tweets from @realDonaldTrump and @cnn

Step 1: Load Packages with Twitter API

# load packages
library(twitteR)
## Warning: package 'twitteR' was built under R version 3.3.3
library(tm)
## Loading required package: NLP
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

You need to use your own Twitter API keys

# api_key <- "xxxxxxxxxxxxxxxxxxxxxxxxx"
# api_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

Step 2: Collect tweets from Trump and CNN

setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
# collect tweets
trump_tweets = userTimeline("realDonaldTrump", n=1500)
cnn_tweets = userTimeline("cnn", n=1500)
# get text
trump_txt = sapply(trump_tweets, function(x) x$getText())
cnn_txt = sapply(cnn_tweets, function(x) x$getText())

Step 3: Create function to clean text

# clean text function
clean.text <- function(some_txt)
{
  some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
  some_txt = gsub("@\\w+", "", some_txt)
  some_txt = gsub("[[:punct:]]", "", some_txt)
  some_txt = gsub("[[:digit:]]", "", some_txt)
  some_txt = gsub("http\\w+", "", some_txt)
  some_txt = gsub("[ \t]{2,}", "", some_txt)
  some_txt = gsub("^\\s+|\\s+$", "", some_txt)
  
# define "tolower error handling" function
  try.tolower = function(x)
  {
    y = NA
    try_error = tryCatch(tolower(x), error=function(e) e)
    if (!inherits(try_error, "error"))
      y = tolower(x)
    return(y)
  }
  
  some_txt = sapply(some_txt, try.tolower)
  some_txt = some_txt[some_txt != ""]
  names(some_txt) = NULL
  return(some_txt)
}

Step 4: Use function to clean text

# clean text
trump_clean = clean.text(trump_txt)
cnn_clean = clean.text(cnn_txt)

# join cleaned texts in a single vector
trump = paste(trump_clean, collapse=" ")
cnn = paste(cnn_clean, collapse=" ")
trump_cnn = c(trump, cnn)

Step 5: Create corpus and term-document matrix

# Corpus
or_corpus = Corpus(VectorSource(trump_cnn))

# remove stopwords
skipwords = c(stopwords("english"), "president", "presidents",
              "trump", "cnn", "video", "todays", "reads", "live", "watch")
or_corpus = tm_map(or_corpus, removeWords, skipwords)

# term-document matrix
or_df = as.data.frame(inspect(TermDocumentMatrix(or_corpus)))

Step 6: check frequency on words form tweets

# need to run the following line code to get dataframe:
# or_df = as.data.frame(inspect(tdm))

names(or_df) = c("trump.txt", "cnn.txt")
# get rid of low frequency words
or_df = subset(or_df, trump.txt>2 & cnn.txt>2)

# get rid of low frequency words
or_df = subset(or_df, trump.txt>5 & cnn.txt>5)

# calculate frequency differences
or_df$freq.dif = or_df$trump.txt - or_df$cnn.txt

# twitted more often by trump
trump_df = subset(or_df, freq.dif > 0)

# twitted more often by cnn
cnn_df = subset(or_df, freq.dif < 0)

# twitted equally
both_df = subset(or_df, freq.dif == 0)

Step 7: Create function to get the words spacing for the plot

# function
optimal.spacing <- function(spaces)
{
   if(spaces > 1) {
      spacing <- 1 / spaces
      if(spaces%%2 > 0) {
         lim = spacing * floor(spaces/2)
         return(seq(-lim, lim, spacing))
      }
      else {
         lim = spacing * (spaces-1)
         return(seq(-lim, lim, spacing*2))
      }
   }
   else {
      # add some jitter when 0
      return(jitter(0, amount=0.2))
   }
}

Step 8: Apply function optimal.spacing

# Get spacing for each frequency type
trump_spacing = sapply(table(trump_df$freq.dif),
                       function(x) optimal.spacing(x))

cnn_spacing = sapply(table(cnn_df$freq.dif),
                        function(x) optimal.spacing(x))

both_spacing = sapply(table(both_df$freq.dif),
                      function(x) optimal.spacing(x))

Step 9: Add spacing column to data frames

# add spacings
trump_optim = rep(0, nrow(trump_df))
for(n in names(trump_spacing)) {
  trump_optim[trump_df$freq.dif == as.numeric(n)] <- trump_spacing[[n]]
}
trump_df = transform(trump_df, Spacing=trump_optim)

cnn_optim = rep(0, nrow(cnn_df))
for(n in names(cnn_spacing)) {
  cnn_optim[cnn_df$freq.dif == as.numeric(n)] <- cnn_spacing[[n]]
}
cnn_df = transform(cnn_df, Spacing=cnn_optim)

both_df$Spacing = as.vector(both_spacing)

Step 10: Let’s visualize the wordcloud

options(warn=-1)
# use ggplot
ggplot(trump_df, aes(x=freq.dif, y=Spacing)) +
  geom_text(aes(size=trump.txt, label=row.names(trump_df),
                colour=freq.dif), alpha=0.7, family='Times') +
  geom_text(data=cnn_df, aes(x=freq.dif, y=Spacing,
                             label=row.names(cnn_df), size=cnn.txt, color=freq.dif),
            alpha=0.7, family='Times') +
  geom_text(data=both_df, aes(x=freq.dif, y=Spacing,
                              label=row.names(both_df), size=trump.txt, color=freq.dif),
            alpha=0.7, family='Times') +
  scale_size(range=c(3,11)) +
  scale_colour_gradient(low="red3", high="blue3", guide="none") +
  scale_x_continuous(breaks=c(min(cnn_df$freq.dif), 0, max(trump_df$freq.dif)),
                     labels=c("Twitted More by cnn","Twitted Equally","Twitted More by trump")) +
  scale_y_continuous(breaks=c(0), labels=c("")) +
  labs(x="", y="", size="Word Frequency") +
  theme_bw() +
  labs(panel.grid.major = element_blank(),
       panel.grid.minor = element_blank(),
       title="Conway's Word Cloud, Tweets (trump -vs- cnn)",
       plot.title = element_text(family="Times", size=18))

# save plot in jpg
ggsave("trump_cnn_ModifyCloud.jpg", width=13, height=8, units="in")