Setup

library(tidyverse)
library(rtweet)
library(httpuv)
library(qdapRegex)
library(tm)
library(qdap)
library(wordcloud)
library(RColorBrewer)
library(plyr)

Problem 1

Get a list of current trending topics in the United States and pick one that interests you. Indicate your choice.

I chose the term “Cubs”

gtny = get_trends("United States")

gtny %>% arrange(desc(tweet_volume)) %>% 
  select(query,tweet_volume) %>% 
  distinct %>% 
  head(10)
## # A tibble: 10 x 2
##    query                   tweet_volume
##    <chr>                          <int>
##  1 Tiger                         173654
##  2 %23OpeningDay                 112730
##  3 SCOTUS                        112482
##  4 Cubs                           58487
##  5 McConnell                      56511
##  6 %22Tel+Aviv%22                 52562
##  7 %22National+Treasure%22        50970
##  8 Mets                           43933
##  9 Braves                         42734
## 10 Cardinals                      38401

Problem 2

Use the code we developed in class to get a cleaned corpus consisting of the texts in 6,000 tweets on your topic. Make sure that your treats are original and in English. Also produce a list of the 60 most common terms in these tweets.

term = "Cubs"
ntweets = 6000
twts = search_tweets(term, n = ntweets, lang = "en") %>% 
  filter(is.na(reply_to_screen_name)  &
         is_retweet == FALSE &
         is_quote == FALSE)
twt_txt = twts$text

# Remove URLs from the tweet
twt_txt_url <- rm_twitter_url(twt_txt)
# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs  <- gsub("[^A-Za-z]"," " , twt_txt_url)
# Convert text in "twt_gsub" dataset to a text corpus and view output
twt_corpus <- twt_txt_chrs %>% 
                VectorSource() %>% 
                Corpus() 

# Convert the corpus to lowercase
twt_corpus_final <- tm_map(twt_corpus, tolower) 
## Warning in tm_map.SimpleCorpus(twt_corpus, tolower): transformation drops
## documents

Problem 3

Refine your corpus by removing a set of custom stopwords.

custom_stopwds <- c("ukraine", " s", "amp", "can", 't',"via")

# Remove custom stop words and create a refined corpus
corp_refined <- tm_map(twt_corpus_final,removeWords, custom_stopwds) 
## Warning in tm_map.SimpleCorpus(twt_corpus_final, removeWords, custom_stopwds):
## transformation drops documents

Problem 4

Produce a barploot of the 10 most common terms in your refined corpus.

termfreq_clean <- freq_terms(corp_refined, 20)

# Extract term frequencies for the top 10 words
termfreq_10w <- freq_terms(corp_refined, 10)
termfreq_10w
##    WORD FREQ
## 1  cubs  948
## 2  the   771
## 3  to    293
## 4  a     283
## 5  and   261
## 6  day   221
## 7  in    211
## 8  i     204
## 9  win   198
## 10 on    182
term60 <- subset(termfreq_10w, FREQ > 60)

# Create a bar plot using terms with more than 60 counts
ggplot(term60, aes(x = reorder(WORD, -FREQ), y = FREQ)) + 
        geom_bar(stat = "identity", fill = "red") + 
        theme(axis.text.x = element_text(angle = 15, hjust = 1))

Problem 5

Produce a wordcloud of the 10 most common terms in your refined corpus.

wordcloud(corp_refined, max.words = 10, colors = "green", 
    scale = c(3,0.5),random.order = FALSE)