library(tidyverse)
library(rtweet)
library(httpuv)
library(qdapRegex)
library(tm)
library(qdap)
library(wordcloud)
library(RColorBrewer)
library(plyr)
Get a list of current trending topics in the United States and pick one that interests you. Indicate your choice.
I chose the term “Cubs”
gtny = get_trends("United States")
gtny %>% arrange(desc(tweet_volume)) %>%
select(query,tweet_volume) %>%
distinct %>%
head(10)
## # A tibble: 10 x 2
## query tweet_volume
## <chr> <int>
## 1 Tiger 173654
## 2 %23OpeningDay 112730
## 3 SCOTUS 112482
## 4 Cubs 58487
## 5 McConnell 56511
## 6 %22Tel+Aviv%22 52562
## 7 %22National+Treasure%22 50970
## 8 Mets 43933
## 9 Braves 42734
## 10 Cardinals 38401
Use the code we developed in class to get a cleaned corpus consisting of the texts in 6,000 tweets on your topic. Make sure that your treats are original and in English. Also produce a list of the 60 most common terms in these tweets.
term = "Cubs"
ntweets = 6000
twts = search_tweets(term, n = ntweets, lang = "en") %>%
filter(is.na(reply_to_screen_name) &
is_retweet == FALSE &
is_quote == FALSE)
twt_txt = twts$text
# Remove URLs from the tweet
twt_txt_url <- rm_twitter_url(twt_txt)
# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs <- gsub("[^A-Za-z]"," " , twt_txt_url)
# Convert text in "twt_gsub" dataset to a text corpus and view output
twt_corpus <- twt_txt_chrs %>%
VectorSource() %>%
Corpus()
# Convert the corpus to lowercase
twt_corpus_final <- tm_map(twt_corpus, tolower)
## Warning in tm_map.SimpleCorpus(twt_corpus, tolower): transformation drops
## documents
Refine your corpus by removing a set of custom stopwords.
custom_stopwds <- c("ukraine", " s", "amp", "can", 't',"via")
# Remove custom stop words and create a refined corpus
corp_refined <- tm_map(twt_corpus_final,removeWords, custom_stopwds)
## Warning in tm_map.SimpleCorpus(twt_corpus_final, removeWords, custom_stopwds):
## transformation drops documents
Produce a barploot of the 10 most common terms in your refined corpus.
termfreq_clean <- freq_terms(corp_refined, 20)
# Extract term frequencies for the top 10 words
termfreq_10w <- freq_terms(corp_refined, 10)
termfreq_10w
## WORD FREQ
## 1 cubs 948
## 2 the 771
## 3 to 293
## 4 a 283
## 5 and 261
## 6 day 221
## 7 in 211
## 8 i 204
## 9 win 198
## 10 on 182
term60 <- subset(termfreq_10w, FREQ > 60)
# Create a bar plot using terms with more than 60 counts
ggplot(term60, aes(x = reorder(WORD, -FREQ), y = FREQ)) +
geom_bar(stat = "identity", fill = "red") +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
Produce a wordcloud of the 10 most common terms in your refined corpus.
wordcloud(corp_refined, max.words = 10, colors = "green",
scale = c(3,0.5),random.order = FALSE)