library(tidyverse)
library(rtweet)
library(httpuv)
library(qdapRegex)
library(tm)
library(qdap)
library(wordcloud)
library(RColorBrewer)
Get a list of current trending topics in the United States and pick one that interests you. Indicate your choice.
gtny = get_trends("USA")
gtny %>% arrange(desc(tweet_volume)) %>%
select(query, tweet_volume) %>%
distinct %>%
head(20)
## # A tibble: 20 x 2
## query tweet_volume
## <chr> <int>
## 1 NFTs 1058989
## 2 Biden 604979
## 3 Apple 455743
## 4 iPhone 387849
## 5 Pakistan 304662
## 6 Europe 209637
## 7 %22Ministry+of+Truth%22 175227
## 8 Messi 163857
## 9 Nigeria 103243
## 10 Patrick 92479
## 11 Christ 72389
## 12 Afghanistan 53855
## 13 Iraq 22886
## 14 Donbass 21652
## 15 Conseil 21442
## 16 Shay 20908
## 17 Organisation 16543
## 18 Russes 15887
## 19 %23offenerbrief 11734
## 20 Draghi 10794
#The latest trend topic in the US is Suga which is the stage name for a member in BTS, a world wide K-Pop boy group. As of 4/28/2022.
Problem 2 Use the code we developed in class to get a cleaned corpus consisting of the texts in 6,000 tweets on your topic. Make sure that your tweets are original and in English. Also produce a list of the 60 most common terms in these tweets.
term = "Suga"
ntweets = 6000
twts = search_tweets(term, n = ntweets, lang = "en") %>% filter(is.na(reply_to_screen_name) & is_retweet == FALSE & is_quote == FALSE)
twt_txt = twts$text
Remove URLs from the tweet
twt_txt_url <- rm_twitter_url(twt_txt)
Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs <- gsub("[^A-Za-z]"," " , twt_txt_url)
Convert text in “twt_gsub” dataset to a text corpus and view output
twt_corpus <- twt_txt_chrs %>%
VectorSource() %>%
Corpus()
Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower)
## Warning in tm_map.SimpleCorpus(twt_corpus, tolower): transformation drops
## documents
Remove English stop words from the corpus and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(twt_corpus_lwr, removeWords,
## stopwords("english")): transformation drops documents
Remove additional spaces from the corpus
twt_corpus_final <- tm_map(twt_corpus_stpwd, stripWhitespace)
## Warning in tm_map.SimpleCorpus(twt_corpus_stpwd, stripWhitespace):
## transformation drops documents
Produce a list of the 60 most common terms in these tweets.
termfreq <- freq_terms(twt_corpus_final, 60)
termfreq
## WORD FREQ
## 1 bts 262
## 2 suga 216
## 3 twt 150
## 4 prod 140
## 5 psy 130
## 6 thatthat 113
## 7 listening 109
## 8 amp 84
## 9 jungkook 72
## 10 ft 47
## 11 stayalive 43
## 12 feat 40
## 13 m 26
## 14 featuring 25
## 15 music 25
## 16 fun 23
## 17 vocalist 22
## 18 brilliant 19
## 19 exceptionally 19
## 20 gifted 19
## 21 song 19
## 22 super 19
## 23 yoongi 18
## 24 listen 16
## 25 thatthatfeatsuga 15
## 26 can 14
## 27 love 14
## 28 t 14
## 29 produced 12
## 30 de 11
## 31 jimin 10
## 32 trending 10
## 33 party 9
## 34 stream 9
## 35 army 8
## 36 choice 8
## 37 keep 8
## 38 psyxsuga 8
## 39 s 8
## 40 day 7
## 41 good 7
## 42 king 7
## 43 streaming 7
## 44 thatthatprodbysuga 7
## 45 agustd 6
## 46 itunes 6
## 47 jin 6
## 48 min 6
## 49 new 6
## 50 oppa 6
## 51 ufcvegas 6
## 52 bad 5
## 53 doggy 5
## 54 dogs 5
## 55 gt 5
## 56 jhope 5
## 57 mv 5
## 58 namjoon 5
## 59 record 5
## 60 worldwide 5
#Problem 3 #Refine your corpus by removing a set of custom stopwords.
custom_stopwds <- c("suga"," r", "amp", "can", "t", "via")
corp_refined <- tm_map(twt_corpus_final,removeWords,custom_stopwds)
## Warning in tm_map.SimpleCorpus(twt_corpus_final, removeWords, custom_stopwds):
## transformation drops documents
Produce a barplot of the 10 most common terms in your refined corpus.
termfreq_10w <- freq_terms(corp_refined, 10)
termfreq_10w
## WORD FREQ
## 1 bts 262
## 2 twt 150
## 3 prod 140
## 4 psy 130
## 5 thatthat 113
## 6 listening 109
## 7 jungkook 72
## 8 ft 47
## 9 stayalive 43
## 10 feat 40
term60 <- subset(termfreq_10w, FREQ > 60)
ggplot(term60, aes(x = reorder(WORD, -FREQ), y = FREQ)) +
geom_bar(stat = "identity", fill = "red") +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
## Problem 5
Produce a wordcloud of the 10 most common terms in your refined corpus.
wordcloud(corp_refined, min.freq = 20, colors = "purple",scale = c(3,0.5),random.order = FALSE)