library(tidyverse)
library(rtweet)
library(httpuv)
library(qdapRegex)
library(tm)
library(qdap)
library(wordcloud)
library(RColorBrewer)

Problem 1

Get a list of current trending topics in the United States and pick one that interests you. Indicate your choice.

gtny = get_trends("USA")

gtny %>% arrange(desc(tweet_volume)) %>%
  select(query, tweet_volume) %>%
  distinct %>%
  head(20)
## # A tibble: 20 x 2
##    query                   tweet_volume
##    <chr>                          <int>
##  1 NFTs                         1058989
##  2 Biden                         604979
##  3 Apple                         455743
##  4 iPhone                        387849
##  5 Pakistan                      304662
##  6 Europe                        209637
##  7 %22Ministry+of+Truth%22       175227
##  8 Messi                         163857
##  9 Nigeria                       103243
## 10 Patrick                        92479
## 11 Christ                         72389
## 12 Afghanistan                    53855
## 13 Iraq                           22886
## 14 Donbass                        21652
## 15 Conseil                        21442
## 16 Shay                           20908
## 17 Organisation                   16543
## 18 Russes                         15887
## 19 %23offenerbrief                11734
## 20 Draghi                         10794

#The latest trend topic in the US is Suga which is the stage name for a member in BTS, a world wide K-Pop boy group. As of 4/28/2022.

Problem 2 Use the code we developed in class to get a cleaned corpus consisting of the texts in 6,000 tweets on your topic. Make sure that your tweets are original and in English. Also produce a list of the 60 most common terms in these tweets.

term = "Suga"
ntweets = 6000
twts = search_tweets(term, n = ntweets, lang = "en") %>% filter(is.na(reply_to_screen_name) & is_retweet == FALSE & is_quote == FALSE)
twt_txt = twts$text

Remove URLs from the tweet

twt_txt_url <- rm_twitter_url(twt_txt)

Replace special characters, punctuation, & numbers with spaces

twt_txt_chrs  <- gsub("[^A-Za-z]"," " , twt_txt_url)

Convert text in “twt_gsub” dataset to a text corpus and view output

twt_corpus <- twt_txt_chrs %>% 
                VectorSource() %>% 
                Corpus() 

Convert the corpus to lowercase

twt_corpus_lwr <- tm_map(twt_corpus, tolower)
## Warning in tm_map.SimpleCorpus(twt_corpus, tolower): transformation drops
## documents

Remove English stop words from the corpus and view the corpus

twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(twt_corpus_lwr, removeWords,
## stopwords("english")): transformation drops documents

Remove additional spaces from the corpus

twt_corpus_final <- tm_map(twt_corpus_stpwd, stripWhitespace)
## Warning in tm_map.SimpleCorpus(twt_corpus_stpwd, stripWhitespace):
## transformation drops documents

Produce a list of the 60 most common terms in these tweets.

termfreq  <- freq_terms(twt_corpus_final, 60)
termfreq
##    WORD               FREQ
## 1  bts                 262
## 2  suga                216
## 3  twt                 150
## 4  prod                140
## 5  psy                 130
## 6  thatthat            113
## 7  listening           109
## 8  amp                  84
## 9  jungkook             72
## 10 ft                   47
## 11 stayalive            43
## 12 feat                 40
## 13 m                    26
## 14 featuring            25
## 15 music                25
## 16 fun                  23
## 17 vocalist             22
## 18 brilliant            19
## 19 exceptionally        19
## 20 gifted               19
## 21 song                 19
## 22 super                19
## 23 yoongi               18
## 24 listen               16
## 25 thatthatfeatsuga     15
## 26 can                  14
## 27 love                 14
## 28 t                    14
## 29 produced             12
## 30 de                   11
## 31 jimin                10
## 32 trending             10
## 33 party                 9
## 34 stream                9
## 35 army                  8
## 36 choice                8
## 37 keep                  8
## 38 psyxsuga              8
## 39 s                     8
## 40 day                   7
## 41 good                  7
## 42 king                  7
## 43 streaming             7
## 44 thatthatprodbysuga    7
## 45 agustd                6
## 46 itunes                6
## 47 jin                   6
## 48 min                   6
## 49 new                   6
## 50 oppa                  6
## 51 ufcvegas              6
## 52 bad                   5
## 53 doggy                 5
## 54 dogs                  5
## 55 gt                    5
## 56 jhope                 5
## 57 mv                    5
## 58 namjoon               5
## 59 record                5
## 60 worldwide             5

#Problem 3 #Refine your corpus by removing a set of custom stopwords.

custom_stopwds <- c("suga"," r", "amp", "can", "t", "via")
corp_refined <- tm_map(twt_corpus_final,removeWords,custom_stopwds)
## Warning in tm_map.SimpleCorpus(twt_corpus_final, removeWords, custom_stopwds):
## transformation drops documents

Problem 4

Produce a barplot of the 10 most common terms in your refined corpus.

termfreq_10w <- freq_terms(corp_refined, 10)
termfreq_10w
##    WORD      FREQ
## 1  bts        262
## 2  twt        150
## 3  prod       140
## 4  psy        130
## 5  thatthat   113
## 6  listening  109
## 7  jungkook    72
## 8  ft          47
## 9  stayalive   43
## 10 feat        40
term60 <- subset(termfreq_10w, FREQ > 60)
ggplot(term60, aes(x = reorder(WORD, -FREQ), y = FREQ)) + 
        geom_bar(stat = "identity", fill = "red") + 
        theme(axis.text.x = element_text(angle = 15, hjust = 1))

## Problem 5

Produce a wordcloud of the 10 most common terms in your refined corpus.

wordcloud(corp_refined, min.freq = 20, colors = "purple",scale = c(3,0.5),random.order = FALSE)