Setup

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(rtweet)

## 
## Attaching package: 'rtweet'

## The following object is masked from 'package:purrr':
## 
##     flatten

library(httpuv)
library(qdapRegex)

## 
## Attaching package: 'qdapRegex'

## The following object is masked from 'package:dplyr':
## 
##     explain

## The following object is masked from 'package:ggplot2':
## 
##     %+%

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(qdap)

## Loading required package: qdapDictionaries

## Loading required package: qdapTools

## 
## Attaching package: 'qdapTools'

## The following object is masked from 'package:dplyr':
## 
##     id

## Loading required package: RColorBrewer

## 
## Attaching package: 'qdap'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following objects are masked from 'package:base':
## 
##     Filter, proportions

library(wordcloud)
library(RColorBrewer)

Today’s Task

I want to take the code snippets in the third chapter of the Datacamp course and construct usable code to perform similar tasks easily.

Part 1

Combine all of the code beginning with the specification of a search term and the number of tweets to the point where we have termfreq and twt_corpus_final. Display the termfreq.

Solution

term = "Ukraine"
ntweets = 6000
twts = search_tweets(term, n = ntweets, lang = "en") %>% 
  filter(is.na(reply_to_screen_name)  &
         is_retweet == FALSE &
         is_quote == FALSE)
twt_txt = twts$text

# Remove URLs from the tweet
twt_txt_url <- rm_twitter_url(twt_txt)
# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs  <- gsub("[^A-Za-z]"," " , twt_txt_url)
# Convert text in "twt_gsub" dataset to a text corpus and view output
twt_corpus <- twt_txt_chrs %>% 
                VectorSource() %>% 
                Corpus() 

# Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower)

## Warning in tm_map.SimpleCorpus(twt_corpus, tolower): transformation drops
## documents

# Remove English stop words from the corpus and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(twt_corpus_lwr, removeWords,
## stopwords("english")): transformation drops documents

# Remove additional spaces from the corpus
twt_corpus_final <- tm_map(twt_corpus_stpwd, stripWhitespace)

## Warning in tm_map.SimpleCorpus(twt_corpus_stpwd, stripWhitespace):
## transformation drops documents

termfreq  <-  freq_terms(twt_corpus_final, 60)
termfreq

##    WORD              FREQ
## 1  ukraine            482
## 2  s                  178
## 3  russia             157
## 4  war                117
## 5  russian             82
## 6  putin               73
## 7  amp                 46
## 8  news                46
## 9  will                44
## 10 via                 43
## 11 invasion            32
## 12 ukrainian           32
## 13 says                31
## 14 us                  31
## 15 new                 26
## 16 now                 26
## 17 trump               26
## 18 biden               25
## 19 t                   23
## 20 people              22
## 21 can                 21
## 22 like                21
## 23 peace               21
## 24 u                   21
## 25 kyiv                19
## 26 forces              18
## 27 youtube             18
## 28 m                   17
## 29 president           17
## 30 back                16
## 31 country             16
## 32 oil                 16
## 33 military            15
## 34 nato                15
## 35 one                 15
## 36 support             14
## 37 ukrainewar          14
## 38 zelensky            14
## 39 going               13
## 40 just                13
## 41 may                 13
## 42 need                13
## 43 world               13
## 44 countries           12
## 45 go                  12
## 46 help                12
## 47 poland              12
## 48 said                12
## 49 stop                12
## 50 talks               12
## 51 amid                11
## 52 children            11
## 53 day                 11
## 54 europe              11
## 55 latest              11
## 56 mariupol            11
## 57 soldiers            11
## 58 truth               11
## 59 ukrainerussianwar   11
## 60 update              11

Part 2

Create a list of custom stopwords and produce a barplot of the 10 most common words.

Solution

# Create a vector of custom stop words
custom_stopwds <- c("ukraine", " s", "amp", "can", 't',"via")

# Remove custom stop words and create a refined corpus
corp_refined <- tm_map(twt_corpus_final,removeWords, custom_stopwds)

## Warning in tm_map.SimpleCorpus(twt_corpus_final, removeWords, custom_stopwds):
## transformation drops documents

# Extract term frequencies for the top 20 words
termfreq_clean <- freq_terms(corp_refined, 20)

# Extract term frequencies for the top 10 words
termfreq_10w <- freq_terms(corp_refined, 10)
termfreq_10w

##    WORD      FREQ
## 1  russia     157
## 2  war        117
## 3  russian     82
## 4  putin       73
## 5  news        46
## 6  will        44
## 7  invasion    32
## 8  ukrainian   32
## 9  says        31
## 10 us          31

# Identify terms with more than 60 counts from the top 10 list
term60 <- subset(termfreq_10w, FREQ > 60)

# Create a bar plot using terms with more than 60 counts
ggplot(term60, aes(x = reorder(WORD, -FREQ), y = FREQ)) + 
        geom_bar(stat = "identity", fill = "red") + 
        theme(axis.text.x = element_text(angle = 15, hjust = 1))

Part 3

Create a few wordclouds.

Solution

# Create a word cloud in red with min frequency of 20
wordcloud(corp_refined, min.freq = 20, colors = "red", 
    scale = c(3,0.5),random.order = FALSE)

# Create word cloud with 6 colors and max 50 words
wordcloud(corp_refined, max.words = 50, 
    colors = brewer.pal(6, "Dark2"), 
    scale=c(4,1), random.order = FALSE)

A Function

Turn the code in Part 1 into a function with two arguments: term and ntweets. The function should return twt_corpus_final.It should also print the list of common terms. This function can then be used in conjunction with parts 2 and 3 in a compact workflow.

Test the function with the term “Covid” and ask for 6,000 tweets.

my_corpus = function(term,ntweets){

twts = search_tweets(term, n = ntweets, lang = "en",retryonratelimit = TRUE) %>% 
  filter(is.na(reply_to_screen_name)  &
         is_retweet == FALSE &
         is_quote == FALSE)
twt_txt = twts$text

# Remove URLs from the tweet
twt_txt_url <- rm_twitter_url(twt_txt)
# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs  <- gsub("[^A-Za-z]"," " , twt_txt_url)
# Convert text in "twt_gsub" dataset to a text corpus and view output
twt_corpus <- twt_txt_chrs %>% 
                VectorSource() %>% 
                Corpus() 

# Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower) 

# Remove English stop words from the corpus and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("english"))

# Remove additional spaces from the corpus
twt_corpus_final <- tm_map(twt_corpus_stpwd, stripWhitespace)

termfreq  <-  freq_terms(twt_corpus_final, 60)
print(termfreq)

return(twt_corpus_final)

}

twt_corpus_final = my_corpus("Covid", 6000)

## Warning in tm_map.SimpleCorpus(twt_corpus, tolower): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(twt_corpus_lwr, removeWords,
## stopwords("english")): transformation drops documents

## Warning in tm_map.SimpleCorpus(twt_corpus_stpwd, stripWhitespace):
## transformation drops documents

##    WORD            FREQ
## 1  covid           1515
## 2  s                275
## 3  t                208
## 4  new              144
## 5  via              126
## 6  just             116
## 7  people           105
## 8  now              104
## 9  will             100
## 10 pandemic          96
## 11 can               95
## 12 m                 95
## 13 get               92
## 14 biden             89
## 15 ivermectin        87
## 16 risk              86
## 17 cases             85
## 18 study             84
## 19 health            83
## 20 like              83
## 21 vaccine           80
## 22 amp               76
## 23 don               73
## 24 booster           70
## 25 still             70
## 26 one               66
## 27 large             60
## 28 got               58
## 29 us                58
## 30 vaccines          58
## 31 know              55
## 32 news              54
## 33 hospitalization   53
## 34 reduce            53
## 35 test              53
## 36 time              52
## 37 ve                52
## 38 may               51
## 39 nyt               51
## 40 deaths            50
## 41 finds             49
## 42 back              48
## 43 omicron           48
## 44 cdc               47
## 45 re                47
## 46 today             47
## 47 first             45
## 48 years             44
## 49 end               43
## 50 day               41
## 51 public            41
## 52 even              40
## 53 positive          40
## 54 week              40
## 55 go                39
## 56 mask              39
## 57 second            39
## 58 going             38
## 59 emergency         37
## 60 long              37
## 61 march             37
## 62 never             37

Twitter 4

Setup

Today’s Task

Part 1

Solution

Part 2

Solution

Part 3

Solution

A Function