Harold Nelson
3/29/2022
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
##
## flatten
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:dplyr':
##
## explain
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Loading required package: qdapDictionaries
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:dplyr':
##
## id
## Loading required package: RColorBrewer
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following objects are masked from 'package:base':
##
## Filter, proportions
I want to take the code snippets in the third chapter of the Datacamp course and construct usable code to perform similar tasks easily.
Combine all of the code beginning with the specification of a search term and the number of tweets to the point where we have termfreq and twt_corpus_final. Display the termfreq.
term = "Ukraine"
ntweets = 6000
twts = search_tweets(term, n = ntweets, lang = "en") %>%
filter(is.na(reply_to_screen_name) &
is_retweet == FALSE &
is_quote == FALSE)
twt_txt = twts$text
# Remove URLs from the tweet
twt_txt_url <- rm_twitter_url(twt_txt)
# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs <- gsub("[^A-Za-z]"," " , twt_txt_url)
# Convert text in "twt_gsub" dataset to a text corpus and view output
twt_corpus <- twt_txt_chrs %>%
VectorSource() %>%
Corpus()
# Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower)
## Warning in tm_map.SimpleCorpus(twt_corpus, tolower): transformation drops
## documents
# Remove English stop words from the corpus and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(twt_corpus_lwr, removeWords,
## stopwords("english")): transformation drops documents
# Remove additional spaces from the corpus
twt_corpus_final <- tm_map(twt_corpus_stpwd, stripWhitespace)
## Warning in tm_map.SimpleCorpus(twt_corpus_stpwd, stripWhitespace):
## transformation drops documents
## WORD FREQ
## 1 ukraine 482
## 2 s 178
## 3 russia 157
## 4 war 117
## 5 russian 82
## 6 putin 73
## 7 amp 46
## 8 news 46
## 9 will 44
## 10 via 43
## 11 invasion 32
## 12 ukrainian 32
## 13 says 31
## 14 us 31
## 15 new 26
## 16 now 26
## 17 trump 26
## 18 biden 25
## 19 t 23
## 20 people 22
## 21 can 21
## 22 like 21
## 23 peace 21
## 24 u 21
## 25 kyiv 19
## 26 forces 18
## 27 youtube 18
## 28 m 17
## 29 president 17
## 30 back 16
## 31 country 16
## 32 oil 16
## 33 military 15
## 34 nato 15
## 35 one 15
## 36 support 14
## 37 ukrainewar 14
## 38 zelensky 14
## 39 going 13
## 40 just 13
## 41 may 13
## 42 need 13
## 43 world 13
## 44 countries 12
## 45 go 12
## 46 help 12
## 47 poland 12
## 48 said 12
## 49 stop 12
## 50 talks 12
## 51 amid 11
## 52 children 11
## 53 day 11
## 54 europe 11
## 55 latest 11
## 56 mariupol 11
## 57 soldiers 11
## 58 truth 11
## 59 ukrainerussianwar 11
## 60 update 11
Create a list of custom stopwords and produce a barplot of the 10 most common words.
# Create a vector of custom stop words
custom_stopwds <- c("ukraine", " s", "amp", "can", 't',"via")
# Remove custom stop words and create a refined corpus
corp_refined <- tm_map(twt_corpus_final,removeWords, custom_stopwds)
## Warning in tm_map.SimpleCorpus(twt_corpus_final, removeWords, custom_stopwds):
## transformation drops documents
# Extract term frequencies for the top 20 words
termfreq_clean <- freq_terms(corp_refined, 20)
# Extract term frequencies for the top 10 words
termfreq_10w <- freq_terms(corp_refined, 10)
termfreq_10w
## WORD FREQ
## 1 russia 157
## 2 war 117
## 3 russian 82
## 4 putin 73
## 5 news 46
## 6 will 44
## 7 invasion 32
## 8 ukrainian 32
## 9 says 31
## 10 us 31
# Identify terms with more than 60 counts from the top 10 list
term60 <- subset(termfreq_10w, FREQ > 60)
# Create a bar plot using terms with more than 60 counts
ggplot(term60, aes(x = reorder(WORD, -FREQ), y = FREQ)) +
geom_bar(stat = "identity", fill = "red") +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
Create a few wordclouds.
# Create a word cloud in red with min frequency of 20
wordcloud(corp_refined, min.freq = 20, colors = "red",
scale = c(3,0.5),random.order = FALSE)
# Create word cloud with 6 colors and max 50 words
wordcloud(corp_refined, max.words = 50,
colors = brewer.pal(6, "Dark2"),
scale=c(4,1), random.order = FALSE)
Turn the code in Part 1 into a function with two arguments: term and ntweets. The function should return twt_corpus_final.It should also print the list of common terms. This function can then be used in conjunction with parts 2 and 3 in a compact workflow.
Test the function with the term “Covid” and ask for 6,000 tweets.
my_corpus = function(term,ntweets){
twts = search_tweets(term, n = ntweets, lang = "en",retryonratelimit = TRUE) %>%
filter(is.na(reply_to_screen_name) &
is_retweet == FALSE &
is_quote == FALSE)
twt_txt = twts$text
# Remove URLs from the tweet
twt_txt_url <- rm_twitter_url(twt_txt)
# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs <- gsub("[^A-Za-z]"," " , twt_txt_url)
# Convert text in "twt_gsub" dataset to a text corpus and view output
twt_corpus <- twt_txt_chrs %>%
VectorSource() %>%
Corpus()
# Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower)
# Remove English stop words from the corpus and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("english"))
# Remove additional spaces from the corpus
twt_corpus_final <- tm_map(twt_corpus_stpwd, stripWhitespace)
termfreq <- freq_terms(twt_corpus_final, 60)
print(termfreq)
return(twt_corpus_final)
}
twt_corpus_final = my_corpus("Covid", 6000)
## Warning in tm_map.SimpleCorpus(twt_corpus, tolower): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(twt_corpus_lwr, removeWords,
## stopwords("english")): transformation drops documents
## Warning in tm_map.SimpleCorpus(twt_corpus_stpwd, stripWhitespace):
## transformation drops documents
## WORD FREQ
## 1 covid 1515
## 2 s 275
## 3 t 208
## 4 new 144
## 5 via 126
## 6 just 116
## 7 people 105
## 8 now 104
## 9 will 100
## 10 pandemic 96
## 11 can 95
## 12 m 95
## 13 get 92
## 14 biden 89
## 15 ivermectin 87
## 16 risk 86
## 17 cases 85
## 18 study 84
## 19 health 83
## 20 like 83
## 21 vaccine 80
## 22 amp 76
## 23 don 73
## 24 booster 70
## 25 still 70
## 26 one 66
## 27 large 60
## 28 got 58
## 29 us 58
## 30 vaccines 58
## 31 know 55
## 32 news 54
## 33 hospitalization 53
## 34 reduce 53
## 35 test 53
## 36 time 52
## 37 ve 52
## 38 may 51
## 39 nyt 51
## 40 deaths 50
## 41 finds 49
## 42 back 48
## 43 omicron 48
## 44 cdc 47
## 45 re 47
## 46 today 47
## 47 first 45
## 48 years 44
## 49 end 43
## 50 day 41
## 51 public 41
## 52 even 40
## 53 positive 40
## 54 week 40
## 55 go 39
## 56 mask 39
## 57 second 39
## 58 going 38
## 59 emergency 37
## 60 long 37
## 61 march 37
## 62 never 37