- Groups: As you arrive
- Look at the news blurbs
- How is word usage different across the articles?
- Which words carry strongest emotion?
- What “topics” are covered in each?
2026-05-20
tidytexttidytext From Van Loon, 2022
From Van Loon, 2022
tidytext to explore these on Wednesday!gutenbergrguardianapitidytextRedditExtractoRgutenberg_download to download your bookgutenberg_id field#install.packages("gutenbergr")
library(gutenbergr)
# Look at Gutenberg books written by Durkheim
gutenberg_works(author == "Durkheim, Émile")
## # A tibble: 1 × 8 ## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf ## <int> <chr> <chr> <int> <fct> <chr> ## 1 41360 The Elem… Durkh… 40654 en Browsing: Culture/… ## # ℹ 2 more variables: rights <fct>, has_text <lgl>
# download book - notice that we need the id number (also on the gutenberg website)
efrl <- gutenberg_download(gutenberg_id = 41360,
mirror = "http://mirrors.xmission.com/gutenberg/")
# download book - notice that we need the id number (also on the gutenberg website)
efrl <- gutenberg_download(gutenberg_id = 41360,
mirror = "http://mirrors.xmission.com/gutenberg/")
| Row | Person | Birthday | Occupation |
|---|---|---|---|
| 1 | Joe | 12/3/1963 | Carpenter |
| 2 | Malik | 6/8/1978 | Architect |
| 3 | Suzanna | 4/3/2001 | Student |
| Row | County | Temperature | PM2.5 |
|---|---|---|---|
| 1 | Santa Clara | 78.1 | 12.1 |
| 2 | San Mateo | 82.3 | 32.1 |
| 3 | San Francisco | 65.4 | 44.7 |
This is nice …
| Row | Paper | Article | Text |
|---|---|---|---|
| 1 | New York Times | Study Compares Gas Stove Pollution to Secondhand Cigarette Smoke | Using a single gas-stove burner can raise indoor concentrations of benzene, … |
| 2 | New York Times | Study Compares Gas Stove Pollution to Secondhand Cigarette Smoke | For the peer-reviewed study, researchers at Stanford’s Doerr School of Sustainability … |
| 3 | New York Times | Study Compares Gas Stove Pollution to Secondhand Cigarette Smoke | In about a third of the homes, a single gas burner … |
But we often prefer this
| Row | Paper | Article | Text |
|---|---|---|---|
| 1 | New York Times | Study Compares Gas Stove Pollu… | Using |
| 2 | New York Times | Study Compares Gas Stove Pollu… | a |
| 3 | New York Times | Study Compares Gas Stove Pollu… | single |
tidytext format?tidytext format with unnest_tokenslibrary(tidytext) library(dplyr) library(magrittr) # try to tokenize into single words efrl %<>% unnest_tokens(word, text)
# count of words that contain "law" efrl %>% count(word, sort = T)
View(stop_words))library(tidytext) library(dplyr) library(magrittr) # count of words that contain "law" efrl %>% count(word, sort = T)
## # A tibble: 11,336 × 2 ## word n ## <chr> <int> ## 1 the 16907 ## 2 of 9685 ## 3 is 5874 ## 4 to 5681 ## 5 and 4653 ## 6 it 4597 ## 7 in 4569 ## 8 a 4277 ## 9 which 3337 ## 10 that 3138 ## # ℹ 11,326 more rows
guardianapilibrary(guardianapi)
library(guardianapi)
gu_api_key("your key here")
guardian <- gu_content('"San Jose" AND "California"',
from_date = "2025-01-1",
to_date = "2026-05-11")
tidytext format?# look at your guardian data guardian %>% head()
## # A tibble: 6 × 45 ## id type section_id section_name web_publication_date web_title web_url ## <chr> <chr> <chr> <chr> <dttm> <chr> <chr> ## 1 us-news/… arti… us-news US news 2026-01-30 05:54:00 Matt Mah… https:… ## 2 us-news/… arti… us-news US news 2025-12-09 05:58:55 Communit… https:… ## 3 us-news/… arti… us-news US news 2026-04-08 08:16:37 ICE agen… https:… ## 4 sport/20… arti… sport Sport 2026-01-29 03:37:36 ICE agen… https:… ## 5 us-news/… arti… us-news US news 2026-04-23 12:44:09 Leading … https:… ## 6 us-news/… arti… us-news US news 2026-02-27 21:00:18 Californ… https:… ## # ℹ 38 more variables: api_url <chr>, tags <lgl>, is_hosted <lgl>, ## # pillar_id <chr>, pillar_name <chr>, headline <chr>, standfirst <chr>, ## # trail_text <chr>, byline <chr>, main <chr>, body <chr>, wordcount <dbl>, ## # first_publication_date <dttm>, is_inappropriate_for_sponsorship <lgl>, ## # is_premoderated <lgl>, last_modified <dttm>, production_office <chr>, ## # publication <chr>, short_url <chr>, should_hide_adverts <lgl>, ## # show_in_related_content <lgl>, thumbnail <chr>, legally_sensitive <lgl>, …
tidytext formatfilter()# first, set up liveblog dataframe tidy_blogs <- guardian %>% filter(type == "liveblog")
unnest_tokens to put in tidytext formatstop_words?anti_join# unnest tokens tidy_blogs %<>% unnest_tokens(word, body_text) %>% anti_join(stop_words)
stop_words?View(stop_words)anti_join the stop_words?# look at examples tidy_blogs %>% select(type, word) %>% head()
## # A tibble: 6 × 2 ## type word ## <chr> <chr> ## 1 liveblog concludes ## 2 liveblog coverage ## 3 liveblog politics ## 4 liveblog day ## 5 liveblog reading ## 6 liveblog developments
count() function to get word frequencies# look at blog word frequencies tidy_blogs %>% count(word, sort = TRUE)
## # A tibble: 7,880 × 2 ## word n ## <chr> <int> ## 1 trump 729 ## 2 president 251 ## 3 donald 245 ## 4 house 237 ## 5 trump’s 208 ## 6 white 182 ## 7 people 147 ## 8 federal 133 ## 9 administration 122 ## 10 war 119 ## # ℹ 7,870 more rows
tidy_articles <- guardian %>% filter(type == "article") # make tidytext format, remove stop words tidy_articles %<>% unnest_tokens(word, body_text) %>% anti_join(stop_words)
# look at article word frequencies tidy_articles %>% count(word, sort = TRUE)
library(tidyr)
frequency <- bind_rows(tidy_blogs,
tidy_articles) %>%
count(type, word) %>%
group_by(type) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = type, values_from = proportion)
library(scales)
library(ggplot2)
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = article, y = liveblog,
color = abs(article - liveblog))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(x = "Articles", y = "Blogs")
# summarize total words in each section tf_articles <- tidy_articles %>% select(section_name, word) %>% count(section_name, word, sort = TRUE)
# summarize total words in each article tf_articles <- tidy_articles %>% select(section_name, word) %>% count(section_name, word, sort = TRUE) # create tfidf articles_tf_idf <- tf_articles %>% bind_tf_idf(word, section_name, n)
tidytext package: AFINN, Bing, and NRC# look at afinn lexicon
get_sentiments("afinn")
# look at bing lexicon
get_sentiments("bing")
# look at nrc lexicon
get_sentiments("nrc")
# look at afinn lexicon
get_sentiments("afinn")
# look at bing lexicon
get_sentiments("bing")
# look at nrc lexicon
get_sentiments("nrc")
tidy_articles with the NRC dictionaryinner_join# join nrc with tidy comments
tidy_articles %<>%
inner_join(get_sentiments("nrc"))
ggplot() to plot sentimentslibrary(ggplot2) ggplot(tidy_articles, aes(y = sentiment))+ geom_bar(aes(fill = sentiment))+ theme_minimal()+ labs(title = "Sentiments in Guardian Articles about San Jose")
facet_wrap to plot sentiments for each categorylibrary(ggplot2) ggplot(tidy_articles, aes(y = sentiment))+ geom_bar(aes(fill = sentiment))+ theme_minimal()+ labs(title = "Sentiments in Articles about San Jose")+ facet_wrap(~section_name, scales = "free_x")
tidytext format (one word per row)RedditExtractoR package!library(RedditExtractoR)
# extract sj subreddits
sj_subreddits <- find_subreddits("san jose")
# we can get urls of the san jose subreddit
sj_urls <- find_thread_urls(subreddit = "SanJose",
period = "day")
# alternatively, we can find urls of all pages related to san jose
sj_urls <- find_thread_urls(keywords = "san jose",
period = "day")
# extract comments from these pages sj_comments <- get_thread_content(sj_urls$url)
# we can get urls of the san jose subreddit
sj_urls <- find_thread_urls(subreddit = "SanJose",
period = "day")
# extract comments from these pages
sj_comments <- get_thread_content(sj_urls$url)
# get climate subreddit urls class(climate_comments)
# get climate subreddit urls names(climate_comments$comments)
# get climate subreddit urls names(climate_comments$threads)
library(dplyr) library(tidytext) tidy_comments <- sj_comments$comments %>% unnest_tokens(word, comment) %>% anti_join(stop_words) # look at words and timestamps tidy_comments %>% select(timestamp, word)