library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(httr)
This tutorial is based heavily on http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#bags-of-words and is adapted from the R pipeline.
I would like you to create your own document from scratch (don’t copy and paste the whole thing) and work through this activity on your own. Note that for most of the analysis the instructions will ask you to use your own data rather than the data I use below.
Suggested reading: Text Mining with R: Tidy Text Mining
us_dec_sentence <- 'We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.'
# Show the number of characters in the sentence.
nchar(us_dec_sentence)
## [1] 209
# Show the sentence itself.
us_dec_sentence
## [1] "We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness."
.txt Files.
library(readr)
us_dec <- read_file('https://ia800305.us.archive.org/29/items/unitedstatesdecl00001gut/when12.txt')
nchar(us_dec)
## [1] 24863
strtrim(us_dec, 200)
## [1] "The Project Gutenberg EBook of The Declaration of Independence\r\n\r\nCopyright laws are changing all over the world. Be sure to check the\r\ncopyright laws for your country before downloading or redistribu"
GetArticleText <- function(langCode, titles) {
# Given a langCode ("en", "de", etc.) and a vector of article titles
# Returns a data frame with the text of the specified articles in
# the specified language
texts <- sapply(titles, function(t) {
resp <- GET(
paste("https://", langCode, ".wikipedia.org/w/api.php", sep=''),
query = list(
action = "query",
prop = "extracts",
format = "json",
explaintext = "",
titles = t
)
);
js <- content(resp, "parsed");
return (js$query$pages[[1]]$extract)
})
return (data.frame(title=titles, text=texts, stringsAsFactors=FALSE, row.names=NULL));
}
# Get the text for
# https://en.wikipedia.org/wiki/Macalester_College,
# https://en.wikipedia.org/wiki/Carleton_College, and
# https://en.wikipedia.org/wiki/University_of_Minnesota in English ("en").
# We could also get the text for the Spanish article ("es"), or German article ("de")
school_wiki_titles = c('Macalester College', 'Carleton College', 'University of Minnesota')
school_df <- GetArticleText('en', school_wiki_titles)
| title | text |
|---|---|
| Macalester College | Macalester College () is a private liberal arts college in Saint Paul, Minnesota. Founded in 1874, Macalester is exclusively an undergraduate four-year institution and enrolled 2,174 students in the f… |
| Carleton College | Carleton College ( KARL-tin) is a private liberal arts college in Northfield, Minnesota, US. Founded in 1866, the college enrolled 2,105 undergraduate students and employed 269 faculty members in fall… |
| University of Minnesota | The University of Minnesota, Twin Cities (the U of M, UMN, Minnesota) is a public research university in the Twin Cities of Minneapolis and Saint Paul, Minnesota. The Twin Cities campus comprises loca… |
We’ll analyze these documents further below.
Exercise: Create a data frame with just the text for some article that interests you. I will use Macalester College, but you should pick something you are interested in!
mac_df <- GetArticleText("en", c("Macalester College"));
| title | text |
|---|---|
| Macalester College | Macalester College () is a private liberal arts college in Saint Paul, Minnesota. Founded in 1874, Macalester is exclusively an undergraduate four-year institution and enrolled 2,174 students in the f… |
Convert to tidy representation:
tidy_mac <- mac_df %>%
unnest_tokens(word, text)
head(tidy_mac)
## title word
## 1 Macalester College macalester
## 1.1 Macalester College college
## 1.2 Macalester College is
## 1.3 Macalester College a
## 1.4 Macalester College private
## 1.5 Macalester College liberal
nrow(tidy_mac)
## [1] 3802
We can also find the most frequently used words by using dplyr’s count function, which creates a frequency table for (in our case) words:
# Create and display frequency count table
all_mac_counts <- tidy_mac %>%
count(word, sort = TRUE)
all_mac_counts %>% head(5)
## word n
## 1 the 219
## 2 and 137
## 3 of 124
## 4 in 101
## 5 macalester 85
Stop words are words that are so frequent they provide no real informational signal. We will typically ignore them.
# Load stop words dataset and display it
data(stop_words)
head(stop_words)
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
dim(stop_words)
## [1] 1149 2
# Create and display frequency count table after removing stop words from the dataset
mac_counts <- tidy_mac %>%
anti_join(stop_words) %>%
count(word, sort=TRUE)
## Joining, by = "word"
head(mac_counts)
## word n
## 1 macalester 85
## 2 students 37
## 3 college 34
## 4 campus 28
## 5 student 22
## 6 minnesota 19
Another way to analyze words is through visualization. The code below shows a common tool (a Word cloud or Wordle) that can visualize word frequencies.
library(wordcloud)
# Show a word cloud with some customized options
wordcloud(mac_counts$word, # column of words
mac_counts$n, # column of frequencies
scale=c(5,0.2), # range of font sizes of words
min.freq = 2, # minimum word frequency to show
max.words=200, # show the 200 most frequent words
random.order=FALSE, # position the most popular words first
colors=brewer.pal(8, "Dark2")) # Color palette
Let’s now create a TidyText data frame with the three Wikipedia documents we collected above via the API. Remember that the TidyText data frame has one row for each word.
| title | text |
|---|---|
| Macalester College | Macalester College () is a private liberal arts college in Saint Paul, Minnesota. Founded in 1874, Macalester is exclusively an undergraduate four-year institution and enrolled 2,174 students in the f… |
| Carleton College | Carleton College ( KARL-tin) is a private liberal arts college in Northfield, Minnesota, US. Founded in 1866, the college enrolled 2,105 undergraduate students and employed 269 faculty members in fall… |
| University of Minnesota | The University of Minnesota, Twin Cities (the U of M, UMN, Minnesota) is a public research university in the Twin Cities of Minneapolis and Saint Paul, Minnesota. The Twin Cities campus comprises loca… |
# Unnest the data frame so each row corresponds to a single word in a single document.
tidy_school_df <- school_df %>%
unnest_tokens(word, text)
head(tidy_school_df)
## title word
## 1 Macalester College macalester
## 1.1 Macalester College college
## 1.2 Macalester College is
## 1.3 Macalester College a
## 1.4 Macalester College private
## 1.5 Macalester College liberal
# Generate counts
school_counts <-
tidy_school_df %>%
anti_join(stop_words) %>%
count(title, word, sort=TRUE)
## Joining, by = "word"
head(school_counts)
## title word n
## 1 University of Minnesota university 89
## 2 Macalester College macalester 85
## 3 University of Minnesota minnesota 71
## 4 Carleton College carleton 68
## 5 University of Minnesota campus 68
## 6 Carleton College college 44
interesting_words <- c(
"liberal",
"education",
"research",
"teaching",
"lgbtq",
"football"
);
school_counts %>%
filter(word %in% interesting_words) %>%
spread(title, n);
## word Carleton College Macalester College University of Minnesota
## 1 education 6 8 5
## 2 football 1 5 9
## 3 lgbtq NA 4 NA
## 4 liberal 13 9 1
## 5 research 3 4 16
## 6 teaching 3 1 1
A common way to analyze single documents is to look at how often different words occur:
mac_freqs <-
school_counts %>%
filter(title == "Macalester College")
mac_freqs %>%
head(10)
## title word n
## 1 Macalester College macalester 85
## 2 Macalester College students 37
## 3 Macalester College college 34
## 4 Macalester College campus 28
## 5 Macalester College student 22
## 6 Macalester College minnesota 19
## 7 Macalester College house 15
## 8 Macalester College hall 14
## 9 Macalester College university 14
## 10 Macalester College arts 13
# We often normalize by the number of words in a document.
mac_freqs %>%
mutate(tf=n / sum(n)) %>%
head(10)
## title word n tf
## 1 Macalester College macalester 85 0.037215412
## 2 Macalester College students 37 0.016199650
## 3 Macalester College college 34 0.014886165
## 4 Macalester College campus 28 0.012259194
## 5 Macalester College student 22 0.009632224
## 6 Macalester College minnesota 19 0.008318739
## 7 Macalester College house 15 0.006567426
## 8 Macalester College hall 14 0.006129597
## 9 Macalester College university 14 0.006129597
## 10 Macalester College arts 13 0.005691769
We will now see how to compare documents. To start, we need to create a corpus, which is a collection of documents.
I will use the Wikipedia API to collect Wikipedia text describing ACM schools, but…
Exercise: I would like you to build up your own corpus that is interesting to you containing at least five different documents that are somehow related.
school_wiki_titles <- c(
'Beloit College',
'Carleton College',
'Coe College',
'Colorado College',
'Cornell College',
'Grinnell College',
'Knox College',
'Lake Forest College',
'Lawrence University',
'Luther College',
'Macalester College',
'Monmouth College',
'Ripon College',
'St. Olaf College');
all_colleges <- GetArticleText("en", school_wiki_titles);
| title | text |
|---|---|
| Beloit College | Beloit College is a private liberal arts college in Beloit, Wisconsin. Foun… |
| Carleton College | Carleton College ( KARL-tin) is a private liberal arts college in Northfiel… |
| Coe College | Coe College is a private liberal arts college in Cedar Rapids, Iowa. Founde… |
| Colorado College | The Colorado College is a private liberal arts college in Colorado Springs,… |
| Cornell College | Cornell College is a private liberal arts college in Mount Vernon, Iowa. Or… |
| Grinnell College | Grinnell College is a private liberal arts college in Grinnell, Iowa. It wa… |
| Knox College | Knox College may refer to: |
Knox College (Illinois), a four-year coeducatio… | |Lake Forest College |Lake Forest College is a private liberal arts college in Lake Forest, Illin… | |Lawrence University |Lawrence University is a private liberal arts college and conservatory of m… | |Luther College |Luther College is the name of several educational institutions:
== Austra… | |Macalester College |Macalester College () is a private liberal arts college in Saint Paul, Minn… | |Monmouth College |Monmouth College is a private Presbyterian liberal arts college in Monmouth… | |Ripon College |Ripon College may refer to:
Ripon College (Wisconsin), a liberal arts coll… | |St. Olaf College |St. Olaf College is a private liberal arts college in Northfield, Minnesota… |
We will first calculate term frequncy, which is the number of times each word occurs:
all_school_counts <-
all_colleges %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(title, word, sort=TRUE)
## Joining, by = "word"
While counts are a reasonable start, they are not perfect. For example, student probably has a high frequency count for all colleges, so it is not particularly distinctive in this corpus.
A better measure of distinctiveness is Tf-IDF (“Term-Frequency Inverse-Document Frequency”).
TF (Term freqency) measures the number of times a word appears. It is often normalized by dividing by the length of a document so that it actually represents the fraction of words in a document that are some particular word.
IDF measures the overall popularity of a word across a corpus, on the other hand, and penalizes “overly-general” words. The actual measure of generality is “Inverse Document Frequency” (idf), which is the log of the inverse of the ratio of documents the term appears in.
Luckily, tidytext does this entirely for us!
with_tf_idf <-
all_school_counts %>%
bind_tf_idf(word, title, n) %>%
arrange(desc(tf_idf))
with_tf_idf %>% filter(title=='Macalester College') %>% head(10)
## title word n tf idf tf_idf
## 1 Macalester College macalester 85 0.037215412 2.639057 0.098213605
## 2 Macalester College mac 13 0.005691769 2.639057 0.015020904
## 3 Macalester College minnesota 19 0.008318739 1.252763 0.010421408
## 4 Macalester College engagement 7 0.003064799 2.639057 0.008088179
## 5 Macalester College macalester's 7 0.003064799 2.639057 0.008088179
## 6 Macalester College sustainability 13 0.005691769 1.252763 0.007130437
## 7 Macalester College civic 6 0.002626970 2.639057 0.006932725
## 8 Macalester College summit 5 0.002189142 2.639057 0.005777271
## 9 Macalester College semester 10 0.004378284 1.252763 0.005484952
## 10 Macalester College gender 8 0.003502627 1.540445 0.005395604
Exercise: Use the Tf-Idf matrix to identify the five highest scoring Tf-Idf words in each document. Compare these to the words with highest counts in each document. What do you notice.