library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringr)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(httr)

This tutorial is based heavily on http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#bags-of-words and is adapted from the R pipeline.

I would like you to create your own document from scratch (don’t copy and paste the whole thing) and work through this activity on your own. Note that for most of the analysis the instructions will ask you to use your own data rather than the data I use below.

Text Acquisition

Suggested reading: Text Mining with R: Tidy Text Mining

String Literals

us_dec_sentence <- 'We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.'

# Show the number of characters in the sentence.
nchar(us_dec_sentence)
## [1] 209
# Show the sentence itself.
us_dec_sentence
## [1] "We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness."

Reading .txt Files

.

library(readr)
us_dec <- read_file('https://ia800305.us.archive.org/29/items/unitedstatesdecl00001gut/when12.txt')
nchar(us_dec)
## [1] 24863
strtrim(us_dec, 200)
## [1] "The Project Gutenberg EBook of The Declaration of Independence\r\n\r\nCopyright laws are changing all over the world. Be sure to check the\r\ncopyright laws for your country before downloading or redistribu"

Via Web APIs

GetArticleText <- function(langCode, titles) {
  # Given a langCode ("en", "de", etc.) and a vector of article titles
  # Returns a data frame with the text of the specified articles in
  # the specified language
  texts <- sapply(titles, function(t) {
    resp <- GET(
      paste("https://", langCode, ".wikipedia.org/w/api.php", sep=''), 
      query = list(
        action  = "query", 
        prop = "extracts",
        format  = "json",
        explaintext = "",
        titles  = t
      )
    );
    
    js <- content(resp, "parsed");
    return (js$query$pages[[1]]$extract)
  })
  
  return (data.frame(title=titles, text=texts, stringsAsFactors=FALSE, row.names=NULL));
}
# Get the text for 
# https://en.wikipedia.org/wiki/Macalester_College,
# https://en.wikipedia.org/wiki/Carleton_College, and 
# https://en.wikipedia.org/wiki/University_of_Minnesota in English ("en").
# We could also get the text for the Spanish article ("es"), or German article ("de")

school_wiki_titles = c('Macalester College', 'Carleton College', 'University of Minnesota')
school_df <- GetArticleText('en', school_wiki_titles)
title text
Macalester College Macalester College () is a private liberal arts college in Saint Paul, Minnesota. Founded in 1874, Macalester is exclusively an undergraduate four-year institution and enrolled 2,174 students in the f…
Carleton College Carleton College ( KARL-tin) is a private liberal arts college in Northfield, Minnesota, US. Founded in 1866, the college enrolled 2,105 undergraduate students and employed 269 faculty members in fall…
University of Minnesota The University of Minnesota, Twin Cities (the U of M, UMN, Minnesota) is a public research university in the Twin Cities of Minneapolis and Saint Paul, Minnesota. The Twin Cities campus comprises loca…

We’ll analyze these documents further below.

Analyzing Single Documents

Exercise: Create a data frame with just the text for some article that interests you. I will use Macalester College, but you should pick something you are interested in!

mac_df <- GetArticleText("en", c("Macalester College"));
title text
Macalester College Macalester College () is a private liberal arts college in Saint Paul, Minnesota. Founded in 1874, Macalester is exclusively an undergraduate four-year institution and enrolled 2,174 students in the f…

Convert to tidy representation:

tidy_mac <- mac_df %>%
  unnest_tokens(word, text)

head(tidy_mac)
##                  title       word
## 1   Macalester College macalester
## 1.1 Macalester College    college
## 1.2 Macalester College         is
## 1.3 Macalester College          a
## 1.4 Macalester College    private
## 1.5 Macalester College    liberal
nrow(tidy_mac)
## [1] 3802

We can also find the most frequently used words by using dplyr’s count function, which creates a frequency table for (in our case) words:

# Create and display frequency count table
all_mac_counts <- tidy_mac %>%
  count(word, sort = TRUE) 
all_mac_counts %>% head(5)
##         word   n
## 1        the 219
## 2        and 137
## 3         of 124
## 4         in 101
## 5 macalester  85

Stop Words

Stop words are words that are so frequent they provide no real informational signal. We will typically ignore them.

# Load stop words dataset and display it
data(stop_words)
head(stop_words)
## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART
dim(stop_words)
## [1] 1149    2
# Create and display frequency count table after removing stop words from the dataset
mac_counts <- tidy_mac %>%
  anti_join(stop_words) %>%
  count(word, sort=TRUE)
## Joining, by = "word"
head(mac_counts)
##         word  n
## 1 macalester 85
## 2   students 37
## 3    college 34
## 4     campus 28
## 5    student 22
## 6  minnesota 19

Word Clouds

Another way to analyze words is through visualization. The code below shows a common tool (a Word cloud or Wordle) that can visualize word frequencies.

library(wordcloud)

# Show a word cloud with some customized options

wordcloud(mac_counts$word,             # column of words
          mac_counts$n,                # column of frequencies
          scale=c(5,0.2),                 # range of font sizes of words
          min.freq = 2,                   # minimum word frequency to show
          max.words=200,                  # show the 200 most frequent words
          random.order=FALSE,             # position the most popular words first
          colors=brewer.pal(8, "Dark2"))  # Color palette

Comparing the text in two (or more) documents.

Let’s now create a TidyText data frame with the three Wikipedia documents we collected above via the API. Remember that the TidyText data frame has one row for each word.

title text
Macalester College Macalester College () is a private liberal arts college in Saint Paul, Minnesota. Founded in 1874, Macalester is exclusively an undergraduate four-year institution and enrolled 2,174 students in the f…
Carleton College Carleton College ( KARL-tin) is a private liberal arts college in Northfield, Minnesota, US. Founded in 1866, the college enrolled 2,105 undergraduate students and employed 269 faculty members in fall…
University of Minnesota The University of Minnesota, Twin Cities (the U of M, UMN, Minnesota) is a public research university in the Twin Cities of Minneapolis and Saint Paul, Minnesota. The Twin Cities campus comprises loca…
# Unnest the data frame so each row corresponds to a single word in a single document.
tidy_school_df <- school_df %>%
  unnest_tokens(word, text)
head(tidy_school_df)
##                  title       word
## 1   Macalester College macalester
## 1.1 Macalester College    college
## 1.2 Macalester College         is
## 1.3 Macalester College          a
## 1.4 Macalester College    private
## 1.5 Macalester College    liberal
# Generate counts
school_counts <-
  tidy_school_df %>%
  anti_join(stop_words) %>%
  count(title, word, sort=TRUE)
## Joining, by = "word"
head(school_counts)
##                     title       word  n
## 1 University of Minnesota university 89
## 2      Macalester College macalester 85
## 3 University of Minnesota  minnesota 71
## 4        Carleton College   carleton 68
## 5 University of Minnesota     campus 68
## 6        Carleton College    college 44

Brainstorm how to “score” the importance of the following words:

interesting_words <- c(
  "liberal",
  "education",
  "research",
  "teaching",
  "lgbtq",
  "football"
);

school_counts %>% 
  filter(word %in% interesting_words) %>%
  spread(title, n);
##        word Carleton College Macalester College University of Minnesota
## 1 education                6                  8                       5
## 2  football                1                  5                       9
## 3     lgbtq               NA                  4                      NA
## 4   liberal               13                  9                       1
## 5  research                3                  4                      16
## 6  teaching                3                  1                       1

Term Frequency

A common way to analyze single documents is to look at how often different words occur:

mac_freqs <- 
  school_counts %>% 
  filter(title == "Macalester College")
mac_freqs %>% 
  head(10)
##                 title       word  n
## 1  Macalester College macalester 85
## 2  Macalester College   students 37
## 3  Macalester College    college 34
## 4  Macalester College     campus 28
## 5  Macalester College    student 22
## 6  Macalester College  minnesota 19
## 7  Macalester College      house 15
## 8  Macalester College       hall 14
## 9  Macalester College university 14
## 10 Macalester College       arts 13
# We often normalize by the number of words in a document.
mac_freqs %>%
  mutate(tf=n / sum(n)) %>%
  head(10)
##                 title       word  n          tf
## 1  Macalester College macalester 85 0.037215412
## 2  Macalester College   students 37 0.016199650
## 3  Macalester College    college 34 0.014886165
## 4  Macalester College     campus 28 0.012259194
## 5  Macalester College    student 22 0.009632224
## 6  Macalester College  minnesota 19 0.008318739
## 7  Macalester College      house 15 0.006567426
## 8  Macalester College       hall 14 0.006129597
## 9  Macalester College university 14 0.006129597
## 10 Macalester College       arts 13 0.005691769

Document Frequency

We will now see how to compare documents. To start, we need to create a corpus, which is a collection of documents.

I will use the Wikipedia API to collect Wikipedia text describing ACM schools, but…

Exercise: I would like you to build up your own corpus that is interesting to you containing at least five different documents that are somehow related.

school_wiki_titles <- c(
  'Beloit College',
    'Carleton College',
    'Coe College',
    'Colorado College',
    'Cornell College',
    'Grinnell College',
    'Knox College',
    'Lake Forest College',
    'Lawrence University',
    'Luther College',
    'Macalester College',
    'Monmouth College',
    'Ripon College',
    'St. Olaf College');

all_colleges <- GetArticleText("en", school_wiki_titles);
title text
Beloit College Beloit College is a private liberal arts college in Beloit, Wisconsin. Foun…
Carleton College Carleton College ( KARL-tin) is a private liberal arts college in Northfiel…
Coe College Coe College is a private liberal arts college in Cedar Rapids, Iowa. Founde…
Colorado College The Colorado College is a private liberal arts college in Colorado Springs,…
Cornell College Cornell College is a private liberal arts college in Mount Vernon, Iowa. Or…
Grinnell College Grinnell College is a private liberal arts college in Grinnell, Iowa. It wa…
Knox College Knox College may refer to:

Knox College (Illinois), a four-year coeducatio… | |Lake Forest College |Lake Forest College is a private liberal arts college in Lake Forest, Illin… | |Lawrence University |Lawrence University is a private liberal arts college and conservatory of m… | |Luther College |Luther College is the name of several educational institutions:

== Austra… | |Macalester College |Macalester College () is a private liberal arts college in Saint Paul, Minn… | |Monmouth College |Monmouth College is a private Presbyterian liberal arts college in Monmouth… | |Ripon College |Ripon College may refer to:

Ripon College (Wisconsin), a liberal arts coll… | |St. Olaf College |St. Olaf College is a private liberal arts college in Northfield, Minnesota… |

Term Frequency

We will first calculate term frequncy, which is the number of times each word occurs:

all_school_counts <-
  all_colleges %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(title, word, sort=TRUE)
## Joining, by = "word"

Tf-Idf

While counts are a reasonable start, they are not perfect. For example, student probably has a high frequency count for all colleges, so it is not particularly distinctive in this corpus.

A better measure of distinctiveness is Tf-IDF (“Term-Frequency Inverse-Document Frequency”).

TF (Term freqency) measures the number of times a word appears. It is often normalized by dividing by the length of a document so that it actually represents the fraction of words in a document that are some particular word.

IDF measures the overall popularity of a word across a corpus, on the other hand, and penalizes “overly-general” words. The actual measure of generality is “Inverse Document Frequency” (idf), which is the log of the inverse of the ratio of documents the term appears in.

Luckily, tidytext does this entirely for us!

with_tf_idf <-
  all_school_counts %>%
  bind_tf_idf(word, title, n) %>%
  arrange(desc(tf_idf))

with_tf_idf %>% filter(title=='Macalester College') %>% head(10)
##                 title           word  n          tf      idf      tf_idf
## 1  Macalester College     macalester 85 0.037215412 2.639057 0.098213605
## 2  Macalester College            mac 13 0.005691769 2.639057 0.015020904
## 3  Macalester College      minnesota 19 0.008318739 1.252763 0.010421408
## 4  Macalester College     engagement  7 0.003064799 2.639057 0.008088179
## 5  Macalester College   macalester's  7 0.003064799 2.639057 0.008088179
## 6  Macalester College sustainability 13 0.005691769 1.252763 0.007130437
## 7  Macalester College          civic  6 0.002626970 2.639057 0.006932725
## 8  Macalester College         summit  5 0.002189142 2.639057 0.005777271
## 9  Macalester College       semester 10 0.004378284 1.252763 0.005484952
## 10 Macalester College         gender  8 0.003502627 1.540445 0.005395604

Analyzing your Tf-Idf matrix

Exercise: Use the Tf-Idf matrix to identify the five highest scoring Tf-Idf words in each document. Compare these to the words with highest counts in each document. What do you notice.