Source: University of Virginia Library, Leah Malkovich. Nov 27, 2018.

Beginner’s Guide

Quanteda has 3 major components of text:

The corpus is the entire text body object, such as a book or chapter of a book. Document feature matrix organizes tokenized words into columns, which makes doing analysis easier and is viewable with view() function. Tokens are each word in the corpus separated from the corpus.

Load the libraries

The corpus used in this guide will be from Project Gutenberg and will use Pride and Prejudice by Jane Austen and A Tale of Two Cities by Charles Dickens.

# if you need to install
# install.packages('quanteda')
library(quanteda)

library(tidyverse)

# readtext allows for .txt, .csv, .json, .doc and .pdf files
library(readtext)

# Project Gutenberg books
library(gutenbergr)

Load the Corpus

# create a temprary directory to store texts
dir.create("tmp")
# download texts
download.file(url = "https://www.gutenberg.org/files/1342/1342-0.txt", 
              destfile = "tmp/Pride and Prejudice_Jane Austen_2008_English.txt")
trying URL 'https://www.gutenberg.org/files/1342/1342-0.txt'
Content type 'text/plain' length 798774 bytes (780 KB)
==================================================
downloaded 780 KB
download.file(url = "https://www.gutenberg.org/files/98/98-0.txt", 
              destfile = "tmp/A Tale of Two Cities_Charles Dickens_2009_English.txt")
trying URL 'https://www.gutenberg.org/files/98/98-0.txt'
Content type 'text/plain' length 807231 bytes (788 KB)
==================================================
downloaded 788 KB
# read in texts
dataframe <- readtext("tmp/*.txt",
                      docvarsfrom = "filenames",
                      docvarnames = c("title", "author", 
                                      "year uploaded", "language"),
                      dvsep = "_",
                      encoding = "UTF-8")
# delete tmp directory
unlink("tmp", recursive = TRUE)



doc_corpus = corpus(dataframe)

summary(doc_corpus)
Corpus consisting of 2 documents, showing 2 documents:

                                                  Text Types Tokens Sentences
 A Tale of Two Cities_Charles Dickens_2009_English.txt 11584 170042      7931
      Pride and Prejudice_Jane Austen_2008_English.txt  7469 147567      6213
                title          author year.uploaded language
 A Tale of Two Cities Charles Dickens          2009  English
  Pride and Prejudice     Jane Austen          2008  English

Clean & Tokenize

Need to clean and tokenize the corpus. Tokenize is when the sentences are split up into individual words.

doc_tokens = tokens(doc_corpus)

doc_tokens
Tokens consisting of 2 documents and 4 docvars.
A Tale of Two Cities_Charles Dickens_2009_English.txt :
 [1] "The"       "Project"   "Gutenberg" "eBook"     "of"        "A"         "Tale"     
 [8] "of"        "Two"       "Cities"    ","         "by"       
[ ... and 170,030 more ]

Pride and Prejudice_Jane Austen_2008_English.txt :
 [1] "The"       "Project"   "Gutenberg" "eBook"     "of"        "Pride"     "and"      
 [8] "Prejudice" ","         "by"        "Jane"      "Austen"   
[ ... and 147,555 more ]

Now we have the document corpus tokenized, we can rerun the code to further clean the tokens by removing punctuations and numbers

doc_tokens = tokens(doc_tokens, 
                    remove_punct = TRUE, 
                    remove_numbers = TRUE,
                    remove_symbols = TRUE
                    )
doc_tokens
Tokens consisting of 2 documents and 4 docvars.
A Tale of Two Cities_Charles Dickens_2009_English.txt :
 [1] "The"       "Project"   "Gutenberg" "eBook"     "of"        "A"         "Tale"     
 [8] "of"        "Two"       "Cities"    "by"        "Charles"  
[ ... and 139,561 more ]

Pride and Prejudice_Jane Austen_2008_English.txt :
 [1] "The"       "Project"   "Gutenberg" "eBook"     "of"        "Pride"     "and"      
 [8] "Prejudice" "by"        "Jane"      "Austen"    "This"     
[ ... and 124,919 more ]

Now we need to remove STOPWORDS from our tokens, stop words are “the”, “and”, “it”, etc and by removing the these words helps have better analysis.

doc_tokens = tokens_select(doc_tokens,
                           stopwords('english'), # make sure of spelling for stopwords
                           selection = 'remove'
                           )
doc_tokens
Tokens consisting of 2 documents and 4 docvars.
A Tale of Two Cities_Charles Dickens_2009_English.txt :
 [1] "Project"   "Gutenberg" "eBook"     "Tale"      "Two"       "Cities"    "Charles"  
 [8] "Dickens"   "eBook"     "use"       "anyone"    "anywhere" 
[ ... and 65,212 more ]

Pride and Prejudice_Jane Austen_2008_English.txt :
 [1] "Project"   "Gutenberg" "eBook"     "Pride"     "Prejudice" "Jane"      "Austen"   
 [8] "eBook"     "use"       "anyone"    "anywhere"  "United"   
[ ... and 56,325 more ]

Stem the tokens

Stemming is the stem of a word, such as the word “dance”, the stem is danc and the endings could be + ing , ed in a text.

doc_tokens = tokens_wordstem(doc_tokens)
doc_tokens
Tokens consisting of 2 documents and 4 docvars.
A Tale of Two Cities_Charles Dickens_2009_English.txt :
 [1] "Project"   "Gutenberg" "eBook"     "Tale"      "Two"       "Citi"      "Charl"    
 [8] "Dicken"    "eBook"     "use"       "anyon"     "anywher"  
[ ... and 65,212 more ]

Pride and Prejudice_Jane Austen_2008_English.txt :
 [1] "Project"   "Gutenberg" "eBook"     "Pride"     "Prejudic"  "Jane"      "Austen"   
 [8] "eBook"     "use"       "anyon"     "anywher"   "Unite"    
[ ... and 56,325 more ]

Now we could make all the word stems lowercase for standardization

doc_tokens = tokens_tolower(doc_tokens)
doc_tokens
Tokens consisting of 2 documents and 4 docvars.
A Tale of Two Cities_Charles Dickens_2009_English.txt :
 [1] "project"   "gutenberg" "ebook"     "tale"      "two"       "citi"      "charl"    
 [8] "dicken"    "ebook"     "use"       "anyon"     "anywher"  
[ ... and 65,212 more ]

Pride and Prejudice_Jane Austen_2008_English.txt :
 [1] "project"   "gutenberg" "ebook"     "pride"     "prejudic"  "jane"      "austen"   
 [8] "ebook"     "use"       "anyon"     "anywher"   "unite"    
[ ... and 56,325 more ]

Summary of the word tokens

summary(doc_tokens)
                                                      Length Class  Mode     
A Tale of Two Cities_Charles Dickens_2009_English.txt 65224  -none- character
Pride and Prejudice_Jane Austen_2008_English.txt      56337  -none- character

Convert to DFM

After tokenized and cleaning the tokens into stems, convert to dfm

doc_dfm = dfm(doc_tokens)
doc_dfm
Document-feature matrix of: 2 documents, 7,972 features (31.27% sparse) and 4 docvars.
                                                       features
docs                                                    project gutenberg ebook tale two
  A Tale of Two Cities_Charles Dickens_2009_English.txt      91        31    20    6 214
  Pride and Prejudice_Jane Austen_2008_English.txt           90        31    21    0 131
                                                       features
docs                                                    citi charl dicken use anyon
  A Tale of Two Cities_Charles Dickens_2009_English.txt   41   102      3  78     5
  Pride and Prejudice_Jane Austen_2008_English.txt         2     7      0  63    26
[ reached max_nfeat ... 7,962 more features ]

Analysis

We could use the kwic() function which is the keywords-in-context function which shows specific words in the context in which they appear.

If we wanted to know where the word love is in the document corpus, and to get a small sentence context of where the word is used we use window = n. How many windows is how many words surround the keyword in search. The kwic() function returns the location of where each specific instance of the word is.

# teh doc_dfm will return an error, this function needs the tokenized corpus
head( kwic(doc_tokens, pattern = "love", window = 3 ))
Keyword-in-context with 6 matches.
NA

Top Features

What are the most used words in the document corpus ? We can use the topfeatures() function. This function takes n number of features, so for a top 10, n = 10.

# need to use the dfm
topfeatures(doc_dfm, 
            n= 10, 
            decreasing = TRUE
            )
       mr      said       one      look elizabeth      time      know      miss      much 
     1406      1063       721       646       634       541       540       525       513 
      now 
      465 

If you wanted word frequencies

# need to use the dfm
topfeatures(doc_dfm, 
            n= 10, 
            decreasing = TRUE,
            scheme = 'docfreq'
            )
  project gutenberg     ebook       two      citi     charl       use     anyon   anywher 
        2         2         2         2         2         2         2         2         2 
    unite 
        2 

The end

This has been the basic of text analysis using Quanteda, which leaves more to learn from here.

LS0tCnRpdGxlOiAiVGV4dCBBbmFseXNpcyB3aXRoIFF1YW50ZWRhIgpvdXRwdXQ6IAogIGh0bWxfbm90ZWJvb2s6IAogICAgdG9jOiB5ZXMKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgdGhlbWU6CiAgICAgIGJnOiAiIzIwMjEyMyIKICAgICAgZmc6ICIjQjhCQ0MyIgogICAgICBwcmltYXJ5OiAiI0VBODBGQyIKICAgICAgc2Vjb25kYXJ5OiAiIzAwREFDNiIKICAgICAgYmFzZV9mb250OgogICAgICAgIGdvb2dsZTogTGF0bwogICAgICBoZWFkaW5nX2ZvbnQ6CiAgICAgICAgZ29vZ2xlOiBVYnVudHUKCi0tLQoKYGBge3IgZXZhbD1GQUxTRSwgaW5jbHVkZT1GQUxTRX0KIyBzb3VyY2U6IGh0dHBzOi8vZGF0YS5saWJyYXJ5LnZpcmdpbmlhLmVkdS9hLWJlZ2lubmVycy1ndWlkZS10by10ZXh0LWFuYWx5c2lzLXdpdGgtcXVhbnRlZGEvCgoKYGBgCgpTb3VyY2U6ICpVbml2ZXJzaXR5IG9mIFZpcmdpbmlhIExpYnJhcnkqLCBMZWFoIE1hbGtvdmljaC4gTm92IDI3LCAyMDE4LgoKCiMjIEJlZ2lubmVyJ3MgR3VpZGUKClF1YW50ZWRhIGhhcyAzIG1ham9yIGNvbXBvbmVudHMgb2YgdGV4dDoKCi0gdGhlIGNvcnB1cwotICoqZG9jdW1lbnQtZmVhdHVyZS1tYXRyaXgqKiAoZGZtKSAKLSB0b2tlbnMKClRoZSBjb3JwdXMgaXMgdGhlIGVudGlyZSB0ZXh0IGJvZHkgb2JqZWN0LCBzdWNoIGFzIGEgYm9vayBvciBjaGFwdGVyIG9mIGEgYm9vay4gCkRvY3VtZW50IGZlYXR1cmUgbWF0cml4IG9yZ2FuaXplcyB0b2tlbml6ZWQgd29yZHMgaW50byBjb2x1bW5zLCB3aGljaCBtYWtlcyBkb2luZyBhbmFseXNpcyBlYXNpZXIgYW5kIGlzIHZpZXdhYmxlIHdpdGggYHZpZXcoKWAgZnVuY3Rpb24uIFRva2VucyBhcmUgZWFjaCB3b3JkIGluIHRoZSBjb3JwdXMgc2VwYXJhdGVkIGZyb20gdGhlIGNvcnB1cy4gCgojIyBMb2FkIHRoZSBsaWJyYXJpZXMKClRoZSBjb3JwdXMgdXNlZCBpbiB0aGlzIGd1aWRlIHdpbGwgYmUgZnJvbSBbUHJvamVjdCBHdXRlbmJlcmddKGh0dHBzOi8vd3d3Lmd1dGVuYmVyZy5vcmcvKSBhbmQgd2lsbCB1c2UgKlByaWRlIGFuZCBQcmVqdWRpY2UqIGJ5IEphbmUgQXVzdGVuIGFuZCAqQSBUYWxlIG9mIFR3byBDaXRpZXMqIGJ5IENoYXJsZXMgRGlja2Vucy4gCgoKYGBge3IgZWNobz1UUlVFLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQojIGlmIHlvdSBuZWVkIHRvIGluc3RhbGwKIyBpbnN0YWxsLnBhY2thZ2VzKCdxdWFudGVkYScpCmxpYnJhcnkocXVhbnRlZGEpCgpsaWJyYXJ5KHRpZHl2ZXJzZSkKCiMgcmVhZHRleHQgYWxsb3dzIGZvciAudHh0LCAuY3N2LCAuanNvbiwgLmRvYyBhbmQgLnBkZiBmaWxlcwpsaWJyYXJ5KHJlYWR0ZXh0KQoKIyBQcm9qZWN0IEd1dGVuYmVyZyBib29rcwpsaWJyYXJ5KGd1dGVuYmVyZ3IpCmBgYAoKPCEtLSBMb2FkIHRoZSBuZWVkZWQgbGlicmFyaWVzLiBUaGlzIG5vdGVib29rIGRldG91cnMgZnJvbSBzb3VyY2UgbWF0ZXJpYWwgb3ZlciByZXRyaWV2aW5nIHRoZSBib29rcyBieSB1c2luZyB0aGUgYGd1dGVuYmVyZ3JgIGxpYnJhcnkgaW5zdGVhZCBvZiBkb3dubG9hZGluZyB0aGUgYm9va3MgbG9jYWxseSB0aGVuIGZlZWRpbmcgdGhlIGZpbGVzIGludG8gUi4gIC0tPgoKIyMgTG9hZCB0aGUgQ29ycHVzCgpgYGB7ciBldmFsPUZBTFNFLCBpbmNsdWRlPUZBTFNFfQojIGJvb2sgdGl0bGVzIHdlIHdhbnQuIFlvdSBjYW4gYWxzbyByZXRyaWV2ZSBib29rcyBieSB0aGVpciBJRHMgKGNhbiBzZWFyY2ggb24gd2Vic2l0ZSkKdGl0bGVzID0gYygiUHJpZGUgYW5kIFByZWp1ZGljZSIsICJBIFRhbGUgb2YgVHdvIENpdGllcyIpCgojIGdyYWIgdGhlIGJvb2tzCmJvb2tzID0gZ3V0ZW5iZXJncjo6Z3V0ZW5iZXJnX3dvcmtzKGxhbmd1YWdlcyA9ICdlbicsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRpdGxlICVpbiUgdGl0bGVzCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICkgJT4lIAogIGd1dGVuYmVyZ3I6Omd1dGVuYmVyZ19kb3dubG9hZChtZXRhX2ZpZWxkcyA9ICJ0aXRsZSIpCgojIHJldHVybnMgYSBkYXRhZnJhbWUKZ2xpbXBzZShib29rcykKYGBgCgoKYGBge3J9CiMgY3JlYXRlIGEgdGVtcHJhcnkgZGlyZWN0b3J5IHRvIHN0b3JlIHRleHRzCmRpci5jcmVhdGUoInRtcCIpCiMgZG93bmxvYWQgdGV4dHMKZG93bmxvYWQuZmlsZSh1cmwgPSAiaHR0cHM6Ly93d3cuZ3V0ZW5iZXJnLm9yZy9maWxlcy8xMzQyLzEzNDItMC50eHQiLCAKICAgICAgICAgICAgICBkZXN0ZmlsZSA9ICJ0bXAvUHJpZGUgYW5kIFByZWp1ZGljZV9KYW5lIEF1c3Rlbl8yMDA4X0VuZ2xpc2gudHh0IikKZG93bmxvYWQuZmlsZSh1cmwgPSAiaHR0cHM6Ly93d3cuZ3V0ZW5iZXJnLm9yZy9maWxlcy85OC85OC0wLnR4dCIsIAogICAgICAgICAgICAgIGRlc3RmaWxlID0gInRtcC9BIFRhbGUgb2YgVHdvIENpdGllc19DaGFybGVzIERpY2tlbnNfMjAwOV9FbmdsaXNoLnR4dCIpCiMgcmVhZCBpbiB0ZXh0cwpkYXRhZnJhbWUgPC0gcmVhZHRleHQoInRtcC8qLnR4dCIsCiAgICAgICAgICAgICAgICAgICAgICBkb2N2YXJzZnJvbSA9ICJmaWxlbmFtZXMiLAogICAgICAgICAgICAgICAgICAgICAgZG9jdmFybmFtZXMgPSBjKCJ0aXRsZSIsICJhdXRob3IiLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAieWVhciB1cGxvYWRlZCIsICJsYW5ndWFnZSIpLAogICAgICAgICAgICAgICAgICAgICAgZHZzZXAgPSAiXyIsCiAgICAgICAgICAgICAgICAgICAgICBlbmNvZGluZyA9ICJVVEYtOCIpCiMgZGVsZXRlIHRtcCBkaXJlY3RvcnkKdW5saW5rKCJ0bXAiLCByZWN1cnNpdmUgPSBUUlVFKQoKCgpkb2NfY29ycHVzID0gY29ycHVzKGRhdGFmcmFtZSkKCnN1bW1hcnkoZG9jX2NvcnB1cykKYGBgCgoKIyMgQ2xlYW4gJiBUb2tlbml6ZQoKTmVlZCB0byBjbGVhbiBhbmQgdG9rZW5pemUgdGhlIGNvcnB1cy4gVG9rZW5pemUgaXMgd2hlbiB0aGUgc2VudGVuY2VzIGFyZSBzcGxpdCB1cCBpbnRvIGluZGl2aWR1YWwgd29yZHMuCgpgYGB7cn0KZG9jX3Rva2VucyA9IHRva2Vucyhkb2NfY29ycHVzKQoKZG9jX3Rva2VucwpgYGAKCk5vdyB3ZSBoYXZlIHRoZSBkb2N1bWVudCBjb3JwdXMgdG9rZW5pemVkLCB3ZSBjYW4gcmVydW4gdGhlIGNvZGUgdG8gZnVydGhlciBjbGVhbiB0aGUgdG9rZW5zIGJ5IHJlbW92aW5nIHB1bmN0dWF0aW9ucyBhbmQgbnVtYmVycwoKYGBge3J9CmRvY190b2tlbnMgPSB0b2tlbnMoZG9jX3Rva2VucywgCiAgICAgICAgICAgICAgICAgICAgcmVtb3ZlX3B1bmN0ID0gVFJVRSwgCiAgICAgICAgICAgICAgICAgICAgcmVtb3ZlX251bWJlcnMgPSBUUlVFLAogICAgICAgICAgICAgICAgICAgIHJlbW92ZV9zeW1ib2xzID0gVFJVRQogICAgICAgICAgICAgICAgICAgICkKZG9jX3Rva2VucwpgYGAKCgpOb3cgd2UgbmVlZCB0byByZW1vdmUgU1RPUFdPUkRTIGZyb20gb3VyIHRva2Vucywgc3RvcCB3b3JkcyBhcmUgInRoZSIsICJhbmQiLCAiaXQiLCBldGMgYW5kIGJ5IHJlbW92aW5nIHRoZSB0aGVzZSB3b3JkcyBoZWxwcyBoYXZlIGJldHRlciBhbmFseXNpcy4KCmBgYHtyfQpkb2NfdG9rZW5zID0gdG9rZW5zX3NlbGVjdChkb2NfdG9rZW5zLAogICAgICAgICAgICAgICAgICAgICAgICAgICBzdG9wd29yZHMoJ2VuZ2xpc2gnKSwgIyBtYWtlIHN1cmUgb2Ygc3BlbGxpbmcgZm9yIHN0b3B3b3JkcwogICAgICAgICAgICAgICAgICAgICAgICAgICBzZWxlY3Rpb24gPSAncmVtb3ZlJwogICAgICAgICAgICAgICAgICAgICAgICAgICApCmRvY190b2tlbnMKYGBgCgoKIyMgU3RlbSB0aGUgdG9rZW5zCgpTdGVtbWluZyBpcyB0aGUgc3RlbSBvZiBhIHdvcmQsIHN1Y2ggYXMgIHRoZSB3b3JkICJkYW5jZSIsIHRoZSBzdGVtIGlzIGBkYW5jYCBhbmQgdGhlIGVuZGluZ3MgY291bGQgYmUgKyBgaW5nYCAsIGBlZGAgaW4gYSB0ZXh0LiAKCgpgYGB7cn0KZG9jX3Rva2VucyA9IHRva2Vuc193b3Jkc3RlbShkb2NfdG9rZW5zKQpkb2NfdG9rZW5zCgpgYGAKTm93IHdlIGNvdWxkIG1ha2UgYWxsIHRoZSB3b3JkIHN0ZW1zIGxvd2VyY2FzZSBmb3Igc3RhbmRhcmRpemF0aW9uCgpgYGB7cn0KZG9jX3Rva2VucyA9IHRva2Vuc190b2xvd2VyKGRvY190b2tlbnMpCmRvY190b2tlbnMKYGBgCgpTdW1tYXJ5IG9mIHRoZSB3b3JkIHRva2VucwpgYGB7cn0Kc3VtbWFyeShkb2NfdG9rZW5zKQpgYGAKCgoKIyMgQ29udmVydCB0byBERk0KCkFmdGVyIHRva2VuaXplZCBhbmQgY2xlYW5pbmcgdGhlIHRva2VucyBpbnRvIHN0ZW1zLCBjb252ZXJ0IHRvIGRmbQoKYGBge3J9CmRvY19kZm0gPSBkZm0oZG9jX3Rva2VucykKZG9jX2RmbQpgYGAKIyMjIEFuYWx5c2lzCgpXZSBjb3VsZCB1c2UgdGhlIGBrd2ljKClgIGZ1bmN0aW9uIHdoaWNoIGlzIHRoZSAqKmtleXdvcmRzLWluLWNvbnRleHQqKiBmdW5jdGlvbiB3aGljaCBzaG93cyBzcGVjaWZpYyB3b3JkcyBpbiB0aGUgY29udGV4dCBpbiB3aGljaCB0aGV5IGFwcGVhci4KCklmIHdlIHdhbnRlZCB0byBrbm93IHdoZXJlIHRoZSB3b3JkIGBsb3ZlYCBpcyBpbiB0aGUgZG9jdW1lbnQgY29ycHVzLCBhbmQgdG8gZ2V0IGEgc21hbGwgc2VudGVuY2UgY29udGV4dCBvZiB3aGVyZSB0aGUgd29yZCBpcyB1c2VkIHdlIHVzZSBgd2luZG93ID0gbmAuIEhvdyBtYW55IHdpbmRvd3MgaXMgaG93IG1hbnkgd29yZHMgc3Vycm91bmQgdGhlIGtleXdvcmQgaW4gc2VhcmNoLiBUaGUgYGt3aWMoKWAgZnVuY3Rpb24gcmV0dXJucyB0aGUgbG9jYXRpb24gb2Ygd2hlcmUgZWFjaCBzcGVjaWZpYyBpbnN0YW5jZSBvZiB0aGUgd29yZCBpcy4KCmBgYHtyfQojIHRoZSBkb2NfZGZtIHdpbGwgcmV0dXJuIGFuIGVycm9yLCB0aGlzIGZ1bmN0aW9uIG5lZWRzIHRoZSB0b2tlbml6ZWQgY29ycHVzCmhlYWQoIGt3aWMoZG9jX3Rva2VucywgcGF0dGVybiA9ICJsb3ZlIiwgd2luZG93ID0gMyApKQpgYGAKCgojIyMgVG9wIEZlYXR1cmVzCgpXaGF0IGFyZSB0aGUgbW9zdCB1c2VkIHdvcmRzIGluIHRoZSBkb2N1bWVudCBjb3JwdXMgPyAKV2UgY2FuIHVzZSB0aGUgYHRvcGZlYXR1cmVzKClgIGZ1bmN0aW9uLiBUaGlzIGZ1bmN0aW9uIHRha2VzIGBuYCBudW1iZXIgb2YgZmVhdHVyZXMsIHNvIGZvciBhIHRvcCAxMCwgbiA9IDEwLgoKYGBge3J9CiMgbmVlZCB0byB1c2UgdGhlIGRmbQp0b3BmZWF0dXJlcyhkb2NfZGZtLCAKICAgICAgICAgICAgbj0gMTAsIAogICAgICAgICAgICBkZWNyZWFzaW5nID0gVFJVRQogICAgICAgICAgICApCiMgcmV0dXJucyB0aGUgY291bnQgb2YgZWFjaCB3b3JkCmBgYApJZiB5b3Ugd2FudGVkIHdvcmQgZnJlcXVlbmNpZXMKCmBgYHtyfQojIG5lZWQgdG8gdXNlIHRoZSBkZm0KdG9wZmVhdHVyZXMoZG9jX2RmbSwgCiAgICAgICAgICAgIG49IDEwLCAKICAgICAgICAgICAgZGVjcmVhc2luZyA9IFRSVUUsCiAgICAgICAgICAgIHNjaGVtZSA9ICdkb2NmcmVxJwogICAgICAgICAgICApCmBgYAoKIyMgVGhlIGVuZAoKVGhpcyBoYXMgYmVlbiB0aGUgYmFzaWMgb2YgdGV4dCBhbmFseXNpcyB1c2luZyBRdWFudGVkYSwgd2hpY2ggbGVhdmVzIG1vcmUgdG8gbGVhcm4gZnJvbSBoZXJlLgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoK