The Tidy Text Format

A tidy text format, as described by Hadley Wickham (2014), is a table with one token per row. A token is meaningful unit of a text, such as a word, that we are interested in using for analysis. In this guide, we provide the basic R functions to tokenize the text data and concert to a one-term-per-row format. We also show how to construct the document-term-matrix (dtm) of text data.

Tidy Text Format Using the {tm} R package

Comments <- c("This is cool, 555 I like the place! Really nice!",
              "AAAAAAAAHHHHH I don't like it, I need a refund!!!!!!", 
              "Probably needs work, they should check the faucet, I can't find my sock.", 
              "I'm never coming back, I hate the place. Zero stars for me!", 
              "I like to stay here again sometime, hopefully with my family.")
data <- data.frame(ID=c(1:5), Comments)

# Option 1: Using the {tm} package
library(tm)
corpus <- Corpus(VectorSource(data$Comments))   # Create the corpus (list)
corpus[[1]]$content                             # print raw text
## [1] "This is cool, 555 I like the place! Really nice!"
corpus <- tm_map(corpus, removeNumbers)         # Remove numbers in the text
corpus <- tm_map(corpus, tolower)               # Convert text to lower case
corpus <- tm_map(corpus, removeWords, stopwords(kind="en")) # Remove stopwords
stopwords2 <- c("aaaaaaaahhhhh")                
corpus <- tm_map(corpus, removeWords, stopwords2) # Remove customized stopwords
corpus <- tm_map(corpus, removePunctuation)     # Remove punctuation marks
#corpus <- tm_map(corpus, stemDocument)          # Stem words (optional)
corpus <- tm_map(corpus, stripWhitespace)       # Remove extra spaces
corpus[[1]]$content                             # print the processed text
## [1] " cool like place really nice"
dtm_comment <- DocumentTermMatrix(corpus)       # Create the document-term-matrix
dtm_comment <- as.matrix(dtm_comment)           
dtm_comment
##     Terms
## Docs cool like nice place really need refund check faucet find needs probably
##    1    1    1    1     1      1    0      0     0      0    0     0        0
##    2    0    1    0     0      0    1      1     0      0    0     0        0
##    3    0    0    0     0      0    0      0     1      1    1     1        1
##    4    0    0    0     1      0    0      0     0      0    0     0        0
##    5    0    1    0     0      0    0      0     0      0    0     0        0
##     Terms
## Docs sock work back coming hate never stars zero family hopefully sometime stay
##    1    0    0    0      0    0     0     0    0      0         0        0    0
##    2    0    0    0      0    0     0     0    0      0         0        0    0
##    3    1    1    0      0    0     0     0    0      0         0        0    0
##    4    0    0    1      1    1     1     1    1      0         0        0    0
##    5    0    0    0      0    0     0     0    0      1         1        1    1

Tidy Text Format Using the {tidyverse} R package

# Option 2: Using the  {tidyverse} package
library(tidyverse)
library(tidytext)
library(dplyr)

tidy_comment <- data %>%
  unnest_tokens(word, Comments) %>%          # unnest_tokens() "tokenizes" the data
  anti_join(stop_words, by="word")
head(tidy_comment)
##   ID          word
## 1  1          cool
## 2  1           555
## 3  1          nice
## 4  2 aaaaaaaahhhhh
## 5  2        refund
## 6  3         check
# Create Custom Stop Words
custom_stop_words <- tribble(                  
  ~word, ~lexicon,
  "555", "word",                             # Add additional stopwords if necessary
  "aaaaaaaahhhhh", "word",
)

stop_words2 <- stop_words %>% 
  bind_rows(custom_stop_words)                # Update the stopwords dictionary
 
tidy_comment <- data %>%
  unnest_tokens(word, Comments) %>%
  anti_join(stop_words2, by="word")                      # This time use stopwords2
head(tidy_comment)
##   ID   word
## 1  1   cool
## 2  1   nice
## 3  2 refund
## 4  3  check
## 5  3 faucet
## 6  3   sock
# Create the document-term-matrix (DTM)
dtm_comment <- tidy_comment %>%
  count(word, ID) %>%
  cast_dtm(ID, word, n) %>%
  as.matrix()

dtm_comment
##     Terms
## Docs check coming cool family faucet hate nice refund sock stars stay
##    3     1      0    0      0      1    0    0      0    1     0    0
##    4     0      1    0      0      0    1    0      0    0     1    0
##    1     0      0    1      0      0    0    1      0    0     0    0
##    5     0      0    0      1      0    0    0      0    0     0    1
##    2     0      0    0      0      0    0    0      1    0     0    0

Although we can create the DTM for either of the two package, we notice that the document term matrix using tidyverse is smaller than the tm package. This is because the tidyverse package implements a longer and more comprehensive list of stopwords than the stop_words dictionary of tm. This is particularly useful in trimming large quantities of text data. To illustrate this, we analyse the data on YouTube Comments from selected YT videos related to the World Cup Finals last December 2022. The finals game between Argentina and France was held in Qatar.

Tidying World Cup Finals YouTube Comments

WC <- read.csv("C:\\Users\\Asus\\Downloads\\YouTube-World-Cup-Comments.csv")
set.seed(193)
WC <- WC[sample(c(1:dim(WC)[1]), 1000),]            # Select only 1000 (for illustration only)
WC$ID <- c(1:1000)

tidy_comment <- WC %>%
  unnest_tokens(word, Comment) %>%
  anti_join(stop_words, by="word")
dtm_comment <- tidy_comment %>%
  count(word, ID) %>%
  cast_dtm(ID, word, n) %>%
  as.matrix()
dim(dtm_comment)                                   # Check the dimesion of the dtm matrix
## [1]  904 2797
dtm_comment[c(300:310), c(200:210)]                # Print only a portion of the matrix
##      Terms
## Docs  annoying ans antarctica antes anthem anti ap apa apost apporter apps
##   166        0   0          0     0      0    0  0   0     0        0    0
##   170        0   0          0     0      0    0  0   0     0        0    0
##   172        0   0          0     0      0    0  0   0     0        0    0
##   182        0   0          0     0      0    0  0   0     0        0    0
##   185        0   0          0     0      0    0  0   0     0        0    0
##   201        0   0          0     0      0    0  0   0     0        0    0
##   202        0   0          0     0      0    0  0   0     0        0    0
##   203        0   0          0     0      0    0  0   0     0        0    0
##   211        0   0          0     0      0    0  0   0     0        0    0
##   212        0   0          0     0      0    0  0   0     0        0    0
##   228        0   0          0     0      0    0  0   0     0        0    0
# Creating World Cloud Based on Term Frequency
library(wordcloud)
word_counts <- tidy_comment %>%
  count(word)
wordcloud(
  words = word_counts$word,
  freq = word_counts$n,
  random.order = FALSE,
  max.words = 70,
  min.freq = 10,
  colors = brewer.pal(8, 'Dark2'),
  scale = c(3, 0.8),
)