Introduction

The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.

Tasks to accomplish

Questions to consider

Exploratory Data Analysis

Note: It should be easier to use Package “tm” etc. to do text mining. However I decided to do it mainly with my own code for practice purposes. Also there are news, blogs and Twitter texts, I only tested processing Twitter texts to see where the performance limitations would be. For actual models, I would use as much texts as I could.

library(stringr)
library(tm)
library(ggplot2)
library(ngram)

# Create data directory if it doesn't exist
data_dir <- "~/Documents/R/data/capstone"
dir.create(data_dir, recursive = TRUE, showWarnings = FALSE)

# Constants - Updated for Mac paths
co_text_attr_en <- file.path(data_dir, "text_attr_en.rds")
co_tidy_twitter_en <- file.path(data_dir, "tidy_twitter_en.rds")
co_tidy_nostop_twitter_en <- file.path(data_dir, "tidy_nostop_twitter_en.rds")
co_1gram_twitter_en <- file.path(data_dir, "1gram_twitter_en.rds")
co_2gram_twitter_en <- file.path(data_dir, "2gram_twitter_en.rds")
co_3gram_twitter_en <- file.path(data_dir, "3gram_twitter_en.rds")
co_1gram_nostop_twitter_en <- file.path(data_dir, "1gram_nostop_twitter_en.rds")
co_2gram_nostop_twitter_en <- file.path(data_dir, "2gram_nostop_twitter_en.rds")
co_3gram_nostop_twitter_en <- file.path(data_dir, "3gram_nostop_twitter_en.rds")
co_1gram_nostop_news_en <- file.path(data_dir, "1gram_nostop_news_en.rds")
co_1gram_nostop_blogs_en <- file.path(data_dir, "1gram_nostop_blogs_en.rds")

Data attributes

Refer to this report. https://rpubs.com/Nov05/450458

# Update this path to where your data is stored on Mac
folder <- "~/Documents/R/data/capstone/en_US/"
filelist <- list.files(path=folder)

l <- lapply(paste0(folder, filelist), function(filepath) {
  size <- file.info(filepath)[1]/1024/1000
  con <- file(filepath, open="r")
  lines <- readLines(con)
  close(con)

  nchars <- lapply(lines, nchar)
  maxchars <- max(unlist(nchars))
  nwords <- sum(sapply(strsplit(lines, "\\s+"), length))

  return(c(filepath, format(round(size, 2), nsmall=2), length(lines), maxchars, nwords))
})

df <- data.frame(matrix(unlist(l), nrow=3, byrow=TRUE))
names(df) <- c('File Name', 'Size(MB)', 'Entries', 'Longest Line', 'Total Words')
saveRDS(df, co_text_attr_en)
if(file.exists(co_text_attr_en)) {
  df <- readRDS(co_text_attr_en)
  print(df)
} else {
  cat("Data file not found. Run the data processing chunk above first.\n")
}
## Data file not found. Run the data processing chunk above first.

Data Cleanse (English Twitter texts)

  1. Convert all letters to lower case to simplify the problem;
  2. Split lines at “.”, “,” and etc.;
  3. Remove non-alphanumeric characters at the beginning or at the end of a word, retain special characters in words like “i’am”, “we’ve”, etc;
  4. Remove extra spaces;
  5. Split words by space.
# Update this path to where your Twitter data is stored on Mac
filepath <- "~/Documents/R/data/capstone/en_US/en_US.twitter.txt"
con <- file(filepath) 
lines <- readLines(con, skipNul=TRUE) # 2360148 lines
close(con)

lines <- tolower(lines)
# split at all ".", "," and etc.
lines <- unlist(strsplit(lines, "[.,:;!?(){}<>]+")) # 5398319 lines

# replace all non-alphanumeric characters with a space at the beginning/end of a word.
lines <- gsub("^[^a-z0-9]+|[^a-z0-9]+$", " ", lines) # at the begining/end of a line
lines <- gsub("[^a-z0-9]+\\s", " ", lines) # before space
lines <- gsub("\\s[^a-z0-9]+", " ", lines) # after space
lines <- gsub("\\s+", " ", lines) # remove mutiple spaces
lines <- str_trim(lines) # remove spaces at the beginning/end of the line
saveRDS(lines, file=co_tidy_twitter_en)

Check how the texts look like now.

if(file.exists(co_tidy_twitter_en)) {
  lines <- readRDS(file=co_tidy_twitter_en)
  head(lines, 20)
} else {
  cat("Data file not found. Run the data processing chunk above first.\n")
}
## Data file not found. Run the data processing chunk above first.

Remove the stop words.

# remove stop words (There are 5398319 lines, size 356.3 Mb. With 16 Gb memory it took about 3 hours. According to this post https://stackoverflow.com/questions/50635341/removing-stop-words-from-corpus-in-r-is-too-slow , it took 2 hours to process 31 MB data by using package "tm".)
lines <- unlist(lapply(lines, function(line){removeWords(line, stopwords("en"))}))
lines <- str_trim(lines) # remove spaces at the beginning/end of the line
lines <- gsub("\\s+", " ", lines) # remove mutiple spaces
lines <- lines[nchar(lines)>0] # remove blank lines. reduce the elements from 5398319 to 5059787
saveRDS(lines, file=co_tidy_nostop_twitter_en)

Count word frequence (1-gram)

# split words by space
words <- unlist(strsplit(lines, "\\s+"))

# count word frequence
word.freq <- table(words)

# convert to data frame
df <- cbind.data.frame(names(word.freq), as.integer(word.freq))
names(df) <- c('word', 'freq')
row.names(df) <- df[,1]

# sort words by frequence descending
df <- df[order(-df$freq),]

# save as RDS file
saveRDS(df, file=co_1gram_twitter_en)

Locate a word in the list.

if(file.exists(co_1gram_twitter_en)) {
  # read word frequence data
  df <- readRDS(file=co_1gram_twitter_en)
  # locate "i'm" 
  if("i'm" %in% rownames(df)) {
    print(df["i'm",])
  } else {
    cat("Word 'i'm' not found in dataset.\n")
  }
} else {
  cat("Data file not found. Run the data processing chunks first.\n")
}
## Data file not found. Run the data processing chunks first.

Plotting

if(file.exists(co_1gram_twitter_en)) {
  df <- readRDS(file=co_1gram_twitter_en)
  ggplot(df[1:20,], aes(x=reorder(word,freq), freq)) +
    geom_col() +
    xlab(NULL) +
    coord_flip() +
    labs(title="Twitter Top 20 Word Frequence (with Stop Words)", x=NULL, y="Word Frequency")
} else {
  cat("Data file not found. Skipping plot.\n")
}
## Data file not found. Skipping plot.
if(file.exists(co_1gram_twitter_en)) {
  df <- readRDS(file=co_1gram_twitter_en)
  plot(df[1:500,]$freq,
       main='Twitter Top 500 Word Frequence',
       ylab="Frequence",
       xlab="Word")
} else {
  cat("Data file not found. Skipping plot.\n")
}
## Data file not found. Skipping plot.
if(file.exists(co_1gram_twitter_en)) {
  df <- readRDS(file=co_1gram_twitter_en)
  ggplot(data=df[1:250,], aes(x=df[1:250,]$freq)) + 
    geom_histogram(colour="black", fill="white", breaks=seq(0, 900000,by=3000)) + 
    labs(title="Histogram of Twitter Top 250 Word Frequence", x="Word Frequency", y="Count")
} else {
  cat("Data file not found. Skipping plot.\n")
}
## Data file not found. Skipping plot.

Often there are words that are frequent but provide little information. These are called stop words, and we want to remove them from your analysis. Some common English stop words include “I”, “she’ll”, “the”, etc. In the tm package, there are 174 common English stop words. The following method to remove the stop words is not recommended in this case because it takes long time.

# NOT recommended!
lines <- lapply(lines, function(line){removeWords(line, stopwords("en"))})

Instead, stop words are removed from the frequence data frame directly.

df <- readRDS(file=co_1gram_twitter_en)
idx <- unlist(lapply(stopwords("en"), function(stopword){return(which(df$word == stopword))}))
df <- df[-idx,]
saveRDS(df, co_1gram_nostop_twitter_en)
if(file.exists(co_1gram_nostop_twitter_en)) {
  df <- readRDS(co_1gram_nostop_twitter_en)
  ggplot(df[1:20,], aes(x=reorder(word,freq), freq)) +
    geom_col() +
    xlab(NULL) +
    coord_flip() +
    labs(title="Twitter Top 20 Word Frequence (without Stop Words)", x=NULL, y="Word Frequency")
} else {
  cat("Data file not found. Skipping plot.\n")
}
## Data file not found. Skipping plot.
if(file.exists(co_1gram_nostop_twitter_en)) {
  library("RColorBrewer")
  library("wordcloud")
  df <- readRDS(co_1gram_nostop_twitter_en)
  # generate word cloud
  set.seed(1234)
  wordcloud(words = df$word, freq = df$freq, min.freq = 1,
            max.words=200, random.order=FALSE, rot.per=0.35, 
            colors=brewer.pal(8, "Dark2"))
} else {
  cat("Data file not found. Skipping wordcloud.\n")
}
## Data file not found. Skipping wordcloud.

2-grams

For example, generate 2-grams from sentence “how are you”. Refer to document: https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf

if(file.exists(co_tidy_twitter_en)) {
  lines <- readRDS(file=co_tidy_twitter_en)
  if(length(lines) > 0) {
    lines[1]
    print(ngram(lines[1], n=2), output="full")
  }
} else {
  cat("Data file not found. Skipping 2-gram example.\n")
}
## Data file not found. Skipping 2-gram example.

Now generate 2-grams from the whole Twitter text.

# remove lines that contain less than 2 words, or ngram() would throw errors.
lines <- lines[str_count(lines, "\\s+")>0] # 4375507 lines
bigram <- ngram(lines, n=2) # this line takes long time. probably should sample the text first.
df <- get.phrasetable(bigram)
saveRDS(df, co_2gram_twitter_en)
if(file.exists(co_2gram_twitter_en)) {
  df <- readRDS(co_2gram_twitter_en)
  ggplot(df[1:20,], aes(x=reorder(ngrams,freq), freq)) +
    geom_col() +
    xlab(NULL) +
    coord_flip() +
    labs(title="Twitter Top 20 2-Gram Frequence (with Stop Words)", x=NULL, y="Word Frequency")
} else {
  cat("Data file not found. Skipping plot.\n")
}
## Data file not found. Skipping plot.

3-grams

Generate 3-grams for the whole Twitter text.

lines <- readRDS(co_tidy_twitter_en)
# remove lines that contain less than 3 words, or ngram() would throw errors.
lines <- lines[str_count(lines, "\\s+")>1] # 3803575 lines
trigram <- ngram(lines, n=3) # this doesn't take long time surprisingly.
df <- get.phrasetable(trigram)
saveRDS(df, co_3gram_twitter_en)
if(file.exists(co_3gram_twitter_en)) {
  df <- readRDS(co_3gram_twitter_en)
  ggplot(df[1:20,], aes(x=reorder(ngrams,freq), freq)) +
    geom_col() +
    xlab(NULL) +
    coord_flip() +
    labs(title="Twitter Top 20 3-Gram Frequence (with Stop Words)", x=NULL, y="Word Frequency")
} else {
  cat("Data file not found. Skipping plot.\n")
}
## Data file not found. Skipping plot.
lines <- readRDS((co_tidy_nostop_twitter_en))
# remove lines that contain less than 3 words, or ngram() would throw errors.
lines <- lines[str_count(lines, "\\s+")>1] # 2780871 lines left
trigram <- ngram(lines, n=3) # this took less than a minute surprisingly.
df <- get.phrasetable(trigram)
saveRDS(df, co_3gram_nostop_twitter_en)
if(file.exists(co_3gram_nostop_twitter_en)) {
  df <- readRDS(co_3gram_nostop_twitter_en)
  ggplot(df[1:20,], aes(x=reorder(ngrams,freq), freq)) +
    geom_col() +
    xlab(NULL) +
    coord_flip() +
    labs(title="Twitter Top 20 3-Gram Frequence (without Stop Words)", x=NULL, y="Word Frequency")
} else {
  cat("Data file not found. Skipping plot.\n")
}
## Data file not found. Skipping plot.

Unique words needed to cover the text

if(file.exists(co_1gram_twitter_en)) {
  df <- readRDS(co_1gram_twitter_en) # with stop words
  df$count <- 1
  df$count <- cumsum(df$count)
  df$coverage <- cumsum(df$freq) / sum(df$freq) * 100
  df <- df[df$coverage <= 91,]

  # find the word counts for 50% and 90% coverage 
  points <- rbind(tail(df[df$coverage <= 50,], 1), tail(df[df$coverage <= 90,], 1))

  ggplot(data=df, aes(x=count, y=coverage, group=1)) +
    geom_line()+
    geom_point(data=points, colour="red", size=3) +
    geom_text(data=points, aes(label=count), hjust=-1, vjust=1) +
    ggtitle("Number of Words to Cover Twitter Text (with Stop Words)") +
    xlab("Number of Words") +
    ylab("Coverage Percentage")
} else {
  cat("Data file not found. Skipping plot.\n")
}
## Data file not found. Skipping plot.

Removal of the stop words would increase the number of words needed to cover the whole text. To decrease the number, lemmatization and/or stemming could be used. https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

For other two English files

Do the same to see whether there are differences.

# Update these paths to your actual data locations
df_news <- readRDS(co_1gram_nostop_news_en)
df_blogs <- readRDS(co_1gram_nostop_blogs_en)

par(mfrow=c(1,2))

ggplot(df_news[1:20,], aes(x=reorder(word,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="News Top 20 Word Frequence (without Stop Words)", x=NULL, y="Word Frequency")

ggplot(df_blogs[1:20,], aes(x=reorder(word,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="Blogs Top 20 Word Frequence (without Stop Words)", x=NULL, y="Word Frequency")