Introduction

The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.

Tasks to accomplish

Questions to consider

Exploratory Data Analysis

Note: It should be easier to use Package “tm” etc. to do text mining. However I decided to do it mainly with my own code for practice purposes. Also there are news, blogs and Twitter texts, I only tested processing Twitter texts to see where the performance limitations would be. For actual models, I would use as much texts as I could.

library(stringr)
library(tm)
library(ggplot2)
library(ngram)

# constants
co_text_attr_en = "D:/R/capstone/data/text_attr_en.rds"
co_tidy_twitter_en = "D:/R/capstone/data/tidy_twitter_en.rds"
co_tidy_nostop_twitter_en = "D:/R/capstone/data/tidy_nostop_twitter_en.rds"
co_1gram_twitter_en = "D:/R/capstone/data/1gram_twitter_en.rds"
co_2gram_twitter_en = "D:/R/capstone/data/2gram_twitter_en.rds"
co_3gram_twitter_en = "D:/R/capstone/data/3gram_twitter_en.rds"
co_1gram_nostop_twitter_en = "D:/R/capstone/data/1gram_nostop_twitter_en.rds"
co_2gram_nostop_twitter_en = "D:/R/capstone/data/2gram_nostop_twitter_en.rds"
co_3gram_nostop_twitter_en = "D:/R/capstone/data/3gram_nostop_twitter_en.rds"

Data attributes

Refer to this report. https://rpubs.com/Nov05/450458

folder <- "D:/R/data/capstone/en_US/"
filelist <- list.files(path=folder)

l <- lapply(paste0(folder, filelist), function(filepath) {
  size <- file.info(filepath)[1]/1024/1000
  con <- file(filepath, open="r")
  lines <- readLines(con)
  nchars <- lapply(lines, nchar)
  maxchars <- max(unlist(nchars))
  nwords <- sum(sapply(strsplit(lines, "\\s+"), length))
  close(con)
  return(c(filepath, format(round(size, 2), nsmall=2), length(lines), maxchars, nwords))
})

df <- data.frame(matrix(unlist(l), nrow=3, byrow=TRUE))
names(df) <- c('File Name', 'Size(MB)', 'Entries', 'Longest Line', 'Total Words')
saveRDS(df, co_text_attr_en)
df <- readRDS(co_text_attr_en); df
##                                    File Name Size(MB) Entries Longest Line
## 1   D:/R/data/capstone/en_US/en_US.blogs.txt   205.23  899288        40835
## 2    D:/R/data/capstone/en_US/en_US.news.txt   200.99   77259         5760
## 3 D:/R/data/capstone/en_US/en_US.twitter.txt   163.19 2360148          213
##   Total Words
## 1    37334441
## 2     2643972
## 3    30373792

Data Cleanse (English Twitter texts)

  1. Convert all letters to lower case to simplify the problem;
  2. Split lines at “.”, “,” and etc.;
  3. Remove non-alphanumeric characters at the beginning or at the end of a word, retain special characters in words like “i’am”, “we’ve”, etc;
  4. Remove extra spaces;
  5. Split words by space.
# read file
filepath <- "D:/R/data/capstone/en_US/en_US.twitter.txt"
con <- file(filepath) 
lines <- readLines(con, skipNul=TRUE) # 2360148 lines
close(con)

lines <- tolower(lines)
# split at all ".", "," and etc.
lines <- unlist(strsplit(lines, "[.,:;!?(){}<>]+")) # 5398319 lines

# replace all non-alphanumeric characters with a space at the beginning/end of a word.
lines <- gsub("^[^a-z0-9]+|[^a-z0-9]+$", " ", lines) # at the begining/end of a line
lines <- gsub("[^a-z0-9]+\\s", " ", lines) # before space
lines <- gsub("\\s[^a-z0-9]+", " ", lines) # after space
lines <- gsub("\\s+", " ", lines) # remove mutiple spaces
lines <- str_trim(lines) # remove spaces at the beginning/end of the line
saveRDS(lines, file=co_tidy_twitter_en)

Check how the texts look like now.

lines <- readRDS(file=co_tidy_twitter_en)
head(lines, 20)
##  [1] "how are you"                                                     
##  [2] "btw thanks for the rt"                                           
##  [3] "you gonna be in dc anytime soon"                                 
##  [4] "love to see you"                                                 
##  [5] "been way"                                                        
##  [6] "way too long"                                                    
##  [7] "when you meet someone special"                                   
##  [8] "you'll know"                                                     
##  [9] "your heart will beat more rapidly and you'll smile for no reason"
## [10] "they've decided its more fun if i don't"                         
## [11] "so tired d"                                                      
## [12] "played lazer tag ran a lot d"                                    
## [13] "ughh going to sleep like in 5 minutes"                           
## [14] "words from a complete stranger"                                  
## [15] "made my birthday even better"                                    
## [16] "first cubs game ever"                                            
## [17] "wrigley field is gorgeous"                                       
## [18] "this is perfect"                                                 
## [19] "go cubs go"                                                      
## [20] "i no"

Remove the stop words.

# remove stop words (There are 5398319 lines, size 356.3 Mb. With 16 Gb memory it took about 3 hours. According to this post https://stackoverflow.com/questions/50635341/removing-stop-words-from-corpus-in-r-is-too-slow , it took 2 hours to process 31 MB data by using package "tm".)
lines <- unlist(lapply(lines, function(line){removeWords(line, stopwords("en"))}))
lines <- str_trim(lines) # remove spaces at the beginning/end of the line
lines <- gsub("\\s+", " ", lines) # remove mutiple spaces
lines <- lines[nchar(lines)>0] # remove blank lines. reduce the elements from 5398319 to 5059787
saveRDS(lines, file=co_tidy_nostop_twitter_en)

Count word frequence (1-gram)

# split words by space
words <- unlist(strsplit(lines, "\\s+"))

# count word frequence
word.freq <- table(words)

# convert to data frame
df <- cbind.data.frame(names(word.freq), as.integer(word.freq))
names(df) <- c('word', 'freq')
row.names(df) <- df[,1]

# sort words by frequence descending
df <- df[order(-df$freq),]

# save as RDS file
saveRDS(df, file=co_1gram_twitter_en)

Locate a word in the list.

# read word frequence data
df <- readRDS(file=co_1gram_twitter_en)
# locate "i'm" 
df["i'm",]
##     word   freq
## i'm  i'm 129302

Plotting

ggplot(df[1:20,], aes(x=reorder(word,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="Twitter Top 20 Word Frequence (with Stop Words)", x=NULL, y="Word Frequency")

plot(df[1:500,]$freq,
     main='Twitter Top 500 Word Frequence',
     ylab="Frequence",
     xlab="Word")

ggplot(data=df[1:250,], aes(x=df[1:250,]$freq)) + 
  geom_histogram(colour="black", fill="white", breaks=seq(0, 900000,by=3000)) + 
  labs(title="Histogram of Twitter Top 250 Word Frequence", x="Word Frequency", y="Count")

Often there are words that are frequent but provide little information. These are called stop words, and we want to remove them from your analysis. Some common English stop words include “I”, “she’ll”, “the”, etc. In the tm package, there are 174 common English stop words. The following method to remove the stop words is not recommended in this case because it takes long time.

# NOT recommended!
lines <- lapply(lines, function(line){removeWords(line, stopwords("en"))})

Instead, stop words are removed from the frequence data frame directly.

df <- readRDS(file=co_1gram_twitter_en)
idx <- unlist(lapply(stopwords("en"), function(stopword){return(which(df$word == stopword))}))
df <- df[-idx,]
saveRDS(df, co_1gram_nostop_twitter_en)
df <- readRDS(co_1gram_nostop_twitter_en)
ggplot(df[1:20,], aes(x=reorder(word,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="Twitter Top 20 Word Frequence (without Stop Words)", x=NULL, y="Word Frequency")

library("RColorBrewer")
library("wordcloud")
# generate word cloud
set.seed(1234)
wordcloud(words = df$word, freq = df$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

2-grams

For example, generate 2-grams from sentence “how are you”. Refer to document: https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf

lines[1]
## [1] "how are you"
print(ngram(lines[1], n=2), output="full")
## are you | 1 
## NULL {1} | 
## 
## how are | 1 
## you {1} |

Now generate 2-grams from the whole Twitter text.

# remove lines that contain less than 2 words, or ngram() would throw errors.
lines <- lines[str_count(lines, "\\s+")>0] # 4375507 lines
bigram <- ngram(lines, n=2) # this line takes long time. probably should sample the text first.
df <- get.phrasetable(bigram)
saveRDS(df, co_2gram_twitter_en)
df <- readRDS(co_2gram_twitter_en)
ggplot(df[1:20,], aes(x=reorder(ngrams,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="Twitter Top 20 2-Gram Frequence (with Stop Words)", x=NULL, y="Word Frequency")

3-grams

Generate 3-grams for the whole Twitter text.

lines <- readRDS(co_tidy_twitter_en)
# remove lines that contain less than 3 words, or ngram() would throw errors.
lines <- lines[str_count(lines, "\\s+")>1] # 3803575 lines
trigram <- ngram(lines, n=3) # this doesn't take long time surprisingly.
df <- get.phrasetable(trigram)
saveRDS(df, co_3gram_twitter_en)
df <- readRDS(co_3gram_twitter_en)
ggplot(df[1:20,], aes(x=reorder(ngrams,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="Twitter Top 20 3-Gram Frequence (with Stop Words)", x=NULL, y="Word Frequency")

lines <- readRDS((co_tidy_nostop_twitter_en))
# remove lines that contain less than 3 words, or ngram() would throw errors.
lines <- lines[str_count(lines, "\\s+")>1] # 2780871 lines left
trigram <- ngram(lines, n=3) # this took less than a minute surprisingly.
df <- get.phrasetable(trigram)
saveRDS(df, co_3gram_nostop_twitter_en)
df <- readRDS(co_3gram_nostop_twitter_en)
ggplot(df[1:20,], aes(x=reorder(ngrams,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="Twitter Top 20 3-Gram Frequence (without Stop Words)", x=NULL, y="Word Frequency")

Unique words needed to cover the text

df <- readRDS(co_1gram_twitter_en) # with stop words
df$count <- 1
df$count <- cumsum(df$count)
df$coverage <- cumsum(df$freq) / sum(df$freq) * 100
df <- df[df$coverage <= 91,]

# find the word counts for 50% and 90% coverage 
points <- rbind(tail(df[df$coverage <= 50,], 1), tail(df[df$coverage <= 90,], 1))

ggplot(data=df, aes(x=count, y=coverage, group=1)) +
  geom_line()+
  geom_point(data=points, colour="red", size=3) +
  geom_text(data=points, aes(label=count), hjust=-1, vjust=1) +
  ggtitle("Number of Words to Cover Twitter Text (with Stop Words)") +
  xlab("Number of Words") +
  ylab("Coverage Percentage")

Removal of the stop words would increase the number of words needed to cover the whole text.To decrease the number, lemmatization and/or stemming could be used. https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

For other two English files

Do the same to see whether there are differences.

df_news <- readRDS("D:/R/capstone/data/1gram_nostop_news_en.rds")
df_blogs <- readRDS("D:/R/capstone/data/1gram_nostop_blogs_en.rds")

par(mfrow=c(1,2))

df <- readRDS(co_1gram_nostop_news_en)
ggplot(df_news[1:20,], aes(x=reorder(word,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="News Top 20 Word Frequence (without Stop Words)", x=NULL, y="Word Frequency")

df <- readRDS(co_1gram_nostop_blogs_en)
ggplot(df_blogs[1:20,], aes(x=reorder(word,freq), freq)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(title="Blogs Top 20 Word Frequence (without Stop Words)", x=NULL, y="Word Frequency")