Introduction

We will be performing 2 tasks within the framework of this work/report.

TASK1 - EXPLORATORY DATA ANALYSIS ON TEXT DATA

The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build our first linguistic models.

Tasks to accomplish:

(i)Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora. (ii)Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

#TASK2- MODELING

The goal here is to build our first simple model for the relationship between words. This is the first step in building a predictive text mining application. We will explore simple models and discover more complicated modeling techniques.

Tasks to accomplish:

(i)Build basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words.

(ii)Build a model to handle unseen n-grams - in some cases people will want to type a combination of words that does not appear in the corpora. Build a model to handle cases where a particular n-gram isn’t observed.

Loading all libraries

Using the command ‘suppressMessages’ to remove the warnings.

suppressMessages(library(NLP))
suppressMessages(library(tm))
suppressMessages(library(RColorBrewer))
suppressMessages(library(wordcloud))
suppressMessages(library(dplyr))
suppressMessages(library(stringi))
suppressMessages(library(RWeka))
suppressMessages(library(ggplot2))
suppressMessages(library(ngram))
suppressMessages(library(quanteda))
suppressMessages(library(gridExtra))

Loading the data

# File path
file1 <- "./final/en_US/en_US.blogs.txt"
file2 <- "./final/en_US/en_US.news.txt"
file3 <- "./final/en_US/en_US.twitter.txt"
# Read blogs
connect <- file(file1, open="rb")
blogs <- readLines(connect, encoding="UTF-8",skipNul = TRUE); close(connect)
# Read news
connect <- file(file2, open="rb")
news <- readLines(connect, encoding="UTF-8",skipNul = TRUE); close(connect)
# Read twitter
connect <- file(file3, open="rb")
twitter <- readLines(connect, encoding="UTF-8",skipNul = TRUE); close(connect)

Examining the data

summaryData <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(summaryData) <- c('Min','Mean','Max')
stats <- data.frame(
  FileName=c("en_US.blogs","en_US.news","en_US.twitter"),      
  t(rbind(sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],  Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',], summaryData)))
head(stats)
##        FileName   Lines     Chars    Words Min     Mean  Max
## 1   en_US.blogs  899288 206824382 37570839   0 41.75107 6726
## 2    en_US.news 1010242 203223154 34494539   1 34.40997 1796
## 3 en_US.twitter 2360148 162096241 30451170   1 12.75065   47
# Get file sizes
blogs.size <- file.info(file1)$size / 1024 ^ 2
news.size <- file.info(file2)$size / 1024 ^ 2
twitter.size <- file.info(file3)$size / 1024 ^ 2
# Summary of dataset
df<-data.frame(Doc = c("blogs", "news", "twitter"), Size.MB = c(blogs.size, news.size, twitter.size), Num.Lines = c(length(blogs), length(news), length(twitter)), Num.Words=c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter))))
df
##       Doc  Size.MB Num.Lines Num.Words
## 1   blogs 200.4242    899288 206824505
## 2    news 196.2775   1010242 203223159
## 3 twitter 159.3641   2360148 162096241

Sampling and Cleaning the Data

Since these data are pretty big in size and I have limited computer memory/ RAM to process them, I have to sample the data first and then clean the data. In terms of sampling the data, I am going to take 0.1% of each data set to ensure the memory of my machine is sufficient to effectively process the data.

set.seed(34)
# Sampling
sampleBlogs <- blogs[sample(1:length(blogs), 0.001*length(blogs), replace=FALSE)]
sampleNews <- news[sample(1:length(news), 0.001*length(news), replace=FALSE)]
sampleTwitter <- twitter[sample(1:length(twitter), 0.001*length(twitter), replace=FALSE)]
# Cleaning
sampleBlogs <- iconv(sampleBlogs, "UTF-8", "ASCII", sub="")
sampleNews <- iconv(sampleNews, "UTF-8", "ASCII", sub="")
sampleTwitter <- iconv(sampleTwitter, "UTF-8", "ASCII", sub="")
data.sample <- c(sampleBlogs,sampleNews,sampleTwitter)

Building Corpus

More cleaning process applied

build_corpus <- function (x = data.sample) {
  sample_c <- VCorpus(VectorSource(x)) # Create corpus dataset
  sample_c <- tm_map(sample_c, content_transformer(tolower)) # all lowercase
  sample_c <- tm_map(sample_c, removePunctuation) # Eleminate punctuation
  sample_c <- tm_map(sample_c, removeNumbers) # Eliminate numbers
  sample_c <- tm_map(sample_c, stripWhitespace) # Strip Whitespace
}
corpusData <- build_corpus(data.sample)

Tokenizing and building n-grams

getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
  #create term-document matrix tokenized on n-grams
  tokenizer <- function(x) { NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
  tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
  #find the top term grams with a minimum of occurrence in the corpus
  top_terms <- findFreqTerms(tdm,lowfreq)
  top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
  top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
  top_terms_freq <- arrange(top_terms_freq, desc(frequency))
}
    
tt.Data <- list(3)
for (i in 1:3) {
  tt.Data[[i]] <- getTermTable(corpusData, ngrams = i, lowfreq = 10)
}

Building WordCloud

# Set random seed for reproducibility
set.seed(34)
options(warn=-1)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
for (i in 1:3) {
  wordcloud(tt.Data[[i]]$word, tt.Data[[i]]$frequency, scale = c(3,1), max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}

Now we are seeing frequencies of words through this package.

Building n-gram models and histograms

plot.Grams <- function (x = tt.Data, N=10) {
  g1 <- ggplot(data = head(x[[1]],N), aes(x = reorder(word, -frequency), y = frequency)) + 
        geom_bar(stat = "identity", fill = "green") + 
        ggtitle(paste("Unigrams")) + 
        xlab("Unigrams") + ylab("Frequency") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))
  g2 <- ggplot(data = head(x[[2]],N), aes(x = reorder(word, -frequency), y = frequency)) + 
        geom_bar(stat = "identity", fill = "blue") + 
        ggtitle(paste("Bigrams")) + 
        xlab("Bigrams") + ylab("Frequency") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))
  g3 <- ggplot(data = head(x[[3]],N), aes(x = reorder(word, -frequency), y = frequency)) + 
        geom_bar(stat = "identity", fill = "darkgreen") + 
        ggtitle(paste("Trigrams")) + 
        xlab("Trigrams") + ylab("Frequency") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))
  # Put three plots into 1 row 3 columns
  gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
}
plot.Grams(x = tt.Data, N = 10)

The n-grams show which words are used most often in the given data.