Initial analysis for text prediction algorithm development

Synopsys

The goal of this report is to load text data set, clean it, perform exploratory data analysis, create unigrams, bigrams and trigrams. The ultimate goal is to prepare to develop Shiny application that performs next word prediction.

Loading the Data

First, the global environment is cleaned, the working directory is set and required packages are loaded.

rm(list = ls())
setwd("~/Documents/classes/dataScSpec/capstone")
require(RWeka)
require(ggplot2)
require(stringi)
require(tm)
require(SnowballC)
require(wordcloud)
require(dplyr)
require(knitr)

Then the training data set file is downloaded and unzipped. There are training data sets in 4 languages: Finnish, Russian, English and German. This report will focus on analyzing English data set. There are 3 files in English data set folder containing samples from blogs, twitter messages and news. All 3 are read into separate variables.

# URL for the data file
file <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

# downloading and unzipping the file
if (!file.exists("Coursera-SwiftKey.zip")) {
    download.file(file, destfile="Coursera-SwiftKey.zip", method="curl")
}
unzip("Coursera-SwiftKey.zip")

# reading US twitter file
twCon <- file("./final/en_US/en_US.twitter.txt", "r")
tw <- readLines(con = twCon, skipNul = TRUE)
close(twCon)

# reading US blogs file
blCon <- file("./final/en_US/en_US.blogs.txt", "r")
bl <- readLines(con = blCon, skipNul = TRUE)
close(blCon)

# reading US news file
nCon <- file("./final/en_US/en_US.news.txt", "r")
news <- readLines(con = nCon, skipNul = TRUE)
close(nCon)

Exploratory Data Analysis of the complete data sets

Preliminary Exploratory Data Analysis is performed by checking if there are elements with no data. Also file sizes, word counts and line counts are calculated and shown in the table.

# Are there lines with missing data?
sum(bl == "")

## [1] 0

sum(tw == "")

## [1] 0

sum(news == "")

## [1] 0

kable(data.frame(
    File = c("twitter", "blog", "news"),
    File.Size.In.MB = round(c(object.size(tw) / 1024 ^ 2,
                     object.size(bl) / 1024 ^ 2,
                     object.size(news)/ 1024 ^ 2), 1),
    Word.Count = c(
        sum(stri_count(tw,regex="\\S+")),
        sum(stri_count(bl,regex="\\S+")),
        sum(stri_count(news,regex="\\S+"))),
    Line.Count = c(length(tw), length(bl), length(news))
), col.names = c("File", "File size in Mb", "Word count", "Line count"))

File	File size in Mb	Word count	Line count
twitter	301.4	30373583	2360148
blog	248.5	37334131	899288
news	249.6	34372530	1010242

Sampling the data

Since each file of the training set is large, they were sampled and combined into a single vector, which was saved for easier retrieval later.

sampleFactor <- 1000
set.seed(1)
twSample <- sample(tw, length(tw) / sampleFactor)
blSample <- sample(bl, length(bl) / sampleFactor)
nSample <- sample(news, length(news) / sampleFactor)

combSample <- c(twSample, blSample, nSample)

# Saving combined sample, cleaning memory
writeLines(combSample, "combSample.txt")
rm(tw, bl, news, twSample, blSample, nSample)

Data cleaning

Sample of the training data set was read and converted into a corpus. Then it was cleaned by removing punctuation and numbers, substituting uppercase letters into lowercase. Then stop words and white spaces were removed. Since these samples may potentially contain profanity words and it is not preferable that the algorithm suggests profanity words, they need to be removed from the corpus. Profanity words were downloaded from https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/badwordslist/badwords.txt and eliminated from the corpus.

# reading saved sample
sampCon <- file("./combSample.txt", "r")
combSample <- readLines(sampCon)
close(sampCon)

sampCorp <- VCorpus(VectorSource(list(combSample)))

sampCorp <- tm_map(sampCorp, removePunctuation)
sampCorp <- tm_map(sampCorp, removeNumbers)
sampCorp <- tm_map(sampCorp, content_transformer(tolower))
sampCorp <- tm_map(sampCorp, removeWords, stopwords("english"))
sampCorp <- tm_map(sampCorp, stripWhitespace)
#sampCorp <- tm_map(sampCorp, stemDocument)


# remove profanity
profFile <- "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/badwordslist/badwords.txt"
if (!file.exists("badwords.txt")) {
    download.file(profFile, destfile="badwords.txt", method="curl")
}

profCon <- file("./badwords.txt", "r")
prof <- readLines(profCon)
close(profCon)
prof_new<- gsub("[*()]","",prof)
sampCorp <- tm_map(sampCorp, removeWords, prof_new)
rm(prof, prof_new)

The frequency of the most frequently occurring words is depicted as a word cloud.

wordcloud(sampCorp, min.freq=110, max.words=80, colors=brewer.pal(8, "Set1"))

Building n-grams

Unigrams, bigrams and trigrams are built using RWeka and tm package.

tokenizer1 <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
tdm1 <- TermDocumentMatrix(sampCorp, control = list(tokenize = tokenizer1))

tokenizer2 <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
tdm2 <- TermDocumentMatrix(sampCorp, control = list(tokenize = tokenizer2))

tokenizer3 <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
tdm3 <- TermDocumentMatrix(sampCorp, control = list(tokenize = tokenizer3))

Top 20 unigrams, bigrams and trigrams

Top 20 frequencies of unigrams, bigrams and trigrams were plotted.

freqTerms1 <- findFreqTerms(tdm1)
termFreq1 <- rowSums(as.matrix(tdm1[freqTerms1, ]))
termFreq1 <- data.frame(unigram=names(termFreq1), frequency=termFreq1)
topFreq1 <- termFreq1 %>% arrange(desc(frequency)) %>% head(20)
topFreq1$unigram <- factor(as.character(topFreq1$unigram), 
    levels = (as.character(topFreq1$unigram)))


p1 <- ggplot(topFreq1, aes(x=unigram, y=frequency)) +
    geom_bar(stat = "identity") +  
    xlab("Unigram") + ylab("Frequency") +
    labs(title = "Top 20 Unigrams by Frequency") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p1)

freqTerms2 <- findFreqTerms(tdm2)
termFreq2 <- rowSums(as.matrix(tdm2[freqTerms2, ]))
termFreq2 <- data.frame(bigram=names(termFreq2), frequency=termFreq2)
topFreq2 <- termFreq2 %>% arrange(desc(frequency)) %>% head(20)
topFreq2$bigram <- factor(as.character(topFreq2$bigram), 
    levels = (as.character(topFreq2$bigram)))

p2 <- ggplot(topFreq2, aes(x=bigram, y=frequency)) +
    geom_bar(stat = "identity") +  
    xlab("Bigram") + ylab("Frequency") +
    labs(title = "Top 20 Bigrams by Frequency") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p2)

freqTerms3 <- findFreqTerms(tdm3)
termFreq3 <- rowSums(as.matrix(tdm3[freqTerms3, ]))
termFreq3 <- data.frame(trigram=names(termFreq3), frequency=termFreq3)
topFreq3 <- termFreq3 %>% arrange(desc(frequency)) %>% head(20)
topFreq3$trigram <- factor(as.character(topFreq3$trigram), 
    levels = (as.character(topFreq3$trigram)))

p3 <- ggplot(topFreq3, aes(x=trigram, y=frequency)) +
    geom_bar(stat = "identity") +  
    xlab("Trigram") + ylab("Frequency") +
    labs(title = "Top 20 Trigrams by Frequency") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p3)

Plan for creating the final algorithm and the Shiny app

Create and analyse n-grams with the larger sample of the training set.
Algorithm should use the frequency of n-grams to convert them into probabilities.
If trigram is unavailable, use bigram. If bigram is unavailable, use unigram.
Assign low, non-zero probabilities to the word combinations not found in the n-grams.
Shiny app will have 2 boxes: text input, and suggested output (showing a few most likely suggestions).