Overview

The aim of the Capstone project is to build a predictive text model using Natural Language Processing (NLM) along with a predictive text application that will determine the most likely next word when a user inputs a word or a phrase. The purpose of this milestone report is to demonstrate how the data was downloaded, imported into R and cleaned. It also contains exploratory analysis of the textual data which is presenst in 3 sets (blogs, news and twitter)

Library Required

library(tm)
## Loading required package: NLP
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.6.3
library(stringi)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pryr)
## Warning: package 'pryr' was built under R version 3.6.3
## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp
## 
## Attaching package: 'pryr'
## The following object is masked from 'package:tm':
## 
##     inspect
library(RColorBrewer)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.3

Import the data

setwd("E:/JHU Data Science/Capstone")
blogs <- readLines("./final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines("./final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("./final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)

Summary Statistics

stats <- data.frame(
  FileName=c("blogs", "news", "twitter"),
  FileSize=sapply(list(blogs, news, twitter), function(x){format(object.size(x), "MB")}),
  # FileSizeMB=c(file.info("./en_US.blogs.txt")$size/1024^2,
  #file.info("./en_US.news.txt")$size/1024^2,
  #file.info("./en_US.twitter.txt")$size/1024^2),
  t(rbind(sapply(list(blogs, news, twitter), stri_stats_general),#[c("Lines", "Chars"),],
          Words = sapply(list(blogs, news, twitter), stri_stats_latex)[4,])
  )
)

stats
##   FileName FileSize   Lines LinesNEmpty     Chars CharsNWhite    Words
## 1    blogs 255.4 Mb  899288      899288 206824382   170389539 37570839
## 2     news  19.8 Mb   77259       77259  15639408    13072698  2651432
## 3  twitter   319 Mb 2360148     2360148 162096241   134082806 30451170

Sample the data

we are going to subset the data into three new data files containing a 1% sample of each of the original data files. We are going to start with a 1% sample and check the size of the VCorpus (Virtual Corpus) object that will be loaded into memory.

set.seed(1001)
sampleSize <- 0.01

blogsSub <- sample(blogs, length(blogs) * sampleSize)
newsSub <- sample(news, length(news) * sampleSize)
twitterSub <- sample(twitter, length(twitter) * sampleSize)

sampleData <- c(sample(blogs, length(blogs) * sampleSize),
                sample(news, length(news) * sampleSize),
                sample(twitter, length(twitter) * sampleSize))

sampleStats <- data.frame(
  FileName=c("blogsSub", "newsSub", "twitterSub", "sampleData"),
  FileSize=sapply(list(blogsSub, newsSub, twitterSub, sampleData), function(x){format(object.size(x), "MB")}),
  t(rbind(sapply(list(blogsSub, newsSub, twitterSub, sampleData), stri_stats_general),#[c("Lines", "Chars"),],
          Words = sapply(list(blogsSub, newsSub, twitterSub, sampleData), stri_stats_latex)[4,])
  )
)

sampleStats
##     FileName FileSize Lines LinesNEmpty   Chars CharsNWhite  Words
## 1   blogsSub   2.6 Mb  8992        8992 2083795     1717050 377945
## 2    newsSub   0.2 Mb   772         772  154332      128930  26336
## 3 twitterSub   3.2 Mb 23601       23601 1621004     1340228 304838
## 4 sampleData     6 Mb 33365       33365 3835341     3167327 704226

Data Cleaning

# Corpus Building
corpus <- VCorpus(VectorSource(sampleData))
object_size(corpus)
## 77.8 MB

The VCorpus object is quite large (77.8 MB), even when the sample size is only 1%. This may be an issue due to memory constraints when it comes time to build the predictive model. But, we will start here and see where this approach leads us.

We next need to clean the corpus Data using functions from the tm package. Common text mining cleaning tasks include:

Convert everything to lower case

Remove punctuation marks, numbers, extra whitespace, and stopwords (common words like “and”, “or”, “is”, “in”, etc.)

Filtering out unwanted words

cleanCorpus <- tm_map(corpus, content_transformer(tolower)) # Convert all to lower case
cleanCorpus <- tm_map(cleanCorpus, removePunctuation) # Remove punctuation marks
cleanCorpus <- tm_map(cleanCorpus, removeNumbers) # Remove numbers
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace) # Remove whitespace
cleanCorpus <- tm_map(cleanCorpus, PlainTextDocument) # Convert all to plain text document

Tokenize and Construct the N-Grams

We next need to tokenize the clean Corpus (i.e., break the text up into words and short phrases) and construct a set of N-grams. We will start with the following three N-Grams:

Unigram - A matrix containing individual words

Bigram - A matrix containing two-word patterns

Trigram - A matrix containing three-word patterns

uniTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uniMatrix <- TermDocumentMatrix(cleanCorpus, control = list(tokenize = uniTokenizer))
biMatrix <- TermDocumentMatrix(cleanCorpus, control = list(tokenize = biTokenizer))
triMatrix <- TermDocumentMatrix(cleanCorpus, control = list(tokenize = triTokenizer))

Calculate the frequency of N-gram

uniCorpus <- findFreqTerms(uniMatrix, lowfreq = 20)
biCorpus <- findFreqTerms(biMatrix, lowfreq = 20)
triCorpus <- findFreqTerms(triMatrix, lowfreq = 20)

uniCorpusFreq <- rowSums(as.matrix(uniMatrix[uniCorpus,]))
uniCorpusFreq <- data.frame(word = names(uniCorpusFreq), frequency = uniCorpusFreq)
biCorpusFreq <- rowSums(as.matrix(biMatrix[biCorpus,]))
biCorpusFreq <- data.frame(word = names(biCorpusFreq), frequency = biCorpusFreq)
triCorpusFreq <- rowSums(as.matrix(triMatrix[triCorpus,]))
triCorpusFreq <- data.frame(word = names(triCorpusFreq), frequency = triCorpusFreq)

head(uniCorpusFreq)
##            word frequency
## “the       “the        60
## “we         “we        29
## ability ability        36
## able       able       191
## about     about      2179
## above     above       107
head(biCorpusFreq)
##                    word frequency
## – and             – and        34
## – i                 – i        20
## – the             – the        25
## a baby           a baby        29
## a bad             a bad        66
## a beautiful a beautiful        45
head(triCorpusFreq)
##                    word frequency
## a bit of       a bit of        36
## a bunch of   a bunch of        31
## a chance to a chance to        33
## a couple of a couple of        69
## a few days   a few days        30
## a few weeks a few weeks        20

Set the order of each corpus frequecy

uniCorpusFreqDescend <- arrange(uniCorpusFreq, desc(frequency))
biCorpusFreqDescend <- arrange(biCorpusFreq, desc(frequency))
triCorpusFreqDescend <- arrange(triCorpusFreq, desc(frequency))

Visualising the Data

uniBar <- ggplot(data = uniCorpusFreqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "black") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle(paste("Top 20 Unigrams")) +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
biBar <- ggplot(data = biCorpusFreqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "springgreen") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle(paste("Top 20 Bigrams")) +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
triBar <- ggplot(data = triCorpusFreqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "deepskyblue") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle(paste("Top 20 Trigrams")) +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
uniBar

biBar

triBar

Word Cloud

uniCloud <- wordcloud(uniCorpusFreq$word, uniCorpusFreq$frequency, scale = c(2, 0.5), max.words = 100, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))

biCloud <- wordcloud(biCorpusFreq$word, biCorpusFreq$frequency, scale = c(2, 0.5), max.words = 100, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))

triCloud <- wordcloud(triCorpusFreq$word, triCorpusFreq$frequency, scale = c(2, 0.5), max.words = 100, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))

Next Steps

  1. Build and test different prediction models and evaluate each based on their performance.

  2. Make and test any necessary modifications to resolve any issues encountered during modeling.

  3. Build, test and deploy a Shiny app with a simple user interface that has acceptable run time and reliably and accurately predicts the next word based on a word or phrase entered by the user.

  4. Decide whether to remove the stopwords and filter out profanity, if necessary.