Executive Summary

This report shows the steps, or the algorithm, used to come up with a predictive text model. 3 files, containing blogs, news and twitter feeds are downloaded to be analysed.

Basic Statistics

Basic data tables and row counts

USB <- read.table('en_US.blogs.txt', sep='\t', quote='', stringsAsFactors=FALSE)
USN <- read.table('en_US.news.txt', sep='\t', quote='', stringsAsFactors=FALSE)
UST <- read.table('en_US.twitter.txt', sep='\t', quote='', stringsAsFactors=FALSE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : embedded nul(s) found in input
class(USB); class(USN); class(UST)
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
dim(USB); dim(USN); dim(UST)
## [1] 898384      1
## [1] 77258     1
## [1] 2302307       1

Word counts

sum(sapply(gregexpr("[A-z]\\W+", USB[1:898384,]), length) + 1L)
## [1] 38022025
sum(sapply(gregexpr("[A-z]\\W+", USN[1:77258,]), length) + 1L)
## [1] 2724759
sum(sapply(gregexpr("[A-z]\\W+", UST[1:2302307,]), length) + 1L)
## [1] 30193276

Sampling

In order to find a balance between accuracy and scalability, a 0.1% sample is taken from each of the 3 data set. The 3 data sets are combined into a single file, from which we find out the most frequent single word, double word and triple word.

library(dplyr); set.seed(3277)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
SUSB <- sample_n(USB, 898)
SUSN <- sample_n(USN, 77)
SUST <- sample_n(UST, 2302)
SUSC <- rbind(SUSB, SUSN, SUST)

Removing profanity from data

BW <- read.csv("profanity.csv", header = F)
library(tm); library(SnowballC)
## Loading required package: NLP
SUSC.corpus <- Corpus(DataframeSource(SUSC)) # create VCorpus from the data frame
SUSC.corpus <- tm_map(SUSC.corpus, removeWords, BW$V1)

Cleaning and Stemming of data

SUSC.corpus <- tm_map(SUSC.corpus, stripWhitespace)
SUSC.corpus <- tm_map(SUSC.corpus, removeWords, stopwords("english"))
SUSC.corpus <- tm_map(SUSC.corpus, removePunctuation)
SUSC.corpus <- tm_map(SUSC.corpus, removeNumbers)
SUSC.corpus <- tm_map(SUSC.corpus, content_transformer(tolower))
SUSC.corpus <- tm_map(SUSC.corpus, PlainTextDocument)
SUSC.corpus <- tm_map(SUSC.corpus, stemDocument)

Tokenization

if(Sys.getenv("JAVA_HOME")!=""){
      Sys.setenv(JAVA_HOME="")
}
library(rJava); library(RWeka)
options(mc.cores=1)
uni_token <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_token <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_token <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
unigram <- TermDocumentMatrix(SUSC.corpus, control=list(tokenize = uni_token))
bigram <- TermDocumentMatrix(SUSC.corpus, control=list(tokenize = bi_token))
trigram <- TermDocumentMatrix(SUSC.corpus, control=list(tokenize = tri_token))
removeSparseTerms(unigram, 0.1)
## <<TermDocumentMatrix (terms: 0, documents: 3277)>>
## Non-/sparse entries: 0/0
## Sparsity           : 100%
## Maximal term length: 0
## Weighting          : term frequency (tf)
removeSparseTerms(bigram, 0.1)
## <<TermDocumentMatrix (terms: 0, documents: 3277)>>
## Non-/sparse entries: 0/0
## Sparsity           : 100%
## Maximal term length: 0
## Weighting          : term frequency (tf)
removeSparseTerms(trigram, 0.1)
## <<TermDocumentMatrix (terms: 0, documents: 3277)>>
## Non-/sparse entries: 0/0
## Sparsity           : 100%
## Maximal term length: 0
## Weighting          : term frequency (tf)

Creating a tokenized database

## Finding the totals for each term
freq.uni <- rowSums(as.matrix(unigram))
freq.bi  <- rowSums(as.matrix(bigram))
freq.tri <- rowSums(as.matrix(trigram))
## Sorting the totals
freq.uni <- sort(freq.uni, decreasing = TRUE)
freq.bi  <- sort(freq.bi, decreasing = TRUE)
freq.tri <- sort(freq.tri, decreasing = TRUE)
## Create the top n data frames from the matrices
df40.freq.uni <- data.frame("Term"=names(head(freq.uni,40)), "Frequency"=head(freq.uni,40))
df20.freq.bi  <- data.frame("Term"=names(head(freq.bi,20)), "Frequency"=head(freq.bi,20))
df10.freq.tri <- data.frame("Term"=names(head(freq.tri,10)), "Frequency"=head(freq.tri,10))
## Reorder levels for better plotting
df40.freq.uni$uniTerm <- reorder(df40.freq.uni$Term, df40.freq.uni$Frequency)
df20.freq.bi$biTerm  <- reorder(df20.freq.bi$Term, df20.freq.bi$Frequency)
df10.freq.tri$triTerm <- reorder(df10.freq.tri$Term, df10.freq.tri$Frequency)

Plotting

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
g1 <- ggplot(df40.freq.uni, aes(x = uniTerm, y = Frequency)) +
    geom_bar(stat = "identity", fill="green") +
    xlab("Terms") + ylab("Count") + ggtitle("Top 40 UniGram Tokenized Word Frequency") +
    coord_flip()
print(g1)

g2 <- ggplot(df20.freq.bi, aes(x = biTerm, y = Frequency)) +
    geom_bar(stat = "identity", fill="green") +
    xlab("Terms") + ylab("Count") + ggtitle("Top 20 BiGram Tokenized Word Frequency") +
    coord_flip()
print(g2)

g3 <- ggplot(df10.freq.tri, aes(x = triTerm, y = Frequency)) +
    geom_bar(stat = "identity", fill="green") +
    xlab("Terms") + ylab("Count") + ggtitle("Top 10 TriGram Tokenized Word Frequency") +
    coord_flip()
print(g3)

Hereafter