Repeated functions are found in the file UtilityFunctions.R, which can be found at the end of this report.

source("UtilityFunctions.R")

Explore the data

First peep the data with shell commands to see how many lines, words and characters are in each file (en_US.twitter.txt, en_US.blogs.txt, en_US.news.txt).

bcon<-file("D:/capstone/en_US/en_US.blogs.txt", "rb")#blogs
tcon<-file("D:/capstone/en_US/en_US.twitter.txt", "rb")#twitter
ncon<-file("D:/capstone/en_US/en_US.news.txt", "rb")#news

set.seed(917)

Open the files, get basic info with file.info and stri_count_words.

mbb<-round(file.info("D:/capstone/en_US/en_US.blogs.txt")$size/10^6, 2)
blogs<-readLines(bcon, skipNul = TRUE)
bsize<-length(blogs)
bcount<-sum(stri_count_words(blogs))

mbt<-round(file.info("D:/capstone/en_US/en_US.twitter.txt")$size/10^6, 2)
twitter<-readLines(tcon, skipNul = TRUE)
tsize<-length(twitter)
tcount<-sum(stri_count_words(twitter))

mbn<-round(file.info("D:/capstone/en_US/en_US.news.txt")$size/10^6, 2)
news<-readLines(ncon, skipNul = TRUE)
nsize<-length(news)
ncount<-sum(stri_count_words(news))

Summary table of original data set.

kable(table)
BLOGS TWITTER NEWS
lines 899288.00 2360148.00 1010242.00
words 38154238.00 30218166.00 35010782.00
mb 210.16 167.11 205.81

Subset to 10% of the data to ease future exploratory analysis.

blogs<-sample(blogs, length(blogs)*0.1) #sample 10% of the data
writeLines(text = blogs, "D:/capstone/blogs.txt", sep = "\n")
close(bcon)

twitter<-sample(twitter, length(twitter)*0.1) #sample 10% of the data
writeLines(text=twitter, con="D:/capstone/twitter.txt", sep="\n")
close(tcon)

news<-sample(news, length(news)*0.1) #sample 10% of the data
writeLines(text=news, con="D:/capstone/news.txt", sep="\n")
close(ncon)

GENERAL NOTES AND OBSERVATIONS:

Method

Examine these three data types and see how the number of unique and frequent words differ. We can use statistical methods (qnorm) to extract the minimum frequency to collect 90% of the words in each data type.

Sample the Data

The size of the original dataset is too big for a consumer PC with the following attributes Windows, 10 x64, build 10586, SURFACE2, x86-64

#read in the newly sampled data with UtilityFunction readFast()
news<-readFast("news.txt")
blogs<-readFast("blogs.txt") #read in the newly sampled data
twitter<-readFast("twitter.txt")#read in the newly sampled data

10% of the original dataset is still too large. Subset again at 10%. The working data set contains just 1% of the original data set.

set.seed(917)
#still too big, resample the data to another 10% to provide 1% of the original data
sblogs<-sample(blogs, length(blogs)*0.1)
stwitter<-sample(twitter, length(twitter)*0.1)
snews<-sample(news, length(news)*0.1)

Clean and reformat the data with a Utility Function

sblogs<-cleanSample(sblogs)
stwitter<-cleanSample(stwitter)
snews<-cleanSample(snews)

Create a corpus for each of the three samples.

Profanities are filter using the list from https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en.

In the cleaning and tokenization of the corpus’, I have removed “stopwords”, those words that are extremely common but lexically not useful for prediction. Such stop words include articles, possessives, prepositions and conjunctions. See Appendix for full list as provided by the tm library

#use tm to create corpuses and clean tokens usings tm_map

corpB<-sampleCorpus(sblogs)
## Loading required package: SnowballC
corpT<-sampleCorpus(stwitter)
corpN<-sampleCorpus(snews)

corpU<-sampleCorpus(c(sample(sblogs, length(sblogs)/3), sample(stwitter, length(stwitter)/3), sample(snews, length(snews)/3)))

Find the most frequent single words in each corpora. Use the frequency rate rather than the frequency number to account for different corpus sizes.

Use qnorm to find the rate which catches 90% of the unique words.

Low frequency is set to 3 to help eliminate misspelled words.

Histogram charts are presented in normal and log scales as they are heavily skewed to the left. The histograms display the frequency of frequencies Red line is mean of frequency Blue line is median of frequency

#Twitter
tdmT<-getTDM(corpT, 1)
unT<-length(tdmT$dimnames$Terms)
lowT<-3
freqT<-sampleFreq(tdmT,lowT)
par(mfrow=c(2,1))
hist(freqT$frequency, breaks=1000, right = TRUE)
abline(v=mean(freqT$freqeuncy), col="red")
## Warning in mean.default(freqT$freqeuncy): argument is not numeric or
## logical: returning NA
abline(v=median(freqT$frequency), col="blue")

hist(log(freqT$frequency), breaks=1000, right = TRUE)
abline(v=mean(log(freqT$frequency)), col="red")
abline(v=median(log(freqT$frequency)), col="blue")

freqT$rate<-freqT$frequency/unT
qT<-qnorm(0.90,mean(freqT$rate),sd = sd(x = freqT$rate, na.rm=T))
freqT<-freqT[order(freqT$rate>qT, decreasing = TRUE), ]
plotT<-freqT[order(freqT[1:25, "rate"], decreasing=FALSE),]

#News
tdmN<-getTDM(corpN, 1)
unN<-length(tdmN$dimnames$Terms)
lowN<-3
freqN<-sampleFreq(tdmN,lowN)
par(mfrow=c(2,1))
hist(freqN$frequency, breaks=1000, right = TRUE)
abline(v=mean(freqN$freqeuncy), col="red")
## Warning in mean.default(freqN$freqeuncy): argument is not numeric or
## logical: returning NA
abline(v=median(freqN$frequency), col="blue")

hist(log(freqN$frequency), breaks=1000, right = TRUE)
abline(v=mean(log(freqN$frequency)), col="red")
abline(v=median(log(freqN$frequency)), col="blue")

freqN$rate<-freqN$frequency/unN
qN<-qnorm(0.90,mean(freqN$rate),sd = sd(x = freqN$rate, na.rm=T))
freqN<-freqN[order(freqN$rate>qN, decreasing = TRUE), ]
plotN<-freqN[order(freqN[1:25, "rate"], decreasing = FALSE), ]

#Blogs
tdmB<-getTDM(corpB, 1)
unB<-length(tdmB$dimnames$Terms)
lowB<-3
freqB<-sampleFreq(tdmB,lowB)
par(mfrow=c(2,1))
hist(freqB$frequency, breaks=1000, right = TRUE)
abline(v=mean(freqB$freqeuncy), col="red")
## Warning in mean.default(freqB$freqeuncy): argument is not numeric or
## logical: returning NA
abline(v=median(freqB$frequency), col="blue")

hist(log(freqB$frequency), breaks=1000, right = TRUE)
abline(v=mean(log(freqB$frequency)), col="red")
abline(v=median(log(freqB$frequency)), col="blue")

freqB$rate<-freqB$frequency/unB
qB<-qnorm(0.90,mean(freqB$rate),sd = sd(x = freqB$rate, na.rm=T))
freqB<-freqB[order(freqB$rate>qB, decreasing = TRUE), ]
plotB<-freqB[order(freqB[1:25, "rate"], decreasing = FALSE), ]

#Union of all three
tdmU<-getTDM(corpU, 1)
unU<-length(tdmU$dimnames$Terms)
lowU<-3
freqU<-sampleFreq(tdmU, lowU)
par(mfrow=c(2,1))
hist(freqU$frequency, breaks=1000, right = TRUE)
hist(log(freqU$frequency), breaks=1000, right = TRUE)

freqU$rate<-freqU$frequency/unU
qU<-qnorm(0.9, mean(freqU$rate), sd = sd(x=freqU$rate, na.rm=TRUE))
freqU<-freqU[order(freqU$rate>qU, decreasing = TRUE),]
plotU<-freqU[order(freqU[1:25, "rate"], decreasing = FALSE),]

#Intersection of all three
uniI<-merge(freqB, freqN, by="term") #find the terms that appear in all three
uniI<-merge(uniI, freqT, by="term")
names(uniI)<-c("term", "BlogCount", "BlogRate", "NewsCount", "NewsRate","TwitterCount" ,"TwitterRate")
uniI$Mean<-rowMeans(uniI[,c(3,5,7)])
uniI<-uniI[order(uniI$Mean, decreasing = TRUE),]
plotI<-uniI[order(uniI[1:25, "Mean"], decreasing=FALSE),]

par(mfrow=c(3,2), mar=c(5,6,2,2))
barplot(plotU$rate, names.arg=plotU$term, las=2, horiz=TRUE, main="Union", cex.names = .75, cex.axis = .5)
barplot(plotI$Mean, names.arg = plotI$term, las=2, horiz=TRUE, main="Intersection", cex.names = .75, cex.axis = .5)
barplot(plotT$rate, names.arg = plotT$term, las=2, horiz=TRUE, main="Twitter", cex.names = .75, cex.axis = .5)
barplot(plotN$rate, names.arg=plotN$term, las=2, horiz=TRUE, main = "News", cex.names = .75, cex.axis = .5)
barplot(plotB$rate, names.arg = plotB$term, las=2, horiz=TRUE, main="Blogs", cex.names = .75, cex.axis = .5)

Find the most frequent bigrams in each corpora and the most frequent that occur in the intersection among all three types.

#twitter
tdmTbi<-getTDM(corpT, 2)
unTbi<-length(tdmTbi$dimnames$Terms)
lowT<-3
freqTbi<-sampleFreq(tdmTbi,lowT)
freqTbi$rate<-freqTbi$frequency/unTbi
qTbi<-qnorm(0.90,mean(freqTbi$rate),sd = sd(x = freqTbi$rate, na.rm=T))
freqTbi<-freqTbi[order(freqTbi$rate>qTbi, decreasing = TRUE), ]
plotTbi<-freqTbi[order(freqTbi[1:25, "rate"], decreasing=FALSE),]

#news
tdmNbi<-getTDM(corpN, 2)
unNbi<-length(tdmNbi$dimnames$Terms)
lowN<-3
freqNbi<-sampleFreq(tdmNbi,lowN)
freqNbi$rate<-freqNbi$frequency/unNbi
qNbi<-qnorm(0.90,mean(freqNbi$rate),sd = sd(x = freqNbi$rate, na.rm=T))
freqNbi<-freqNbi[order(freqNbi$rate>qNbi, decreasing = TRUE), ]
plotNbi<-freqNbi[order(freqNbi[1:25, "rate"], decreasing = FALSE), ]

#blogs
tdmBbi<-getTDM(corpB, 2)
unBbi<-length(tdmBbi$dimnames$Terms)
lowB<-3
freqBbi<-sampleFreq(tdmBbi,lowB)
freqBbi$rate<-freqBbi$frequency/unBbi
qB<-qnorm(0.90,mean(freqBbi$rate),sd = sd(x = freqBbi$rate, na.rm=T))
freqBbi<-freqBbi[order(freqBbi$rate>qB, decreasing = TRUE), ]
plotBbi<-freqBbi[order(freqBbi[1:25, "rate"], decreasing = FALSE), ]

#union of twitter, news and blogs
tdmUbi<-getTDM(corpU, 2)
unUbi<-length(tdmUbi$dimnames$Terms)
lowU<-3
freqUbi<-sampleFreq(tdmUbi,lowU)
freqUbi$rate<-freqUbi$frequency/unUbi
qU<-qnorm(0.90,mean(freqUbi$rate),sd = sd(x = freqUbi$rate, na.rm=T))
freqUbi<-freqUbi[order(freqUbi$rate>qU, decreasing = TRUE), ]
plotUbi<-freqUbi[order(freqUbi[1:25, "rate"], decreasing = FALSE), ]

#intersection of twitter, news and blogs
biI<-merge(freqBbi, freqNbi, by="term") #find the terms that appear in all three
biI<-merge(biI, freqTbi, by="term")
names(biI)<-c("term", "BlogCount", "BlogRate", "NewsCount", "NewsRate","TwitterCount" ,"TwitterRate")
biI$Mean<-rowMeans(biI[,c(3,5,7)])
biI<-biI[order(biI$Mean, decreasing = TRUE),]
plotIbi<-biI[order(biI[1:25, "Mean"], decreasing=FALSE),]


#barplot of top 25 terms by rate.
par(mfrow=c(3,2), mar=c(5,10,2,2))
barplot(plotUbi$rate, names.arg=plotUbi$term, las=2, horiz=TRUE, main = "Union", cex.names = .75, cex.axis = .5)
barplot(plotIbi$Mean, names.arg=plotIbi$term, las=2, horiz=TRUE, main="Intersection", cex.names = .75, cex.axis = .5)
barplot(plotTbi$rate, names.arg = plotTbi$term, las=2, horiz=TRUE, main="Twitter", cex.names = .75, cex.axis = .5)
barplot(plotNbi$rate, names.arg=plotNbi$term, las=2, horiz=TRUE, main = "News", cex.names = .75, cex.axis = .5)
barplot(plotBbi$rate, names.arg = plotBbi$term, las=2, horiz=TRUE, main="Blogs", cex.names = .75, cex.axis = .5)

And finally find the most frequent trigrams

tdmTtri<-getTDM(corpT, 3)
unTtri<-length(tdmTtri$dimnames$Terms)
lowT<-2
freqTtri<-sampleFreq(tdmTtri,lowT)
freqTtri$rate<-freqTtri$frequency/unTtri
qTtri<-qnorm(0.90,mean(freqTtri$rate),sd = sd(x = freqTtri$rate, na.rm=T))
freqTtri<-freqTtri[order(freqTtri$rate>qTtri, decreasing = TRUE), ]
plotTtri<-freqTtri[order(freqTtri[1:25, "rate"], decreasing=FALSE),]


tdmNtri<-getTDM(corpN, 3)
unNtri<-length(tdmNtri$dimnames$Terms)
lowN<-2
freqNtri<-sampleFreq(tdmNtri,lowN)
freqNtri$rate<-freqNtri$frequency/unNtri
qNtri<-qnorm(0.90,mean(freqNtri$rate),sd = sd(x = freqNtri$rate, na.rm=T))
freqNtri<-freqNtri[order(freqNtri$rate>qNtri, decreasing = TRUE), ]
plotNtri<-freqNtri[order(freqNtri[1:25, "rate"], decreasing = FALSE), ]

tdmBtri<-getTDM(corpB, 3)
unBtri<-length(tdmBtri$dimnames$Terms)
lowB<-2
freqBtri<-sampleFreq(tdmBtri,lowB)
freqBtri$rate<-freqBtri$frequency/unBtri
qB<-qnorm(0.90,mean(freqBtri$rate),sd = sd(x = freqBtri$rate, na.rm=T))
freqBtri<-freqBtri[order(freqBtri$rate>qB, decreasing = TRUE), ]
plotBtri<-freqBtri[order(freqBtri[1:25, "rate"], decreasing = FALSE), ]

tdmUtri<-getTDM(corpU, 3)
unUtri<-length(tdmUtri$dimnames$Terms)
lowU<-2
freqUtri<-sampleFreq(tdmUtri,lowU)
freqUtri$rate<-freqUtri$frequency/unUtri
qU<-qnorm(0.90,mean(freqUtri$rate),sd = sd(x = freqUtri$rate, na.rm=T))
freqUtri<-freqUtri[order(freqUtri$rate>qU, decreasing = TRUE), ]
plotUtri<-freqUtri[order(freqUtri[1:25, "rate"], decreasing = FALSE), ]

triI<-merge(freqBtri, freqNtri, by="term") #find the terms that appear in all three
triI<-merge(triI, freqTtri, by="term")
names(triI)<-c("term", "BlogCount", "BlogRate", "NewsCount", "NewsRate","TwitterCount" ,"TwitterRate")
triI$Mean<-rowMeans(triI[,c(3,5,7)])
triI<-triI[order(triI$Mean, decreasing = TRUE),]
plotItri<-triI[order(triI[1:25, "Mean"], decreasing=FALSE),]

par(mfrow=c(3,2), mar=c(5,10,2,2))
barplot(plotUtri$rate, names.arg=plotUtri$term, las=2, horiz=TRUE, main="Union", cex.names = .75, cex.axis = .5)
barplot(plotItri$Mean, names.arg = plotItri$term, las=2, horiz=TRUE, main="Intersection", cex.names = .75, cex.axis = .5)
barplot(plotTtri$rate, names.arg = plotTtri$term, las=2, horiz=TRUE, main="Twitter", cex.names = .75, cex.axis = .5)
barplot(plotNtri$rate, names.arg=plotNtri$term, las=2, horiz=TRUE, main = "News", cex.names = .75, cex.axis = .5)
barplot(plotBtri$rate, names.arg = plotBtri$term, las=2, horiz=TRUE, main="Blogs", cex.names = .75, cex.axis = .5)

Next Steps

Additional exploration of forward and backward ngrams, using top 1000 non-stopword English to determine likely “next word”

Experiment with machine learning techniques, such as random forest, to best estimate the next word for top 1000 non-stopword English. Add stopwords back in to see how things change.

Going forward I would like to create a Shiny App that will predict the next word in two different contexts: Twitter or blog/news. The user types a word into a text field and the app will return two columns of three possible next words, one column for twitter, the other column for blog/news.

Appendix

Stopwords, removed:

stopwords("en")
##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"

UtilityFunctions.R functions

library(knitr)
library(stringi)
library(tm)
library(RWeka)

readFast<-function(x){
        s=file.info(x)$size
        buf=readChar(x, s, useBytes = T)
        strsplit(buf, "\n", fixed = TRUE, useBytes = T)[[1]]
}

cleanSample<-function(x){
        require(stringi)

        
        file<-iconv(x, from="ISO-8859-1", "ASCII",sub = "")
        
        
        # punct <- '[]\\?!\"$\\d%&(){}+*/:;\\/,._`|~\\[<=>@\\^-]'
        file<-file[!is.na(file)]
        # file<-stri_replace_all(regex = punct, replacement = '',str =file)
        
        # file<-stri_trans_tolower(file)
        file<-stri_trim_both(file)
        file<-gsub('\"',"", file)
        file<-gsub('[[:digit:]]+','',file)

        file
   
}



sampleCorpus<-function(sample){
        require(tm)
        require(SnowballC)
        
        carlin<-readLines("D:/capstone/carlin.txt")

        sample<-Corpus(VectorSource(sample))
        sample<-tm_map(sample, removePunctuation)
        sample<-tm_map(sample, tolower)
        sample<-tm_map(sample, stripWhitespace)
        sample<-tm_map(sample, removeWords, stopwords("en"))
        sample<-tm_map(sample, stemDocument)
        sample<-tm_map(sample, removeWords, carlin)
        sample<-tm_map(sample, PlainTextDocument)

        
        
}
        
getTDM<-function(corpsample, n){
        require(RWeka)
        options(mc.cores=1)
        
        token <- function(x) NGramTokenizer(x, Weka_control(min = n, max = n))
        tdm <- TermDocumentMatrix(corpsample, control = list(tokenize = token))
        removeSparseTerms(tdm, 0.1)
        tdm
        
}


sampleFreq<-function(tdm, low=50){
        freqTerms <- findFreqTerms(tdm, lowfreq = low)
        termFrequency <- rowSums(as.matrix(tdm[freqTerms,]))
        termFrequency <- data.frame(term=names(termFrequency), frequency=termFrequency,row.names=NULL)
        termFrequency<-termFrequency[order(termFrequency$frequency, decreasing = TRUE),]
        termFrequency
        
        
}