Loading the Data

First we will load the dataset into R:

#library(readr)

#Setup the blogs file
blogsFile <- "en_US.blogs.txt"
con <- file(blogsFile, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

#Setup the news file
newsFile <- "en_US.news.txt"
con <- file(newsFile, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(con, encoding = "UTF-8", skipNul = TRUE): incomplete final
## line found on 'en_US.news.txt'
close(con)

#Setup the twitter file
twitterFile <- "en_US.twitter.txt"
con <- file(twitterFile, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

rm(con)

Basic Data Summary

Using the data, we will provide a quick summary to display the file sizes, number of lines, word counts, and number of words per line

library(stringi)
#File Size
filesize_MB=round(file.info(c(blogsFile,newsFile,twitterFile))$size/1024^2)

##Word Counts
numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]


##Line Counts
numLines=sapply(list(blogs,news,twitter),length)

##Data Tables - we will summarize the words per line min, max and average
wplInfo=sapply(list(blogs,news,twitter),function(x)summary(stri_count_words(x))[c('Min.','Max.','Mean')])
rownames(wplInfo)=c('WPL.Min','WPL.Max','WPL.AVG')

DataTable=data.frame(File=c("blogs.txt","news.txt","twitter.txt"),FileSize=paste(filesize_MB,"MB"),Lines=numLines,Words=numWords,t(rbind(round(wplInfo))))

DataTable
##          File FileSize   Lines    Words WPL.Min WPL.Max WPL.AVG
## 1   blogs.txt   200 MB  899288 37570839       0    6726      42
## 2    news.txt   196 MB   77259  2651432       1    1123      35
## 3 twitter.txt   159 MB 2360148 30451170       1      47      13

Cleaning and Processing the Data

Next steps for processing the data are to start with a small sample of about 5% of the entire set for basic analysis where we convert all words to lower case, remove punctuation and numbers, remove URLs, remove Twitter Handles, and remove email patterns. Following this we will create the corpus and output some histograms of unigrams, bigrams and trigrams to examine frequently used words and groups of words.

set.seed(101120)


#create sample sizes of the data sets
sampleSize=0.05
sampleblogs=sample(blogs,length(blogs)*sampleSize,replace=FALSE)
samplenews=sample(news,length(news)*sampleSize,replace=FALSE)
sampletwitter=sample(twitter,length(twitter)*sampleSize,replace=FALSE)

#remove any non-English words from the sample datasets
sampleblogs=iconv(sampleblogs,"latin1","ASCII",sub="")
samplenews=iconv(samplenews,"latin1","ASCII",sub="")
sampletwitter=iconv(sampletwitter,"latin1","ASCII",sub="")

#combine all 3 data sets into one large dataset
sampledata=c(sampleblogs,samplenews,sampletwitter)

#create the corpus
library(tm)
## Loading required package: NLP
#function to perform all the cleaning steps at once
createCorpus=function(dataSet){
  docs=VCorpus(VectorSource(dataSet))
  toSpace=content_transformer(function(x,pattern)gsub(pattern," ",x))
  docs=tm_map(docs,toSpace,"(f|ht)tp(s?)://(.*)[.][a-z]+")
  docs=tm_map(docs,toSpace,"@[^\\s]+")
  docs=tm_map(docs,toSpace,"\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b")
  docs=tm_map(docs,removeNumbers)
  docs=tm_map(docs,removePunctuation)
  docs=tm_map(docs,tolower)
  docs=tm_map(docs,stripWhitespace)
  docs=tm_map(docs,removeWords,stopwords("English"))
  docs=tm_map(docs,PlainTextDocument)
  return(docs)
}

#apply the function and build the corpus
Corpus=createCorpus(sampledata)

Histograms of Unigrams, 2-grams, and 3-grams

library(RWeka)

#Tokenizing Functions
uniTokenizer=function(x) NGramTokenizer(x, Weka_control(min=1,max=1))
biTokenizer=function(x) NGramTokenizer(x, Weka_control(min=2,max=2))
triTokenizer=function(x) NGramTokenizer(x, Weka_control(min=3,max=3))

#Creating Term Document Matrices for the Corpus
Unigram=TermDocumentMatrix(Corpus,control=list(tokenize=uniTokenizer))
Bigram=TermDocumentMatrix(Corpus,control=list(tokenize=biTokenizer))
Trigram=TermDocumentMatrix(Corpus,control=list(tokenize=triTokenizer))

#Remove sparse terms
Unigram=sort(rowSums(as.matrix(removeSparseTerms(Unigram,0.99))),decreasing = TRUE)
Bigram=sort(rowSums(as.matrix(removeSparseTerms(Bigram,0.999))),decreasing = TRUE)
Trigram=sort(rowSums(as.matrix(removeSparseTerms(Trigram,0.9999))),decreasing = TRUE)

#get frequencies of most common n-grams
Unigramfreq=data.frame(word=names(Unigram),freq=Unigram)
Bigramfreq=data.frame(word=names(Bigram),freq=Bigram)
Trigramfreq=data.frame(word=names(Trigram),freq=Trigram)

#Histograms
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
#Unigrams
U=ggplot(Unigramfreq[1:25,],aes(x=reorder(word,-freq),y=freq))
U=U + geom_bar(stat = "identity", fill = I("grey50"))
U=U + geom_text(aes(label = freq ), vjust = -0.20, size = 3)
U=U+xlab("")
U=U+ylab("Frequency")
U=U+theme(axis.text.x=element_text(hjust=1.0,angle=90))
U=U+ggtitle("Top 25 Common Unigrams")
print(U)

#Bigrams
B=ggplot(Bigramfreq[1:25,],aes(x=reorder(word,-freq),y=freq))
B=B + geom_bar(stat = "identity", fill = I("grey50"))
B=B + geom_text(aes(label = freq ), vjust = -0.20, size = 3)
B=B+xlab("")
B=B+ylab("Frequency")
B=B+theme(axis.text.x=element_text(hjust=1.0,angle=90))
B=B+ggtitle("Top 25 Common Bigrams")
print(B)

#Trigrams
T=ggplot(Trigramfreq[1:25,],aes(x=reorder(word,-freq),y=freq))
T=T + geom_bar(stat = "identity", fill = I("grey50"))
T=T + geom_text(aes(label = freq ), vjust = -0.20, size = 3)
T=T+xlab("")
T=T+ylab("Frequency")
T=T+theme(axis.text.x=element_text(hjust=1.0,angle=90))
T=T+ggtitle("Top 25 Common Trigrams")
print(T)

Future Actions

The goal of the assignment is to create an app which will predict the next word given an input of a phrase of a few words. Our predictive algorithm will utilize the various n gram models in the earlier analysis, 2-grams, 3-grams and even 4-grams in the train and test data sets. Using the algorithm made up of r functions, input will be an n-gram and the word with the highest frequency from the train and test sets will be returned. Further steps will be to ensure that the algorithm can operate efficiently in that the word can be predicted without spending excessive time.