In the following sections we will:

Plan for the Shiny App

The algorithm will use phase (n-gram) frequencies -see graphs below- for predicting the probabilities of next words. The shiny app will take in a phrase and will return the different probable terms with their respective probabilities.

Basic Counting Summary

We have the three files: Blogs, News and Twitter

We count the number of lines, words and chars in each file:

library(stringr)
twitter <- readLines("en_US.twitter.txt")
t_lines <- length(twitter)
t_word= sum(unlist(sapply(X=twitter,FUN=str_count,pattern=" ")))+t_lines
t_nchar <- sum(nchar(twitter))

blogs <- readLines("en_US.blogs.txt")
b_lines <- length(blogs)
b_word= sum(unlist(sapply(X=blogs,FUN=str_count,pattern=" ")))+b_lines
b_nchar <- sum(nchar(blogs))

news <- readLines(file("en_US.news.txt","rb"))
n_lines <- length(news)
n_word= sum(unlist(sapply(X=news,FUN=str_count,pattern=" ")))+n_lines
n_nchar <- sum(nchar(news))

summ <- data.frame(c("blogs","news","twitter"),c(b_lines,n_lines,t_lines), c(b_word,n_word,t_word), c(b_nchar,n_nchar,t_nchar))
names(summ)<-c("file","#lines", "#words", "#char")

summ
##      file  #lines   #words     #char
## 1   blogs  899288 37334131 208361438
## 2    news 1010242 34372530 203791405
## 3 twitter 2360148 30373545 162384825

Word and Bigram Frequency

After this basic summary, we want to explore word and bigram frequencies. Using News here as an example but ran for the three files, for words we run the following command on bash:

tr 'A-Z' 'a-z' < en_US.news.txt | tr -sc 'A-Aa-z' '\n' | sort | uniq -c | sort -n -r > wfNews.txt

For bigrams we run:

tr 'A-Z' 'a-z' < en_US.news.txt | tr -sc 'A-Za-z' '\n' > newswords.txt;
cp newswords.txt ./tmp1.txt;
sed '1d' tmp1.txt > tmpfile; mv tmpfile newsnext.txt;
paste newswords.txt newsnext.txt | sort | uniq -c | sort -n -r > 2gNews.txt;
rm tmp1.txt newsnext.txt newswords.txt

We now have files with the frequency of words and bigrams for each file.

wfnews <- read.table("wfNews.txt",col.names=c("f","w"))
wfblogs <- read.table("wfBlogs.txt",col.names=c("f","w"))
wftwitter <- read.table("wfTwitter.txt",col.names=c("f","w"))
par(las=2)
barplot(head(wfblogs$f,30),names.arg=head(wfblogs$w,30),col="orange", main = "Frequency of first 30 words in Blogs")

plot of chunk unnamed-chunk-3

barplot(head(wfnews$f,30),names.arg=head(wfnews$w,30),col="orange", main = "Frequency of first 30 words in News")

plot of chunk unnamed-chunk-3

barplot(head(wftwitter$f,30),names.arg=head(wftwitter$w,30),col="orange", main = "Frequency of first 30 words in Twitter")

plot of chunk unnamed-chunk-3 And this is what we get for bigram frequency distributions:

bgnews <- read.table("2gNews.txt",col.names=c("f","w1","w2"),nrows=1000)
bgnews$w <- paste(bgnews$w1,bgnews$w2)
bgblogs <- read.table("2gBlogs.txt",col.names=c("f","w1","w2"),nrows=1000)
bgblogs$w <- paste(bgblogs$w1,bgblogs$w2)
bgtwitter <- read.table("2gTwitter.txt",col.names=c("f","w1","w2"),nrows=1000)
bgtwitter$w <- paste(bgtwitter$w1,bgtwitter$w2)
par(las=2)
barplot(head(bgblogs$f,30),names.arg=head(bgblogs$w,30),col="orange", main = "Frequency of first 30 Bigrams in Blogs")

plot of chunk unnamed-chunk-5

barplot(head(bgnews$f,30),names.arg=head(bgnews$w,30),col="orange", main = "Frequency of first 30 Bigrams in News")

plot of chunk unnamed-chunk-5

barplot(head(bgtwitter$f,30),names.arg=head(bgtwitter$w,30),col="orange", main = "Frequency of first 30 Bigrams in Twitter")

plot of chunk unnamed-chunk-5