In the following sections we will:
The algorithm will use phase (n-gram) frequencies -see graphs below- for predicting the probabilities of next words. The shiny app will take in a phrase and will return the different probable terms with their respective probabilities.
We have the three files: Blogs, News and Twitter
We count the number of lines, words and chars in each file:
library(stringr)
twitter <- readLines("en_US.twitter.txt")
t_lines <- length(twitter)
t_word= sum(unlist(sapply(X=twitter,FUN=str_count,pattern=" ")))+t_lines
t_nchar <- sum(nchar(twitter))
blogs <- readLines("en_US.blogs.txt")
b_lines <- length(blogs)
b_word= sum(unlist(sapply(X=blogs,FUN=str_count,pattern=" ")))+b_lines
b_nchar <- sum(nchar(blogs))
news <- readLines(file("en_US.news.txt","rb"))
n_lines <- length(news)
n_word= sum(unlist(sapply(X=news,FUN=str_count,pattern=" ")))+n_lines
n_nchar <- sum(nchar(news))
summ <- data.frame(c("blogs","news","twitter"),c(b_lines,n_lines,t_lines), c(b_word,n_word,t_word), c(b_nchar,n_nchar,t_nchar))
names(summ)<-c("file","#lines", "#words", "#char")
summ
## file #lines #words #char
## 1 blogs 899288 37334131 208361438
## 2 news 1010242 34372530 203791405
## 3 twitter 2360148 30373545 162384825
After this basic summary, we want to explore word and bigram frequencies. Using News here as an example but ran for the three files, for words we run the following command on bash:
tr 'A-Z' 'a-z' < en_US.news.txt | tr -sc 'A-Aa-z' '\n' | sort | uniq -c | sort -n -r > wfNews.txt
For bigrams we run:
tr 'A-Z' 'a-z' < en_US.news.txt | tr -sc 'A-Za-z' '\n' > newswords.txt;
cp newswords.txt ./tmp1.txt;
sed '1d' tmp1.txt > tmpfile; mv tmpfile newsnext.txt;
paste newswords.txt newsnext.txt | sort | uniq -c | sort -n -r > 2gNews.txt;
rm tmp1.txt newsnext.txt newswords.txt
We now have files with the frequency of words and bigrams for each file.
wfnews <- read.table("wfNews.txt",col.names=c("f","w"))
wfblogs <- read.table("wfBlogs.txt",col.names=c("f","w"))
wftwitter <- read.table("wfTwitter.txt",col.names=c("f","w"))
par(las=2)
barplot(head(wfblogs$f,30),names.arg=head(wfblogs$w,30),col="orange", main = "Frequency of first 30 words in Blogs")
barplot(head(wfnews$f,30),names.arg=head(wfnews$w,30),col="orange", main = "Frequency of first 30 words in News")
barplot(head(wftwitter$f,30),names.arg=head(wftwitter$w,30),col="orange", main = "Frequency of first 30 words in Twitter")
And this is what we get for bigram frequency distributions:
bgnews <- read.table("2gNews.txt",col.names=c("f","w1","w2"),nrows=1000)
bgnews$w <- paste(bgnews$w1,bgnews$w2)
bgblogs <- read.table("2gBlogs.txt",col.names=c("f","w1","w2"),nrows=1000)
bgblogs$w <- paste(bgblogs$w1,bgblogs$w2)
bgtwitter <- read.table("2gTwitter.txt",col.names=c("f","w1","w2"),nrows=1000)
bgtwitter$w <- paste(bgtwitter$w1,bgtwitter$w2)
par(las=2)
barplot(head(bgblogs$f,30),names.arg=head(bgblogs$w,30),col="orange", main = "Frequency of first 30 Bigrams in Blogs")
barplot(head(bgnews$f,30),names.arg=head(bgnews$w,30),col="orange", main = "Frequency of first 30 Bigrams in News")
barplot(head(bgtwitter$f,30),names.arg=head(bgtwitter$w,30),col="orange", main = "Frequency of first 30 Bigrams in Twitter")