N-gram Prediction Model

Read in the data

setwd("C:/Users/PK/Desktop/final/en_US")
#read in blogs and twitter datasets
blogs<-readLines("C:/Users/PK/Desktop/final/en_US/en_US.blogs.txt",encoding="UTF-8")
twitter<-readLines("C:/Users/PK/Desktop/final/en_US/en_US.twitter.txt",encoding="UTF-8")
#read in news dataset in binary mode
tmpV<-file("C:/Users/PK/Desktop/final/en_US/en_US.news.txt",open="rb")
news<-readLines(tmpV,encoding="UTF-8")
close(tmpV)
rm(tmpV)

Remove non English words

blogs<-iconv(blogs,"latin1","ASCII",sub="")
news<-iconv(news,"latin1","ASCII",sub="")
twitter<-iconv(twitter,"latin1","ASCII",sub="")

Summaries of datasets

#length of datasets
length(blogs)

## [1] 899288

length(news)

## [1] 1010242

length(twitter)

## [1] 2360148

#words count of datasets
blogsWC<-stri_count_words(blogs)
sum(blogsWC)

## [1] 37510168

summary(blogsWC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.71   60.00 6725.00

newsWC<-stri_count_words(news)
sum(newsWC)

## [1] 34749301

summary(newsWC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0    31.0    34.4    46.0  1796.0

twitterWC<-stri_count_words(twitter)
sum(twitterWC)

## [1] 30088564

summary(twitterWC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00

#number of characters of datasets
ncharblogs<-sum(nchar(blogs))
ncharblogs

## [1] 206043906

ncharnews<-sum(nchar(news))
ncharnews

## [1] 202917604

nchartwitter<-sum(nchar(twitter))
nchartwitter

## [1] 161961345

Take a sample of the datasets

#Take a sample of 500 from each dataset
blogsSample<-sample(blogs,500,replace=FALSE)
newsSample<-sample(news,500,replace=FALSE)
twitterSample<-sample(twitter,500,replace=FALSE)
#combine samples
sample<-c(blogsSample,newsSample,twitterSample)

#load sample data as corpus
corpus<-VCorpus(VectorSource(sample))
corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1500

Preprocessing and cleaning

#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
corpus<-tm_map(corpus,toSpace,"/")
corpus<-tm_map(corpus,toSpace,"@")
corpus<-tm_map(corpus,toSpace,"\\|")
#Cleaning the text
corpus<-tm_map(corpus,content_transformer(tolower))
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,stripWhitespace)
corpus<-tm_map(corpus,stemDocument)

It is important to note that, stopwords were not removed because of their relevance in our prediction model. I hope the model will not be exagerated because of the presence of stopwords.

Term Document Matrix

library(NLP)
library(tm)
tdm<-TermDocumentMatrix(corpus)
tdm

## <<TermDocumentMatrix (terms: 6636, documents: 1500)>>
## Non-/sparse entries: 28925/9925075
## Sparsity           : 100%
## Maximal term length: 36
## Weighting          : term frequency (tf)

inspect(tdm)

## <<TermDocumentMatrix (terms: 6636, documents: 1500)>>
## Non-/sparse entries: 28925/9925075
## Sparsity           : 100%
## Maximal term length: 36
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  138 21 340 372 387 478 494 569 798 930
##   and    8  3   4   3   3   4   7   6   3  11
##   but    2  0   1   1   0   4   1   1   2   3
##   for    0  1   3   3   2   1   2   1   3   2
##   have   0  1   0   4   0   5   0   0   1   2
##   that   6  3   5   6   3   1   9   1   5   4
##   the   12  6   6   8  13   6  14  11  11  10
##   this   0  0   1   5   0   0   4   0   1   1
##   was    0  3   6   4   5   2   4   1   1   0
##   with   1  1   2   1   0   0   3   2   1   1
##   you    0 10   0   0   0   1   0   0   0   2

m<-as.matrix(tdm)
m1<-sort(rowSums(m),decreasing=TRUE)
d<-data.frame(word=names(m1),freq=m1)
head(d,20)

##      word freq
## the   the 2121
## and   and 1071
## that that  477
## for   for  457
## you   you  363
## was   was  295
## with with  294
## have have  245
## this this  235
## but   but  230
## not   not  203
## are   are  180
## will will  175
## they they  173
## from from  166
## said said  161
## his   his  158
## your your  145
## has   has  140
## one   one  138

Generate wordcloud

library(wordcloud)
set.seed(123)
wordcloud(words=d$word,d$freq,min.freq=10,max.words=200,random.order=FALSE,scale=c(8,0.5),
          colors=brewer.pal(8,"Dark2"))

Plot word frequencies

barplot(d[1:20,]$freq,las=2,names.arg=d[1:20,]$word,col="yellow",main="Most Frequent Words",
        ylab="Word Frequencies")

N-gram Modeling

#creat a unigram
UnigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=1,max=1))
unigram<-TermDocumentMatrix(corpus,control=list(tokenize=UnigramTokenizer))
#extract frequencies of the unigram
unifreq<-sort(rowSums(as.matrix(unigram)),decreasing=TRUE)
unifreqDF<-data.frame(word1=names(unifreq),freq=unifreq)
head(unifreqDF,10)

##      word1 freq
## the    the 2121
## and    and 1071
## that  that  477
## for    for  457
## you    you  363
## was    was  295
## with  with  294
## have  have  245
## this  this  235
## but    but  230

#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigram<-TermDocumentMatrix(corpus,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
bifreq<-sort(rowSums(as.matrix(bigram)),decreasing=TRUE)
bifreqDF<-data.frame(word2=names(bifreq),freq=bifreq)
head(bifreqDF,10)

##             word2 freq
## of the     of the  200
## in the     in the  187
## to the     to the   95
## for the   for the   92
## on the     on the   80
## to be       to be   69
## at the     at the   64
## and the   and the   62
## in a         in a   57
## with the with the   53

#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigram<-TermDocumentMatrix(corpus,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
trifreq<-sort(rowSums(as.matrix(trigram)),decreasing=TRUE)
trifreqDF<-data.frame(word3=names(trifreq),freq=trifreq)
head(trifreqDF,10)

##                       word3 freq
## one of the       one of the   16
## a lot of           a lot of   12
## i want to         i want to   12
## be abl to         be abl to   10
## part of the     part of the   10
## thank for the thank for the    9
## if you have     if you have    8
## it was a           it was a    8
## out of the       out of the    8
## to be a             to be a    8

wordcloud of Unigrams,Bigrams and Trigrams

set.seed(123)
wordcloud(words=unifreqDF$word1,unifreqDF$freq,min.freq=10,max.words=200,random.order=FALSE,
          scale=c(8,0.5),colors=brewer.pal(5,"Dark2"))

wordcloud(words=bifreqDF$word2,bifreqDF$freq,min.freq=5,max.words=100,random.order=FALSE,
          scale=c(3,0.5),colors=brewer.pal(5,"Dark2"))

wordcloud(words=trifreqDF$word3,trifreqDF$freq,min.freq=5,max.words=100,random.order=FALSE,
          scale=c(2,0.1),colors=brewer.pal(5,"Dark2"))

Frequency plot of unigrams,bigrams and trigrams

#barplot of unigrams
names(unifreq)=c("the","and","that","for","you","was","with","have","this","but")
barplot(unifreqDF[1:10,]$freq,las=2,names.arg=unifreqDF[1:10,]$word1,col="yellow",cex.names=0.8,
        ylab="Frequency",main="Most Frequent Unigrams")

#barplot of bigrams
names(bifreq)=c("of the","in the","to the","for the","on the","to be","at the","and the",
                "in a","with the")
barplot(bifreqDF[1:10,]$freq,las=2,names.arg=bifreqDF[1:10,]$word2,col="yellow",cex.names=0.8,
        ylab="Frequency",main="Most Frequent Bigrams")

#barplot of trigrams
names(trifreq)=c("one of the","a lot of","i want to","be abl to","part of the","thank for the",
                 "if you have","it was a","out of the","to be a")
barplot(trifreqDF[1:10,]$freq,las=2,names.arg=trifreqDF[1:10,]$word3,col="yellow",cex.names=0.7,
        ylab="Frequency",main="Most Frequent Trigrams")

N-gram Prediction Model

David Asare Kumi

January 31, 2019

Introduction

Load Needed Packages

Read in the data

Remove non English words

Summaries of datasets

Take a sample of the datasets

Preprocessing and cleaning

Term Document Matrix

Generate wordcloud

Plot word frequencies

N-gram Modeling

wordcloud of Unigrams,Bigrams and Trigrams

Frequency plot of unigrams,bigrams and trigrams