Introduction

The goal of this exercise is to build and evaluate your first predictive model. You will use the n-gram and backoff models you built in previous tasks to build and evaluate your predictive model. The goal is to make the model efficient and accurate.

Tasks to accomplish

Questions to consider

Load Needed Packages

library(stringi)
library(stringr)
library(ggplot2)
library(NLP)
library(tm)
library(RWeka)
library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(tau)
library(Matrix)
library(data.table)
library(parallel)
library(reshape2)

Read in the data

setwd("C:/Users/PK/Desktop/final/en_US")
#read in blogs and twitter datasets
blogs<-readLines("C:/Users/PK/Desktop/final/en_US/en_US.blogs.txt",encoding="UTF-8")
twitter<-readLines("C:/Users/PK/Desktop/final/en_US/en_US.twitter.txt",encoding="UTF-8")
#read in news dataset in binary mode
tmpV<-file("C:/Users/PK/Desktop/final/en_US/en_US.news.txt",open="rb")
news<-readLines(tmpV,encoding="UTF-8")
close(tmpV)
rm(tmpV)

Remove non English words

blogs<-iconv(blogs,"latin1","ASCII",sub="")
news<-iconv(news,"latin1","ASCII",sub="")
twitter<-iconv(twitter,"latin1","ASCII",sub="")

Summaries of datasets

#length of datasets
length(blogs)
## [1] 899288
length(news)
## [1] 1010242
length(twitter)
## [1] 2360148
#words count of datasets
blogsWC<-stri_count_words(blogs)
sum(blogsWC)
## [1] 37510168
summary(blogsWC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.71   60.00 6725.00
newsWC<-stri_count_words(news)
sum(newsWC)
## [1] 34749301
summary(newsWC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0    31.0    34.4    46.0  1796.0
twitterWC<-stri_count_words(twitter)
sum(twitterWC)
## [1] 30088564
summary(twitterWC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00
#number of characters of datasets
ncharblogs<-sum(nchar(blogs))
ncharblogs
## [1] 206043906
ncharnews<-sum(nchar(news))
ncharnews
## [1] 202917604
nchartwitter<-sum(nchar(twitter))
nchartwitter
## [1] 161961345

Take a sample of the datasets

#Take a sample of 500 from each dataset
blogsSample<-sample(blogs,500,replace=FALSE)
newsSample<-sample(news,500,replace=FALSE)
twitterSample<-sample(twitter,500,replace=FALSE)
#combine samples
sample<-c(blogsSample,newsSample,twitterSample)

#load sample data as corpus
corpus<-VCorpus(VectorSource(sample))
corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1500

Preprocessing and cleaning

#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
corpus<-tm_map(corpus,toSpace,"/")
corpus<-tm_map(corpus,toSpace,"@")
corpus<-tm_map(corpus,toSpace,"\\|")
#Cleaning the text
corpus<-tm_map(corpus,content_transformer(tolower))
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,stripWhitespace)
corpus<-tm_map(corpus,stemDocument)

It is important to note that, stopwords were not removed because of their relevance in our prediction model. I hope the model will not be exagerated because of the presence of stopwords.

Term Document Matrix

library(NLP)
library(tm)
tdm<-TermDocumentMatrix(corpus)
tdm
## <<TermDocumentMatrix (terms: 6636, documents: 1500)>>
## Non-/sparse entries: 28925/9925075
## Sparsity           : 100%
## Maximal term length: 36
## Weighting          : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 6636, documents: 1500)>>
## Non-/sparse entries: 28925/9925075
## Sparsity           : 100%
## Maximal term length: 36
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  138 21 340 372 387 478 494 569 798 930
##   and    8  3   4   3   3   4   7   6   3  11
##   but    2  0   1   1   0   4   1   1   2   3
##   for    0  1   3   3   2   1   2   1   3   2
##   have   0  1   0   4   0   5   0   0   1   2
##   that   6  3   5   6   3   1   9   1   5   4
##   the   12  6   6   8  13   6  14  11  11  10
##   this   0  0   1   5   0   0   4   0   1   1
##   was    0  3   6   4   5   2   4   1   1   0
##   with   1  1   2   1   0   0   3   2   1   1
##   you    0 10   0   0   0   1   0   0   0   2
m<-as.matrix(tdm)
m1<-sort(rowSums(m),decreasing=TRUE)
d<-data.frame(word=names(m1),freq=m1)
head(d,20)
##      word freq
## the   the 2121
## and   and 1071
## that that  477
## for   for  457
## you   you  363
## was   was  295
## with with  294
## have have  245
## this this  235
## but   but  230
## not   not  203
## are   are  180
## will will  175
## they they  173
## from from  166
## said said  161
## his   his  158
## your your  145
## has   has  140
## one   one  138

Generate wordcloud

library(wordcloud)
set.seed(123)
wordcloud(words=d$word,d$freq,min.freq=10,max.words=200,random.order=FALSE,scale=c(8,0.5),
          colors=brewer.pal(8,"Dark2"))

Plot word frequencies

barplot(d[1:20,]$freq,las=2,names.arg=d[1:20,]$word,col="yellow",main="Most Frequent Words",
        ylab="Word Frequencies")

N-gram Modeling

#creat a unigram
UnigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=1,max=1))
unigram<-TermDocumentMatrix(corpus,control=list(tokenize=UnigramTokenizer))
#extract frequencies of the unigram
unifreq<-sort(rowSums(as.matrix(unigram)),decreasing=TRUE)
unifreqDF<-data.frame(word1=names(unifreq),freq=unifreq)
head(unifreqDF,10)
##      word1 freq
## the    the 2121
## and    and 1071
## that  that  477
## for    for  457
## you    you  363
## was    was  295
## with  with  294
## have  have  245
## this  this  235
## but    but  230
#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigram<-TermDocumentMatrix(corpus,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
bifreq<-sort(rowSums(as.matrix(bigram)),decreasing=TRUE)
bifreqDF<-data.frame(word2=names(bifreq),freq=bifreq)
head(bifreqDF,10)
##             word2 freq
## of the     of the  200
## in the     in the  187
## to the     to the   95
## for the   for the   92
## on the     on the   80
## to be       to be   69
## at the     at the   64
## and the   and the   62
## in a         in a   57
## with the with the   53
#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigram<-TermDocumentMatrix(corpus,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
trifreq<-sort(rowSums(as.matrix(trigram)),decreasing=TRUE)
trifreqDF<-data.frame(word3=names(trifreq),freq=trifreq)
head(trifreqDF,10)
##                       word3 freq
## one of the       one of the   16
## a lot of           a lot of   12
## i want to         i want to   12
## be abl to         be abl to   10
## part of the     part of the   10
## thank for the thank for the    9
## if you have     if you have    8
## it was a           it was a    8
## out of the       out of the    8
## to be a             to be a    8

wordcloud of Unigrams,Bigrams and Trigrams

set.seed(123)
wordcloud(words=unifreqDF$word1,unifreqDF$freq,min.freq=10,max.words=200,random.order=FALSE,
          scale=c(8,0.5),colors=brewer.pal(5,"Dark2"))

wordcloud(words=bifreqDF$word2,bifreqDF$freq,min.freq=5,max.words=100,random.order=FALSE,
          scale=c(3,0.5),colors=brewer.pal(5,"Dark2"))

wordcloud(words=trifreqDF$word3,trifreqDF$freq,min.freq=5,max.words=100,random.order=FALSE,
          scale=c(2,0.1),colors=brewer.pal(5,"Dark2"))

Frequency plot of unigrams,bigrams and trigrams

#barplot of unigrams
names(unifreq)=c("the","and","that","for","you","was","with","have","this","but")
barplot(unifreqDF[1:10,]$freq,las=2,names.arg=unifreqDF[1:10,]$word1,col="yellow",cex.names=0.8,
        ylab="Frequency",main="Most Frequent Unigrams")

#barplot of bigrams
names(bifreq)=c("of the","in the","to the","for the","on the","to be","at the","and the",
                "in a","with the")
barplot(bifreqDF[1:10,]$freq,las=2,names.arg=bifreqDF[1:10,]$word2,col="yellow",cex.names=0.8,
        ylab="Frequency",main="Most Frequent Bigrams")

#barplot of trigrams
names(trifreq)=c("one of the","a lot of","i want to","be abl to","part of the","thank for the",
                 "if you have","it was a","out of the","to be a")
barplot(trifreqDF[1:10,]$freq,las=2,names.arg=trifreqDF[1:10,]$word3,col="yellow",cex.names=0.7,
        ylab="Frequency",main="Most Frequent Trigrams")