The goal of this exercise is to build and evaluate your first predictive model. You will use the n-gram and backoff models you built in previous tasks to build and evaluate your predictive model. The goal is to make the model efficient and accurate.
Tasks to accomplish
Build a predictive model based on the previous data modeling steps - you may combine the models in any way you think is appropriate.
Evaluate the model for efficiency and accuracy - use timing software to evaluate the computational complexity of your model. Evaluate the model accuracy using different metrics like perplexity, accuracy at the first word, second word, and third word.
Questions to consider
How does the model perform for different choices of the parameters and size of the model?
How much does the model slow down for the performance you gain?
Does perplexity correlate with the other measures of accuracy?
Can you reduce the size of the model (number of parameters) without reducing performance?
library(stringi)
library(stringr)
library(ggplot2)
library(NLP)
library(tm)
library(RWeka)
library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(tau)
library(Matrix)
library(data.table)
library(parallel)
library(reshape2)
setwd("C:/Users/PK/Desktop/final/en_US")
#read in blogs and twitter datasets
blogs<-readLines("C:/Users/PK/Desktop/final/en_US/en_US.blogs.txt",encoding="UTF-8")
twitter<-readLines("C:/Users/PK/Desktop/final/en_US/en_US.twitter.txt",encoding="UTF-8")
#read in news dataset in binary mode
tmpV<-file("C:/Users/PK/Desktop/final/en_US/en_US.news.txt",open="rb")
news<-readLines(tmpV,encoding="UTF-8")
close(tmpV)
rm(tmpV)
blogs<-iconv(blogs,"latin1","ASCII",sub="")
news<-iconv(news,"latin1","ASCII",sub="")
twitter<-iconv(twitter,"latin1","ASCII",sub="")
#length of datasets
length(blogs)
## [1] 899288
length(news)
## [1] 1010242
length(twitter)
## [1] 2360148
#words count of datasets
blogsWC<-stri_count_words(blogs)
sum(blogsWC)
## [1] 37510168
summary(blogsWC)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 28.00 41.71 60.00 6725.00
newsWC<-stri_count_words(news)
sum(newsWC)
## [1] 34749301
summary(newsWC)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 31.0 34.4 46.0 1796.0
twitterWC<-stri_count_words(twitter)
sum(twitterWC)
## [1] 30088564
summary(twitterWC)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.75 18.00 47.00
#number of characters of datasets
ncharblogs<-sum(nchar(blogs))
ncharblogs
## [1] 206043906
ncharnews<-sum(nchar(news))
ncharnews
## [1] 202917604
nchartwitter<-sum(nchar(twitter))
nchartwitter
## [1] 161961345
#Take a sample of 500 from each dataset
blogsSample<-sample(blogs,500,replace=FALSE)
newsSample<-sample(news,500,replace=FALSE)
twitterSample<-sample(twitter,500,replace=FALSE)
#combine samples
sample<-c(blogsSample,newsSample,twitterSample)
#load sample data as corpus
corpus<-VCorpus(VectorSource(sample))
corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1500
#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
corpus<-tm_map(corpus,toSpace,"/")
corpus<-tm_map(corpus,toSpace,"@")
corpus<-tm_map(corpus,toSpace,"\\|")
#Cleaning the text
corpus<-tm_map(corpus,content_transformer(tolower))
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,stripWhitespace)
corpus<-tm_map(corpus,stemDocument)
It is important to note that, stopwords were not removed because of their relevance in our prediction model. I hope the model will not be exagerated because of the presence of stopwords.
library(NLP)
library(tm)
tdm<-TermDocumentMatrix(corpus)
tdm
## <<TermDocumentMatrix (terms: 6636, documents: 1500)>>
## Non-/sparse entries: 28925/9925075
## Sparsity : 100%
## Maximal term length: 36
## Weighting : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 6636, documents: 1500)>>
## Non-/sparse entries: 28925/9925075
## Sparsity : 100%
## Maximal term length: 36
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 138 21 340 372 387 478 494 569 798 930
## and 8 3 4 3 3 4 7 6 3 11
## but 2 0 1 1 0 4 1 1 2 3
## for 0 1 3 3 2 1 2 1 3 2
## have 0 1 0 4 0 5 0 0 1 2
## that 6 3 5 6 3 1 9 1 5 4
## the 12 6 6 8 13 6 14 11 11 10
## this 0 0 1 5 0 0 4 0 1 1
## was 0 3 6 4 5 2 4 1 1 0
## with 1 1 2 1 0 0 3 2 1 1
## you 0 10 0 0 0 1 0 0 0 2
m<-as.matrix(tdm)
m1<-sort(rowSums(m),decreasing=TRUE)
d<-data.frame(word=names(m1),freq=m1)
head(d,20)
## word freq
## the the 2121
## and and 1071
## that that 477
## for for 457
## you you 363
## was was 295
## with with 294
## have have 245
## this this 235
## but but 230
## not not 203
## are are 180
## will will 175
## they they 173
## from from 166
## said said 161
## his his 158
## your your 145
## has has 140
## one one 138
library(wordcloud)
set.seed(123)
wordcloud(words=d$word,d$freq,min.freq=10,max.words=200,random.order=FALSE,scale=c(8,0.5),
colors=brewer.pal(8,"Dark2"))
barplot(d[1:20,]$freq,las=2,names.arg=d[1:20,]$word,col="yellow",main="Most Frequent Words",
ylab="Word Frequencies")
#creat a unigram
UnigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=1,max=1))
unigram<-TermDocumentMatrix(corpus,control=list(tokenize=UnigramTokenizer))
#extract frequencies of the unigram
unifreq<-sort(rowSums(as.matrix(unigram)),decreasing=TRUE)
unifreqDF<-data.frame(word1=names(unifreq),freq=unifreq)
head(unifreqDF,10)
## word1 freq
## the the 2121
## and and 1071
## that that 477
## for for 457
## you you 363
## was was 295
## with with 294
## have have 245
## this this 235
## but but 230
#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigram<-TermDocumentMatrix(corpus,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
bifreq<-sort(rowSums(as.matrix(bigram)),decreasing=TRUE)
bifreqDF<-data.frame(word2=names(bifreq),freq=bifreq)
head(bifreqDF,10)
## word2 freq
## of the of the 200
## in the in the 187
## to the to the 95
## for the for the 92
## on the on the 80
## to be to be 69
## at the at the 64
## and the and the 62
## in a in a 57
## with the with the 53
#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigram<-TermDocumentMatrix(corpus,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
trifreq<-sort(rowSums(as.matrix(trigram)),decreasing=TRUE)
trifreqDF<-data.frame(word3=names(trifreq),freq=trifreq)
head(trifreqDF,10)
## word3 freq
## one of the one of the 16
## a lot of a lot of 12
## i want to i want to 12
## be abl to be abl to 10
## part of the part of the 10
## thank for the thank for the 9
## if you have if you have 8
## it was a it was a 8
## out of the out of the 8
## to be a to be a 8
set.seed(123)
wordcloud(words=unifreqDF$word1,unifreqDF$freq,min.freq=10,max.words=200,random.order=FALSE,
scale=c(8,0.5),colors=brewer.pal(5,"Dark2"))
wordcloud(words=bifreqDF$word2,bifreqDF$freq,min.freq=5,max.words=100,random.order=FALSE,
scale=c(3,0.5),colors=brewer.pal(5,"Dark2"))
wordcloud(words=trifreqDF$word3,trifreqDF$freq,min.freq=5,max.words=100,random.order=FALSE,
scale=c(2,0.1),colors=brewer.pal(5,"Dark2"))
#barplot of unigrams
names(unifreq)=c("the","and","that","for","you","was","with","have","this","but")
barplot(unifreqDF[1:10,]$freq,las=2,names.arg=unifreqDF[1:10,]$word1,col="yellow",cex.names=0.8,
ylab="Frequency",main="Most Frequent Unigrams")
#barplot of bigrams
names(bifreq)=c("of the","in the","to the","for the","on the","to be","at the","and the",
"in a","with the")
barplot(bifreqDF[1:10,]$freq,las=2,names.arg=bifreqDF[1:10,]$word2,col="yellow",cex.names=0.8,
ylab="Frequency",main="Most Frequent Bigrams")
#barplot of trigrams
names(trifreq)=c("one of the","a lot of","i want to","be abl to","part of the","thank for the",
"if you have","it was a","out of the","to be a")
barplot(trifreqDF[1:10,]$freq,las=2,names.arg=trifreqDF[1:10,]$word3,col="yellow",cex.names=0.7,
ylab="Frequency",main="Most Frequent Trigrams")