The goal of this project is to do exploratory analysis on a corpus of documents and prepare to develop and implement a text prediction algorithm.
library(stringi) # library for analysing string character
library(ggplot2) # library for plotting
library(tm) # library for text mining
library(RWeka) # Text Mining & Corpus Functions (similar like quanteda)
library(wordcloud)
library(tau)
library(Matrix)
library(data.table)
library(parallel)
library(reshape2)
blogfile<- readLines("en_US.blogs.txt",encoding = "UTF-8", skipNul = TRUE)
newsfile <- readLines("en_US.news.txt",encoding = "UTF-8", skipNul = TRUE)
twitterfile <- readLines("en_US.twitter.txt",encoding = "UTF-8", skipNul = TRUE)
blogs <- iconv(blogfile,"latin1","ASCII",sub = "")
news <- iconv(newsfile,"latin1","ASCII",sub = "")
twitter <- iconv(twitterfile, from = "latin1", to = "UTF-8", sub="")
twitter <- stri_replace_all_regex(twitter, "\u2019|`","'")
twitter <- stri_replace_all_regex(twitter, "\u201c|\u201d|u201f|``",'"')
bsize <- file.size("en_US.blogs.txt")
nsize <- file.size("en_US.news.txt")
tsize <- file.size("en_US.twitter.txt")
sumcount <- matrix(c(NROW(blogfile),NROW(newsfile),NROW(twitterfile),sum(nchar(blogfile)),sum(nchar(newsfile)),sum(nchar(twitterfile)),(bsize/1024^2),(nsize/1024^2),(tsize/1024^2)),byrow = FALSE,nrow=3,ncol=3,dimnames = list(c("blogs","news","twitter"),c("FileSize","Lines","Characters")))
Words <- sapply(list(blogfile,newsfile,twitterfile),stri_stats_latex)['Words',]
Summarycount <-cbind(sumcount,Words)
Summarycount
## FileSize Lines Characters Words
## blogs 899288 206824505 200.4242 37570839
## news 1010242 203223159 196.2775 34494539
## twitter 2360148 162096241 159.3641 30451170
factor <- 0.01
blogs1 <- sample(blogs,round(factor*length(blogs)))
news1 <- sample(news,round(factor*length(news)))
twitter1 <- sample(twitter,round(factor*length(twitter)))
BasicSummary1 <- matrix(c(NROW(blogs1),NROW(news1),NROW(twitter1)),byrow = TRUE,nrow=3,ncol=1,dimnames = list(c("blogs1","news1","twitter1"),"No.Of Rows"))
BasicSummary1
## No.Of Rows
## blogs1 8993
## news1 10102
## twitter1 23601
set.seed(666)
trainingset <- c(blogs1,news1,twitter1)
trainingcorpus <- VCorpus(VectorSource(trainingset))
pprocess <- function(document){
document <- tm_map(document, removePunctuation)
document <- tm_map(document, removeNumbers)
document <- tm_map(document, stripWhitespace)
document <- tm_map(document, content_transformer(tolower))
document <- tm_map(document, PlainTextDocument)
return(document)
}
trainingcorpus <- pprocess(trainingcorpus)
Onegramtokenizer <- function(x)
unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
Bigramtokenizer <- function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
Trigramtokenizer <-function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
dtm1g <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Onegramtokenizer))
dtm2g <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Bigramtokenizer))
dtm3g <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Trigramtokenizer))
We look to the most frequently occurring ngrams in our sample set, in this case, those with occurences more than 50 times.
unigramf <- findFreqTerms(dtm1g,lowfreq =50)
bigramf <- findFreqTerms(dtm2g,lowfreq = 50)
trigramf <- findFreqTerms(dtm3g,lowfreq = 50)
Unigramfreq <- rowSums(as.matrix(dtm1g[unigramf,]))
Unigramfreq <- data.frame(word=names(Unigramfreq),frequency=Unigramfreq)
plotthegraph <- function(data,title,num){
df <- data[order(-data$frequency),][1:num,]
barplot(df[1:num,]$freq, las = 2, names.arg = df[1:num,]$word,
col ="green", main = title,
ylab = "Frequencies",cex.axis =0.8)
}
par(mar=c(10,4,4,2))
plotthegraph(Unigramfreq,"Unigrams",20)
Bigramfreq <- rowSums(as.matrix(dtm2g[bigramf,]))
Bigramfreq <- data.frame(word=names(Bigramfreq),frequency=Bigramfreq)
plotthegraph <- function(data,title,num){
df <- data[order(-data$frequency),][1:num,]
barplot(df[1:num,]$freq, las = 2, names.arg = df[1:num,]$word,
col ="blue", main = title,
ylab = "Frequencies",cex.axis =0.8)
}
par(mar=c(10,4,4,2))
plotthegraph(Bigramfreq,"Bigrams",20)
Trigramfreq <- rowSums(as.matrix(dtm3g[trigramf,]))
Trigramfreq <- data.frame(word=names(Trigramfreq),frequency=Trigramfreq)
head(Trigramfreq)
## word frequency
## a bit of a bit of 78
## a couple of a couple of 94
## a lot of a lot of 316
## according to the according to the 83
## all of the all of the 69
## and i have and i have 53
plotthegraph <- function(data,title,num){
df <- data[order(-data$frequency),][1:num,]
barplot(df[1:num,]$freq, las = 2, names.arg = df[1:num,]$word,
col ="black", main = title,
ylab = "Frequencies",cex.axis =0.8)
}
par(mar=c(10,4,4,2))
plotthegraph(Trigramfreq,"Trigrams",20)