Introduction

The goal of this project is to do exploratory analysis on a corpus of documents and prepare to develop and implement a text prediction algorithm.

Tasks to accomplish

  1. Download the HC Corpora Data Set provided by SwiftKey and successfully load it in R.
  2. Create a basic report of summary statistics about the data sets.
  3. Report any interesting findings that you amassed so far.
  4. Create basic plots, such as histograms to illustrate features of the data.

Load Libraries

library(stringi)    # library for analysing string character
library(ggplot2)    # library for plotting
library(tm)         # library for text mining
library(RWeka)      # Text Mining & Corpus Functions (similar like quanteda)
library(wordcloud) 
library(tau)
library(Matrix)
library(data.table)
library(parallel)
library(reshape2)

Loading dataset and Removing Non-English Words

blogfile<- readLines("en_US.blogs.txt",encoding = "UTF-8", skipNul = TRUE)
newsfile <- readLines("en_US.news.txt",encoding = "UTF-8", skipNul = TRUE)
twitterfile <- readLines("en_US.twitter.txt",encoding = "UTF-8", skipNul = TRUE)

blogs <- iconv(blogfile,"latin1","ASCII",sub = "")
news <- iconv(newsfile,"latin1","ASCII",sub = "")
twitter <- iconv(twitterfile, from = "latin1", to = "UTF-8", sub="")
twitter <- stri_replace_all_regex(twitter, "\u2019|`","'")
twitter <- stri_replace_all_regex(twitter, "\u201c|\u201d|u201f|``",'"')

Counts of Lines,Words,characters

bsize <- file.size("en_US.blogs.txt")
nsize <- file.size("en_US.news.txt")
tsize <- file.size("en_US.twitter.txt")

sumcount <- matrix(c(NROW(blogfile),NROW(newsfile),NROW(twitterfile),sum(nchar(blogfile)),sum(nchar(newsfile)),sum(nchar(twitterfile)),(bsize/1024^2),(nsize/1024^2),(tsize/1024^2)),byrow = FALSE,nrow=3,ncol=3,dimnames = list(c("blogs","news","twitter"),c("FileSize","Lines","Characters")))
Words <- sapply(list(blogfile,newsfile,twitterfile),stri_stats_latex)['Words',]
Summarycount <-cbind(sumcount,Words)
Summarycount
##         FileSize     Lines Characters    Words
## blogs     899288 206824505   200.4242 37570839
## news     1010242 203223159   196.2775 34494539
## twitter  2360148 162096241   159.3641 30451170

Creating training set with 1% of rows from the dataset

factor <- 0.01
blogs1 <- sample(blogs,round(factor*length(blogs)))
news1 <- sample(news,round(factor*length(news)))
twitter1 <- sample(twitter,round(factor*length(twitter)))
BasicSummary1 <- matrix(c(NROW(blogs1),NROW(news1),NROW(twitter1)),byrow = TRUE,nrow=3,ncol=1,dimnames = list(c("blogs1","news1","twitter1"),"No.Of Rows"))
BasicSummary1
##          No.Of Rows
## blogs1         8993
## news1         10102
## twitter1      23601

Merging files from blogs,news and twitter to create Corpus.

set.seed(666)
trainingset <- c(blogs1,news1,twitter1)
trainingcorpus <- VCorpus(VectorSource(trainingset))

Preprocessing CORPUS file

pprocess <- function(document){
        document <- tm_map(document, removePunctuation)
        document <- tm_map(document, removeNumbers)
        document <- tm_map(document, stripWhitespace)
        document <- tm_map(document, content_transformer(tolower))
        document <- tm_map(document, PlainTextDocument)
        return(document)
}
trainingcorpus <- pprocess(trainingcorpus)

Create Tokenizers

Onegramtokenizer <- function(x)
        unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
Bigramtokenizer <- function(x)
        unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
Trigramtokenizer <-function(x)
        unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

Creating Document Matrix

dtm1g <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Onegramtokenizer))
dtm2g <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Bigramtokenizer))
dtm3g <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Trigramtokenizer))

We look to the most frequently occurring ngrams in our sample set, in this case, those with occurences more than 50 times.

unigramf <- findFreqTerms(dtm1g,lowfreq =50)
bigramf <- findFreqTerms(dtm2g,lowfreq = 50)
trigramf <- findFreqTerms(dtm3g,lowfreq = 50)

Plotting Frequencies For Uni-grams

Unigramfreq <- rowSums(as.matrix(dtm1g[unigramf,]))
Unigramfreq <- data.frame(word=names(Unigramfreq),frequency=Unigramfreq)
plotthegraph <- function(data,title,num){
        df <- data[order(-data$frequency),][1:num,]
        barplot(df[1:num,]$freq, las = 2, names.arg = df[1:num,]$word,
                col ="green", main = title,
                ylab = "Frequencies",cex.axis =0.8)
}
par(mar=c(10,4,4,2))
plotthegraph(Unigramfreq,"Unigrams",20)

Plotting Frequencies For Bi-grams

Bigramfreq <- rowSums(as.matrix(dtm2g[bigramf,]))
Bigramfreq <- data.frame(word=names(Bigramfreq),frequency=Bigramfreq)
plotthegraph <- function(data,title,num){
        df <- data[order(-data$frequency),][1:num,]
        barplot(df[1:num,]$freq, las = 2, names.arg = df[1:num,]$word,
                col ="blue", main = title,
                ylab = "Frequencies",cex.axis =0.8)
}
par(mar=c(10,4,4,2))
plotthegraph(Bigramfreq,"Bigrams",20)

Plotting Frequencies For Tri-grams

Trigramfreq <- rowSums(as.matrix(dtm3g[trigramf,]))
Trigramfreq <- data.frame(word=names(Trigramfreq),frequency=Trigramfreq)
head(Trigramfreq)
##                              word frequency
## a bit of                 a bit of        78
## a couple of           a couple of        94
## a lot of                 a lot of       316
## according to the according to the        83
## all of the             all of the        69
## and i have             and i have        53
plotthegraph <- function(data,title,num){
        df <- data[order(-data$frequency),][1:num,]
        barplot(df[1:num,]$freq, las = 2, names.arg = df[1:num,]$word,
                col ="black", main = title,
                ylab = "Frequencies",cex.axis =0.8)
}
par(mar=c(10,4,4,2))
plotthegraph(Trigramfreq,"Trigrams",20)